ocr process
Showing
4 changed files
with
159 additions
and
61 deletions
| 1 | aiohttp==3.6.2 | ||
| 2 | async-timeout==3.0.1 | ||
| 3 | attrs==19.3.0 | ||
| 1 | certifi==2016.2.28 | 4 | certifi==2016.2.28 |
| 5 | chardet==3.0.4 | ||
| 2 | Django==2.1 | 6 | Django==2.1 |
| 3 | # django-mysqlpool @ https://github.com/smartfile/django-mysqlpool/archive/master.zip | 7 | # django-mysqlpool @ https://github.com/smartfile/django-mysqlpool/archive/master.zip |
| 4 | djangorestframework==3.9.0 | 8 | djangorestframework==3.9.0 |
| 5 | djangorestframework-jwt==1.11.0 | 9 | djangorestframework-jwt==1.11.0 |
| 10 | idna==2.9 | ||
| 11 | idna-ssl==1.1.0 | ||
| 6 | marshmallow==3.6.1 | 12 | marshmallow==3.6.1 |
| 13 | multidict==4.7.6 | ||
| 7 | pdfminer3k==1.3.4 | 14 | pdfminer3k==1.3.4 |
| 8 | Pillow==7.1.2 | 15 | Pillow==7.1.2 |
| 9 | ply==3.11 | 16 | ply==3.11 |
| ... | @@ -17,4 +24,7 @@ redis==3.4.1 | ... | @@ -17,4 +24,7 @@ redis==3.4.1 |
| 17 | # situlogger @ http://gitlab.situdata.com/zhouweiqi/situlogger/repository/archive.tar.gz?ref=master | 24 | # situlogger @ http://gitlab.situdata.com/zhouweiqi/situlogger/repository/archive.tar.gz?ref=master |
| 18 | six==1.14.0 | 25 | six==1.14.0 |
| 19 | SQLAlchemy==0.9.10 | 26 | SQLAlchemy==0.9.10 |
| 27 | typing-extensions==3.7.4.2 | ||
| 20 | webargs==6.1.0 | 28 | webargs==6.1.0 |
| 29 | xlwt==1.3.0 | ||
| 30 | yarl==1.4.2 | ... | ... |
| 1 | import time | ||
| 2 | import os | 1 | import os |
| 3 | import signal | 2 | import time |
| 4 | import fitz | 3 | import fitz |
| 4 | import xlwt | ||
| 5 | import signal | ||
| 6 | import base64 | ||
| 7 | import asyncio | ||
| 8 | import aiohttp | ||
| 5 | from PIL import Image | 9 | from PIL import Image |
| 6 | from io import BytesIO | 10 | from io import BytesIO |
| 7 | 11 | ||
| 8 | from django.core.management import BaseCommand | 12 | from django.core.management import BaseCommand |
| 9 | from common.mixins import LoggerMixin | 13 | from common.mixins import LoggerMixin |
| 10 | from common.redis_cache import redis_handler as rh | 14 | from common.redis_cache import redis_handler as rh |
| 15 | from common.tools.file_tools import write_zip_file | ||
| 11 | from apps.doc.models import UploadDocRecords, DocStatus | 16 | from apps.doc.models import UploadDocRecords, DocStatus |
| 12 | from settings import conf | 17 | from settings import conf |
| 13 | 18 | ||
| ... | @@ -25,6 +30,12 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -25,6 +30,12 @@ class Command(BaseCommand, LoggerMixin): |
| 25 | self.zoom_x = 2.0 | 30 | self.zoom_x = 2.0 |
| 26 | self.zoom_y = 2.0 | 31 | self.zoom_y = 2.0 |
| 27 | self.trans = fitz.Matrix(self.zoom_x, self.zoom_y).preRotate(0) # zoom factor 2 in each dimension | 32 | self.trans = fitz.Matrix(self.zoom_x, self.zoom_y).preRotate(0) # zoom factor 2 in each dimension |
| 33 | # ocr相关 | ||
| 34 | self.ocr_url = conf.OCR_URL | ||
| 35 | self.ocr_header = { | ||
| 36 | 'X-Auth-Token': conf.OCR_TOKEN, | ||
| 37 | 'Content-Type': 'application/json' | ||
| 38 | } | ||
| 28 | # 优雅退出信号:15 | 39 | # 优雅退出信号:15 |
| 29 | signal.signal(signal.SIGTERM, self.signal_handler) | 40 | signal.signal(signal.SIGTERM, self.signal_handler) |
| 30 | 41 | ||
| ... | @@ -47,16 +58,52 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -47,16 +58,52 @@ class Command(BaseCommand, LoggerMixin): |
| 47 | 58 | ||
| 48 | def pdf_download(self, doc_info): | 59 | def pdf_download(self, doc_info): |
| 49 | if doc_info is None: | 60 | if doc_info is None: |
| 50 | return | 61 | return None, None, None, None |
| 51 | # TODO EDMS下载pdf | 62 | # TODO EDMS下载pdf |
| 52 | # pdf_path = '/Users/clay/Desktop/biz/biz_logic/data/2/横版-表格-工商银行CH-B008802400.pdf' | 63 | # pdf_path = '/Users/clay/Desktop/biz/biz_logic/data/2/横版-表格-工商银行CH-B008802400.pdf' |
| 53 | # doc_data_path = os.path.dirname(pdf_path) | 64 | # doc_data_path = os.path.dirname(pdf_path) |
| 54 | doc_id = doc_info['id'] | 65 | doc_id = doc_info['id'] |
| 55 | doc_data_path = os.path.join(self.data_dir, str(doc_id)) | 66 | doc_data_path = os.path.join(self.data_dir, str(doc_id)) |
| 56 | pdf_path = os.path.join(doc_data_path, '{0}.pdf'.format(doc_id)) | 67 | pdf_path = os.path.join(doc_data_path, '{0}.pdf'.format(doc_id)) |
| 68 | excel_path = os.path.join(doc_data_path, '{0}.xls'.format(doc_id)) | ||
| 57 | self.cronjob_log.info('{0} [pdf download success] [doc_info={1}] [pdf_path={2}]'.format( | 69 | self.cronjob_log.info('{0} [pdf download success] [doc_info={1}] [pdf_path={2}]'.format( |
| 58 | self.log_base, doc_info, pdf_path)) | 70 | self.log_base, doc_info, pdf_path)) |
| 59 | return pdf_path, doc_data_path | 71 | return doc_data_path, excel_path, pdf_path, doc_id |
| 72 | |||
| 73 | @staticmethod | ||
| 74 | def append_sheet(wb, sheets_list, img_name): | ||
| 75 | for i, sheet in enumerate(sheets_list): | ||
| 76 | ws = wb.add_sheet('{0}_{1}'.format(img_name, i)) | ||
| 77 | cells = sheet.get('cells') | ||
| 78 | for cell in cells: | ||
| 79 | c1 = cell.get('start_column') | ||
| 80 | c2 = cell.get('end_column') | ||
| 81 | r1 = cell.get('start_row') | ||
| 82 | r2 = cell.get('end_row') | ||
| 83 | label = cell.get('words') | ||
| 84 | ws.write_merge(r1, r2, c1, c2, label=label) | ||
| 85 | |||
| 86 | @staticmethod | ||
| 87 | def get_ocr_json(img_path): | ||
| 88 | with open(img_path, "rb") as f: | ||
| 89 | base64_data = base64.b64encode(f.read()) | ||
| 90 | return {'imgBase64': base64_data.decode('utf-8')} | ||
| 91 | |||
| 92 | |||
| 93 | async def fetch_ocr_result(self, img_path): | ||
| 94 | async with aiohttp.ClientSession( | ||
| 95 | headers=self.ocr_header, connector=aiohttp.TCPConnector(ssl=False) | ||
| 96 | ) as session: | ||
| 97 | json_data = self.get_ocr_json(img_path) | ||
| 98 | async with session.post(self.ocr_url, json=json_data) as response: | ||
| 99 | return await response.json() | ||
| 100 | |||
| 101 | async def img_ocr_excel(self, wb, img_path): | ||
| 102 | res = await self.fetch_ocr_result(img_path) | ||
| 103 | self.cronjob_log.info('{0} [fetch ocr result success] [img={1}] [res={2}]'.format(self.log_base, img_path, res)) | ||
| 104 | sheets_list = res.get('result').get('res') | ||
| 105 | img_name = os.path.basename(img_path) | ||
| 106 | self.append_sheet(wb, sheets_list, img_name) | ||
| 60 | 107 | ||
| 61 | @staticmethod | 108 | @staticmethod |
| 62 | def getimage(pix): | 109 | def getimage(pix): |
| ... | @@ -143,65 +190,91 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -143,65 +190,91 @@ class Command(BaseCommand, LoggerMixin): |
| 143 | # 从队列获取文件信息 | 190 | # 从队列获取文件信息 |
| 144 | doc_info = self.get_doc_info() | 191 | doc_info = self.get_doc_info() |
| 145 | # 从EDMS获取PDF文件 | 192 | # 从EDMS获取PDF文件 |
| 146 | pdf_path, doc_data_path = self.pdf_download(doc_info) | 193 | doc_data_path, excel_path, pdf_path, doc_id = self.pdf_download(doc_info) |
| 147 | # 队列为空时的处理 | 194 | # 队列为空时的处理 |
| 148 | if pdf_path is None: | 195 | if pdf_path is None: |
| 149 | time.sleep(10) | 196 | time.sleep(10) |
| 150 | continue | 197 | continue |
| 151 | # PDF文件提取图片 | 198 | try: |
| 152 | img_save_path = os.path.join(doc_data_path, 'img') | 199 | # PDF文件提取图片 |
| 153 | os.makedirs(img_save_path, exist_ok=True) | 200 | img_save_path = os.path.join(doc_data_path, 'img') |
| 154 | with fitz.Document(pdf_path) as pdf: | 201 | os.makedirs(img_save_path, exist_ok=True) |
| 155 | self.cronjob_log.info('{0} [pdf_path={1}] [pdf_metadata={2}]'.format( | 202 | img_path_list = [] |
| 156 | self.log_base, pdf_path, pdf.metadata)) | 203 | with fitz.Document(pdf_path) as pdf: |
| 157 | # xref_list = [] # TODO 图片去重 | 204 | self.cronjob_log.info('{0} [pdf_path={1}] [metadata={2}]'.format( |
| 158 | for pno in range(pdf.pageCount): | 205 | self.log_base, pdf_path, pdf.metadata)) |
| 159 | il = pdf.getPageImageList(pno) | 206 | # xref_list = [] # TODO 图片去重 |
| 160 | il.sort(key=lambda x: x[0]) | 207 | for pno in range(pdf.pageCount): |
| 161 | img_il_list = self.split_il(il) | 208 | il = pdf.getPageImageList(pno) |
| 162 | del il | 209 | il.sort(key=lambda x: x[0]) |
| 163 | 210 | img_il_list = self.split_il(il) | |
| 164 | if len(img_il_list) > 3: # 单页无规律小图过多时,使用页面转图片 | 211 | del il |
| 165 | page = pdf.loadPage(pno) | 212 | |
| 166 | pm = page.getPixmap(matrix=self.trans, alpha=False) | 213 | if len(img_il_list) > 3: # 单页无规律小图过多时,使用页面转图片 |
| 167 | save_path = os.path.join(img_save_path, 'page_{0}_img_0.png'.format(page.number)) | 214 | page = pdf.loadPage(pno) |
| 168 | pm.writePNG(save_path) | 215 | pm = page.getPixmap(matrix=self.trans, alpha=False) |
| 169 | else: # 提取图片 | 216 | save_path = os.path.join(img_save_path, 'page_{0}_img_0.png'.format(page.number)) |
| 170 | for img_count, img_il in enumerate(img_il_list): | 217 | pm.writePNG(save_path) |
| 171 | if len(img_il) == 1: # 当只有一张图片时, 简化处理 | 218 | img_path_list.append(save_path) |
| 172 | pix = self.recoverpix(pdf, img_il[0]) | 219 | self.cronjob_log.info('{0} [page to img success] [pdf_path={1}] [page={2}]'.format( |
| 173 | ext, img_data = self.get_img_data(pix) | 220 | self.log_base, pdf_path, page.number)) |
| 174 | save_path = os.path.join(img_save_path, 'page_{0}_img_{1}.{2}'.format( | 221 | else: # 提取图片 |
| 175 | pno, img_count, ext)) | 222 | for img_index, img_il in enumerate(img_il_list): |
| 176 | with open(save_path, "wb") as f: | 223 | if len(img_il) == 1: # 当只有一张图片时, 简化处理 |
| 177 | f.write(img_data) | 224 | pix = self.recoverpix(pdf, img_il[0]) |
| 178 | else: # 多张图片,竖向拼接 | ||
| 179 | height_sum = 0 | ||
| 180 | im_list = [] | ||
| 181 | width = img_il[0][2] | ||
| 182 | for img in img_il: | ||
| 183 | # xref = img[0] | ||
| 184 | # if xref in xref_list: | ||
| 185 | # continue | ||
| 186 | height = img[3] | ||
| 187 | pix = self.recoverpix(pdf, img) | ||
| 188 | ext, img_data = self.get_img_data(pix) | 225 | ext, img_data = self.get_img_data(pix) |
| 226 | save_path = os.path.join(img_save_path, 'page_{0}_img_{1}.{2}'.format( | ||
| 227 | pno, img_index, ext)) | ||
| 228 | with open(save_path, "wb") as f: | ||
| 229 | f.write(img_data) | ||
| 230 | img_path_list.append(save_path) | ||
| 231 | self.cronjob_log.info( | ||
| 232 | '{0} [extract img success] [pdf_path={1}] [page={2}] [img_index={3}]'.format( | ||
| 233 | self.log_base, pdf_path, pno, img_index)) | ||
| 234 | else: # 多张图片,竖向拼接 | ||
| 235 | height_sum = 0 | ||
| 236 | im_list = [] | ||
| 237 | width = img_il[0][2] | ||
| 238 | for img in img_il: | ||
| 239 | # xref = img[0] | ||
| 240 | # if xref in xref_list: | ||
| 241 | # continue | ||
| 242 | height = img[3] | ||
| 243 | pix = self.recoverpix(pdf, img) | ||
| 244 | ext, img_data = self.get_img_data(pix) | ||
| 189 | 245 | ||
| 190 | # xref_list.append(xref) | 246 | # xref_list.append(xref) |
| 191 | 247 | ||
| 192 | im = Image.open(BytesIO(img_data)) | 248 | im = Image.open(BytesIO(img_data)) |
| 193 | im_list.append((height, im, ext)) | 249 | im_list.append((height, im, ext)) |
| 194 | height_sum += height | 250 | height_sum += height |
| 195 | 251 | ||
| 196 | save_path = os.path.join(img_save_path, 'page_{0}_img_{1}.{2}'.format( | 252 | save_path = os.path.join(img_save_path, 'page_{0}_img_{1}.{2}'.format( |
| 197 | pno, img_count, im_list[0][2])) | 253 | pno, img_index, im_list[0][2])) |
| 198 | res = Image.new(im_list[0][1].mode, (width, height_sum)) | 254 | res = Image.new(im_list[0][1].mode, (width, height_sum)) |
| 199 | h_now = 0 | 255 | h_now = 0 |
| 200 | for h, m, _ in im_list: | 256 | for h, m, _ in im_list: |
| 201 | res.paste(m, box=(0, h_now)) | 257 | res.paste(m, box=(0, h_now)) |
| 202 | h_now += h | 258 | h_now += h |
| 203 | res.save(save_path) | 259 | res.save(save_path) |
| 260 | img_path_list.append(save_path) | ||
| 261 | self.cronjob_log.info( | ||
| 262 | '{0} [extract img success] [pdf_path={1}] [page={2}] [img_index={3}]'.format( | ||
| 263 | self.log_base, pdf_path, pno, img_index)) | ||
| 264 | self.cronjob_log.info('{0} [pdf to img success]'.format(self.log_base)) | ||
| 204 | 265 | ||
| 205 | # 图片调用算法判断是否为银行流水 | 266 | write_zip_file(img_save_path, os.path.join(doc_data_path, '{0}_img.zip'.format(doc_id))) |
| 206 | # 图片调用算法OCR为excel文件 | 267 | # 图片调用算法判断是否为银行流水, 图片调用算法OCR为excel文件 |
| 207 | # 整合excel文件上传至EDMS | 268 | wb = xlwt.Workbook() |
| 269 | loop = asyncio.get_event_loop() | ||
| 270 | tasks = [self.img_ocr_excel(wb, img_path) for img_path in img_path_list] | ||
| 271 | loop.run_until_complete(asyncio.wait(tasks)) | ||
| 272 | loop.close() | ||
| 273 | wb.save(excel_path) | ||
| 274 | # 整合excel文件上传至EDMS | ||
| 275 | except Exception as e: | ||
| 276 | UploadDocRecords.objects.filter(id=doc_id).update(status=DocStatus.PROCESS_FAILED.value) | ||
| 277 | self.cronjob_log.error('{0} [process failed] [err={1}]'.format(self.log_base, e)) | ||
| 278 | else: | ||
| 279 | UploadDocRecords.objects.filter(id=doc_id).update(status=DocStatus.COMPLETE.value) | ||
| 280 | self.cronjob_log.info('{0} [doc process complete] [doc_id={1}]'.format(self.log_base, doc_id)) | ... | ... |
| ... | @@ -7,13 +7,12 @@ class DocHandler: | ... | @@ -7,13 +7,12 @@ class DocHandler: |
| 7 | 7 | ||
| 8 | @staticmethod | 8 | @staticmethod |
| 9 | def get_link(doc_id, file='pdf'): | 9 | def get_link(doc_id, file='pdf'): |
| 10 | data_path = os.path.join(conf.DATA_DIR, str(doc_id)) | ||
| 11 | if file == 'pdf': | 10 | if file == 'pdf': |
| 12 | return os.path.join(data_path, '{0}.pdf'.format(str(doc_id))) | 11 | return '/data/{0}/{0}.pdf'.format(doc_id) |
| 13 | elif file == 'img': | 12 | elif file == 'img': |
| 14 | return os.path.join(data_path, '{0}_img.zip'.format(str(doc_id))) | 13 | return '/data/{0}/{0}_img.zip'.format(doc_id) |
| 15 | else: | 14 | else: |
| 16 | return os.path.join(data_path, '{0}.xlsx'.format(str(doc_id))) | 15 | return '/data/{0}/{0}.xls'.format(doc_id) |
| 17 | 16 | ||
| 18 | def get_doc_list(self, doc_queryset): | 17 | def get_doc_list(self, doc_queryset): |
| 19 | for doc_dict in doc_queryset: | 18 | for doc_dict in doc_queryset: | ... | ... |
| 1 | import os | ||
| 2 | from zipfile import ZipFile | ||
| 3 | |||
| 4 | |||
| 1 | def file_write(file, file_path): | 5 | def file_write(file, file_path): |
| 2 | with open(file_path, 'wb+') as f: | 6 | with open(file_path, 'wb+') as f: |
| 3 | for chunk in file.chunks(): | 7 | for chunk in file.chunks(): |
| 4 | f.write(chunk) | 8 | f.write(chunk) |
| 9 | |||
| 10 | |||
| 11 | def write_zip_file(dir_name, zipfile_path): | ||
| 12 | if not os.path.isdir(dir_name): | ||
| 13 | return | ||
| 14 | with ZipFile(zipfile_path, 'w') as z: | ||
| 15 | for root, dirs, files in os.walk(dir_name): | ||
| 16 | root_target_path = root.replace(dir_name, '') | ||
| 17 | for single_file in files: | ||
| 18 | src_file_path = os.path.join(root, single_file) | ||
| 19 | file_target_path = os.path.join(root_target_path, single_file) | ||
| 20 | z.write(src_file_path, file_target_path) | ... | ... |
-
Please register or sign in to post a comment