ocr process
Showing
4 changed files
with
113 additions
and
15 deletions
1 | aiohttp==3.6.2 | ||
2 | async-timeout==3.0.1 | ||
3 | attrs==19.3.0 | ||
1 | certifi==2016.2.28 | 4 | certifi==2016.2.28 |
5 | chardet==3.0.4 | ||
2 | Django==2.1 | 6 | Django==2.1 |
3 | # django-mysqlpool @ https://github.com/smartfile/django-mysqlpool/archive/master.zip | 7 | # django-mysqlpool @ https://github.com/smartfile/django-mysqlpool/archive/master.zip |
4 | djangorestframework==3.9.0 | 8 | djangorestframework==3.9.0 |
5 | djangorestframework-jwt==1.11.0 | 9 | djangorestframework-jwt==1.11.0 |
10 | idna==2.9 | ||
11 | idna-ssl==1.1.0 | ||
6 | marshmallow==3.6.1 | 12 | marshmallow==3.6.1 |
13 | multidict==4.7.6 | ||
7 | pdfminer3k==1.3.4 | 14 | pdfminer3k==1.3.4 |
8 | Pillow==7.1.2 | 15 | Pillow==7.1.2 |
9 | ply==3.11 | 16 | ply==3.11 |
... | @@ -17,4 +24,7 @@ redis==3.4.1 | ... | @@ -17,4 +24,7 @@ redis==3.4.1 |
17 | # situlogger @ http://gitlab.situdata.com/zhouweiqi/situlogger/repository/archive.tar.gz?ref=master | 24 | # situlogger @ http://gitlab.situdata.com/zhouweiqi/situlogger/repository/archive.tar.gz?ref=master |
18 | six==1.14.0 | 25 | six==1.14.0 |
19 | SQLAlchemy==0.9.10 | 26 | SQLAlchemy==0.9.10 |
27 | typing-extensions==3.7.4.2 | ||
20 | webargs==6.1.0 | 28 | webargs==6.1.0 |
29 | xlwt==1.3.0 | ||
30 | yarl==1.4.2 | ... | ... |
1 | import time | ||
2 | import os | 1 | import os |
3 | import signal | 2 | import time |
4 | import fitz | 3 | import fitz |
4 | import xlwt | ||
5 | import signal | ||
6 | import base64 | ||
7 | import asyncio | ||
8 | import aiohttp | ||
5 | from PIL import Image | 9 | from PIL import Image |
6 | from io import BytesIO | 10 | from io import BytesIO |
7 | 11 | ||
8 | from django.core.management import BaseCommand | 12 | from django.core.management import BaseCommand |
9 | from common.mixins import LoggerMixin | 13 | from common.mixins import LoggerMixin |
10 | from common.redis_cache import redis_handler as rh | 14 | from common.redis_cache import redis_handler as rh |
15 | from common.tools.file_tools import write_zip_file | ||
11 | from apps.doc.models import UploadDocRecords, DocStatus | 16 | from apps.doc.models import UploadDocRecords, DocStatus |
12 | from settings import conf | 17 | from settings import conf |
13 | 18 | ||
... | @@ -25,6 +30,12 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -25,6 +30,12 @@ class Command(BaseCommand, LoggerMixin): |
25 | self.zoom_x = 2.0 | 30 | self.zoom_x = 2.0 |
26 | self.zoom_y = 2.0 | 31 | self.zoom_y = 2.0 |
27 | self.trans = fitz.Matrix(self.zoom_x, self.zoom_y).preRotate(0) # zoom factor 2 in each dimension | 32 | self.trans = fitz.Matrix(self.zoom_x, self.zoom_y).preRotate(0) # zoom factor 2 in each dimension |
33 | # ocr相关 | ||
34 | self.ocr_url = conf.OCR_URL | ||
35 | self.ocr_header = { | ||
36 | 'X-Auth-Token': conf.OCR_TOKEN, | ||
37 | 'Content-Type': 'application/json' | ||
38 | } | ||
28 | # 优雅退出信号:15 | 39 | # 优雅退出信号:15 |
29 | signal.signal(signal.SIGTERM, self.signal_handler) | 40 | signal.signal(signal.SIGTERM, self.signal_handler) |
30 | 41 | ||
... | @@ -47,16 +58,52 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -47,16 +58,52 @@ class Command(BaseCommand, LoggerMixin): |
47 | 58 | ||
48 | def pdf_download(self, doc_info): | 59 | def pdf_download(self, doc_info): |
49 | if doc_info is None: | 60 | if doc_info is None: |
50 | return | 61 | return None, None, None, None |
51 | # TODO EDMS下载pdf | 62 | # TODO EDMS下载pdf |
52 | # pdf_path = '/Users/clay/Desktop/biz/biz_logic/data/2/横版-表格-工商银行CH-B008802400.pdf' | 63 | # pdf_path = '/Users/clay/Desktop/biz/biz_logic/data/2/横版-表格-工商银行CH-B008802400.pdf' |
53 | # doc_data_path = os.path.dirname(pdf_path) | 64 | # doc_data_path = os.path.dirname(pdf_path) |
54 | doc_id = doc_info['id'] | 65 | doc_id = doc_info['id'] |
55 | doc_data_path = os.path.join(self.data_dir, str(doc_id)) | 66 | doc_data_path = os.path.join(self.data_dir, str(doc_id)) |
56 | pdf_path = os.path.join(doc_data_path, '{0}.pdf'.format(doc_id)) | 67 | pdf_path = os.path.join(doc_data_path, '{0}.pdf'.format(doc_id)) |
68 | excel_path = os.path.join(doc_data_path, '{0}.xls'.format(doc_id)) | ||
57 | self.cronjob_log.info('{0} [pdf download success] [doc_info={1}] [pdf_path={2}]'.format( | 69 | self.cronjob_log.info('{0} [pdf download success] [doc_info={1}] [pdf_path={2}]'.format( |
58 | self.log_base, doc_info, pdf_path)) | 70 | self.log_base, doc_info, pdf_path)) |
59 | return pdf_path, doc_data_path | 71 | return doc_data_path, excel_path, pdf_path, doc_id |
72 | |||
73 | @staticmethod | ||
74 | def append_sheet(wb, sheets_list, img_name): | ||
75 | for i, sheet in enumerate(sheets_list): | ||
76 | ws = wb.add_sheet('{0}_{1}'.format(img_name, i)) | ||
77 | cells = sheet.get('cells') | ||
78 | for cell in cells: | ||
79 | c1 = cell.get('start_column') | ||
80 | c2 = cell.get('end_column') | ||
81 | r1 = cell.get('start_row') | ||
82 | r2 = cell.get('end_row') | ||
83 | label = cell.get('words') | ||
84 | ws.write_merge(r1, r2, c1, c2, label=label) | ||
85 | |||
86 | @staticmethod | ||
87 | def get_ocr_json(img_path): | ||
88 | with open(img_path, "rb") as f: | ||
89 | base64_data = base64.b64encode(f.read()) | ||
90 | return {'imgBase64': base64_data.decode('utf-8')} | ||
91 | |||
92 | |||
93 | async def fetch_ocr_result(self, img_path): | ||
94 | async with aiohttp.ClientSession( | ||
95 | headers=self.ocr_header, connector=aiohttp.TCPConnector(ssl=False) | ||
96 | ) as session: | ||
97 | json_data = self.get_ocr_json(img_path) | ||
98 | async with session.post(self.ocr_url, json=json_data) as response: | ||
99 | return await response.json() | ||
100 | |||
101 | async def img_ocr_excel(self, wb, img_path): | ||
102 | res = await self.fetch_ocr_result(img_path) | ||
103 | self.cronjob_log.info('{0} [fetch ocr result success] [img={1}] [res={2}]'.format(self.log_base, img_path, res)) | ||
104 | sheets_list = res.get('result').get('res') | ||
105 | img_name = os.path.basename(img_path) | ||
106 | self.append_sheet(wb, sheets_list, img_name) | ||
60 | 107 | ||
61 | @staticmethod | 108 | @staticmethod |
62 | def getimage(pix): | 109 | def getimage(pix): |
... | @@ -143,16 +190,18 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -143,16 +190,18 @@ class Command(BaseCommand, LoggerMixin): |
143 | # 从队列获取文件信息 | 190 | # 从队列获取文件信息 |
144 | doc_info = self.get_doc_info() | 191 | doc_info = self.get_doc_info() |
145 | # 从EDMS获取PDF文件 | 192 | # 从EDMS获取PDF文件 |
146 | pdf_path, doc_data_path = self.pdf_download(doc_info) | 193 | doc_data_path, excel_path, pdf_path, doc_id = self.pdf_download(doc_info) |
147 | # 队列为空时的处理 | 194 | # 队列为空时的处理 |
148 | if pdf_path is None: | 195 | if pdf_path is None: |
149 | time.sleep(10) | 196 | time.sleep(10) |
150 | continue | 197 | continue |
198 | try: | ||
151 | # PDF文件提取图片 | 199 | # PDF文件提取图片 |
152 | img_save_path = os.path.join(doc_data_path, 'img') | 200 | img_save_path = os.path.join(doc_data_path, 'img') |
153 | os.makedirs(img_save_path, exist_ok=True) | 201 | os.makedirs(img_save_path, exist_ok=True) |
202 | img_path_list = [] | ||
154 | with fitz.Document(pdf_path) as pdf: | 203 | with fitz.Document(pdf_path) as pdf: |
155 | self.cronjob_log.info('{0} [pdf_path={1}] [pdf_metadata={2}]'.format( | 204 | self.cronjob_log.info('{0} [pdf_path={1}] [metadata={2}]'.format( |
156 | self.log_base, pdf_path, pdf.metadata)) | 205 | self.log_base, pdf_path, pdf.metadata)) |
157 | # xref_list = [] # TODO 图片去重 | 206 | # xref_list = [] # TODO 图片去重 |
158 | for pno in range(pdf.pageCount): | 207 | for pno in range(pdf.pageCount): |
... | @@ -166,15 +215,22 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -166,15 +215,22 @@ class Command(BaseCommand, LoggerMixin): |
166 | pm = page.getPixmap(matrix=self.trans, alpha=False) | 215 | pm = page.getPixmap(matrix=self.trans, alpha=False) |
167 | save_path = os.path.join(img_save_path, 'page_{0}_img_0.png'.format(page.number)) | 216 | save_path = os.path.join(img_save_path, 'page_{0}_img_0.png'.format(page.number)) |
168 | pm.writePNG(save_path) | 217 | pm.writePNG(save_path) |
218 | img_path_list.append(save_path) | ||
219 | self.cronjob_log.info('{0} [page to img success] [pdf_path={1}] [page={2}]'.format( | ||
220 | self.log_base, pdf_path, page.number)) | ||
169 | else: # 提取图片 | 221 | else: # 提取图片 |
170 | for img_count, img_il in enumerate(img_il_list): | 222 | for img_index, img_il in enumerate(img_il_list): |
171 | if len(img_il) == 1: # 当只有一张图片时, 简化处理 | 223 | if len(img_il) == 1: # 当只有一张图片时, 简化处理 |
172 | pix = self.recoverpix(pdf, img_il[0]) | 224 | pix = self.recoverpix(pdf, img_il[0]) |
173 | ext, img_data = self.get_img_data(pix) | 225 | ext, img_data = self.get_img_data(pix) |
174 | save_path = os.path.join(img_save_path, 'page_{0}_img_{1}.{2}'.format( | 226 | save_path = os.path.join(img_save_path, 'page_{0}_img_{1}.{2}'.format( |
175 | pno, img_count, ext)) | 227 | pno, img_index, ext)) |
176 | with open(save_path, "wb") as f: | 228 | with open(save_path, "wb") as f: |
177 | f.write(img_data) | 229 | f.write(img_data) |
230 | img_path_list.append(save_path) | ||
231 | self.cronjob_log.info( | ||
232 | '{0} [extract img success] [pdf_path={1}] [page={2}] [img_index={3}]'.format( | ||
233 | self.log_base, pdf_path, pno, img_index)) | ||
178 | else: # 多张图片,竖向拼接 | 234 | else: # 多张图片,竖向拼接 |
179 | height_sum = 0 | 235 | height_sum = 0 |
180 | im_list = [] | 236 | im_list = [] |
... | @@ -194,14 +250,31 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -194,14 +250,31 @@ class Command(BaseCommand, LoggerMixin): |
194 | height_sum += height | 250 | height_sum += height |
195 | 251 | ||
196 | save_path = os.path.join(img_save_path, 'page_{0}_img_{1}.{2}'.format( | 252 | save_path = os.path.join(img_save_path, 'page_{0}_img_{1}.{2}'.format( |
197 | pno, img_count, im_list[0][2])) | 253 | pno, img_index, im_list[0][2])) |
198 | res = Image.new(im_list[0][1].mode, (width, height_sum)) | 254 | res = Image.new(im_list[0][1].mode, (width, height_sum)) |
199 | h_now = 0 | 255 | h_now = 0 |
200 | for h, m, _ in im_list: | 256 | for h, m, _ in im_list: |
201 | res.paste(m, box=(0, h_now)) | 257 | res.paste(m, box=(0, h_now)) |
202 | h_now += h | 258 | h_now += h |
203 | res.save(save_path) | 259 | res.save(save_path) |
260 | img_path_list.append(save_path) | ||
261 | self.cronjob_log.info( | ||
262 | '{0} [extract img success] [pdf_path={1}] [page={2}] [img_index={3}]'.format( | ||
263 | self.log_base, pdf_path, pno, img_index)) | ||
264 | self.cronjob_log.info('{0} [pdf to img success]'.format(self.log_base)) | ||
204 | 265 | ||
205 | # 图片调用算法判断是否为银行流水 | 266 | write_zip_file(img_save_path, os.path.join(doc_data_path, '{0}_img.zip'.format(doc_id))) |
206 | # 图片调用算法OCR为excel文件 | 267 | # 图片调用算法判断是否为银行流水, 图片调用算法OCR为excel文件 |
268 | wb = xlwt.Workbook() | ||
269 | loop = asyncio.get_event_loop() | ||
270 | tasks = [self.img_ocr_excel(wb, img_path) for img_path in img_path_list] | ||
271 | loop.run_until_complete(asyncio.wait(tasks)) | ||
272 | loop.close() | ||
273 | wb.save(excel_path) | ||
207 | # 整合excel文件上传至EDMS | 274 | # 整合excel文件上传至EDMS |
275 | except Exception as e: | ||
276 | UploadDocRecords.objects.filter(id=doc_id).update(status=DocStatus.PROCESS_FAILED.value) | ||
277 | self.cronjob_log.error('{0} [process failed] [err={1}]'.format(self.log_base, e)) | ||
278 | else: | ||
279 | UploadDocRecords.objects.filter(id=doc_id).update(status=DocStatus.COMPLETE.value) | ||
280 | self.cronjob_log.info('{0} [doc process complete] [doc_id={1}]'.format(self.log_base, doc_id)) | ... | ... |
... | @@ -7,13 +7,12 @@ class DocHandler: | ... | @@ -7,13 +7,12 @@ class DocHandler: |
7 | 7 | ||
8 | @staticmethod | 8 | @staticmethod |
9 | def get_link(doc_id, file='pdf'): | 9 | def get_link(doc_id, file='pdf'): |
10 | data_path = os.path.join(conf.DATA_DIR, str(doc_id)) | ||
11 | if file == 'pdf': | 10 | if file == 'pdf': |
12 | return os.path.join(data_path, '{0}.pdf'.format(str(doc_id))) | 11 | return '/data/{0}/{0}.pdf'.format(doc_id) |
13 | elif file == 'img': | 12 | elif file == 'img': |
14 | return os.path.join(data_path, '{0}_img.zip'.format(str(doc_id))) | 13 | return '/data/{0}/{0}_img.zip'.format(doc_id) |
15 | else: | 14 | else: |
16 | return os.path.join(data_path, '{0}.xlsx'.format(str(doc_id))) | 15 | return '/data/{0}/{0}.xls'.format(doc_id) |
17 | 16 | ||
18 | def get_doc_list(self, doc_queryset): | 17 | def get_doc_list(self, doc_queryset): |
19 | for doc_dict in doc_queryset: | 18 | for doc_dict in doc_queryset: | ... | ... |
1 | import os | ||
2 | from zipfile import ZipFile | ||
3 | |||
4 | |||
1 | def file_write(file, file_path): | 5 | def file_write(file, file_path): |
2 | with open(file_path, 'wb+') as f: | 6 | with open(file_path, 'wb+') as f: |
3 | for chunk in file.chunks(): | 7 | for chunk in file.chunks(): |
4 | f.write(chunk) | 8 | f.write(chunk) |
9 | |||
10 | |||
11 | def write_zip_file(dir_name, zipfile_path): | ||
12 | if not os.path.isdir(dir_name): | ||
13 | return | ||
14 | with ZipFile(zipfile_path, 'w') as z: | ||
15 | for root, dirs, files in os.walk(dir_name): | ||
16 | root_target_path = root.replace(dir_name, '') | ||
17 | for single_file in files: | ||
18 | src_file_path = os.path.join(root, single_file) | ||
19 | file_target_path = os.path.join(root_target_path, single_file) | ||
20 | z.write(src_file_path, file_target_path) | ... | ... |
-
Please register or sign in to post a comment