a02a957e by 周伟奇

ocr process

1 parent f8904dcb
1 aiohttp==3.6.2
2 async-timeout==3.0.1
3 attrs==19.3.0
1 certifi==2016.2.28 4 certifi==2016.2.28
5 chardet==3.0.4
2 Django==2.1 6 Django==2.1
3 # django-mysqlpool @ https://github.com/smartfile/django-mysqlpool/archive/master.zip 7 # django-mysqlpool @ https://github.com/smartfile/django-mysqlpool/archive/master.zip
4 djangorestframework==3.9.0 8 djangorestframework==3.9.0
5 djangorestframework-jwt==1.11.0 9 djangorestframework-jwt==1.11.0
10 idna==2.9
11 idna-ssl==1.1.0
6 marshmallow==3.6.1 12 marshmallow==3.6.1
13 multidict==4.7.6
7 pdfminer3k==1.3.4 14 pdfminer3k==1.3.4
8 Pillow==7.1.2 15 Pillow==7.1.2
9 ply==3.11 16 ply==3.11
...@@ -17,4 +24,7 @@ redis==3.4.1 ...@@ -17,4 +24,7 @@ redis==3.4.1
17 # situlogger @ http://gitlab.situdata.com/zhouweiqi/situlogger/repository/archive.tar.gz?ref=master 24 # situlogger @ http://gitlab.situdata.com/zhouweiqi/situlogger/repository/archive.tar.gz?ref=master
18 six==1.14.0 25 six==1.14.0
19 SQLAlchemy==0.9.10 26 SQLAlchemy==0.9.10
27 typing-extensions==3.7.4.2
20 webargs==6.1.0 28 webargs==6.1.0
29 xlwt==1.3.0
30 yarl==1.4.2
......
1 import time
2 import os 1 import os
3 import signal 2 import time
4 import fitz 3 import fitz
4 import xlwt
5 import signal
6 import base64
7 import asyncio
8 import aiohttp
5 from PIL import Image 9 from PIL import Image
6 from io import BytesIO 10 from io import BytesIO
7 11
8 from django.core.management import BaseCommand 12 from django.core.management import BaseCommand
9 from common.mixins import LoggerMixin 13 from common.mixins import LoggerMixin
10 from common.redis_cache import redis_handler as rh 14 from common.redis_cache import redis_handler as rh
15 from common.tools.file_tools import write_zip_file
11 from apps.doc.models import UploadDocRecords, DocStatus 16 from apps.doc.models import UploadDocRecords, DocStatus
12 from settings import conf 17 from settings import conf
13 18
...@@ -25,6 +30,12 @@ class Command(BaseCommand, LoggerMixin): ...@@ -25,6 +30,12 @@ class Command(BaseCommand, LoggerMixin):
25 self.zoom_x = 2.0 30 self.zoom_x = 2.0
26 self.zoom_y = 2.0 31 self.zoom_y = 2.0
27 self.trans = fitz.Matrix(self.zoom_x, self.zoom_y).preRotate(0) # zoom factor 2 in each dimension 32 self.trans = fitz.Matrix(self.zoom_x, self.zoom_y).preRotate(0) # zoom factor 2 in each dimension
33 # ocr相关
34 self.ocr_url = conf.OCR_URL
35 self.ocr_header = {
36 'X-Auth-Token': conf.OCR_TOKEN,
37 'Content-Type': 'application/json'
38 }
28 # 优雅退出信号:15 39 # 优雅退出信号:15
29 signal.signal(signal.SIGTERM, self.signal_handler) 40 signal.signal(signal.SIGTERM, self.signal_handler)
30 41
...@@ -47,16 +58,52 @@ class Command(BaseCommand, LoggerMixin): ...@@ -47,16 +58,52 @@ class Command(BaseCommand, LoggerMixin):
47 58
48 def pdf_download(self, doc_info): 59 def pdf_download(self, doc_info):
49 if doc_info is None: 60 if doc_info is None:
50 return 61 return None, None, None, None
51 # TODO EDMS下载pdf 62 # TODO EDMS下载pdf
52 # pdf_path = '/Users/clay/Desktop/biz/biz_logic/data/2/横版-表格-工商银行CH-B008802400.pdf' 63 # pdf_path = '/Users/clay/Desktop/biz/biz_logic/data/2/横版-表格-工商银行CH-B008802400.pdf'
53 # doc_data_path = os.path.dirname(pdf_path) 64 # doc_data_path = os.path.dirname(pdf_path)
54 doc_id = doc_info['id'] 65 doc_id = doc_info['id']
55 doc_data_path = os.path.join(self.data_dir, str(doc_id)) 66 doc_data_path = os.path.join(self.data_dir, str(doc_id))
56 pdf_path = os.path.join(doc_data_path, '{0}.pdf'.format(doc_id)) 67 pdf_path = os.path.join(doc_data_path, '{0}.pdf'.format(doc_id))
68 excel_path = os.path.join(doc_data_path, '{0}.xls'.format(doc_id))
57 self.cronjob_log.info('{0} [pdf download success] [doc_info={1}] [pdf_path={2}]'.format( 69 self.cronjob_log.info('{0} [pdf download success] [doc_info={1}] [pdf_path={2}]'.format(
58 self.log_base, doc_info, pdf_path)) 70 self.log_base, doc_info, pdf_path))
59 return pdf_path, doc_data_path 71 return doc_data_path, excel_path, pdf_path, doc_id
72
73 @staticmethod
74 def append_sheet(wb, sheets_list, img_name):
75 for i, sheet in enumerate(sheets_list):
76 ws = wb.add_sheet('{0}_{1}'.format(img_name, i))
77 cells = sheet.get('cells')
78 for cell in cells:
79 c1 = cell.get('start_column')
80 c2 = cell.get('end_column')
81 r1 = cell.get('start_row')
82 r2 = cell.get('end_row')
83 label = cell.get('words')
84 ws.write_merge(r1, r2, c1, c2, label=label)
85
86 @staticmethod
87 def get_ocr_json(img_path):
88 with open(img_path, "rb") as f:
89 base64_data = base64.b64encode(f.read())
90 return {'imgBase64': base64_data.decode('utf-8')}
91
92
93 async def fetch_ocr_result(self, img_path):
94 async with aiohttp.ClientSession(
95 headers=self.ocr_header, connector=aiohttp.TCPConnector(ssl=False)
96 ) as session:
97 json_data = self.get_ocr_json(img_path)
98 async with session.post(self.ocr_url, json=json_data) as response:
99 return await response.json()
100
101 async def img_ocr_excel(self, wb, img_path):
102 res = await self.fetch_ocr_result(img_path)
103 self.cronjob_log.info('{0} [fetch ocr result success] [img={1}] [res={2}]'.format(self.log_base, img_path, res))
104 sheets_list = res.get('result').get('res')
105 img_name = os.path.basename(img_path)
106 self.append_sheet(wb, sheets_list, img_name)
60 107
61 @staticmethod 108 @staticmethod
62 def getimage(pix): 109 def getimage(pix):
...@@ -143,16 +190,18 @@ class Command(BaseCommand, LoggerMixin): ...@@ -143,16 +190,18 @@ class Command(BaseCommand, LoggerMixin):
143 # 从队列获取文件信息 190 # 从队列获取文件信息
144 doc_info = self.get_doc_info() 191 doc_info = self.get_doc_info()
145 # 从EDMS获取PDF文件 192 # 从EDMS获取PDF文件
146 pdf_path, doc_data_path = self.pdf_download(doc_info) 193 doc_data_path, excel_path, pdf_path, doc_id = self.pdf_download(doc_info)
147 # 队列为空时的处理 194 # 队列为空时的处理
148 if pdf_path is None: 195 if pdf_path is None:
149 time.sleep(10) 196 time.sleep(10)
150 continue 197 continue
198 try:
151 # PDF文件提取图片 199 # PDF文件提取图片
152 img_save_path = os.path.join(doc_data_path, 'img') 200 img_save_path = os.path.join(doc_data_path, 'img')
153 os.makedirs(img_save_path, exist_ok=True) 201 os.makedirs(img_save_path, exist_ok=True)
202 img_path_list = []
154 with fitz.Document(pdf_path) as pdf: 203 with fitz.Document(pdf_path) as pdf:
155 self.cronjob_log.info('{0} [pdf_path={1}] [pdf_metadata={2}]'.format( 204 self.cronjob_log.info('{0} [pdf_path={1}] [metadata={2}]'.format(
156 self.log_base, pdf_path, pdf.metadata)) 205 self.log_base, pdf_path, pdf.metadata))
157 # xref_list = [] # TODO 图片去重 206 # xref_list = [] # TODO 图片去重
158 for pno in range(pdf.pageCount): 207 for pno in range(pdf.pageCount):
...@@ -166,15 +215,22 @@ class Command(BaseCommand, LoggerMixin): ...@@ -166,15 +215,22 @@ class Command(BaseCommand, LoggerMixin):
166 pm = page.getPixmap(matrix=self.trans, alpha=False) 215 pm = page.getPixmap(matrix=self.trans, alpha=False)
167 save_path = os.path.join(img_save_path, 'page_{0}_img_0.png'.format(page.number)) 216 save_path = os.path.join(img_save_path, 'page_{0}_img_0.png'.format(page.number))
168 pm.writePNG(save_path) 217 pm.writePNG(save_path)
218 img_path_list.append(save_path)
219 self.cronjob_log.info('{0} [page to img success] [pdf_path={1}] [page={2}]'.format(
220 self.log_base, pdf_path, page.number))
169 else: # 提取图片 221 else: # 提取图片
170 for img_count, img_il in enumerate(img_il_list): 222 for img_index, img_il in enumerate(img_il_list):
171 if len(img_il) == 1: # 当只有一张图片时, 简化处理 223 if len(img_il) == 1: # 当只有一张图片时, 简化处理
172 pix = self.recoverpix(pdf, img_il[0]) 224 pix = self.recoverpix(pdf, img_il[0])
173 ext, img_data = self.get_img_data(pix) 225 ext, img_data = self.get_img_data(pix)
174 save_path = os.path.join(img_save_path, 'page_{0}_img_{1}.{2}'.format( 226 save_path = os.path.join(img_save_path, 'page_{0}_img_{1}.{2}'.format(
175 pno, img_count, ext)) 227 pno, img_index, ext))
176 with open(save_path, "wb") as f: 228 with open(save_path, "wb") as f:
177 f.write(img_data) 229 f.write(img_data)
230 img_path_list.append(save_path)
231 self.cronjob_log.info(
232 '{0} [extract img success] [pdf_path={1}] [page={2}] [img_index={3}]'.format(
233 self.log_base, pdf_path, pno, img_index))
178 else: # 多张图片,竖向拼接 234 else: # 多张图片,竖向拼接
179 height_sum = 0 235 height_sum = 0
180 im_list = [] 236 im_list = []
...@@ -194,14 +250,31 @@ class Command(BaseCommand, LoggerMixin): ...@@ -194,14 +250,31 @@ class Command(BaseCommand, LoggerMixin):
194 height_sum += height 250 height_sum += height
195 251
196 save_path = os.path.join(img_save_path, 'page_{0}_img_{1}.{2}'.format( 252 save_path = os.path.join(img_save_path, 'page_{0}_img_{1}.{2}'.format(
197 pno, img_count, im_list[0][2])) 253 pno, img_index, im_list[0][2]))
198 res = Image.new(im_list[0][1].mode, (width, height_sum)) 254 res = Image.new(im_list[0][1].mode, (width, height_sum))
199 h_now = 0 255 h_now = 0
200 for h, m, _ in im_list: 256 for h, m, _ in im_list:
201 res.paste(m, box=(0, h_now)) 257 res.paste(m, box=(0, h_now))
202 h_now += h 258 h_now += h
203 res.save(save_path) 259 res.save(save_path)
260 img_path_list.append(save_path)
261 self.cronjob_log.info(
262 '{0} [extract img success] [pdf_path={1}] [page={2}] [img_index={3}]'.format(
263 self.log_base, pdf_path, pno, img_index))
264 self.cronjob_log.info('{0} [pdf to img success]'.format(self.log_base))
204 265
205 # 图片调用算法判断是否为银行流水 266 write_zip_file(img_save_path, os.path.join(doc_data_path, '{0}_img.zip'.format(doc_id)))
206 # 图片调用算法OCR为excel文件 267 # 图片调用算法判断是否为银行流水, 图片调用算法OCR为excel文件
268 wb = xlwt.Workbook()
269 loop = asyncio.get_event_loop()
270 tasks = [self.img_ocr_excel(wb, img_path) for img_path in img_path_list]
271 loop.run_until_complete(asyncio.wait(tasks))
272 loop.close()
273 wb.save(excel_path)
207 # 整合excel文件上传至EDMS 274 # 整合excel文件上传至EDMS
275 except Exception as e:
276 UploadDocRecords.objects.filter(id=doc_id).update(status=DocStatus.PROCESS_FAILED.value)
277 self.cronjob_log.error('{0} [process failed] [err={1}]'.format(self.log_base, e))
278 else:
279 UploadDocRecords.objects.filter(id=doc_id).update(status=DocStatus.COMPLETE.value)
280 self.cronjob_log.info('{0} [doc process complete] [doc_id={1}]'.format(self.log_base, doc_id))
......
...@@ -7,13 +7,12 @@ class DocHandler: ...@@ -7,13 +7,12 @@ class DocHandler:
7 7
8 @staticmethod 8 @staticmethod
9 def get_link(doc_id, file='pdf'): 9 def get_link(doc_id, file='pdf'):
10 data_path = os.path.join(conf.DATA_DIR, str(doc_id))
11 if file == 'pdf': 10 if file == 'pdf':
12 return os.path.join(data_path, '{0}.pdf'.format(str(doc_id))) 11 return '/data/{0}/{0}.pdf'.format(doc_id)
13 elif file == 'img': 12 elif file == 'img':
14 return os.path.join(data_path, '{0}_img.zip'.format(str(doc_id))) 13 return '/data/{0}/{0}_img.zip'.format(doc_id)
15 else: 14 else:
16 return os.path.join(data_path, '{0}.xlsx'.format(str(doc_id))) 15 return '/data/{0}/{0}.xls'.format(doc_id)
17 16
18 def get_doc_list(self, doc_queryset): 17 def get_doc_list(self, doc_queryset):
19 for doc_dict in doc_queryset: 18 for doc_dict in doc_queryset:
......
1 import os
2 from zipfile import ZipFile
3
4
1 def file_write(file, file_path): 5 def file_write(file, file_path):
2 with open(file_path, 'wb+') as f: 6 with open(file_path, 'wb+') as f:
3 for chunk in file.chunks(): 7 for chunk in file.chunks():
4 f.write(chunk) 8 f.write(chunk)
9
10
11 def write_zip_file(dir_name, zipfile_path):
12 if not os.path.isdir(dir_name):
13 return
14 with ZipFile(zipfile_path, 'w') as z:
15 for root, dirs, files in os.walk(dir_name):
16 root_target_path = root.replace(dir_name, '')
17 for single_file in files:
18 src_file_path = os.path.join(root, single_file)
19 file_target_path = os.path.join(root_target_path, single_file)
20 z.write(src_file_path, file_target_path)
......
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!