add edms retry
Showing
3 changed files
with
165 additions
and
124 deletions
... | @@ -35,6 +35,8 @@ DEALER_CODE_META_FIELD_id = 13 | ... | @@ -35,6 +35,8 @@ DEALER_CODE_META_FIELD_id = 13 |
35 | BUSINESS_TYPE_META_FIELD_id = 93 | 35 | BUSINESS_TYPE_META_FIELD_id = 93 |
36 | DEALER_CODE = 'ocr_situ_group' | 36 | DEALER_CODE = 'ocr_situ_group' |
37 | 37 | ||
38 | RETRY_TIMES = 3 | ||
39 | |||
38 | # ---------银行流水模板相关-------------------------------------------------------------------------------------------- | 40 | # ---------银行流水模板相关-------------------------------------------------------------------------------------------- |
39 | 41 | ||
40 | TRANS_MAP = { | 42 | TRANS_MAP = { | ... | ... |
src/apps/doc/exceptions.py
0 → 100644
... | @@ -19,6 +19,7 @@ from apps.doc.models import DocStatus, HILDoc, AFCDoc, Keywords | ... | @@ -19,6 +19,7 @@ from apps.doc.models import DocStatus, HILDoc, AFCDoc, Keywords |
19 | from apps.doc.named_enum import KeywordsType | 19 | from apps.doc.named_enum import KeywordsType |
20 | from apps.doc import consts | 20 | from apps.doc import consts |
21 | from apps.doc.ocr.edms import EDMS, rh | 21 | from apps.doc.ocr.edms import EDMS, rh |
22 | from apps.doc.exceptions import EDMSException | ||
22 | 23 | ||
23 | 24 | ||
24 | class Command(BaseCommand, LoggerMixin): | 25 | class Command(BaseCommand, LoggerMixin): |
... | @@ -72,11 +73,21 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -72,11 +73,21 @@ class Command(BaseCommand, LoggerMixin): |
72 | os.makedirs(doc_data_path, exist_ok=True) | 73 | os.makedirs(doc_data_path, exist_ok=True) |
73 | pdf_path = os.path.join(doc_data_path, '{0}.pdf'.format(doc.id)) | 74 | pdf_path = os.path.join(doc_data_path, '{0}.pdf'.format(doc.id)) |
74 | if not doc.application_id.startswith(consts.FIXED_APPLICATION_ID_PREFIX): | 75 | if not doc.application_id.startswith(consts.FIXED_APPLICATION_ID_PREFIX): |
75 | self.edms.download(pdf_path, doc.metadata_version_id) | 76 | for times in range(consts.RETRY_TIMES): |
77 | try: | ||
78 | self.edms.download(pdf_path, doc.metadata_version_id) | ||
79 | except Exception as e: | ||
80 | self.cronjob_log.warn('{0} [edms download failed] [times={1}] [business_type={2}] [doc_id={3}] ' | ||
81 | '[error={4}]'.format(self.log_base, times, business_type, doc.id, e)) | ||
82 | edms_exc = str(e) | ||
83 | else: | ||
84 | break | ||
85 | else: | ||
86 | raise EDMSException(edms_exc) | ||
76 | 87 | ||
77 | excel_path = os.path.join(doc_data_path, '{0}.xlsx'.format(doc.id)) | 88 | excel_path = os.path.join(doc_data_path, '{0}.xlsx'.format(doc.id)) |
78 | src_excel_path = os.path.join(doc_data_path, 'src.xlsx') | 89 | src_excel_path = os.path.join(doc_data_path, 'src.xlsx') |
79 | self.cronjob_log.info('{0} [pdf download success] [business_type={1}] [doc_id={2}] [pdf_path={3}]'.format( | 90 | self.cronjob_log.info('{0} [edms download success] [business_type={1}] [doc_id={2}] [pdf_path={3}]'.format( |
80 | self.log_base, business_type, doc.id, pdf_path)) | 91 | self.log_base, business_type, doc.id, pdf_path)) |
81 | return doc_data_path, excel_path, src_excel_path, pdf_path | 92 | return doc_data_path, excel_path, src_excel_path, pdf_path |
82 | 93 | ||
... | @@ -177,105 +188,21 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -177,105 +188,21 @@ class Command(BaseCommand, LoggerMixin): |
177 | else: | 188 | else: |
178 | skip_img.append(self.parse_img_path(img_path)) | 189 | skip_img.append(self.parse_img_path(img_path)) |
179 | 190 | ||
180 | # async def fetch_ocr_result(self, url, json_data): | 191 | @staticmethod |
181 | # async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(ssl=False)) as session: | 192 | async def fetch_ocr_1_result(url, json_data): |
182 | # async with session.post(url, json=json_data) as response: | 193 | async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(ssl=False)) as session: |
183 | # if response.status == 200: | 194 | async with session.post(url, json=json_data) as response: |
184 | # return await response.json() | 195 | if response.status == 200: |
185 | # | 196 | return await response.json() |
186 | # async def img_2_ocr_2_wb(self, wb, img_path, bs_summary, unknown_summary, license_summary): | 197 | |
187 | # with open(img_path, 'rb') as f: | 198 | @staticmethod |
188 | # base64_data = base64.b64encode(f.read()) | 199 | async def fetch_ocr_2_result(url, json_data): |
189 | # # 获取解码后的base64值 | 200 | async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(ssl=False)) as session: |
190 | # file_data = base64_data.decode() | 201 | async with session.post(url, data=json_data) as response: |
191 | # json_data_1 = { | 202 | if response.status == 200: |
192 | # "file": file_data | 203 | return await response.json() |
193 | # } | 204 | |
194 | # ocr_res_1 = await self.fetch_ocr_result(self.ocr_url_1, json_data_1) | 205 | async def img_2_ocr_2_wb(self, wb, img_path, bs_summary, unknown_summary, license_summary, skip_img): |
195 | # if ocr_res_1 is None: | ||
196 | # raise Exception('ocr 1 error, img_path={0}'.format(img_path)) | ||
197 | # else: | ||
198 | # self.cronjob_log.info('{0} [ocr_1 result] [img={1}] [res={2}]'.format( | ||
199 | # self.log_base, img_path, ocr_res_1)) | ||
200 | # | ||
201 | # if ocr_res_1.get('code') == 1: | ||
202 | # ocr_data = ocr_res_1.get('data', {}) | ||
203 | # classify = ocr_data.get('classify') | ||
204 | # if classify is None: | ||
205 | # return | ||
206 | # elif classify in consts.OTHER_CLASSIFY_SET: # 其他类 | ||
207 | # return | ||
208 | # elif classify in consts.LICENSE_CLASSIFY_SET_1: # 证件1 | ||
209 | # self.license1_process(ocr_data, license_summary, classify) | ||
210 | # elif classify in consts.LICENSE_CLASSIFY_SET_2: # 证件2 | ||
211 | # pid, _ = consts.LICENSE_CLASSIFY_MAPPING.get(classify) | ||
212 | # json_data_2 = { | ||
213 | # "pid": str(pid), | ||
214 | # "key": conf.OCR_KEY, | ||
215 | # "secret": conf.OCR_SECRET, | ||
216 | # "file": file_data | ||
217 | # } | ||
218 | # ocr_res_2 = await self.fetch_ocr_result(self.ocr_url_2, json_data_2) | ||
219 | # if ocr_res_2 is None: | ||
220 | # raise Exception('ocr 2 error, img_path={0}'.format(img_path)) | ||
221 | # else: | ||
222 | # # 识别结果 | ||
223 | # self.cronjob_log.info('{0} [ocr_2 result] [img={1}] [res={2}]'.format( | ||
224 | # self.log_base, img_path, ocr_res_2)) | ||
225 | # self.license2_process(ocr_res_2, license_summary, pid, classify) | ||
226 | # else: # 流水处理 | ||
227 | # self.bs_process(wb, ocr_data, bs_summary, unknown_summary, img_path, classify) | ||
228 | |||
229 | def img_2_ocr_2_wb(self, wb, img_path, bs_summary, unknown_summary, license_summary, skip_img): | ||
230 | # # 流水 | ||
231 | # res = { | ||
232 | # 'code': 1, | ||
233 | # 'msg': 'success', | ||
234 | # 'data': { | ||
235 | # 'classify': 0, | ||
236 | # 'confidence': 0.999, | ||
237 | # 'data': [ | ||
238 | # { | ||
239 | # 'summary': ['户名', '卡号', '页码', '回单验证码', '打印时间', '起始时间', '终止时间'], | ||
240 | # 'cells': [] | ||
241 | # }, | ||
242 | # { | ||
243 | # 'summary': ['户名', '卡号', '页码', '回单验证码', '打印时间', '起始时间', '终止时间'], | ||
244 | # 'cells': [] | ||
245 | # } | ||
246 | # ] | ||
247 | # } | ||
248 | # } | ||
249 | # | ||
250 | # # 证件-1 | ||
251 | # res = { | ||
252 | # 'code': 1, | ||
253 | # 'msg': 'success', | ||
254 | # 'data': { | ||
255 | # 'classify': 0, | ||
256 | # 'confidence': 0.999, | ||
257 | # 'data': [ | ||
258 | # { | ||
259 | # 'cn_key': 'value', | ||
260 | # 'cn_key': 'value', | ||
261 | # }, | ||
262 | # { | ||
263 | # 'cn_key': 'value', | ||
264 | # 'cn_key': 'value', | ||
265 | # }, | ||
266 | # ] | ||
267 | # } | ||
268 | # } | ||
269 | # | ||
270 | # # 证件-2 or 其他类 | ||
271 | # res = { | ||
272 | # 'code': 1, | ||
273 | # 'msg': 'success', | ||
274 | # 'data': { | ||
275 | # 'classify': 0, | ||
276 | # 'confidence': 0.999, | ||
277 | # } | ||
278 | # } | ||
279 | with open(img_path, 'rb') as f: | 206 | with open(img_path, 'rb') as f: |
280 | base64_data = base64.b64encode(f.read()) | 207 | base64_data = base64.b64encode(f.read()) |
281 | # 获取解码后的base64值 | 208 | # 获取解码后的base64值 |
... | @@ -283,9 +210,10 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -283,9 +210,10 @@ class Command(BaseCommand, LoggerMixin): |
283 | json_data_1 = { | 210 | json_data_1 = { |
284 | "file": file_data | 211 | "file": file_data |
285 | } | 212 | } |
286 | response_1 = requests.post(self.ocr_url_1, json=json_data_1) | 213 | ocr_res_1 = await self.fetch_ocr_1_result(self.ocr_url_1, json_data_1) |
287 | if response_1.status_code == 200: | 214 | if ocr_res_1 is None: |
288 | ocr_res_1 = response_1.json() | 215 | raise Exception('ocr 1 error, img_path={0}'.format(img_path)) |
216 | else: | ||
289 | self.cronjob_log.info('{0} [ocr_1 result] [img={1}] [res={2}]'.format( | 217 | self.cronjob_log.info('{0} [ocr_1 result] [img={1}] [res={2}]'.format( |
290 | self.log_base, img_path, ocr_res_1)) | 218 | self.log_base, img_path, ocr_res_1)) |
291 | 219 | ||
... | @@ -308,26 +236,121 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -308,26 +236,121 @@ class Command(BaseCommand, LoggerMixin): |
308 | "secret": conf.OCR_SECRET, | 236 | "secret": conf.OCR_SECRET, |
309 | "file": file_data | 237 | "file": file_data |
310 | } | 238 | } |
311 | response_2 = requests.post(self.ocr_url_2, data=json_data_2) | 239 | ocr_res_2 = await self.fetch_ocr_2_result(self.ocr_url_2, json_data_2) |
312 | if response_2.status_code == 200: | 240 | if ocr_res_2 is None: |
241 | raise Exception('ocr 2 error, img_path={0}'.format(img_path)) | ||
242 | else: | ||
313 | # 识别结果 | 243 | # 识别结果 |
314 | ocr_res_2 = response_2.json() | ||
315 | self.cronjob_log.info('{0} [ocr_2 result] [img={1}] [res={2}]'.format( | 244 | self.cronjob_log.info('{0} [ocr_2 result] [img={1}] [res={2}]'.format( |
316 | self.log_base, img_path, ocr_res_2)) | 245 | self.log_base, img_path, ocr_res_2)) |
317 | self.license2_process(ocr_res_2, license_summary, pid, classify, skip_img, img_path) | 246 | self.license2_process(ocr_res_2, license_summary, pid, classify, skip_img, img_path) |
318 | else: | ||
319 | raise Exception('ocr 2 error, img_path={0}'.format(img_path)) | ||
320 | else: # 流水处理 | 247 | else: # 流水处理 |
321 | self.bs_process(wb, ocr_data, bs_summary, unknown_summary, img_path, classify, skip_img) | 248 | self.bs_process(wb, ocr_data, bs_summary, unknown_summary, img_path, classify, skip_img) |
322 | else: | 249 | else: |
323 | skip_img.append(self.parse_img_path(img_path)) | 250 | skip_img.append(self.parse_img_path(img_path)) |
324 | else: | 251 | |
325 | raise Exception('ocr 1 error, img_path={0}'.format(img_path)) | 252 | # def img_2_ocr_2_wb(self, wb, img_path, bs_summary, unknown_summary, license_summary, skip_img): |
253 | # # # 流水 | ||
254 | # # res = { | ||
255 | # # 'code': 1, | ||
256 | # # 'msg': 'success', | ||
257 | # # 'data': { | ||
258 | # # 'classify': 0, | ||
259 | # # 'confidence': 0.999, | ||
260 | # # 'data': [ | ||
261 | # # { | ||
262 | # # 'summary': ['户名', '卡号', '页码', '回单验证码', '打印时间', '起始时间', '终止时间'], | ||
263 | # # 'cells': [] | ||
264 | # # }, | ||
265 | # # { | ||
266 | # # 'summary': ['户名', '卡号', '页码', '回单验证码', '打印时间', '起始时间', '终止时间'], | ||
267 | # # 'cells': [] | ||
268 | # # } | ||
269 | # # ] | ||
270 | # # } | ||
271 | # # } | ||
272 | # # | ||
273 | # # # 证件-1 | ||
274 | # # res = { | ||
275 | # # 'code': 1, | ||
276 | # # 'msg': 'success', | ||
277 | # # 'data': { | ||
278 | # # 'classify': 0, | ||
279 | # # 'confidence': 0.999, | ||
280 | # # 'data': [ | ||
281 | # # { | ||
282 | # # 'cn_key': 'value', | ||
283 | # # 'cn_key': 'value', | ||
284 | # # }, | ||
285 | # # { | ||
286 | # # 'cn_key': 'value', | ||
287 | # # 'cn_key': 'value', | ||
288 | # # }, | ||
289 | # # ] | ||
290 | # # } | ||
291 | # # } | ||
292 | # # | ||
293 | # # # 证件-2 or 其他类 | ||
294 | # # res = { | ||
295 | # # 'code': 1, | ||
296 | # # 'msg': 'success', | ||
297 | # # 'data': { | ||
298 | # # 'classify': 0, | ||
299 | # # 'confidence': 0.999, | ||
300 | # # } | ||
301 | # # } | ||
302 | # with open(img_path, 'rb') as f: | ||
303 | # base64_data = base64.b64encode(f.read()) | ||
304 | # # 获取解码后的base64值 | ||
305 | # file_data = base64_data.decode() | ||
306 | # json_data_1 = { | ||
307 | # "file": file_data | ||
308 | # } | ||
309 | # response_1 = requests.post(self.ocr_url_1, json=json_data_1) | ||
310 | # if response_1.status_code == 200: | ||
311 | # ocr_res_1 = response_1.json() | ||
312 | # self.cronjob_log.info('{0} [ocr_1 result] [img={1}] [res={2}]'.format( | ||
313 | # self.log_base, img_path, ocr_res_1)) | ||
314 | # | ||
315 | # if ocr_res_1.get('code') == 1: | ||
316 | # ocr_data = ocr_res_1.get('data', {}) | ||
317 | # classify = ocr_data.get('classify') | ||
318 | # if classify is None: | ||
319 | # skip_img.append(self.parse_img_path(img_path)) | ||
320 | # return | ||
321 | # elif classify in consts.OTHER_CLASSIFY_SET: # 其他类 | ||
322 | # skip_img.append(self.parse_img_path(img_path)) | ||
323 | # return | ||
324 | # elif classify in consts.LICENSE_CLASSIFY_SET_1: # 证件1 | ||
325 | # self.license1_process(ocr_data, license_summary, classify, skip_img, img_path) | ||
326 | # elif classify in consts.LICENSE_CLASSIFY_SET_2: # 证件2 | ||
327 | # pid, _ = consts.LICENSE_CLASSIFY_MAPPING.get(classify) | ||
328 | # json_data_2 = { | ||
329 | # "pid": str(pid), | ||
330 | # "key": conf.OCR_KEY, | ||
331 | # "secret": conf.OCR_SECRET, | ||
332 | # "file": file_data | ||
333 | # } | ||
334 | # response_2 = requests.post(self.ocr_url_2, data=json_data_2) | ||
335 | # if response_2.status_code == 200: | ||
336 | # # 识别结果 | ||
337 | # ocr_res_2 = response_2.json() | ||
338 | # self.cronjob_log.info('{0} [ocr_2 result] [img={1}] [res={2}]'.format( | ||
339 | # self.log_base, img_path, ocr_res_2)) | ||
340 | # self.license2_process(ocr_res_2, license_summary, pid, classify, skip_img, img_path) | ||
341 | # else: | ||
342 | # raise Exception('ocr 2 error, img_path={0}'.format(img_path)) | ||
343 | # else: # 流水处理 | ||
344 | # self.bs_process(wb, ocr_data, bs_summary, unknown_summary, img_path, classify, skip_img) | ||
345 | # else: | ||
346 | # skip_img.append(self.parse_img_path(img_path)) | ||
347 | # else: | ||
348 | # raise Exception('ocr 1 error, img_path={0}'.format(img_path)) | ||
326 | 349 | ||
327 | @staticmethod | 350 | @staticmethod |
328 | def parse_img_path(img_path): | 351 | def parse_img_path(img_path): |
329 | img_name, _ = os.path.splitext(os.path.basename(img_path)) | 352 | img_name, _ = os.path.splitext(os.path.basename(img_path)) |
330 | return img_name[5], img_name[11] | 353 | return int(img_name[5])+1, int(img_name[11])+1 |
331 | 354 | ||
332 | @staticmethod | 355 | @staticmethod |
333 | def get_most(value_list): | 356 | def get_most(value_list): |
... | @@ -520,14 +543,14 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -520,14 +543,14 @@ class Command(BaseCommand, LoggerMixin): |
520 | # wb = Workbook() | 543 | # wb = Workbook() |
521 | 544 | ||
522 | # 4.1 获取OCR结果 | 545 | # 4.1 获取OCR结果 |
523 | # loop = asyncio.get_event_loop() | 546 | loop = asyncio.get_event_loop() |
524 | # tasks = [self.img_2_ocr_2_wb(wb, img_path, bs_summary, unknown_summary, license_summary) | 547 | tasks = [self.img_2_ocr_2_wb(wb, img_path, bs_summary, unknown_summary, license_summary, skip_img) |
525 | # for img_path in pdf_handler.img_path_list] | 548 | for img_path in pdf_handler.img_path_list] |
526 | # loop.run_until_complete(asyncio.wait(tasks)) | 549 | loop.run_until_complete(asyncio.wait(tasks)) |
527 | # loop.close() | 550 | # loop.close() |
528 | 551 | ||
529 | for img_path in pdf_handler.img_path_list: | 552 | # for img_path in pdf_handler.img_path_list: |
530 | self.img_2_ocr_2_wb(wb, img_path, bs_summary, unknown_summary, license_summary, skip_img) | 553 | # self.img_2_ocr_2_wb(wb, img_path, bs_summary, unknown_summary, license_summary, skip_img) |
531 | 554 | ||
532 | self.cronjob_log.info('{0} [business_type={1}] [doc_id={2}] [bs_summary={3}] [unknown_summary={4}] ' | 555 | self.cronjob_log.info('{0} [business_type={1}] [doc_id={2}] [bs_summary={3}] [unknown_summary={4}] ' |
533 | '[license_summary={5}]'.format(self.log_base, business_type, doc.id, bs_summary, | 556 | '[license_summary={5}]'.format(self.log_base, business_type, doc.id, bs_summary, |
... | @@ -545,23 +568,37 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -545,23 +568,37 @@ class Command(BaseCommand, LoggerMixin): |
545 | wb.save(src_excel_path) | 568 | wb.save(src_excel_path) |
546 | wb.rebuild(merged_bs_summary, license_summary, skip_img) | 569 | wb.rebuild(merged_bs_summary, license_summary, skip_img) |
547 | wb.save(excel_path) | 570 | wb.save(excel_path) |
571 | except EDMSException as e: | ||
572 | self.cronjob_log.error('{0} [process failed (edms download)] [business_type={1}] [doc_id={2}] ' | ||
573 | '[err={3}]'.format(self.log_base, business_type, doc.id, e)) | ||
548 | except Exception as e: | 574 | except Exception as e: |
549 | doc.status = DocStatus.PROCESS_FAILED.value | 575 | doc.status = DocStatus.PROCESS_FAILED.value |
550 | doc.save() | 576 | doc.save() |
551 | self.cronjob_log.error('{0} [process failed] [business_type={1}] [doc_id={2}] [err={3}]'.format( | 577 | self.cronjob_log.error('{0} [process failed (program)] [business_type={1}] [doc_id={2}] ' |
552 | self.log_base, business_type, doc.id, e)) | 578 | '[err={3}]'.format(self.log_base, business_type, doc.id, e)) |
553 | else: | 579 | else: |
554 | try: | 580 | try: |
555 | # 5.上传至EDMS | 581 | # 5.上传至EDMS |
556 | self.edms.upload(excel_path, doc, business_type) | 582 | for times in range(consts.RETRY_TIMES): |
557 | # print('upload pass') | 583 | try: |
584 | self.edms.upload(excel_path, doc, business_type) | ||
585 | except Exception as e: | ||
586 | self.cronjob_log.warn( | ||
587 | '{0} [edms upload failed] [times={1}] [business_type={2}] [doc_id={3}] ' | ||
588 | '[error={4}]'.format(self.log_base, times, business_type, doc.id, e)) | ||
589 | edms_exc = str(e) | ||
590 | else: | ||
591 | break | ||
592 | else: | ||
593 | raise EDMSException(edms_exc) | ||
558 | except Exception as e: | 594 | except Exception as e: |
559 | doc.status = DocStatus.UPLOAD_FAILED.value | 595 | doc.status = DocStatus.UPLOAD_FAILED.value |
560 | doc.save() | 596 | doc.save() |
561 | end_time = time.time() | 597 | end_time = time.time() |
562 | speed_time = int(end_time - start_time) | 598 | speed_time = int(end_time - start_time) |
563 | self.cronjob_log.error('{0} [upload failed] [business_type={1}] [doc_id={2}] [speed_time={3}] ' | 599 | self.cronjob_log.error('{0} [process failed (edms upload)] [business_type={1}] [doc_id={2}] ' |
564 | '[err={4}]'.format(self.log_base, business_type, doc.id, speed_time, e)) | 600 | '[speed_time={3}] [err={4}]'.format(self.log_base, business_type, doc.id, |
601 | speed_time, e)) | ||
565 | write_zip_file(img_save_path, os.path.join(doc_data_path, '{0}_img.zip'.format(doc.id))) | 602 | write_zip_file(img_save_path, os.path.join(doc_data_path, '{0}_img.zip'.format(doc.id))) |
566 | 603 | ||
567 | else: | 604 | else: | ... | ... |
-
Please register or sign in to post a comment