e570371a by 周伟奇

add edms retry

1 parent 04020101
...@@ -35,6 +35,8 @@ DEALER_CODE_META_FIELD_id = 13 ...@@ -35,6 +35,8 @@ DEALER_CODE_META_FIELD_id = 13
35 BUSINESS_TYPE_META_FIELD_id = 93 35 BUSINESS_TYPE_META_FIELD_id = 93
36 DEALER_CODE = 'ocr_situ_group' 36 DEALER_CODE = 'ocr_situ_group'
37 37
38 RETRY_TIMES = 3
39
38 # ---------银行流水模板相关-------------------------------------------------------------------------------------------- 40 # ---------银行流水模板相关--------------------------------------------------------------------------------------------
39 41
40 TRANS_MAP = { 42 TRANS_MAP = {
......
1 class EDMSException(Exception):
2 pass
...@@ -19,6 +19,7 @@ from apps.doc.models import DocStatus, HILDoc, AFCDoc, Keywords ...@@ -19,6 +19,7 @@ from apps.doc.models import DocStatus, HILDoc, AFCDoc, Keywords
19 from apps.doc.named_enum import KeywordsType 19 from apps.doc.named_enum import KeywordsType
20 from apps.doc import consts 20 from apps.doc import consts
21 from apps.doc.ocr.edms import EDMS, rh 21 from apps.doc.ocr.edms import EDMS, rh
22 from apps.doc.exceptions import EDMSException
22 23
23 24
24 class Command(BaseCommand, LoggerMixin): 25 class Command(BaseCommand, LoggerMixin):
...@@ -72,11 +73,21 @@ class Command(BaseCommand, LoggerMixin): ...@@ -72,11 +73,21 @@ class Command(BaseCommand, LoggerMixin):
72 os.makedirs(doc_data_path, exist_ok=True) 73 os.makedirs(doc_data_path, exist_ok=True)
73 pdf_path = os.path.join(doc_data_path, '{0}.pdf'.format(doc.id)) 74 pdf_path = os.path.join(doc_data_path, '{0}.pdf'.format(doc.id))
74 if not doc.application_id.startswith(consts.FIXED_APPLICATION_ID_PREFIX): 75 if not doc.application_id.startswith(consts.FIXED_APPLICATION_ID_PREFIX):
75 self.edms.download(pdf_path, doc.metadata_version_id) 76 for times in range(consts.RETRY_TIMES):
77 try:
78 self.edms.download(pdf_path, doc.metadata_version_id)
79 except Exception as e:
80 self.cronjob_log.warn('{0} [edms download failed] [times={1}] [business_type={2}] [doc_id={3}] '
81 '[error={4}]'.format(self.log_base, times, business_type, doc.id, e))
82 edms_exc = str(e)
83 else:
84 break
85 else:
86 raise EDMSException(edms_exc)
76 87
77 excel_path = os.path.join(doc_data_path, '{0}.xlsx'.format(doc.id)) 88 excel_path = os.path.join(doc_data_path, '{0}.xlsx'.format(doc.id))
78 src_excel_path = os.path.join(doc_data_path, 'src.xlsx') 89 src_excel_path = os.path.join(doc_data_path, 'src.xlsx')
79 self.cronjob_log.info('{0} [pdf download success] [business_type={1}] [doc_id={2}] [pdf_path={3}]'.format( 90 self.cronjob_log.info('{0} [edms download success] [business_type={1}] [doc_id={2}] [pdf_path={3}]'.format(
80 self.log_base, business_type, doc.id, pdf_path)) 91 self.log_base, business_type, doc.id, pdf_path))
81 return doc_data_path, excel_path, src_excel_path, pdf_path 92 return doc_data_path, excel_path, src_excel_path, pdf_path
82 93
...@@ -177,105 +188,21 @@ class Command(BaseCommand, LoggerMixin): ...@@ -177,105 +188,21 @@ class Command(BaseCommand, LoggerMixin):
177 else: 188 else:
178 skip_img.append(self.parse_img_path(img_path)) 189 skip_img.append(self.parse_img_path(img_path))
179 190
180 # async def fetch_ocr_result(self, url, json_data): 191 @staticmethod
181 # async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(ssl=False)) as session: 192 async def fetch_ocr_1_result(url, json_data):
182 # async with session.post(url, json=json_data) as response: 193 async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(ssl=False)) as session:
183 # if response.status == 200: 194 async with session.post(url, json=json_data) as response:
184 # return await response.json() 195 if response.status == 200:
185 # 196 return await response.json()
186 # async def img_2_ocr_2_wb(self, wb, img_path, bs_summary, unknown_summary, license_summary): 197
187 # with open(img_path, 'rb') as f: 198 @staticmethod
188 # base64_data = base64.b64encode(f.read()) 199 async def fetch_ocr_2_result(url, json_data):
189 # # 获取解码后的base64值 200 async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(ssl=False)) as session:
190 # file_data = base64_data.decode() 201 async with session.post(url, data=json_data) as response:
191 # json_data_1 = { 202 if response.status == 200:
192 # "file": file_data 203 return await response.json()
193 # } 204
194 # ocr_res_1 = await self.fetch_ocr_result(self.ocr_url_1, json_data_1) 205 async def img_2_ocr_2_wb(self, wb, img_path, bs_summary, unknown_summary, license_summary, skip_img):
195 # if ocr_res_1 is None:
196 # raise Exception('ocr 1 error, img_path={0}'.format(img_path))
197 # else:
198 # self.cronjob_log.info('{0} [ocr_1 result] [img={1}] [res={2}]'.format(
199 # self.log_base, img_path, ocr_res_1))
200 #
201 # if ocr_res_1.get('code') == 1:
202 # ocr_data = ocr_res_1.get('data', {})
203 # classify = ocr_data.get('classify')
204 # if classify is None:
205 # return
206 # elif classify in consts.OTHER_CLASSIFY_SET: # 其他类
207 # return
208 # elif classify in consts.LICENSE_CLASSIFY_SET_1: # 证件1
209 # self.license1_process(ocr_data, license_summary, classify)
210 # elif classify in consts.LICENSE_CLASSIFY_SET_2: # 证件2
211 # pid, _ = consts.LICENSE_CLASSIFY_MAPPING.get(classify)
212 # json_data_2 = {
213 # "pid": str(pid),
214 # "key": conf.OCR_KEY,
215 # "secret": conf.OCR_SECRET,
216 # "file": file_data
217 # }
218 # ocr_res_2 = await self.fetch_ocr_result(self.ocr_url_2, json_data_2)
219 # if ocr_res_2 is None:
220 # raise Exception('ocr 2 error, img_path={0}'.format(img_path))
221 # else:
222 # # 识别结果
223 # self.cronjob_log.info('{0} [ocr_2 result] [img={1}] [res={2}]'.format(
224 # self.log_base, img_path, ocr_res_2))
225 # self.license2_process(ocr_res_2, license_summary, pid, classify)
226 # else: # 流水处理
227 # self.bs_process(wb, ocr_data, bs_summary, unknown_summary, img_path, classify)
228
229 def img_2_ocr_2_wb(self, wb, img_path, bs_summary, unknown_summary, license_summary, skip_img):
230 # # 流水
231 # res = {
232 # 'code': 1,
233 # 'msg': 'success',
234 # 'data': {
235 # 'classify': 0,
236 # 'confidence': 0.999,
237 # 'data': [
238 # {
239 # 'summary': ['户名', '卡号', '页码', '回单验证码', '打印时间', '起始时间', '终止时间'],
240 # 'cells': []
241 # },
242 # {
243 # 'summary': ['户名', '卡号', '页码', '回单验证码', '打印时间', '起始时间', '终止时间'],
244 # 'cells': []
245 # }
246 # ]
247 # }
248 # }
249 #
250 # # 证件-1
251 # res = {
252 # 'code': 1,
253 # 'msg': 'success',
254 # 'data': {
255 # 'classify': 0,
256 # 'confidence': 0.999,
257 # 'data': [
258 # {
259 # 'cn_key': 'value',
260 # 'cn_key': 'value',
261 # },
262 # {
263 # 'cn_key': 'value',
264 # 'cn_key': 'value',
265 # },
266 # ]
267 # }
268 # }
269 #
270 # # 证件-2 or 其他类
271 # res = {
272 # 'code': 1,
273 # 'msg': 'success',
274 # 'data': {
275 # 'classify': 0,
276 # 'confidence': 0.999,
277 # }
278 # }
279 with open(img_path, 'rb') as f: 206 with open(img_path, 'rb') as f:
280 base64_data = base64.b64encode(f.read()) 207 base64_data = base64.b64encode(f.read())
281 # 获取解码后的base64值 208 # 获取解码后的base64值
...@@ -283,9 +210,10 @@ class Command(BaseCommand, LoggerMixin): ...@@ -283,9 +210,10 @@ class Command(BaseCommand, LoggerMixin):
283 json_data_1 = { 210 json_data_1 = {
284 "file": file_data 211 "file": file_data
285 } 212 }
286 response_1 = requests.post(self.ocr_url_1, json=json_data_1) 213 ocr_res_1 = await self.fetch_ocr_1_result(self.ocr_url_1, json_data_1)
287 if response_1.status_code == 200: 214 if ocr_res_1 is None:
288 ocr_res_1 = response_1.json() 215 raise Exception('ocr 1 error, img_path={0}'.format(img_path))
216 else:
289 self.cronjob_log.info('{0} [ocr_1 result] [img={1}] [res={2}]'.format( 217 self.cronjob_log.info('{0} [ocr_1 result] [img={1}] [res={2}]'.format(
290 self.log_base, img_path, ocr_res_1)) 218 self.log_base, img_path, ocr_res_1))
291 219
...@@ -308,26 +236,121 @@ class Command(BaseCommand, LoggerMixin): ...@@ -308,26 +236,121 @@ class Command(BaseCommand, LoggerMixin):
308 "secret": conf.OCR_SECRET, 236 "secret": conf.OCR_SECRET,
309 "file": file_data 237 "file": file_data
310 } 238 }
311 response_2 = requests.post(self.ocr_url_2, data=json_data_2) 239 ocr_res_2 = await self.fetch_ocr_2_result(self.ocr_url_2, json_data_2)
312 if response_2.status_code == 200: 240 if ocr_res_2 is None:
241 raise Exception('ocr 2 error, img_path={0}'.format(img_path))
242 else:
313 # 识别结果 243 # 识别结果
314 ocr_res_2 = response_2.json()
315 self.cronjob_log.info('{0} [ocr_2 result] [img={1}] [res={2}]'.format( 244 self.cronjob_log.info('{0} [ocr_2 result] [img={1}] [res={2}]'.format(
316 self.log_base, img_path, ocr_res_2)) 245 self.log_base, img_path, ocr_res_2))
317 self.license2_process(ocr_res_2, license_summary, pid, classify, skip_img, img_path) 246 self.license2_process(ocr_res_2, license_summary, pid, classify, skip_img, img_path)
318 else:
319 raise Exception('ocr 2 error, img_path={0}'.format(img_path))
320 else: # 流水处理 247 else: # 流水处理
321 self.bs_process(wb, ocr_data, bs_summary, unknown_summary, img_path, classify, skip_img) 248 self.bs_process(wb, ocr_data, bs_summary, unknown_summary, img_path, classify, skip_img)
322 else: 249 else:
323 skip_img.append(self.parse_img_path(img_path)) 250 skip_img.append(self.parse_img_path(img_path))
324 else: 251
325 raise Exception('ocr 1 error, img_path={0}'.format(img_path)) 252 # def img_2_ocr_2_wb(self, wb, img_path, bs_summary, unknown_summary, license_summary, skip_img):
253 # # # 流水
254 # # res = {
255 # # 'code': 1,
256 # # 'msg': 'success',
257 # # 'data': {
258 # # 'classify': 0,
259 # # 'confidence': 0.999,
260 # # 'data': [
261 # # {
262 # # 'summary': ['户名', '卡号', '页码', '回单验证码', '打印时间', '起始时间', '终止时间'],
263 # # 'cells': []
264 # # },
265 # # {
266 # # 'summary': ['户名', '卡号', '页码', '回单验证码', '打印时间', '起始时间', '终止时间'],
267 # # 'cells': []
268 # # }
269 # # ]
270 # # }
271 # # }
272 # #
273 # # # 证件-1
274 # # res = {
275 # # 'code': 1,
276 # # 'msg': 'success',
277 # # 'data': {
278 # # 'classify': 0,
279 # # 'confidence': 0.999,
280 # # 'data': [
281 # # {
282 # # 'cn_key': 'value',
283 # # 'cn_key': 'value',
284 # # },
285 # # {
286 # # 'cn_key': 'value',
287 # # 'cn_key': 'value',
288 # # },
289 # # ]
290 # # }
291 # # }
292 # #
293 # # # 证件-2 or 其他类
294 # # res = {
295 # # 'code': 1,
296 # # 'msg': 'success',
297 # # 'data': {
298 # # 'classify': 0,
299 # # 'confidence': 0.999,
300 # # }
301 # # }
302 # with open(img_path, 'rb') as f:
303 # base64_data = base64.b64encode(f.read())
304 # # 获取解码后的base64值
305 # file_data = base64_data.decode()
306 # json_data_1 = {
307 # "file": file_data
308 # }
309 # response_1 = requests.post(self.ocr_url_1, json=json_data_1)
310 # if response_1.status_code == 200:
311 # ocr_res_1 = response_1.json()
312 # self.cronjob_log.info('{0} [ocr_1 result] [img={1}] [res={2}]'.format(
313 # self.log_base, img_path, ocr_res_1))
314 #
315 # if ocr_res_1.get('code') == 1:
316 # ocr_data = ocr_res_1.get('data', {})
317 # classify = ocr_data.get('classify')
318 # if classify is None:
319 # skip_img.append(self.parse_img_path(img_path))
320 # return
321 # elif classify in consts.OTHER_CLASSIFY_SET: # 其他类
322 # skip_img.append(self.parse_img_path(img_path))
323 # return
324 # elif classify in consts.LICENSE_CLASSIFY_SET_1: # 证件1
325 # self.license1_process(ocr_data, license_summary, classify, skip_img, img_path)
326 # elif classify in consts.LICENSE_CLASSIFY_SET_2: # 证件2
327 # pid, _ = consts.LICENSE_CLASSIFY_MAPPING.get(classify)
328 # json_data_2 = {
329 # "pid": str(pid),
330 # "key": conf.OCR_KEY,
331 # "secret": conf.OCR_SECRET,
332 # "file": file_data
333 # }
334 # response_2 = requests.post(self.ocr_url_2, data=json_data_2)
335 # if response_2.status_code == 200:
336 # # 识别结果
337 # ocr_res_2 = response_2.json()
338 # self.cronjob_log.info('{0} [ocr_2 result] [img={1}] [res={2}]'.format(
339 # self.log_base, img_path, ocr_res_2))
340 # self.license2_process(ocr_res_2, license_summary, pid, classify, skip_img, img_path)
341 # else:
342 # raise Exception('ocr 2 error, img_path={0}'.format(img_path))
343 # else: # 流水处理
344 # self.bs_process(wb, ocr_data, bs_summary, unknown_summary, img_path, classify, skip_img)
345 # else:
346 # skip_img.append(self.parse_img_path(img_path))
347 # else:
348 # raise Exception('ocr 1 error, img_path={0}'.format(img_path))
326 349
327 @staticmethod 350 @staticmethod
328 def parse_img_path(img_path): 351 def parse_img_path(img_path):
329 img_name, _ = os.path.splitext(os.path.basename(img_path)) 352 img_name, _ = os.path.splitext(os.path.basename(img_path))
330 return img_name[5], img_name[11] 353 return int(img_name[5])+1, int(img_name[11])+1
331 354
332 @staticmethod 355 @staticmethod
333 def get_most(value_list): 356 def get_most(value_list):
...@@ -520,14 +543,14 @@ class Command(BaseCommand, LoggerMixin): ...@@ -520,14 +543,14 @@ class Command(BaseCommand, LoggerMixin):
520 # wb = Workbook() 543 # wb = Workbook()
521 544
522 # 4.1 获取OCR结果 545 # 4.1 获取OCR结果
523 # loop = asyncio.get_event_loop() 546 loop = asyncio.get_event_loop()
524 # tasks = [self.img_2_ocr_2_wb(wb, img_path, bs_summary, unknown_summary, license_summary) 547 tasks = [self.img_2_ocr_2_wb(wb, img_path, bs_summary, unknown_summary, license_summary, skip_img)
525 # for img_path in pdf_handler.img_path_list] 548 for img_path in pdf_handler.img_path_list]
526 # loop.run_until_complete(asyncio.wait(tasks)) 549 loop.run_until_complete(asyncio.wait(tasks))
527 # loop.close() 550 # loop.close()
528 551
529 for img_path in pdf_handler.img_path_list: 552 # for img_path in pdf_handler.img_path_list:
530 self.img_2_ocr_2_wb(wb, img_path, bs_summary, unknown_summary, license_summary, skip_img) 553 # self.img_2_ocr_2_wb(wb, img_path, bs_summary, unknown_summary, license_summary, skip_img)
531 554
532 self.cronjob_log.info('{0} [business_type={1}] [doc_id={2}] [bs_summary={3}] [unknown_summary={4}] ' 555 self.cronjob_log.info('{0} [business_type={1}] [doc_id={2}] [bs_summary={3}] [unknown_summary={4}] '
533 '[license_summary={5}]'.format(self.log_base, business_type, doc.id, bs_summary, 556 '[license_summary={5}]'.format(self.log_base, business_type, doc.id, bs_summary,
...@@ -545,23 +568,37 @@ class Command(BaseCommand, LoggerMixin): ...@@ -545,23 +568,37 @@ class Command(BaseCommand, LoggerMixin):
545 wb.save(src_excel_path) 568 wb.save(src_excel_path)
546 wb.rebuild(merged_bs_summary, license_summary, skip_img) 569 wb.rebuild(merged_bs_summary, license_summary, skip_img)
547 wb.save(excel_path) 570 wb.save(excel_path)
571 except EDMSException as e:
572 self.cronjob_log.error('{0} [process failed (edms download)] [business_type={1}] [doc_id={2}] '
573 '[err={3}]'.format(self.log_base, business_type, doc.id, e))
548 except Exception as e: 574 except Exception as e:
549 doc.status = DocStatus.PROCESS_FAILED.value 575 doc.status = DocStatus.PROCESS_FAILED.value
550 doc.save() 576 doc.save()
551 self.cronjob_log.error('{0} [process failed] [business_type={1}] [doc_id={2}] [err={3}]'.format( 577 self.cronjob_log.error('{0} [process failed (program)] [business_type={1}] [doc_id={2}] '
552 self.log_base, business_type, doc.id, e)) 578 '[err={3}]'.format(self.log_base, business_type, doc.id, e))
553 else: 579 else:
554 try: 580 try:
555 # 5.上传至EDMS 581 # 5.上传至EDMS
556 self.edms.upload(excel_path, doc, business_type) 582 for times in range(consts.RETRY_TIMES):
557 # print('upload pass') 583 try:
584 self.edms.upload(excel_path, doc, business_type)
585 except Exception as e:
586 self.cronjob_log.warn(
587 '{0} [edms upload failed] [times={1}] [business_type={2}] [doc_id={3}] '
588 '[error={4}]'.format(self.log_base, times, business_type, doc.id, e))
589 edms_exc = str(e)
590 else:
591 break
592 else:
593 raise EDMSException(edms_exc)
558 except Exception as e: 594 except Exception as e:
559 doc.status = DocStatus.UPLOAD_FAILED.value 595 doc.status = DocStatus.UPLOAD_FAILED.value
560 doc.save() 596 doc.save()
561 end_time = time.time() 597 end_time = time.time()
562 speed_time = int(end_time - start_time) 598 speed_time = int(end_time - start_time)
563 self.cronjob_log.error('{0} [upload failed] [business_type={1}] [doc_id={2}] [speed_time={3}] ' 599 self.cronjob_log.error('{0} [process failed (edms upload)] [business_type={1}] [doc_id={2}] '
564 '[err={4}]'.format(self.log_base, business_type, doc.id, speed_time, e)) 600 '[speed_time={3}] [err={4}]'.format(self.log_base, business_type, doc.id,
601 speed_time, e))
565 write_zip_file(img_save_path, os.path.join(doc_data_path, '{0}_img.zip'.format(doc.id))) 602 write_zip_file(img_save_path, os.path.join(doc_data_path, '{0}_img.zip'.format(doc.id)))
566 603
567 else: 604 else:
......
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!