6e296e35 by 冯轩

merge

2 parents 0e53f7a3 bba2102f
...@@ -1287,6 +1287,9 @@ class Command(BaseCommand, LoggerMixin): ...@@ -1287,6 +1287,9 @@ class Command(BaseCommand, LoggerMixin):
1287 target_pdf_path = os.path.join(pdf_doc_data_path, '{0}.pdf'.format(pdf_doc.id)) 1287 target_pdf_path = os.path.join(pdf_doc_data_path, '{0}.pdf'.format(pdf_doc.id))
1288 shutil.move(pdf_path, target_pdf_path) 1288 shutil.move(pdf_path, target_pdf_path)
1289 1289
1290 if '微信支付交易明细证明' in os.path.basename(pdf_path) or '微信流水' in os.path.basename(pdf_path):
1291 pdf_task_str = consts.SPLIT_STR.join([business_type, str(pdf_doc.id), '12', re_ocr_flag])
1292 else:
1290 pdf_task_str = consts.SPLIT_STR.join([business_type, str(pdf_doc.id), '0', re_ocr_flag]) 1293 pdf_task_str = consts.SPLIT_STR.join([business_type, str(pdf_doc.id), '0', re_ocr_flag])
1291 pdf_task_str_list.append(pdf_task_str) 1294 pdf_task_str_list.append(pdf_task_str)
1292 except Exception as e: 1295 except Exception as e:
...@@ -1507,6 +1510,138 @@ class Command(BaseCommand, LoggerMixin): ...@@ -1507,6 +1510,138 @@ class Command(BaseCommand, LoggerMixin):
1507 self.log_base, traceback.format_exc())) 1510 self.log_base, traceback.format_exc()))
1508 # error_list.append(1) 1511 # error_list.append(1)
1509 # return 1512 # return
1513 elif classify_1_str == '12' or classify_1_str == '29': # weixin e-bs 或e-invoice 都走微信电子流水逻辑
1514 try:
1515 max_img_count = 500
1516 for times in range(consts.RETRY_TIMES):
1517 try:
1518 if doc.application_id.startswith(consts.FIXED_APPLICATION_ID_PREFIX):
1519 self.online_log.info('{0} [mo ni xia dan] [task={1}] [times={2}] '
1520 '[pdf_path={3}]'.format(self.log_base, task_str,
1521 times, pdf_path))
1522 elif os.path.exists(pdf_path):
1523 self.online_log.info('{0} [pdf from zip file] [task={1}] [times={2}] '
1524 '[pdf_path={3}]'.format(self.log_base, task_str,
1525 times, pdf_path))
1526 else:
1527 # self.edms.download(pdf_path, doc.metadata_version_id)
1528 self.edms.download(pdf_path, doc.metadata_version_id, doc.document_scheme,
1529 business_type)
1530 self.online_log.info('{0} [ecm download success] [task={1}] [times={2}] '
1531 '[pdf_path={3}]'.format(self.log_base, task_str,
1532 times, pdf_path))
1533
1534 # 3.PDF文件提取图片
1535 self.online_log.info('{0} [pdf to img start] [task={1}] [times={2}]'.format(
1536 self.log_base, task_str, times))
1537 start_time = time.time()
1538 pdf_handler.extract_image_for_weixin(max_img_count)
1539 end_time = time.time()
1540 speed_time = int(end_time - start_time)
1541 self.online_log.info('{0} [pdf to img end] [task={1}] [times={2}] [spend_time={3}] [is_new_modify={4}]'.format(
1542 self.log_base, task_str, times, speed_time, pdf_handler.is_new_modify))
1543 except Exception as e:
1544 self.online_log.warn('{0} [download or pdf to img failed] [task={1}] [times={2}] '
1545 '[error={3}]'.format(self.log_base, task_str, times,
1546 traceback.format_exc()))
1547 else:
1548 break
1549 else:
1550 raise Exception('download or pdf to img failed')
1551
1552 if pdf_handler.img_count == 0:
1553 self.online_log.warn('{0} [pdf to img failed (pdf img empty)] [task={1}]'.format(
1554 self.log_base, task_str))
1555 raise Exception('pdf img empty')
1556 elif pdf_handler.img_count >= max_img_count:
1557 self.online_log.info('{0} [too many pdf image] [task={1}] [img_count={2}]'.format(
1558 self.log_base, task_str, pdf_handler.img_count))
1559
1560 try:
1561 report_table = HILOCRReport if business_type == consts.HIL_PREFIX else AFCOCRReport
1562 report_table.objects.create(
1563 case_number=doc.application_id,
1564 request_team=RequestTeam.get_value(doc.document_scheme, 0),
1565 request_trigger=RequestTrigger.get_value(doc.data_source, 0),
1566 input_file=doc.document_name,
1567 transaction_start=doc.start_time,
1568 transaction_end=doc.start_time,
1569 successful_at_this_level=False,
1570 failure_reason=FailureReason.IMG_LIMIT.value,
1571 process_name=ProcessName.ALL.value,
1572 notes='pdf page count: {0}'.format(str(pdf_handler.img_count))
1573 )
1574 except Exception as e:
1575 self.online_log.error('{0} [process error (report db save)] [error={1}]'.format(
1576 self.log_base, traceback.format_exc()))
1577
1578 try:
1579 doc.status = DocStatus.PROCESS_FAILED.value
1580 doc.page_count = pdf_handler.page_count
1581 doc.save()
1582 except Exception as e:
1583 self.online_log.error('{0} [process error (db save)] [error={1}]'.format(
1584 self.log_base, traceback.format_exc()))
1585 else:
1586 try:
1587 if pdf_handler.is_e_pdf:
1588 doc.metadata = pdf_handler.metadata if pdf_handler.metadata is None else \
1589 json.dumps(pdf_handler.metadata)
1590 doc.page_count = pdf_handler.page_count
1591 doc.save()
1592 except Exception as e:
1593 self.online_log.error('{0} [process error (db save)] [error={1}]'.format(
1594 self.log_base, traceback.format_exc()))
1595
1596 with lock:
1597 todo_count_dict[task_str] = pdf_handler.img_count
1598
1599 self.online_log.info('{0} [pdf_2_img_2_queue] [{1}] [is_ebank={2}]'.format(
1600 self.log_base, task_str, pdf_handler.is_ebank
1601 ))
1602 for img_idx, img_path in enumerate(pdf_handler.img_path_list):
1603 while img_queue.full():
1604 self.online_log.info('{0} [pdf_2_img_2_queue] [img queue full]'.format(self.log_base))
1605 time.sleep(self.sleep_time_img_put)
1606 if pdf_handler.is_e_weixin_bs:
1607 try:
1608 #self.online_log.info('{0} [pdf_2_img_2_queue] [img_idx={1}] [page_text_list={2}]'.format(self.log_base, img_idx, pdf_handler.page_text_list))
1609 text_list = pdf_handler.page_text_list[img_idx].pop('rebuild_text')
1610 except Exception as e:
1611 text_list = []
1612 else:
1613 text_list = []
1614 img_queue.put((business_type, img_path, text_list))
1615 except Exception as e:
1616 try:
1617 end_time = timezone.now()
1618 report_table = HILOCRReport if business_type == consts.HIL_PREFIX else AFCOCRReport
1619 report_table.objects.create(
1620 case_number=doc.application_id,
1621 request_team=RequestTeam.get_value(doc.document_scheme, 0),
1622 request_trigger=RequestTrigger.get_value(doc.data_source, 0),
1623 input_file=doc.document_name,
1624 transaction_start=doc.start_time,
1625 transaction_end=end_time,
1626 successful_at_this_level=False,
1627 failure_reason=FailureReason.PDF.value,
1628 process_name=ProcessName.ALL.value,
1629 )
1630 except Exception as e:
1631 self.online_log.error('{0} [process error (report db save)] [error={1}]'.format(
1632 self.log_base, traceback.format_exc()))
1633
1634 try:
1635 doc.status = DocStatus.PROCESS_FAILED.value
1636 doc.page_count = pdf_handler.page_count
1637 doc.save()
1638 self.online_log.warn('{0} [process failed (pdf_2_img_2_queue)] [task={1}] '
1639 '[error={2}]'.format(self.log_base, task_str, traceback.format_exc()))
1640 except Exception as e:
1641 self.online_log.error('{0} [process error (db save)] [error={1}]'.format(
1642 self.log_base, traceback.format_exc()))
1643 # error_list.append(1)
1644 # return
1510 else: # e-contract or or e-fsm-contract or e-hmh 1645 else: # e-contract or or e-fsm-contract or e-hmh
1511 try: 1646 try:
1512 # pdf下载 处理 图片存储 识别 1647 # pdf下载 处理 图片存储 识别
...@@ -1677,6 +1812,7 @@ class Command(BaseCommand, LoggerMixin): ...@@ -1677,6 +1812,7 @@ class Command(BaseCommand, LoggerMixin):
1677 json_data_1['text_list'] = text_list 1812 json_data_1['text_list'] = text_list
1678 1813
1679 start_time = time.time() 1814 start_time = time.time()
1815 self.online_log.info('{0} [ocr_1 api] [img={1}] [json_data_1={2}]'.format(self.log_base, img_path, json_data_1))
1680 ocr_1_response = requests.post(url, json=json_data_1) 1816 ocr_1_response = requests.post(url, json=json_data_1)
1681 if ocr_1_response.status_code != 200: 1817 if ocr_1_response.status_code != 200:
1682 raise OCR1Exception('ocr_1 status code: {0}'.format(ocr_1_response.status_code)) 1818 raise OCR1Exception('ocr_1 status code: {0}'.format(ocr_1_response.status_code))
...@@ -1687,6 +1823,7 @@ class Command(BaseCommand, LoggerMixin): ...@@ -1687,6 +1823,7 @@ class Command(BaseCommand, LoggerMixin):
1687 else: 1823 else:
1688 self.online_log.info('{0} [ocr_1 start] [img={1}] [url={2}]'.format(self.log_base, img_path, url)) 1824 self.online_log.info('{0} [ocr_1 start] [img={1}] [url={2}]'.format(self.log_base, img_path, url))
1689 ocr_1_res = ocr_1_response.json() 1825 ocr_1_res = ocr_1_response.json()
1826 self.online_log.info('{0} [ocr_1 api res] [img={1}] [ocr_1_res={2}]'.format(self.log_base, img_path, ocr_1_res))
1690 end_time = time.time() 1827 end_time = time.time()
1691 speed_time = int(end_time - start_time) 1828 speed_time = int(end_time - start_time)
1692 self.online_log.info('{0} [ocr_1 success] [img={1}] [url={2}] [speed_time={3}]'.format( 1829 self.online_log.info('{0} [ocr_1 success] [img={1}] [url={2}] [speed_time={3}]'.format(
......
...@@ -11,6 +11,8 @@ from openpyxl import Workbook ...@@ -11,6 +11,8 @@ from openpyxl import Workbook
11 from openpyxl.styles import PatternFill, numbers 11 from openpyxl.styles import PatternFill, numbers
12 from openpyxl.utils import get_column_letter 12 from openpyxl.utils import get_column_letter
13 from apps.doc import consts 13 from apps.doc import consts
14 import logging
15 online_log = logging.getLogger('online')
14 16
15 17
16 class BSWorkbook(Workbook): 18 class BSWorkbook(Workbook):
...@@ -562,6 +564,8 @@ class BSWorkbook(Workbook): ...@@ -562,6 +564,8 @@ class BSWorkbook(Workbook):
562 borrow_cell = None if borrow_cell_idx is None or borrow_cell_idx >= length else rows[borrow_cell_idx] 564 borrow_cell = None if borrow_cell_idx is None or borrow_cell_idx >= length else rows[borrow_cell_idx]
563 565
564 summary_cell_value = None if summary_cell is None else summary_cell.value 566 summary_cell_value = None if summary_cell is None else summary_cell.value
567 if summary_cell.value is not None:
568 summary_cell_value = summary_cell_value.strip()
565 date_cell_value = None if date_cell is None else date_cell.value 569 date_cell_value = None if date_cell is None else date_cell.value
566 amount_cell_value = None if amount_cell is None else amount_cell.value 570 amount_cell_value = None if amount_cell is None else amount_cell.value
567 over_cell_value = None if over_cell is None else over_cell.value 571 over_cell_value = None if over_cell is None else over_cell.value
...@@ -638,6 +642,7 @@ class BSWorkbook(Workbook): ...@@ -638,6 +642,7 @@ class BSWorkbook(Workbook):
638 642
639 # 3.2.提取信息、高亮 643 # 3.2.提取信息、高亮
640 # row = summary_cell.row 644 # row = summary_cell.row
645 # online_log.info('[ti qu xin xi gao liang =========== >] [summary_cell_value={0}]'.format(summary_cell_value))
641 if summary_cell is not None: 646 if summary_cell is not None:
642 # 关键词1提取 647 # 关键词1提取
643 if summary_cell_value in self.interest_keyword: 648 if summary_cell_value in self.interest_keyword:
......
...@@ -704,6 +704,14 @@ class UploadDocView(GenericView, DocHandler): ...@@ -704,6 +704,14 @@ class UploadDocView(GenericView, DocHandler):
704 classify_1 = classify_1_tmp 704 classify_1 = classify_1_tmp
705 break 705 break
706 706
707 if classify_1 == 0 and ('微信支付交易明细证明' in document_name or '微信流水' in document_name):
708 classify_1 = 12
709 self.running_log.info('[weixin bs process] [doc_id={0}]'.format(doc.id))
710
711 if classify_1 == 0 and (document_name.startswith("dzfp") or '电子发票' in document_name):
712 classify_1 = 29
713 self.running_log.info('[dzfp process] [doc_id={0}]'.format(doc.id))
714
707 715
708 if document_name.endswith('.zip') or document_name.endswith('.rar') or document_name.endswith('.ZIP') \ 716 if document_name.endswith('.zip') or document_name.endswith('.rar') or document_name.endswith('.ZIP') \
709 or document_name.endswith('.RAR'): 717 or document_name.endswith('.RAR'):
...@@ -1259,6 +1267,14 @@ class DocView(DocGenericView, DocHandler): ...@@ -1259,6 +1267,14 @@ class DocView(DocGenericView, DocHandler):
1259 classify_1 = classify_1_tmp 1267 classify_1 = classify_1_tmp
1260 break 1268 break
1261 1269
1270 if classify_1 == 0 and ('微信支付交易明细证明' in document_name or '微信流水' in document_name):
1271 classify_1 = 12
1272 self.running_log.info('[weixin bs process] [doc_id={0}]'.format(doc.id))
1273
1274 if classify_1 == 0 and (document_name.startswith("dzfp") or '电子发票' in document_name):
1275 classify_1 = 29
1276 self.running_log.info('[dzfp process] [doc_id={0}]'.format(doc.id))
1277
1262 # tasks = ['{0}{1}{2}'.format(prefix, consts.SPLIT_STR, doc.id)] 1278 # tasks = ['{0}{1}{2}'.format(prefix, consts.SPLIT_STR, doc.id)]
1263 task = consts.SPLIT_STR.join([prefix, str(doc.id), str(classify_1), 'N']) 1279 task = consts.SPLIT_STR.join([prefix, str(doc.id), str(classify_1), 'N'])
1264 enqueue_res = rh.enqueue([task], is_priority) 1280 enqueue_res = rh.enqueue([task], is_priority)
......
...@@ -2731,6 +2731,13 @@ def se_compare_license(license_en, ocr_res_dict, field_list, is_auto): ...@@ -2731,6 +2731,13 @@ def se_compare_license(license_en, ocr_res_dict, field_list, is_auto):
2731 value = json.dumps(value, ensure_ascii=False) 2731 value = json.dumps(value, ensure_ascii=False)
2732 error_type = empty_error_type if result == consts.RESULT_Y else ErrorType.OCR.value 2732 error_type = empty_error_type if result == consts.RESULT_Y else ErrorType.OCR.value
2733 result_field_list.append((name, value, result, ocr_str, img_path, error_type, compare_logic[name][3])) 2733 result_field_list.append((name, value, result, ocr_str, img_path, error_type, compare_logic[name][3]))
2734 # CHINARPA-5620 电子发票返回CMS时 增加COMMENTS "此单为电子数控发票"
2735 if ocr_field == consts.MVI_OCR_FIELD :
2736 e_invoice = ocr_res_list[res_idx].get('电子发票')
2737 if e_invoice is not None and e_invoice == '是':
2738 result_field_list.append(('是否电子发票', '是/否', 'Y', '是', empty_str, empty_error_type, '此单为电子数控发票'))
2739 else:
2740 result_field_list.append(('是否电子发票', '是/否', 'Y', '否', empty_str, empty_error_type, '此单为电子数控发票'))
2734 else: 2741 else:
2735 no_ocr_result = True 2742 no_ocr_result = True
2736 2743
...@@ -3523,6 +3530,9 @@ def se_compare_process(compare_info, ocr_res_dict, is_gsyh, is_auto, id_res_list ...@@ -3523,6 +3530,9 @@ def se_compare_process(compare_info, ocr_res_dict, is_gsyh, is_auto, id_res_list
3523 elif isinstance(cn_reason, list): 3530 elif isinstance(cn_reason, list):
3524 cn_reason_list.extend(cn_reason) 3531 cn_reason_list.extend(cn_reason)
3525 rpa_failure_reason.setdefault('、'.join(cn_reason), []).append(value) 3532 rpa_failure_reason.setdefault('、'.join(cn_reason), []).append(value)
3533 #
3534 if license_en == consts.MVI_EN and name == '是否电子发票' and ocr_str == '是':
3535 cn_reason_list.append(cn_reason)
3526 compare_result.append( 3536 compare_result.append(
3527 { 3537 {
3528 consts.HEAD_LIST[0]: info_key, 3538 consts.HEAD_LIST[0]: info_key,
......
...@@ -69,6 +69,7 @@ class PDFHandler: ...@@ -69,6 +69,7 @@ class PDFHandler:
69 self.suffix = self.get_suffix(document_name) 69 self.suffix = self.get_suffix(document_name)
70 self.is_ebank = False 70 self.is_ebank = False
71 self.is_e_pdf = False 71 self.is_e_pdf = False
72 self.is_e_weixin_bs = False
72 self.page_text_list = [] 73 self.page_text_list = []
73 self.pdf_info = {} 74 self.pdf_info = {}
74 self.img_path_pno_list = [] 75 self.img_path_pno_list = []
...@@ -186,6 +187,8 @@ class PDFHandler: ...@@ -186,6 +187,8 @@ class PDFHandler:
186 self.img_path_list.append(img_save_path) 187 self.img_path_list.append(img_save_path)
187 if self.is_ebank: 188 if self.is_ebank:
188 self.rebuild_bbox(pm.width, pm.height, page.number) 189 self.rebuild_bbox(pm.width, pm.height, page.number)
190 if self.is_e_weixin_bs:
191 self.rebuild_bbox(pm.width, pm.height, page.number)
189 192
190 @staticmethod 193 @staticmethod
191 def getimage(pix): 194 def getimage(pix):
...@@ -407,6 +410,57 @@ class PDFHandler: ...@@ -407,6 +410,57 @@ class PDFHandler:
407 self.is_e_pdf = True 410 self.is_e_pdf = True
408 self.page_text_list = page_text_list 411 self.page_text_list = page_text_list
409 412
413 def put_text(self, pdf):
414 page_text_list = []
415 text_item_sum = 0
416 for pno in range(pdf.pageCount):
417 page = pdf.loadPage(pno)
418 if page.rotation is None:
419 rotation = 0
420 elif isinstance(page.rotation, int):
421 divisor, remainder = divmod(page.rotation, 90)
422 if remainder != 0:
423 return
424 rotation = divmod(divisor, 4)[1]
425 else:
426 return
427 textpage = page.getTextPage()
428 text = textpage.extractDICT()
429 text_list = []
430 for block in text.get('blocks'):
431 for line in block.get('lines'):
432 for span in line.get('spans'):
433 char = span.get('text')
434
435 if char.strip() == '':
436 continue
437
438 # 特殊emoji跳过
439 try:
440 print(char)
441 except Exception as e:
442 continue
443
444 bbox = span.get('bbox')
445 if pno == 0 and self.title_is_ebank(char):
446 in_ebank_set = True
447 text_list.append((bbox, char))
448 text_item_sum += len(text_list)
449 if text_item_sum < (pno + 1) * 5:
450 return
451 else:
452 page_text_list.append(
453 {
454 'width': text.get('width'),
455 'height': text.get('height'),
456 'rotation': rotation,
457 'text': text_list
458 }
459 )
460 self.is_e_pdf = True
461 self.is_e_weixin_bs = True
462 self.page_text_list = page_text_list
463
410 def e_contract_process(self): 464 def e_contract_process(self):
411 os.makedirs(self.img_dir_path, exist_ok=True) 465 os.makedirs(self.img_dir_path, exist_ok=True)
412 with fitz.Document(self.path) as pdf: 466 with fitz.Document(self.path) as pdf:
...@@ -473,6 +527,59 @@ class PDFHandler: ...@@ -473,6 +527,59 @@ class PDFHandler:
473 self.merge_il(pdf, pno, il) 527 self.merge_il(pdf, pno, il)
474 self.img_count = len(self.img_path_list) 528 self.img_count = len(self.img_path_list)
475 529
530 def extract_image_for_weixin(self, max_img_count=None):
531 self.img_path_list = []
532 self.xref_set = set()
533 os.makedirs(self.img_dir_path, exist_ok=True)
534
535 if self.suffix in self.img_suffixs:
536 img_save_path = self.get_img_save_path(0, ext=self.suffix[1:])
537 shutil.copy(self.path, img_save_path)
538 self.img_path_list.append(img_save_path)
539 else:
540 with fitz.Document(self.path) as pdf:
541 # 解密
542 for pwd in self.pwd_list:
543 if not pdf.isEncrypted:
544 break
545 pdf.authenticate(pwd)
546
547 self.metadata = pdf.metadata
548 self.page_count = pdf.pageCount
549 if isinstance(max_img_count, int) and pdf.pageCount >= max_img_count:
550 self.img_count = pdf.pageCount
551 return
552 self.put_text(pdf)
553 for pno in range(pdf.pageCount):
554 il = pdf.getPageImageList(pno) # 获取页面图片对象
555 # (xref, smask, width, height, bpc, colorspace, alt.colorspace, name, filter, invoker)
556
557 # 1.页面图片对象数目为0时,保存整个页面为png图片
558 if self.is_e_pdf or self.is_ebank or len(il) == 0:
559 page = pdf.loadPage(pno)
560 self.page_to_png(page)
561 # 2.页面图片对象数目为1时:
562 # 小图(如电子账单的盖章):保存整个页面为png图片
563 # 大图:提取图片对象
564 elif len(il) == 1:
565 xref, smask, width, height, _, colorspace, _, _, _ = il[0]
566 # 小图
567 if width < WH_COUPLE_1[0] and height < WH_COUPLE_1[1]:
568 page = pdf.loadPage(pno)
569 self.page_to_png(page)
570 # 大图
571 elif width >= WH_COUPLE_6[0] or height >= WH_COUPLE_6[1]:
572 self.is_new_modify = 1
573 is_big_img = (width < WH_COUPLE_7[0] and height < WH_COUPLE_7[1]) # 防止图片过大
574 page = pdf.loadPage(pno)
575 self.page_to_png(page, is_big_img=is_big_img)
576 elif xref not in self.xref_set:
577 self.extract_single_image(pdf, xref, smask, colorspace, pno)
578 # 3.页面图片对象数目大于1时,特殊处理
579 else:
580 self.merge_il(pdf, pno, il)
581 self.img_count = len(self.img_path_list)
582
476 def extract_page_image(self): 583 def extract_page_image(self):
477 self.img_path_list = [] 584 self.img_path_list = []
478 self.xref_set = set() 585 self.xref_set = set()
......
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!