272692c8 by 冯轩

Merge branch 'feature/weixin-bs-2'

2 parents e08e5c00 df94248b
...@@ -1287,7 +1287,10 @@ class Command(BaseCommand, LoggerMixin): ...@@ -1287,7 +1287,10 @@ class Command(BaseCommand, LoggerMixin):
1287 target_pdf_path = os.path.join(pdf_doc_data_path, '{0}.pdf'.format(pdf_doc.id)) 1287 target_pdf_path = os.path.join(pdf_doc_data_path, '{0}.pdf'.format(pdf_doc.id))
1288 shutil.move(pdf_path, target_pdf_path) 1288 shutil.move(pdf_path, target_pdf_path)
1289 1289
1290 pdf_task_str = consts.SPLIT_STR.join([business_type, str(pdf_doc.id), '0']) 1290 if '微信支付交易明细证明' in os.path.basename(pdf_path) or '微信流水' in os.path.basename(pdf_path):
1291 pdf_task_str = consts.SPLIT_STR.join([business_type, str(pdf_doc.id), '12'])
1292 else:
1293 pdf_task_str = consts.SPLIT_STR.join([business_type, str(pdf_doc.id), '0'])
1291 pdf_task_str_list.append(pdf_task_str) 1294 pdf_task_str_list.append(pdf_task_str)
1292 except Exception as e: 1295 except Exception as e:
1293 self.online_log.warn('{0} [zip_2_pdfs] [recreate pdf task failed] [task={1}] [pdf_path={2}]' 1296 self.online_log.warn('{0} [zip_2_pdfs] [recreate pdf task failed] [task={1}] [pdf_path={2}]'
...@@ -1504,6 +1507,138 @@ class Command(BaseCommand, LoggerMixin): ...@@ -1504,6 +1507,138 @@ class Command(BaseCommand, LoggerMixin):
1504 self.log_base, traceback.format_exc())) 1507 self.log_base, traceback.format_exc()))
1505 # error_list.append(1) 1508 # error_list.append(1)
1506 # return 1509 # return
1510 elif classify_1_str == '12' or classify_1_str == '29': # weixin e-bs 或e-invoice 都走微信电子流水逻辑
1511 try:
1512 max_img_count = 500
1513 for times in range(consts.RETRY_TIMES):
1514 try:
1515 if doc.application_id.startswith(consts.FIXED_APPLICATION_ID_PREFIX):
1516 self.online_log.info('{0} [mo ni xia dan] [task={1}] [times={2}] '
1517 '[pdf_path={3}]'.format(self.log_base, task_str,
1518 times, pdf_path))
1519 elif os.path.exists(pdf_path):
1520 self.online_log.info('{0} [pdf from zip file] [task={1}] [times={2}] '
1521 '[pdf_path={3}]'.format(self.log_base, task_str,
1522 times, pdf_path))
1523 else:
1524 # self.edms.download(pdf_path, doc.metadata_version_id)
1525 self.edms.download(pdf_path, doc.metadata_version_id, doc.document_scheme,
1526 business_type)
1527 self.online_log.info('{0} [ecm download success] [task={1}] [times={2}] '
1528 '[pdf_path={3}]'.format(self.log_base, task_str,
1529 times, pdf_path))
1530
1531 # 3.PDF文件提取图片
1532 self.online_log.info('{0} [pdf to img start] [task={1}] [times={2}]'.format(
1533 self.log_base, task_str, times))
1534 start_time = time.time()
1535 pdf_handler.extract_image_for_weixin(max_img_count)
1536 end_time = time.time()
1537 speed_time = int(end_time - start_time)
1538 self.online_log.info('{0} [pdf to img end] [task={1}] [times={2}] [spend_time={3}] [is_new_modify={4}]'.format(
1539 self.log_base, task_str, times, speed_time, pdf_handler.is_new_modify))
1540 except Exception as e:
1541 self.online_log.warn('{0} [download or pdf to img failed] [task={1}] [times={2}] '
1542 '[error={3}]'.format(self.log_base, task_str, times,
1543 traceback.format_exc()))
1544 else:
1545 break
1546 else:
1547 raise Exception('download or pdf to img failed')
1548
1549 if pdf_handler.img_count == 0:
1550 self.online_log.warn('{0} [pdf to img failed (pdf img empty)] [task={1}]'.format(
1551 self.log_base, task_str))
1552 raise Exception('pdf img empty')
1553 elif pdf_handler.img_count >= max_img_count:
1554 self.online_log.info('{0} [too many pdf image] [task={1}] [img_count={2}]'.format(
1555 self.log_base, task_str, pdf_handler.img_count))
1556
1557 try:
1558 report_table = HILOCRReport if business_type == consts.HIL_PREFIX else AFCOCRReport
1559 report_table.objects.create(
1560 case_number=doc.application_id,
1561 request_team=RequestTeam.get_value(doc.document_scheme, 0),
1562 request_trigger=RequestTrigger.get_value(doc.data_source, 0),
1563 input_file=doc.document_name,
1564 transaction_start=doc.start_time,
1565 transaction_end=doc.start_time,
1566 successful_at_this_level=False,
1567 failure_reason=FailureReason.IMG_LIMIT.value,
1568 process_name=ProcessName.ALL.value,
1569 notes='pdf page count: {0}'.format(str(pdf_handler.img_count))
1570 )
1571 except Exception as e:
1572 self.online_log.error('{0} [process error (report db save)] [error={1}]'.format(
1573 self.log_base, traceback.format_exc()))
1574
1575 try:
1576 doc.status = DocStatus.PROCESS_FAILED.value
1577 doc.page_count = pdf_handler.page_count
1578 doc.save()
1579 except Exception as e:
1580 self.online_log.error('{0} [process error (db save)] [error={1}]'.format(
1581 self.log_base, traceback.format_exc()))
1582 else:
1583 try:
1584 if pdf_handler.is_e_pdf:
1585 doc.metadata = pdf_handler.metadata if pdf_handler.metadata is None else \
1586 json.dumps(pdf_handler.metadata)
1587 doc.page_count = pdf_handler.page_count
1588 doc.save()
1589 except Exception as e:
1590 self.online_log.error('{0} [process error (db save)] [error={1}]'.format(
1591 self.log_base, traceback.format_exc()))
1592
1593 with lock:
1594 todo_count_dict[task_str] = pdf_handler.img_count
1595
1596 self.online_log.info('{0} [pdf_2_img_2_queue] [{1}] [is_ebank={2}]'.format(
1597 self.log_base, task_str, pdf_handler.is_ebank
1598 ))
1599 for img_idx, img_path in enumerate(pdf_handler.img_path_list):
1600 while img_queue.full():
1601 self.online_log.info('{0} [pdf_2_img_2_queue] [img queue full]'.format(self.log_base))
1602 time.sleep(self.sleep_time_img_put)
1603 if pdf_handler.is_e_weixin_bs:
1604 try:
1605 #self.online_log.info('{0} [pdf_2_img_2_queue] [img_idx={1}] [page_text_list={2}]'.format(self.log_base, img_idx, pdf_handler.page_text_list))
1606 text_list = pdf_handler.page_text_list[img_idx].pop('rebuild_text')
1607 except Exception as e:
1608 text_list = []
1609 else:
1610 text_list = []
1611 img_queue.put((business_type, img_path, text_list))
1612 except Exception as e:
1613 try:
1614 end_time = timezone.now()
1615 report_table = HILOCRReport if business_type == consts.HIL_PREFIX else AFCOCRReport
1616 report_table.objects.create(
1617 case_number=doc.application_id,
1618 request_team=RequestTeam.get_value(doc.document_scheme, 0),
1619 request_trigger=RequestTrigger.get_value(doc.data_source, 0),
1620 input_file=doc.document_name,
1621 transaction_start=doc.start_time,
1622 transaction_end=end_time,
1623 successful_at_this_level=False,
1624 failure_reason=FailureReason.PDF.value,
1625 process_name=ProcessName.ALL.value,
1626 )
1627 except Exception as e:
1628 self.online_log.error('{0} [process error (report db save)] [error={1}]'.format(
1629 self.log_base, traceback.format_exc()))
1630
1631 try:
1632 doc.status = DocStatus.PROCESS_FAILED.value
1633 doc.page_count = pdf_handler.page_count
1634 doc.save()
1635 self.online_log.warn('{0} [process failed (pdf_2_img_2_queue)] [task={1}] '
1636 '[error={2}]'.format(self.log_base, task_str, traceback.format_exc()))
1637 except Exception as e:
1638 self.online_log.error('{0} [process error (db save)] [error={1}]'.format(
1639 self.log_base, traceback.format_exc()))
1640 # error_list.append(1)
1641 # return
1507 else: # e-contract or or e-fsm-contract or e-hmh 1642 else: # e-contract or or e-fsm-contract or e-hmh
1508 try: 1643 try:
1509 # pdf下载 处理 图片存储 识别 1644 # pdf下载 处理 图片存储 识别
...@@ -1674,6 +1809,7 @@ class Command(BaseCommand, LoggerMixin): ...@@ -1674,6 +1809,7 @@ class Command(BaseCommand, LoggerMixin):
1674 json_data_1['text_list'] = text_list 1809 json_data_1['text_list'] = text_list
1675 1810
1676 start_time = time.time() 1811 start_time = time.time()
1812 self.online_log.info('{0} [ocr_1 api] [img={1}] [json_data_1={2}]'.format(self.log_base, img_path, json_data_1))
1677 ocr_1_response = requests.post(url, json=json_data_1) 1813 ocr_1_response = requests.post(url, json=json_data_1)
1678 if ocr_1_response.status_code != 200: 1814 if ocr_1_response.status_code != 200:
1679 raise OCR1Exception('ocr_1 status code: {0}'.format(ocr_1_response.status_code)) 1815 raise OCR1Exception('ocr_1 status code: {0}'.format(ocr_1_response.status_code))
...@@ -1684,6 +1820,7 @@ class Command(BaseCommand, LoggerMixin): ...@@ -1684,6 +1820,7 @@ class Command(BaseCommand, LoggerMixin):
1684 else: 1820 else:
1685 self.online_log.info('{0} [ocr_1 start] [img={1}] [url={2}]'.format(self.log_base, img_path, url)) 1821 self.online_log.info('{0} [ocr_1 start] [img={1}] [url={2}]'.format(self.log_base, img_path, url))
1686 ocr_1_res = ocr_1_response.json() 1822 ocr_1_res = ocr_1_response.json()
1823 self.online_log.info('{0} [ocr_1 api res] [img={1}] [ocr_1_res={2}]'.format(self.log_base, img_path, ocr_1_res))
1687 end_time = time.time() 1824 end_time = time.time()
1688 speed_time = int(end_time - start_time) 1825 speed_time = int(end_time - start_time)
1689 self.online_log.info('{0} [ocr_1 success] [img={1}] [url={2}] [speed_time={3}]'.format( 1826 self.online_log.info('{0} [ocr_1 success] [img={1}] [url={2}] [speed_time={3}]'.format(
......
...@@ -11,6 +11,8 @@ from openpyxl import Workbook ...@@ -11,6 +11,8 @@ from openpyxl import Workbook
11 from openpyxl.styles import PatternFill, numbers 11 from openpyxl.styles import PatternFill, numbers
12 from openpyxl.utils import get_column_letter 12 from openpyxl.utils import get_column_letter
13 from apps.doc import consts 13 from apps.doc import consts
14 import logging
15 online_log = logging.getLogger('online')
14 16
15 17
16 class BSWorkbook(Workbook): 18 class BSWorkbook(Workbook):
...@@ -562,6 +564,8 @@ class BSWorkbook(Workbook): ...@@ -562,6 +564,8 @@ class BSWorkbook(Workbook):
562 borrow_cell = None if borrow_cell_idx is None or borrow_cell_idx >= length else rows[borrow_cell_idx] 564 borrow_cell = None if borrow_cell_idx is None or borrow_cell_idx >= length else rows[borrow_cell_idx]
563 565
564 summary_cell_value = None if summary_cell is None else summary_cell.value 566 summary_cell_value = None if summary_cell is None else summary_cell.value
567 if summary_cell.value is not None:
568 summary_cell_value = summary_cell_value.strip()
565 date_cell_value = None if date_cell is None else date_cell.value 569 date_cell_value = None if date_cell is None else date_cell.value
566 amount_cell_value = None if amount_cell is None else amount_cell.value 570 amount_cell_value = None if amount_cell is None else amount_cell.value
567 over_cell_value = None if over_cell is None else over_cell.value 571 over_cell_value = None if over_cell is None else over_cell.value
...@@ -638,6 +642,7 @@ class BSWorkbook(Workbook): ...@@ -638,6 +642,7 @@ class BSWorkbook(Workbook):
638 642
639 # 3.2.提取信息、高亮 643 # 3.2.提取信息、高亮
640 # row = summary_cell.row 644 # row = summary_cell.row
645 # online_log.info('[ti qu xin xi gao liang =========== >] [summary_cell_value={0}]'.format(summary_cell_value))
641 if summary_cell is not None: 646 if summary_cell is not None:
642 # 关键词1提取 647 # 关键词1提取
643 if summary_cell_value in self.interest_keyword: 648 if summary_cell_value in self.interest_keyword:
......
...@@ -693,6 +693,14 @@ class UploadDocView(GenericView, DocHandler): ...@@ -693,6 +693,14 @@ class UploadDocView(GenericView, DocHandler):
693 classify_1 = classify_1_tmp 693 classify_1 = classify_1_tmp
694 break 694 break
695 695
696 if classify_1 == 0 and ('微信支付交易明细证明' in document_name or '微信流水' in document_name):
697 classify_1 = 12
698 self.running_log.info('[weixin bs process] [doc_id={0}]'.format(doc.id))
699
700 if classify_1 == 0 and (document_name.startswith("dzfp_")):
701 classify_1 = 0
702 self.running_log.info('[dzfp process] [doc_id={0}]'.format(doc.id))
703
696 704
697 if document_name.endswith('.zip') or document_name.endswith('.rar') or document_name.endswith('.ZIP') \ 705 if document_name.endswith('.zip') or document_name.endswith('.rar') or document_name.endswith('.ZIP') \
698 or document_name.endswith('.RAR'): 706 or document_name.endswith('.RAR'):
...@@ -1247,6 +1255,14 @@ class DocView(DocGenericView, DocHandler): ...@@ -1247,6 +1255,14 @@ class DocView(DocGenericView, DocHandler):
1247 if keyword in document_name: 1255 if keyword in document_name:
1248 classify_1 = classify_1_tmp 1256 classify_1 = classify_1_tmp
1249 break 1257 break
1258
1259 if classify_1 == 0 and ('微信支付交易明细证明' in document_name or '微信流水' in document_name):
1260 classify_1 = 12
1261 self.running_log.info('[weixin bs process] [doc_id={0}]'.format(doc.id))
1262
1263 if classify_1 == 0 and (document_name.startswith("dzfp_")):
1264 classify_1 = 0
1265 self.running_log.info('[dzfp process] [doc_id={0}]'.format(doc.id))
1250 1266
1251 # tasks = ['{0}{1}{2}'.format(prefix, consts.SPLIT_STR, doc.id)] 1267 # tasks = ['{0}{1}{2}'.format(prefix, consts.SPLIT_STR, doc.id)]
1252 task = consts.SPLIT_STR.join([prefix, str(doc.id), str(classify_1)]) 1268 task = consts.SPLIT_STR.join([prefix, str(doc.id), str(classify_1)])
......
...@@ -69,6 +69,7 @@ class PDFHandler: ...@@ -69,6 +69,7 @@ class PDFHandler:
69 self.suffix = self.get_suffix(document_name) 69 self.suffix = self.get_suffix(document_name)
70 self.is_ebank = False 70 self.is_ebank = False
71 self.is_e_pdf = False 71 self.is_e_pdf = False
72 self.is_e_weixin_bs = False
72 self.page_text_list = [] 73 self.page_text_list = []
73 self.pdf_info = {} 74 self.pdf_info = {}
74 self.img_path_pno_list = [] 75 self.img_path_pno_list = []
...@@ -186,6 +187,8 @@ class PDFHandler: ...@@ -186,6 +187,8 @@ class PDFHandler:
186 self.img_path_list.append(img_save_path) 187 self.img_path_list.append(img_save_path)
187 if self.is_ebank: 188 if self.is_ebank:
188 self.rebuild_bbox(pm.width, pm.height, page.number) 189 self.rebuild_bbox(pm.width, pm.height, page.number)
190 if self.is_e_weixin_bs:
191 self.rebuild_bbox(pm.width, pm.height, page.number)
189 192
190 @staticmethod 193 @staticmethod
191 def getimage(pix): 194 def getimage(pix):
...@@ -407,6 +410,57 @@ class PDFHandler: ...@@ -407,6 +410,57 @@ class PDFHandler:
407 self.is_e_pdf = True 410 self.is_e_pdf = True
408 self.page_text_list = page_text_list 411 self.page_text_list = page_text_list
409 412
413 def put_text(self, pdf):
414 page_text_list = []
415 text_item_sum = 0
416 for pno in range(pdf.pageCount):
417 page = pdf.loadPage(pno)
418 if page.rotation is None:
419 rotation = 0
420 elif isinstance(page.rotation, int):
421 divisor, remainder = divmod(page.rotation, 90)
422 if remainder != 0:
423 return
424 rotation = divmod(divisor, 4)[1]
425 else:
426 return
427 textpage = page.getTextPage()
428 text = textpage.extractDICT()
429 text_list = []
430 for block in text.get('blocks'):
431 for line in block.get('lines'):
432 for span in line.get('spans'):
433 char = span.get('text')
434
435 if char.strip() == '':
436 continue
437
438 # 特殊emoji跳过
439 try:
440 print(char)
441 except Exception as e:
442 continue
443
444 bbox = span.get('bbox')
445 if pno == 0 and self.title_is_ebank(char):
446 in_ebank_set = True
447 text_list.append((bbox, char))
448 text_item_sum += len(text_list)
449 if text_item_sum < (pno + 1) * 5:
450 return
451 else:
452 page_text_list.append(
453 {
454 'width': text.get('width'),
455 'height': text.get('height'),
456 'rotation': rotation,
457 'text': text_list
458 }
459 )
460 self.is_e_pdf = True
461 self.is_e_weixin_bs = True
462 self.page_text_list = page_text_list
463
410 def e_contract_process(self): 464 def e_contract_process(self):
411 os.makedirs(self.img_dir_path, exist_ok=True) 465 os.makedirs(self.img_dir_path, exist_ok=True)
412 with fitz.Document(self.path) as pdf: 466 with fitz.Document(self.path) as pdf:
...@@ -473,6 +527,59 @@ class PDFHandler: ...@@ -473,6 +527,59 @@ class PDFHandler:
473 self.merge_il(pdf, pno, il) 527 self.merge_il(pdf, pno, il)
474 self.img_count = len(self.img_path_list) 528 self.img_count = len(self.img_path_list)
475 529
530 def extract_image_for_weixin(self, max_img_count=None):
531 self.img_path_list = []
532 self.xref_set = set()
533 os.makedirs(self.img_dir_path, exist_ok=True)
534
535 if self.suffix in self.img_suffixs:
536 img_save_path = self.get_img_save_path(0, ext=self.suffix[1:])
537 shutil.copy(self.path, img_save_path)
538 self.img_path_list.append(img_save_path)
539 else:
540 with fitz.Document(self.path) as pdf:
541 # 解密
542 for pwd in self.pwd_list:
543 if not pdf.isEncrypted:
544 break
545 pdf.authenticate(pwd)
546
547 self.metadata = pdf.metadata
548 self.page_count = pdf.pageCount
549 if isinstance(max_img_count, int) and pdf.pageCount >= max_img_count:
550 self.img_count = pdf.pageCount
551 return
552 self.put_text(pdf)
553 for pno in range(pdf.pageCount):
554 il = pdf.getPageImageList(pno) # 获取页面图片对象
555 # (xref, smask, width, height, bpc, colorspace, alt.colorspace, name, filter, invoker)
556
557 # 1.页面图片对象数目为0时,保存整个页面为png图片
558 if self.is_e_pdf or self.is_ebank or len(il) == 0:
559 page = pdf.loadPage(pno)
560 self.page_to_png(page)
561 # 2.页面图片对象数目为1时:
562 # 小图(如电子账单的盖章):保存整个页面为png图片
563 # 大图:提取图片对象
564 elif len(il) == 1:
565 xref, smask, width, height, _, colorspace, _, _, _ = il[0]
566 # 小图
567 if width < WH_COUPLE_1[0] and height < WH_COUPLE_1[1]:
568 page = pdf.loadPage(pno)
569 self.page_to_png(page)
570 # 大图
571 elif width >= WH_COUPLE_6[0] or height >= WH_COUPLE_6[1]:
572 self.is_new_modify = 1
573 is_big_img = (width < WH_COUPLE_7[0] and height < WH_COUPLE_7[1]) # 防止图片过大
574 page = pdf.loadPage(pno)
575 self.page_to_png(page, is_big_img=is_big_img)
576 elif xref not in self.xref_set:
577 self.extract_single_image(pdf, xref, smask, colorspace, pno)
578 # 3.页面图片对象数目大于1时,特殊处理
579 else:
580 self.merge_il(pdf, pno, il)
581 self.img_count = len(self.img_path_list)
582
476 def extract_page_image(self): 583 def extract_page_image(self):
477 self.img_path_list = [] 584 self.img_path_list = []
478 self.xref_set = set() 585 self.xref_set = set()
......
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!