01c8ebe1 by 冯轩

Merge branch 'feature/weixin-bs-2' into feature/uat-tmp

2 parents 8b18dc07 01854092
...@@ -1504,6 +1504,138 @@ class Command(BaseCommand, LoggerMixin): ...@@ -1504,6 +1504,138 @@ class Command(BaseCommand, LoggerMixin):
1504 self.log_base, traceback.format_exc())) 1504 self.log_base, traceback.format_exc()))
1505 # error_list.append(1) 1505 # error_list.append(1)
1506 # return 1506 # return
1507 elif classify_1_str == '12': # weixin e-bs
1508 try:
1509 max_img_count = 500
1510 for times in range(consts.RETRY_TIMES):
1511 try:
1512 if doc.application_id.startswith(consts.FIXED_APPLICATION_ID_PREFIX):
1513 self.online_log.info('{0} [mo ni xia dan] [task={1}] [times={2}] '
1514 '[pdf_path={3}]'.format(self.log_base, task_str,
1515 times, pdf_path))
1516 elif os.path.exists(pdf_path):
1517 self.online_log.info('{0} [pdf from zip file] [task={1}] [times={2}] '
1518 '[pdf_path={3}]'.format(self.log_base, task_str,
1519 times, pdf_path))
1520 else:
1521 # self.edms.download(pdf_path, doc.metadata_version_id)
1522 self.edms.download(pdf_path, doc.metadata_version_id, doc.document_scheme,
1523 business_type)
1524 self.online_log.info('{0} [ecm download success] [task={1}] [times={2}] '
1525 '[pdf_path={3}]'.format(self.log_base, task_str,
1526 times, pdf_path))
1527
1528 # 3.PDF文件提取图片
1529 self.online_log.info('{0} [pdf to img start] [task={1}] [times={2}]'.format(
1530 self.log_base, task_str, times))
1531 start_time = time.time()
1532 pdf_handler.extract_image_for_weixin(max_img_count)
1533 end_time = time.time()
1534 speed_time = int(end_time - start_time)
1535 self.online_log.info('{0} [pdf to img end] [task={1}] [times={2}] [spend_time={3}] [is_new_modify={4}]'.format(
1536 self.log_base, task_str, times, speed_time, pdf_handler.is_new_modify))
1537 except Exception as e:
1538 self.online_log.warn('{0} [download or pdf to img failed] [task={1}] [times={2}] '
1539 '[error={3}]'.format(self.log_base, task_str, times,
1540 traceback.format_exc()))
1541 else:
1542 break
1543 else:
1544 raise Exception('download or pdf to img failed')
1545
1546 if pdf_handler.img_count == 0:
1547 self.online_log.warn('{0} [pdf to img failed (pdf img empty)] [task={1}]'.format(
1548 self.log_base, task_str))
1549 raise Exception('pdf img empty')
1550 elif pdf_handler.img_count >= max_img_count:
1551 self.online_log.info('{0} [too many pdf image] [task={1}] [img_count={2}]'.format(
1552 self.log_base, task_str, pdf_handler.img_count))
1553
1554 try:
1555 report_table = HILOCRReport if business_type == consts.HIL_PREFIX else AFCOCRReport
1556 report_table.objects.create(
1557 case_number=doc.application_id,
1558 request_team=RequestTeam.get_value(doc.document_scheme, 0),
1559 request_trigger=RequestTrigger.get_value(doc.data_source, 0),
1560 input_file=doc.document_name,
1561 transaction_start=doc.start_time,
1562 transaction_end=doc.start_time,
1563 successful_at_this_level=False,
1564 failure_reason=FailureReason.IMG_LIMIT.value,
1565 process_name=ProcessName.ALL.value,
1566 notes='pdf page count: {0}'.format(str(pdf_handler.img_count))
1567 )
1568 except Exception as e:
1569 self.online_log.error('{0} [process error (report db save)] [error={1}]'.format(
1570 self.log_base, traceback.format_exc()))
1571
1572 try:
1573 doc.status = DocStatus.PROCESS_FAILED.value
1574 doc.page_count = pdf_handler.page_count
1575 doc.save()
1576 except Exception as e:
1577 self.online_log.error('{0} [process error (db save)] [error={1}]'.format(
1578 self.log_base, traceback.format_exc()))
1579 else:
1580 try:
1581 if pdf_handler.is_e_pdf:
1582 doc.metadata = pdf_handler.metadata if pdf_handler.metadata is None else \
1583 json.dumps(pdf_handler.metadata)
1584 doc.page_count = pdf_handler.page_count
1585 doc.save()
1586 except Exception as e:
1587 self.online_log.error('{0} [process error (db save)] [error={1}]'.format(
1588 self.log_base, traceback.format_exc()))
1589
1590 with lock:
1591 todo_count_dict[task_str] = pdf_handler.img_count
1592
1593 self.online_log.info('{0} [pdf_2_img_2_queue] [{1}] [is_ebank={2}]'.format(
1594 self.log_base, task_str, pdf_handler.is_ebank
1595 ))
1596 for img_idx, img_path in enumerate(pdf_handler.img_path_list):
1597 while img_queue.full():
1598 self.online_log.info('{0} [pdf_2_img_2_queue] [img queue full]'.format(self.log_base))
1599 time.sleep(self.sleep_time_img_put)
1600 if pdf_handler.is_e_weixin_bs:
1601 try:
1602 #self.online_log.info('{0} [pdf_2_img_2_queue] [img_idx={1}] [page_text_list={2}]'.format(self.log_base, img_idx, pdf_handler.page_text_list))
1603 text_list = pdf_handler.page_text_list[img_idx].pop('rebuild_text')
1604 except Exception as e:
1605 text_list = []
1606 else:
1607 text_list = []
1608 img_queue.put((business_type, img_path, text_list))
1609 except Exception as e:
1610 try:
1611 end_time = timezone.now()
1612 report_table = HILOCRReport if business_type == consts.HIL_PREFIX else AFCOCRReport
1613 report_table.objects.create(
1614 case_number=doc.application_id,
1615 request_team=RequestTeam.get_value(doc.document_scheme, 0),
1616 request_trigger=RequestTrigger.get_value(doc.data_source, 0),
1617 input_file=doc.document_name,
1618 transaction_start=doc.start_time,
1619 transaction_end=end_time,
1620 successful_at_this_level=False,
1621 failure_reason=FailureReason.PDF.value,
1622 process_name=ProcessName.ALL.value,
1623 )
1624 except Exception as e:
1625 self.online_log.error('{0} [process error (report db save)] [error={1}]'.format(
1626 self.log_base, traceback.format_exc()))
1627
1628 try:
1629 doc.status = DocStatus.PROCESS_FAILED.value
1630 doc.page_count = pdf_handler.page_count
1631 doc.save()
1632 self.online_log.warn('{0} [process failed (pdf_2_img_2_queue)] [task={1}] '
1633 '[error={2}]'.format(self.log_base, task_str, traceback.format_exc()))
1634 except Exception as e:
1635 self.online_log.error('{0} [process error (db save)] [error={1}]'.format(
1636 self.log_base, traceback.format_exc()))
1637 # error_list.append(1)
1638 # return
1507 else: # e-contract or or e-fsm-contract or e-hmh 1639 else: # e-contract or or e-fsm-contract or e-hmh
1508 try: 1640 try:
1509 # pdf下载 处理 图片存储 识别 1641 # pdf下载 处理 图片存储 识别
...@@ -1674,6 +1806,7 @@ class Command(BaseCommand, LoggerMixin): ...@@ -1674,6 +1806,7 @@ class Command(BaseCommand, LoggerMixin):
1674 json_data_1['text_list'] = text_list 1806 json_data_1['text_list'] = text_list
1675 1807
1676 start_time = time.time() 1808 start_time = time.time()
1809 self.online_log.info('{0} [ocr_1 api] [img={1}] [json_data_1={2}]'.format(self.log_base, img_path, json_data_1))
1677 ocr_1_response = requests.post(url, json=json_data_1) 1810 ocr_1_response = requests.post(url, json=json_data_1)
1678 if ocr_1_response.status_code != 200: 1811 if ocr_1_response.status_code != 200:
1679 raise OCR1Exception('ocr_1 status code: {0}'.format(ocr_1_response.status_code)) 1812 raise OCR1Exception('ocr_1 status code: {0}'.format(ocr_1_response.status_code))
......
...@@ -696,6 +696,10 @@ class UploadDocView(GenericView, DocHandler): ...@@ -696,6 +696,10 @@ class UploadDocView(GenericView, DocHandler):
696 classify_1 = classify_1_tmp 696 classify_1 = classify_1_tmp
697 break 697 break
698 698
699 if classify_1 == 0 and ('微信支付交易明细证明' in document_name or '微信流水' in document_name):
700 classify_1 = 12
701 self.running_log.info('[weixin bs process] [doc_id={0}]'.format(doc.id))
702
699 703
700 if document_name.endswith('.zip') or document_name.endswith('.rar') or document_name.endswith('.ZIP') \ 704 if document_name.endswith('.zip') or document_name.endswith('.rar') or document_name.endswith('.ZIP') \
701 or document_name.endswith('.RAR'): 705 or document_name.endswith('.RAR'):
...@@ -1271,6 +1275,10 @@ class DocView(DocGenericView, DocHandler): ...@@ -1271,6 +1275,10 @@ class DocView(DocGenericView, DocHandler):
1271 if keyword in document_name: 1275 if keyword in document_name:
1272 classify_1 = classify_1_tmp 1276 classify_1 = classify_1_tmp
1273 break 1277 break
1278
1279 if classify_1 == 0 and ('微信支付交易明细证明' in document_name or '微信流水' in document_name):
1280 classify_1 = 12
1281 self.running_log.info('[weixin bs process] [doc_id={0}]'.format(doc.id))
1274 1282
1275 # tasks = ['{0}{1}{2}'.format(prefix, consts.SPLIT_STR, doc.id)] 1283 # tasks = ['{0}{1}{2}'.format(prefix, consts.SPLIT_STR, doc.id)]
1276 task = consts.SPLIT_STR.join([prefix, str(doc.id), str(classify_1)]) 1284 task = consts.SPLIT_STR.join([prefix, str(doc.id), str(classify_1)])
......
...@@ -69,6 +69,7 @@ class PDFHandler: ...@@ -69,6 +69,7 @@ class PDFHandler:
69 self.suffix = self.get_suffix(document_name) 69 self.suffix = self.get_suffix(document_name)
70 self.is_ebank = False 70 self.is_ebank = False
71 self.is_e_pdf = False 71 self.is_e_pdf = False
72 self.is_e_weixin_bs = False
72 self.page_text_list = [] 73 self.page_text_list = []
73 self.pdf_info = {} 74 self.pdf_info = {}
74 self.img_path_pno_list = [] 75 self.img_path_pno_list = []
...@@ -186,6 +187,8 @@ class PDFHandler: ...@@ -186,6 +187,8 @@ class PDFHandler:
186 self.img_path_list.append(img_save_path) 187 self.img_path_list.append(img_save_path)
187 if self.is_ebank: 188 if self.is_ebank:
188 self.rebuild_bbox(pm.width, pm.height, page.number) 189 self.rebuild_bbox(pm.width, pm.height, page.number)
190 if self.is_e_weixin_bs:
191 self.rebuild_bbox(pm.width, pm.height, page.number)
189 192
190 @staticmethod 193 @staticmethod
191 def getimage(pix): 194 def getimage(pix):
...@@ -407,6 +410,57 @@ class PDFHandler: ...@@ -407,6 +410,57 @@ class PDFHandler:
407 self.is_e_pdf = True 410 self.is_e_pdf = True
408 self.page_text_list = page_text_list 411 self.page_text_list = page_text_list
409 412
413 def put_text(self, pdf):
414 page_text_list = []
415 text_item_sum = 0
416 for pno in range(pdf.pageCount):
417 page = pdf.loadPage(pno)
418 if page.rotation is None:
419 rotation = 0
420 elif isinstance(page.rotation, int):
421 divisor, remainder = divmod(page.rotation, 90)
422 if remainder != 0:
423 return
424 rotation = divmod(divisor, 4)[1]
425 else:
426 return
427 textpage = page.getTextPage()
428 text = textpage.extractDICT()
429 text_list = []
430 for block in text.get('blocks'):
431 for line in block.get('lines'):
432 for span in line.get('spans'):
433 char = span.get('text')
434
435 if char.strip() == '':
436 continue
437
438 # 特殊emoji跳过
439 try:
440 print(char)
441 except Exception as e:
442 continue
443
444 bbox = span.get('bbox')
445 if pno == 0 and self.title_is_ebank(char):
446 in_ebank_set = True
447 text_list.append((bbox, char))
448 text_item_sum += len(text_list)
449 if text_item_sum < (pno + 1) * 5:
450 return
451 else:
452 page_text_list.append(
453 {
454 'width': text.get('width'),
455 'height': text.get('height'),
456 'rotation': rotation,
457 'text': text_list
458 }
459 )
460 self.is_e_pdf = True
461 self.is_e_weixin_bs = True
462 self.page_text_list = page_text_list
463
410 def e_contract_process(self): 464 def e_contract_process(self):
411 os.makedirs(self.img_dir_path, exist_ok=True) 465 os.makedirs(self.img_dir_path, exist_ok=True)
412 with fitz.Document(self.path) as pdf: 466 with fitz.Document(self.path) as pdf:
...@@ -473,6 +527,59 @@ class PDFHandler: ...@@ -473,6 +527,59 @@ class PDFHandler:
473 self.merge_il(pdf, pno, il) 527 self.merge_il(pdf, pno, il)
474 self.img_count = len(self.img_path_list) 528 self.img_count = len(self.img_path_list)
475 529
530 def extract_image_for_weixin(self, max_img_count=None):
531 self.img_path_list = []
532 self.xref_set = set()
533 os.makedirs(self.img_dir_path, exist_ok=True)
534
535 if self.suffix in self.img_suffixs:
536 img_save_path = self.get_img_save_path(0, ext=self.suffix[1:])
537 shutil.copy(self.path, img_save_path)
538 self.img_path_list.append(img_save_path)
539 else:
540 with fitz.Document(self.path) as pdf:
541 # 解密
542 for pwd in self.pwd_list:
543 if not pdf.isEncrypted:
544 break
545 pdf.authenticate(pwd)
546
547 self.metadata = pdf.metadata
548 self.page_count = pdf.pageCount
549 if isinstance(max_img_count, int) and pdf.pageCount >= max_img_count:
550 self.img_count = pdf.pageCount
551 return
552 self.put_text(pdf)
553 for pno in range(pdf.pageCount):
554 il = pdf.getPageImageList(pno) # 获取页面图片对象
555 # (xref, smask, width, height, bpc, colorspace, alt.colorspace, name, filter, invoker)
556
557 # 1.页面图片对象数目为0时,保存整个页面为png图片
558 if self.is_e_pdf or self.is_ebank or len(il) == 0:
559 page = pdf.loadPage(pno)
560 self.page_to_png(page)
561 # 2.页面图片对象数目为1时:
562 # 小图(如电子账单的盖章):保存整个页面为png图片
563 # 大图:提取图片对象
564 elif len(il) == 1:
565 xref, smask, width, height, _, colorspace, _, _, _ = il[0]
566 # 小图
567 if width < WH_COUPLE_1[0] and height < WH_COUPLE_1[1]:
568 page = pdf.loadPage(pno)
569 self.page_to_png(page)
570 # 大图
571 elif width >= WH_COUPLE_6[0] or height >= WH_COUPLE_6[1]:
572 self.is_new_modify = 1
573 is_big_img = (width < WH_COUPLE_7[0] and height < WH_COUPLE_7[1]) # 防止图片过大
574 page = pdf.loadPage(pno)
575 self.page_to_png(page, is_big_img=is_big_img)
576 elif xref not in self.xref_set:
577 self.extract_single_image(pdf, xref, smask, colorspace, pno)
578 # 3.页面图片对象数目大于1时,特殊处理
579 else:
580 self.merge_il(pdf, pno, il)
581 self.img_count = len(self.img_path_list)
582
476 def extract_page_image(self): 583 def extract_page_image(self):
477 self.img_path_list = [] 584 self.img_path_list = []
478 self.xref_set = set() 585 self.xref_set = set()
......
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!