6e296e35 by 冯轩

merge

2 parents 0e53f7a3 bba2102f
...@@ -11,6 +11,8 @@ from openpyxl import Workbook ...@@ -11,6 +11,8 @@ from openpyxl import Workbook
11 from openpyxl.styles import PatternFill, numbers 11 from openpyxl.styles import PatternFill, numbers
12 from openpyxl.utils import get_column_letter 12 from openpyxl.utils import get_column_letter
13 from apps.doc import consts 13 from apps.doc import consts
14 import logging
15 online_log = logging.getLogger('online')
14 16
15 17
16 class BSWorkbook(Workbook): 18 class BSWorkbook(Workbook):
...@@ -562,6 +564,8 @@ class BSWorkbook(Workbook): ...@@ -562,6 +564,8 @@ class BSWorkbook(Workbook):
562 borrow_cell = None if borrow_cell_idx is None or borrow_cell_idx >= length else rows[borrow_cell_idx] 564 borrow_cell = None if borrow_cell_idx is None or borrow_cell_idx >= length else rows[borrow_cell_idx]
563 565
564 summary_cell_value = None if summary_cell is None else summary_cell.value 566 summary_cell_value = None if summary_cell is None else summary_cell.value
567 if summary_cell.value is not None:
568 summary_cell_value = summary_cell_value.strip()
565 date_cell_value = None if date_cell is None else date_cell.value 569 date_cell_value = None if date_cell is None else date_cell.value
566 amount_cell_value = None if amount_cell is None else amount_cell.value 570 amount_cell_value = None if amount_cell is None else amount_cell.value
567 over_cell_value = None if over_cell is None else over_cell.value 571 over_cell_value = None if over_cell is None else over_cell.value
...@@ -638,6 +642,7 @@ class BSWorkbook(Workbook): ...@@ -638,6 +642,7 @@ class BSWorkbook(Workbook):
638 642
639 # 3.2.提取信息、高亮 643 # 3.2.提取信息、高亮
640 # row = summary_cell.row 644 # row = summary_cell.row
645 # online_log.info('[ti qu xin xi gao liang =========== >] [summary_cell_value={0}]'.format(summary_cell_value))
641 if summary_cell is not None: 646 if summary_cell is not None:
642 # 关键词1提取 647 # 关键词1提取
643 if summary_cell_value in self.interest_keyword: 648 if summary_cell_value in self.interest_keyword:
......
...@@ -704,6 +704,14 @@ class UploadDocView(GenericView, DocHandler): ...@@ -704,6 +704,14 @@ class UploadDocView(GenericView, DocHandler):
704 classify_1 = classify_1_tmp 704 classify_1 = classify_1_tmp
705 break 705 break
706 706
707 if classify_1 == 0 and ('微信支付交易明细证明' in document_name or '微信流水' in document_name):
708 classify_1 = 12
709 self.running_log.info('[weixin bs process] [doc_id={0}]'.format(doc.id))
710
711 if classify_1 == 0 and (document_name.startswith("dzfp") or '电子发票' in document_name):
712 classify_1 = 29
713 self.running_log.info('[dzfp process] [doc_id={0}]'.format(doc.id))
714
707 715
708 if document_name.endswith('.zip') or document_name.endswith('.rar') or document_name.endswith('.ZIP') \ 716 if document_name.endswith('.zip') or document_name.endswith('.rar') or document_name.endswith('.ZIP') \
709 or document_name.endswith('.RAR'): 717 or document_name.endswith('.RAR'):
...@@ -1258,6 +1266,14 @@ class DocView(DocGenericView, DocHandler): ...@@ -1258,6 +1266,14 @@ class DocView(DocGenericView, DocHandler):
1258 if keyword in document_name: 1266 if keyword in document_name:
1259 classify_1 = classify_1_tmp 1267 classify_1 = classify_1_tmp
1260 break 1268 break
1269
1270 if classify_1 == 0 and ('微信支付交易明细证明' in document_name or '微信流水' in document_name):
1271 classify_1 = 12
1272 self.running_log.info('[weixin bs process] [doc_id={0}]'.format(doc.id))
1273
1274 if classify_1 == 0 and (document_name.startswith("dzfp") or '电子发票' in document_name):
1275 classify_1 = 29
1276 self.running_log.info('[dzfp process] [doc_id={0}]'.format(doc.id))
1261 1277
1262 # tasks = ['{0}{1}{2}'.format(prefix, consts.SPLIT_STR, doc.id)] 1278 # tasks = ['{0}{1}{2}'.format(prefix, consts.SPLIT_STR, doc.id)]
1263 task = consts.SPLIT_STR.join([prefix, str(doc.id), str(classify_1), 'N']) 1279 task = consts.SPLIT_STR.join([prefix, str(doc.id), str(classify_1), 'N'])
......
...@@ -2731,6 +2731,13 @@ def se_compare_license(license_en, ocr_res_dict, field_list, is_auto): ...@@ -2731,6 +2731,13 @@ def se_compare_license(license_en, ocr_res_dict, field_list, is_auto):
2731 value = json.dumps(value, ensure_ascii=False) 2731 value = json.dumps(value, ensure_ascii=False)
2732 error_type = empty_error_type if result == consts.RESULT_Y else ErrorType.OCR.value 2732 error_type = empty_error_type if result == consts.RESULT_Y else ErrorType.OCR.value
2733 result_field_list.append((name, value, result, ocr_str, img_path, error_type, compare_logic[name][3])) 2733 result_field_list.append((name, value, result, ocr_str, img_path, error_type, compare_logic[name][3]))
2734 # CHINARPA-5620 电子发票返回CMS时 增加COMMENTS "此单为电子数控发票"
2735 if ocr_field == consts.MVI_OCR_FIELD :
2736 e_invoice = ocr_res_list[res_idx].get('电子发票')
2737 if e_invoice is not None and e_invoice == '是':
2738 result_field_list.append(('是否电子发票', '是/否', 'Y', '是', empty_str, empty_error_type, '此单为电子数控发票'))
2739 else:
2740 result_field_list.append(('是否电子发票', '是/否', 'Y', '否', empty_str, empty_error_type, '此单为电子数控发票'))
2734 else: 2741 else:
2735 no_ocr_result = True 2742 no_ocr_result = True
2736 2743
...@@ -3523,6 +3530,9 @@ def se_compare_process(compare_info, ocr_res_dict, is_gsyh, is_auto, id_res_list ...@@ -3523,6 +3530,9 @@ def se_compare_process(compare_info, ocr_res_dict, is_gsyh, is_auto, id_res_list
3523 elif isinstance(cn_reason, list): 3530 elif isinstance(cn_reason, list):
3524 cn_reason_list.extend(cn_reason) 3531 cn_reason_list.extend(cn_reason)
3525 rpa_failure_reason.setdefault('、'.join(cn_reason), []).append(value) 3532 rpa_failure_reason.setdefault('、'.join(cn_reason), []).append(value)
3533 #
3534 if license_en == consts.MVI_EN and name == '是否电子发票' and ocr_str == '是':
3535 cn_reason_list.append(cn_reason)
3526 compare_result.append( 3536 compare_result.append(
3527 { 3537 {
3528 consts.HEAD_LIST[0]: info_key, 3538 consts.HEAD_LIST[0]: info_key,
......
...@@ -69,6 +69,7 @@ class PDFHandler: ...@@ -69,6 +69,7 @@ class PDFHandler:
69 self.suffix = self.get_suffix(document_name) 69 self.suffix = self.get_suffix(document_name)
70 self.is_ebank = False 70 self.is_ebank = False
71 self.is_e_pdf = False 71 self.is_e_pdf = False
72 self.is_e_weixin_bs = False
72 self.page_text_list = [] 73 self.page_text_list = []
73 self.pdf_info = {} 74 self.pdf_info = {}
74 self.img_path_pno_list = [] 75 self.img_path_pno_list = []
...@@ -186,6 +187,8 @@ class PDFHandler: ...@@ -186,6 +187,8 @@ class PDFHandler:
186 self.img_path_list.append(img_save_path) 187 self.img_path_list.append(img_save_path)
187 if self.is_ebank: 188 if self.is_ebank:
188 self.rebuild_bbox(pm.width, pm.height, page.number) 189 self.rebuild_bbox(pm.width, pm.height, page.number)
190 if self.is_e_weixin_bs:
191 self.rebuild_bbox(pm.width, pm.height, page.number)
189 192
190 @staticmethod 193 @staticmethod
191 def getimage(pix): 194 def getimage(pix):
...@@ -407,6 +410,57 @@ class PDFHandler: ...@@ -407,6 +410,57 @@ class PDFHandler:
407 self.is_e_pdf = True 410 self.is_e_pdf = True
408 self.page_text_list = page_text_list 411 self.page_text_list = page_text_list
409 412
413 def put_text(self, pdf):
414 page_text_list = []
415 text_item_sum = 0
416 for pno in range(pdf.pageCount):
417 page = pdf.loadPage(pno)
418 if page.rotation is None:
419 rotation = 0
420 elif isinstance(page.rotation, int):
421 divisor, remainder = divmod(page.rotation, 90)
422 if remainder != 0:
423 return
424 rotation = divmod(divisor, 4)[1]
425 else:
426 return
427 textpage = page.getTextPage()
428 text = textpage.extractDICT()
429 text_list = []
430 for block in text.get('blocks'):
431 for line in block.get('lines'):
432 for span in line.get('spans'):
433 char = span.get('text')
434
435 if char.strip() == '':
436 continue
437
438 # 特殊emoji跳过
439 try:
440 print(char)
441 except Exception as e:
442 continue
443
444 bbox = span.get('bbox')
445 if pno == 0 and self.title_is_ebank(char):
446 in_ebank_set = True
447 text_list.append((bbox, char))
448 text_item_sum += len(text_list)
449 if text_item_sum < (pno + 1) * 5:
450 return
451 else:
452 page_text_list.append(
453 {
454 'width': text.get('width'),
455 'height': text.get('height'),
456 'rotation': rotation,
457 'text': text_list
458 }
459 )
460 self.is_e_pdf = True
461 self.is_e_weixin_bs = True
462 self.page_text_list = page_text_list
463
410 def e_contract_process(self): 464 def e_contract_process(self):
411 os.makedirs(self.img_dir_path, exist_ok=True) 465 os.makedirs(self.img_dir_path, exist_ok=True)
412 with fitz.Document(self.path) as pdf: 466 with fitz.Document(self.path) as pdf:
...@@ -473,6 +527,59 @@ class PDFHandler: ...@@ -473,6 +527,59 @@ class PDFHandler:
473 self.merge_il(pdf, pno, il) 527 self.merge_il(pdf, pno, il)
474 self.img_count = len(self.img_path_list) 528 self.img_count = len(self.img_path_list)
475 529
530 def extract_image_for_weixin(self, max_img_count=None):
531 self.img_path_list = []
532 self.xref_set = set()
533 os.makedirs(self.img_dir_path, exist_ok=True)
534
535 if self.suffix in self.img_suffixs:
536 img_save_path = self.get_img_save_path(0, ext=self.suffix[1:])
537 shutil.copy(self.path, img_save_path)
538 self.img_path_list.append(img_save_path)
539 else:
540 with fitz.Document(self.path) as pdf:
541 # 解密
542 for pwd in self.pwd_list:
543 if not pdf.isEncrypted:
544 break
545 pdf.authenticate(pwd)
546
547 self.metadata = pdf.metadata
548 self.page_count = pdf.pageCount
549 if isinstance(max_img_count, int) and pdf.pageCount >= max_img_count:
550 self.img_count = pdf.pageCount
551 return
552 self.put_text(pdf)
553 for pno in range(pdf.pageCount):
554 il = pdf.getPageImageList(pno) # 获取页面图片对象
555 # (xref, smask, width, height, bpc, colorspace, alt.colorspace, name, filter, invoker)
556
557 # 1.页面图片对象数目为0时,保存整个页面为png图片
558 if self.is_e_pdf or self.is_ebank or len(il) == 0:
559 page = pdf.loadPage(pno)
560 self.page_to_png(page)
561 # 2.页面图片对象数目为1时:
562 # 小图(如电子账单的盖章):保存整个页面为png图片
563 # 大图:提取图片对象
564 elif len(il) == 1:
565 xref, smask, width, height, _, colorspace, _, _, _ = il[0]
566 # 小图
567 if width < WH_COUPLE_1[0] and height < WH_COUPLE_1[1]:
568 page = pdf.loadPage(pno)
569 self.page_to_png(page)
570 # 大图
571 elif width >= WH_COUPLE_6[0] or height >= WH_COUPLE_6[1]:
572 self.is_new_modify = 1
573 is_big_img = (width < WH_COUPLE_7[0] and height < WH_COUPLE_7[1]) # 防止图片过大
574 page = pdf.loadPage(pno)
575 self.page_to_png(page, is_big_img=is_big_img)
576 elif xref not in self.xref_set:
577 self.extract_single_image(pdf, xref, smask, colorspace, pno)
578 # 3.页面图片对象数目大于1时,特殊处理
579 else:
580 self.merge_il(pdf, pno, il)
581 self.img_count = len(self.img_path_list)
582
476 def extract_page_image(self): 583 def extract_page_image(self):
477 self.img_path_list = [] 584 self.img_path_list = []
478 self.xref_set = set() 585 self.xref_set = set()
......
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!