merge
Showing
5 changed files
with
138 additions
and
0 deletions
This diff is collapsed.
Click to expand it.
| ... | @@ -11,6 +11,8 @@ from openpyxl import Workbook | ... | @@ -11,6 +11,8 @@ from openpyxl import Workbook |
| 11 | from openpyxl.styles import PatternFill, numbers | 11 | from openpyxl.styles import PatternFill, numbers |
| 12 | from openpyxl.utils import get_column_letter | 12 | from openpyxl.utils import get_column_letter |
| 13 | from apps.doc import consts | 13 | from apps.doc import consts |
| 14 | import logging | ||
| 15 | online_log = logging.getLogger('online') | ||
| 14 | 16 | ||
| 15 | 17 | ||
| 16 | class BSWorkbook(Workbook): | 18 | class BSWorkbook(Workbook): |
| ... | @@ -562,6 +564,8 @@ class BSWorkbook(Workbook): | ... | @@ -562,6 +564,8 @@ class BSWorkbook(Workbook): |
| 562 | borrow_cell = None if borrow_cell_idx is None or borrow_cell_idx >= length else rows[borrow_cell_idx] | 564 | borrow_cell = None if borrow_cell_idx is None or borrow_cell_idx >= length else rows[borrow_cell_idx] |
| 563 | 565 | ||
| 564 | summary_cell_value = None if summary_cell is None else summary_cell.value | 566 | summary_cell_value = None if summary_cell is None else summary_cell.value |
| 567 | if summary_cell.value is not None: | ||
| 568 | summary_cell_value = summary_cell_value.strip() | ||
| 565 | date_cell_value = None if date_cell is None else date_cell.value | 569 | date_cell_value = None if date_cell is None else date_cell.value |
| 566 | amount_cell_value = None if amount_cell is None else amount_cell.value | 570 | amount_cell_value = None if amount_cell is None else amount_cell.value |
| 567 | over_cell_value = None if over_cell is None else over_cell.value | 571 | over_cell_value = None if over_cell is None else over_cell.value |
| ... | @@ -638,6 +642,7 @@ class BSWorkbook(Workbook): | ... | @@ -638,6 +642,7 @@ class BSWorkbook(Workbook): |
| 638 | 642 | ||
| 639 | # 3.2.提取信息、高亮 | 643 | # 3.2.提取信息、高亮 |
| 640 | # row = summary_cell.row | 644 | # row = summary_cell.row |
| 645 | # online_log.info('[ti qu xin xi gao liang =========== >] [summary_cell_value={0}]'.format(summary_cell_value)) | ||
| 641 | if summary_cell is not None: | 646 | if summary_cell is not None: |
| 642 | # 关键词1提取 | 647 | # 关键词1提取 |
| 643 | if summary_cell_value in self.interest_keyword: | 648 | if summary_cell_value in self.interest_keyword: | ... | ... |
| ... | @@ -704,6 +704,14 @@ class UploadDocView(GenericView, DocHandler): | ... | @@ -704,6 +704,14 @@ class UploadDocView(GenericView, DocHandler): |
| 704 | classify_1 = classify_1_tmp | 704 | classify_1 = classify_1_tmp |
| 705 | break | 705 | break |
| 706 | 706 | ||
| 707 | if classify_1 == 0 and ('微信支付交易明细证明' in document_name or '微信流水' in document_name): | ||
| 708 | classify_1 = 12 | ||
| 709 | self.running_log.info('[weixin bs process] [doc_id={0}]'.format(doc.id)) | ||
| 710 | |||
| 711 | if classify_1 == 0 and (document_name.startswith("dzfp") or '电子发票' in document_name): | ||
| 712 | classify_1 = 29 | ||
| 713 | self.running_log.info('[dzfp process] [doc_id={0}]'.format(doc.id)) | ||
| 714 | |||
| 707 | 715 | ||
| 708 | if document_name.endswith('.zip') or document_name.endswith('.rar') or document_name.endswith('.ZIP') \ | 716 | if document_name.endswith('.zip') or document_name.endswith('.rar') or document_name.endswith('.ZIP') \ |
| 709 | or document_name.endswith('.RAR'): | 717 | or document_name.endswith('.RAR'): |
| ... | @@ -1259,6 +1267,14 @@ class DocView(DocGenericView, DocHandler): | ... | @@ -1259,6 +1267,14 @@ class DocView(DocGenericView, DocHandler): |
| 1259 | classify_1 = classify_1_tmp | 1267 | classify_1 = classify_1_tmp |
| 1260 | break | 1268 | break |
| 1261 | 1269 | ||
| 1270 | if classify_1 == 0 and ('微信支付交易明细证明' in document_name or '微信流水' in document_name): | ||
| 1271 | classify_1 = 12 | ||
| 1272 | self.running_log.info('[weixin bs process] [doc_id={0}]'.format(doc.id)) | ||
| 1273 | |||
| 1274 | if classify_1 == 0 and (document_name.startswith("dzfp") or '电子发票' in document_name): | ||
| 1275 | classify_1 = 29 | ||
| 1276 | self.running_log.info('[dzfp process] [doc_id={0}]'.format(doc.id)) | ||
| 1277 | |||
| 1262 | # tasks = ['{0}{1}{2}'.format(prefix, consts.SPLIT_STR, doc.id)] | 1278 | # tasks = ['{0}{1}{2}'.format(prefix, consts.SPLIT_STR, doc.id)] |
| 1263 | task = consts.SPLIT_STR.join([prefix, str(doc.id), str(classify_1), 'N']) | 1279 | task = consts.SPLIT_STR.join([prefix, str(doc.id), str(classify_1), 'N']) |
| 1264 | enqueue_res = rh.enqueue([task], is_priority) | 1280 | enqueue_res = rh.enqueue([task], is_priority) | ... | ... |
| ... | @@ -2731,6 +2731,13 @@ def se_compare_license(license_en, ocr_res_dict, field_list, is_auto): | ... | @@ -2731,6 +2731,13 @@ def se_compare_license(license_en, ocr_res_dict, field_list, is_auto): |
| 2731 | value = json.dumps(value, ensure_ascii=False) | 2731 | value = json.dumps(value, ensure_ascii=False) |
| 2732 | error_type = empty_error_type if result == consts.RESULT_Y else ErrorType.OCR.value | 2732 | error_type = empty_error_type if result == consts.RESULT_Y else ErrorType.OCR.value |
| 2733 | result_field_list.append((name, value, result, ocr_str, img_path, error_type, compare_logic[name][3])) | 2733 | result_field_list.append((name, value, result, ocr_str, img_path, error_type, compare_logic[name][3])) |
| 2734 | # CHINARPA-5620 电子发票返回CMS时 增加COMMENTS "此单为电子数控发票" | ||
| 2735 | if ocr_field == consts.MVI_OCR_FIELD : | ||
| 2736 | e_invoice = ocr_res_list[res_idx].get('电子发票') | ||
| 2737 | if e_invoice is not None and e_invoice == '是': | ||
| 2738 | result_field_list.append(('是否电子发票', '是/否', 'Y', '是', empty_str, empty_error_type, '此单为电子数控发票')) | ||
| 2739 | else: | ||
| 2740 | result_field_list.append(('是否电子发票', '是/否', 'Y', '否', empty_str, empty_error_type, '此单为电子数控发票')) | ||
| 2734 | else: | 2741 | else: |
| 2735 | no_ocr_result = True | 2742 | no_ocr_result = True |
| 2736 | 2743 | ||
| ... | @@ -3523,6 +3530,9 @@ def se_compare_process(compare_info, ocr_res_dict, is_gsyh, is_auto, id_res_list | ... | @@ -3523,6 +3530,9 @@ def se_compare_process(compare_info, ocr_res_dict, is_gsyh, is_auto, id_res_list |
| 3523 | elif isinstance(cn_reason, list): | 3530 | elif isinstance(cn_reason, list): |
| 3524 | cn_reason_list.extend(cn_reason) | 3531 | cn_reason_list.extend(cn_reason) |
| 3525 | rpa_failure_reason.setdefault('、'.join(cn_reason), []).append(value) | 3532 | rpa_failure_reason.setdefault('、'.join(cn_reason), []).append(value) |
| 3533 | # | ||
| 3534 | if license_en == consts.MVI_EN and name == '是否电子发票' and ocr_str == '是': | ||
| 3535 | cn_reason_list.append(cn_reason) | ||
| 3526 | compare_result.append( | 3536 | compare_result.append( |
| 3527 | { | 3537 | { |
| 3528 | consts.HEAD_LIST[0]: info_key, | 3538 | consts.HEAD_LIST[0]: info_key, | ... | ... |
| ... | @@ -69,6 +69,7 @@ class PDFHandler: | ... | @@ -69,6 +69,7 @@ class PDFHandler: |
| 69 | self.suffix = self.get_suffix(document_name) | 69 | self.suffix = self.get_suffix(document_name) |
| 70 | self.is_ebank = False | 70 | self.is_ebank = False |
| 71 | self.is_e_pdf = False | 71 | self.is_e_pdf = False |
| 72 | self.is_e_weixin_bs = False | ||
| 72 | self.page_text_list = [] | 73 | self.page_text_list = [] |
| 73 | self.pdf_info = {} | 74 | self.pdf_info = {} |
| 74 | self.img_path_pno_list = [] | 75 | self.img_path_pno_list = [] |
| ... | @@ -186,6 +187,8 @@ class PDFHandler: | ... | @@ -186,6 +187,8 @@ class PDFHandler: |
| 186 | self.img_path_list.append(img_save_path) | 187 | self.img_path_list.append(img_save_path) |
| 187 | if self.is_ebank: | 188 | if self.is_ebank: |
| 188 | self.rebuild_bbox(pm.width, pm.height, page.number) | 189 | self.rebuild_bbox(pm.width, pm.height, page.number) |
| 190 | if self.is_e_weixin_bs: | ||
| 191 | self.rebuild_bbox(pm.width, pm.height, page.number) | ||
| 189 | 192 | ||
| 190 | @staticmethod | 193 | @staticmethod |
| 191 | def getimage(pix): | 194 | def getimage(pix): |
| ... | @@ -407,6 +410,57 @@ class PDFHandler: | ... | @@ -407,6 +410,57 @@ class PDFHandler: |
| 407 | self.is_e_pdf = True | 410 | self.is_e_pdf = True |
| 408 | self.page_text_list = page_text_list | 411 | self.page_text_list = page_text_list |
| 409 | 412 | ||
| 413 | def put_text(self, pdf): | ||
| 414 | page_text_list = [] | ||
| 415 | text_item_sum = 0 | ||
| 416 | for pno in range(pdf.pageCount): | ||
| 417 | page = pdf.loadPage(pno) | ||
| 418 | if page.rotation is None: | ||
| 419 | rotation = 0 | ||
| 420 | elif isinstance(page.rotation, int): | ||
| 421 | divisor, remainder = divmod(page.rotation, 90) | ||
| 422 | if remainder != 0: | ||
| 423 | return | ||
| 424 | rotation = divmod(divisor, 4)[1] | ||
| 425 | else: | ||
| 426 | return | ||
| 427 | textpage = page.getTextPage() | ||
| 428 | text = textpage.extractDICT() | ||
| 429 | text_list = [] | ||
| 430 | for block in text.get('blocks'): | ||
| 431 | for line in block.get('lines'): | ||
| 432 | for span in line.get('spans'): | ||
| 433 | char = span.get('text') | ||
| 434 | |||
| 435 | if char.strip() == '': | ||
| 436 | continue | ||
| 437 | |||
| 438 | # 特殊emoji跳过 | ||
| 439 | try: | ||
| 440 | print(char) | ||
| 441 | except Exception as e: | ||
| 442 | continue | ||
| 443 | |||
| 444 | bbox = span.get('bbox') | ||
| 445 | if pno == 0 and self.title_is_ebank(char): | ||
| 446 | in_ebank_set = True | ||
| 447 | text_list.append((bbox, char)) | ||
| 448 | text_item_sum += len(text_list) | ||
| 449 | if text_item_sum < (pno + 1) * 5: | ||
| 450 | return | ||
| 451 | else: | ||
| 452 | page_text_list.append( | ||
| 453 | { | ||
| 454 | 'width': text.get('width'), | ||
| 455 | 'height': text.get('height'), | ||
| 456 | 'rotation': rotation, | ||
| 457 | 'text': text_list | ||
| 458 | } | ||
| 459 | ) | ||
| 460 | self.is_e_pdf = True | ||
| 461 | self.is_e_weixin_bs = True | ||
| 462 | self.page_text_list = page_text_list | ||
| 463 | |||
| 410 | def e_contract_process(self): | 464 | def e_contract_process(self): |
| 411 | os.makedirs(self.img_dir_path, exist_ok=True) | 465 | os.makedirs(self.img_dir_path, exist_ok=True) |
| 412 | with fitz.Document(self.path) as pdf: | 466 | with fitz.Document(self.path) as pdf: |
| ... | @@ -473,6 +527,59 @@ class PDFHandler: | ... | @@ -473,6 +527,59 @@ class PDFHandler: |
| 473 | self.merge_il(pdf, pno, il) | 527 | self.merge_il(pdf, pno, il) |
| 474 | self.img_count = len(self.img_path_list) | 528 | self.img_count = len(self.img_path_list) |
| 475 | 529 | ||
| 530 | def extract_image_for_weixin(self, max_img_count=None): | ||
| 531 | self.img_path_list = [] | ||
| 532 | self.xref_set = set() | ||
| 533 | os.makedirs(self.img_dir_path, exist_ok=True) | ||
| 534 | |||
| 535 | if self.suffix in self.img_suffixs: | ||
| 536 | img_save_path = self.get_img_save_path(0, ext=self.suffix[1:]) | ||
| 537 | shutil.copy(self.path, img_save_path) | ||
| 538 | self.img_path_list.append(img_save_path) | ||
| 539 | else: | ||
| 540 | with fitz.Document(self.path) as pdf: | ||
| 541 | # 解密 | ||
| 542 | for pwd in self.pwd_list: | ||
| 543 | if not pdf.isEncrypted: | ||
| 544 | break | ||
| 545 | pdf.authenticate(pwd) | ||
| 546 | |||
| 547 | self.metadata = pdf.metadata | ||
| 548 | self.page_count = pdf.pageCount | ||
| 549 | if isinstance(max_img_count, int) and pdf.pageCount >= max_img_count: | ||
| 550 | self.img_count = pdf.pageCount | ||
| 551 | return | ||
| 552 | self.put_text(pdf) | ||
| 553 | for pno in range(pdf.pageCount): | ||
| 554 | il = pdf.getPageImageList(pno) # 获取页面图片对象 | ||
| 555 | # (xref, smask, width, height, bpc, colorspace, alt.colorspace, name, filter, invoker) | ||
| 556 | |||
| 557 | # 1.页面图片对象数目为0时,保存整个页面为png图片 | ||
| 558 | if self.is_e_pdf or self.is_ebank or len(il) == 0: | ||
| 559 | page = pdf.loadPage(pno) | ||
| 560 | self.page_to_png(page) | ||
| 561 | # 2.页面图片对象数目为1时: | ||
| 562 | # 小图(如电子账单的盖章):保存整个页面为png图片 | ||
| 563 | # 大图:提取图片对象 | ||
| 564 | elif len(il) == 1: | ||
| 565 | xref, smask, width, height, _, colorspace, _, _, _ = il[0] | ||
| 566 | # 小图 | ||
| 567 | if width < WH_COUPLE_1[0] and height < WH_COUPLE_1[1]: | ||
| 568 | page = pdf.loadPage(pno) | ||
| 569 | self.page_to_png(page) | ||
| 570 | # 大图 | ||
| 571 | elif width >= WH_COUPLE_6[0] or height >= WH_COUPLE_6[1]: | ||
| 572 | self.is_new_modify = 1 | ||
| 573 | is_big_img = (width < WH_COUPLE_7[0] and height < WH_COUPLE_7[1]) # 防止图片过大 | ||
| 574 | page = pdf.loadPage(pno) | ||
| 575 | self.page_to_png(page, is_big_img=is_big_img) | ||
| 576 | elif xref not in self.xref_set: | ||
| 577 | self.extract_single_image(pdf, xref, smask, colorspace, pno) | ||
| 578 | # 3.页面图片对象数目大于1时,特殊处理 | ||
| 579 | else: | ||
| 580 | self.merge_il(pdf, pno, il) | ||
| 581 | self.img_count = len(self.img_path_list) | ||
| 582 | |||
| 476 | def extract_page_image(self): | 583 | def extract_page_image(self): |
| 477 | self.img_path_list = [] | 584 | self.img_path_list = [] |
| 478 | self.xref_set = set() | 585 | self.xref_set = set() | ... | ... |
-
Please register or sign in to post a comment