init
Showing
3 changed files
with
245 additions
and
0 deletions
| ... | @@ -1504,6 +1504,137 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -1504,6 +1504,137 @@ class Command(BaseCommand, LoggerMixin): |
| 1504 | self.log_base, traceback.format_exc())) | 1504 | self.log_base, traceback.format_exc())) |
| 1505 | # error_list.append(1) | 1505 | # error_list.append(1) |
| 1506 | # return | 1506 | # return |
| 1507 | elif classify_1_str == '12': # weixin e-bs | ||
| 1508 | try: | ||
| 1509 | max_img_count = 500 | ||
| 1510 | for times in range(consts.RETRY_TIMES): | ||
| 1511 | try: | ||
| 1512 | if doc.application_id.startswith(consts.FIXED_APPLICATION_ID_PREFIX): | ||
| 1513 | self.online_log.info('{0} [mo ni xia dan] [task={1}] [times={2}] ' | ||
| 1514 | '[pdf_path={3}]'.format(self.log_base, task_str, | ||
| 1515 | times, pdf_path)) | ||
| 1516 | elif os.path.exists(pdf_path): | ||
| 1517 | self.online_log.info('{0} [pdf from zip file] [task={1}] [times={2}] ' | ||
| 1518 | '[pdf_path={3}]'.format(self.log_base, task_str, | ||
| 1519 | times, pdf_path)) | ||
| 1520 | else: | ||
| 1521 | # self.edms.download(pdf_path, doc.metadata_version_id) | ||
| 1522 | self.edms.download(pdf_path, doc.metadata_version_id, doc.document_scheme, | ||
| 1523 | business_type) | ||
| 1524 | self.online_log.info('{0} [ecm download success] [task={1}] [times={2}] ' | ||
| 1525 | '[pdf_path={3}]'.format(self.log_base, task_str, | ||
| 1526 | times, pdf_path)) | ||
| 1527 | |||
| 1528 | # 3.PDF文件提取图片 | ||
| 1529 | self.online_log.info('{0} [pdf to img start] [task={1}] [times={2}]'.format( | ||
| 1530 | self.log_base, task_str, times)) | ||
| 1531 | start_time = time.time() | ||
| 1532 | pdf_handler.extract_image_for_weixin(max_img_count) | ||
| 1533 | end_time = time.time() | ||
| 1534 | speed_time = int(end_time - start_time) | ||
| 1535 | self.online_log.info('{0} [pdf to img end] [task={1}] [times={2}] [spend_time={3}] [is_new_modify={4}]'.format( | ||
| 1536 | self.log_base, task_str, times, speed_time, pdf_handler.is_new_modify)) | ||
| 1537 | except Exception as e: | ||
| 1538 | self.online_log.warn('{0} [download or pdf to img failed] [task={1}] [times={2}] ' | ||
| 1539 | '[error={3}]'.format(self.log_base, task_str, times, | ||
| 1540 | traceback.format_exc())) | ||
| 1541 | else: | ||
| 1542 | break | ||
| 1543 | else: | ||
| 1544 | raise Exception('download or pdf to img failed') | ||
| 1545 | |||
| 1546 | if pdf_handler.img_count == 0: | ||
| 1547 | self.online_log.warn('{0} [pdf to img failed (pdf img empty)] [task={1}]'.format( | ||
| 1548 | self.log_base, task_str)) | ||
| 1549 | raise Exception('pdf img empty') | ||
| 1550 | elif pdf_handler.img_count >= max_img_count: | ||
| 1551 | self.online_log.info('{0} [too many pdf image] [task={1}] [img_count={2}]'.format( | ||
| 1552 | self.log_base, task_str, pdf_handler.img_count)) | ||
| 1553 | |||
| 1554 | try: | ||
| 1555 | report_table = HILOCRReport if business_type == consts.HIL_PREFIX else AFCOCRReport | ||
| 1556 | report_table.objects.create( | ||
| 1557 | case_number=doc.application_id, | ||
| 1558 | request_team=RequestTeam.get_value(doc.document_scheme, 0), | ||
| 1559 | request_trigger=RequestTrigger.get_value(doc.data_source, 0), | ||
| 1560 | input_file=doc.document_name, | ||
| 1561 | transaction_start=doc.start_time, | ||
| 1562 | transaction_end=doc.start_time, | ||
| 1563 | successful_at_this_level=False, | ||
| 1564 | failure_reason=FailureReason.IMG_LIMIT.value, | ||
| 1565 | process_name=ProcessName.ALL.value, | ||
| 1566 | notes='pdf page count: {0}'.format(str(pdf_handler.img_count)) | ||
| 1567 | ) | ||
| 1568 | except Exception as e: | ||
| 1569 | self.online_log.error('{0} [process error (report db save)] [error={1}]'.format( | ||
| 1570 | self.log_base, traceback.format_exc())) | ||
| 1571 | |||
| 1572 | try: | ||
| 1573 | doc.status = DocStatus.PROCESS_FAILED.value | ||
| 1574 | doc.page_count = pdf_handler.page_count | ||
| 1575 | doc.save() | ||
| 1576 | except Exception as e: | ||
| 1577 | self.online_log.error('{0} [process error (db save)] [error={1}]'.format( | ||
| 1578 | self.log_base, traceback.format_exc())) | ||
| 1579 | else: | ||
| 1580 | try: | ||
| 1581 | if pdf_handler.is_e_pdf: | ||
| 1582 | doc.metadata = pdf_handler.metadata if pdf_handler.metadata is None else \ | ||
| 1583 | json.dumps(pdf_handler.metadata) | ||
| 1584 | doc.page_count = pdf_handler.page_count | ||
| 1585 | doc.save() | ||
| 1586 | except Exception as e: | ||
| 1587 | self.online_log.error('{0} [process error (db save)] [error={1}]'.format( | ||
| 1588 | self.log_base, traceback.format_exc())) | ||
| 1589 | |||
| 1590 | with lock: | ||
| 1591 | todo_count_dict[task_str] = pdf_handler.img_count | ||
| 1592 | |||
| 1593 | self.online_log.info('{0} [pdf_2_img_2_queue] [{1}] [is_ebank={2}]'.format( | ||
| 1594 | self.log_base, task_str, pdf_handler.is_ebank | ||
| 1595 | )) | ||
| 1596 | for img_idx, img_path in enumerate(pdf_handler.img_path_list): | ||
| 1597 | while img_queue.full(): | ||
| 1598 | self.online_log.info('{0} [pdf_2_img_2_queue] [img queue full]'.format(self.log_base)) | ||
| 1599 | time.sleep(self.sleep_time_img_put) | ||
| 1600 | if pdf_handler.is_e_weixin_bs: | ||
| 1601 | try: | ||
| 1602 | text_list = pdf_handler.page_text_list | ||
| 1603 | except Exception as e: | ||
| 1604 | text_list = [] | ||
| 1605 | else: | ||
| 1606 | text_list = [] | ||
| 1607 | img_queue.put((business_type, img_path, text_list)) | ||
| 1608 | except Exception as e: | ||
| 1609 | try: | ||
| 1610 | end_time = timezone.now() | ||
| 1611 | report_table = HILOCRReport if business_type == consts.HIL_PREFIX else AFCOCRReport | ||
| 1612 | report_table.objects.create( | ||
| 1613 | case_number=doc.application_id, | ||
| 1614 | request_team=RequestTeam.get_value(doc.document_scheme, 0), | ||
| 1615 | request_trigger=RequestTrigger.get_value(doc.data_source, 0), | ||
| 1616 | input_file=doc.document_name, | ||
| 1617 | transaction_start=doc.start_time, | ||
| 1618 | transaction_end=end_time, | ||
| 1619 | successful_at_this_level=False, | ||
| 1620 | failure_reason=FailureReason.PDF.value, | ||
| 1621 | process_name=ProcessName.ALL.value, | ||
| 1622 | ) | ||
| 1623 | except Exception as e: | ||
| 1624 | self.online_log.error('{0} [process error (report db save)] [error={1}]'.format( | ||
| 1625 | self.log_base, traceback.format_exc())) | ||
| 1626 | |||
| 1627 | try: | ||
| 1628 | doc.status = DocStatus.PROCESS_FAILED.value | ||
| 1629 | doc.page_count = pdf_handler.page_count | ||
| 1630 | doc.save() | ||
| 1631 | self.online_log.warn('{0} [process failed (pdf_2_img_2_queue)] [task={1}] ' | ||
| 1632 | '[error={2}]'.format(self.log_base, task_str, traceback.format_exc())) | ||
| 1633 | except Exception as e: | ||
| 1634 | self.online_log.error('{0} [process error (db save)] [error={1}]'.format( | ||
| 1635 | self.log_base, traceback.format_exc())) | ||
| 1636 | # error_list.append(1) | ||
| 1637 | # return | ||
| 1507 | else: # e-contract or or e-fsm-contract or e-hmh | 1638 | else: # e-contract or or e-fsm-contract or e-hmh |
| 1508 | try: | 1639 | try: |
| 1509 | # pdf下载 处理 图片存储 识别 | 1640 | # pdf下载 处理 图片存储 识别 |
| ... | @@ -1674,6 +1805,7 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -1674,6 +1805,7 @@ class Command(BaseCommand, LoggerMixin): |
| 1674 | json_data_1['text_list'] = text_list | 1805 | json_data_1['text_list'] = text_list |
| 1675 | 1806 | ||
| 1676 | start_time = time.time() | 1807 | start_time = time.time() |
| 1808 | self.online_log.info('{0} [ocr_1 api] [img={1}] [json_data_1={2}]'.format(self.log_base, img_path, json_data_1)) | ||
| 1677 | ocr_1_response = requests.post(url, json=json_data_1) | 1809 | ocr_1_response = requests.post(url, json=json_data_1) |
| 1678 | if ocr_1_response.status_code != 200: | 1810 | if ocr_1_response.status_code != 200: |
| 1679 | raise OCR1Exception('ocr_1 status code: {0}'.format(ocr_1_response.status_code)) | 1811 | raise OCR1Exception('ocr_1 status code: {0}'.format(ocr_1_response.status_code)) | ... | ... |
| ... | @@ -684,6 +684,10 @@ class UploadDocView(GenericView, DocHandler): | ... | @@ -684,6 +684,10 @@ class UploadDocView(GenericView, DocHandler): |
| 684 | classify_1 = classify_1_tmp | 684 | classify_1 = classify_1_tmp |
| 685 | break | 685 | break |
| 686 | 686 | ||
| 687 | if classify_1 == 0 and ('微信支付交易明细证明' in document_name or '微信流水' in document_name): | ||
| 688 | classify_1 = 12 | ||
| 689 | self.running_log.info('[weixin bs process] [doc_id={0}]'.format(doc.id)) | ||
| 690 | |||
| 687 | 691 | ||
| 688 | if document_name.endswith('.zip') or document_name.endswith('.rar') or document_name.endswith('.ZIP') \ | 692 | if document_name.endswith('.zip') or document_name.endswith('.rar') or document_name.endswith('.ZIP') \ |
| 689 | or document_name.endswith('.RAR'): | 693 | or document_name.endswith('.RAR'): |
| ... | @@ -1238,6 +1242,10 @@ class DocView(DocGenericView, DocHandler): | ... | @@ -1238,6 +1242,10 @@ class DocView(DocGenericView, DocHandler): |
| 1238 | if keyword in document_name: | 1242 | if keyword in document_name: |
| 1239 | classify_1 = classify_1_tmp | 1243 | classify_1 = classify_1_tmp |
| 1240 | break | 1244 | break |
| 1245 | |||
| 1246 | if classify_1 == 0 and ('微信支付交易明细证明' in document_name or '微信流水' in document_name): | ||
| 1247 | classify_1 = 12 | ||
| 1248 | self.running_log.info('[weixin bs process] [doc_id={0}]'.format(doc.id)) | ||
| 1241 | 1249 | ||
| 1242 | # tasks = ['{0}{1}{2}'.format(prefix, consts.SPLIT_STR, doc.id)] | 1250 | # tasks = ['{0}{1}{2}'.format(prefix, consts.SPLIT_STR, doc.id)] |
| 1243 | task = consts.SPLIT_STR.join([prefix, str(doc.id), str(classify_1)]) | 1251 | task = consts.SPLIT_STR.join([prefix, str(doc.id), str(classify_1)]) | ... | ... |
| ... | @@ -69,6 +69,7 @@ class PDFHandler: | ... | @@ -69,6 +69,7 @@ class PDFHandler: |
| 69 | self.suffix = self.get_suffix(document_name) | 69 | self.suffix = self.get_suffix(document_name) |
| 70 | self.is_ebank = False | 70 | self.is_ebank = False |
| 71 | self.is_e_pdf = False | 71 | self.is_e_pdf = False |
| 72 | self.is_e_weixin_bs = False | ||
| 72 | self.page_text_list = [] | 73 | self.page_text_list = [] |
| 73 | self.pdf_info = {} | 74 | self.pdf_info = {} |
| 74 | self.img_path_pno_list = [] | 75 | self.img_path_pno_list = [] |
| ... | @@ -407,6 +408,57 @@ class PDFHandler: | ... | @@ -407,6 +408,57 @@ class PDFHandler: |
| 407 | self.is_e_pdf = True | 408 | self.is_e_pdf = True |
| 408 | self.page_text_list = page_text_list | 409 | self.page_text_list = page_text_list |
| 409 | 410 | ||
| 411 | def put_text(self, pdf): | ||
| 412 | page_text_list = [] | ||
| 413 | text_item_sum = 0 | ||
| 414 | for pno in range(pdf.pageCount): | ||
| 415 | page = pdf.loadPage(pno) | ||
| 416 | if page.rotation is None: | ||
| 417 | rotation = 0 | ||
| 418 | elif isinstance(page.rotation, int): | ||
| 419 | divisor, remainder = divmod(page.rotation, 90) | ||
| 420 | if remainder != 0: | ||
| 421 | return | ||
| 422 | rotation = divmod(divisor, 4)[1] | ||
| 423 | else: | ||
| 424 | return | ||
| 425 | textpage = page.getTextPage() | ||
| 426 | text = textpage.extractDICT() | ||
| 427 | text_list = [] | ||
| 428 | for block in text.get('blocks'): | ||
| 429 | for line in block.get('lines'): | ||
| 430 | for span in line.get('spans'): | ||
| 431 | char = span.get('text') | ||
| 432 | |||
| 433 | if char.strip() == '': | ||
| 434 | continue | ||
| 435 | |||
| 436 | # 特殊emoji跳过 | ||
| 437 | try: | ||
| 438 | print(char) | ||
| 439 | except Exception as e: | ||
| 440 | continue | ||
| 441 | |||
| 442 | bbox = span.get('bbox') | ||
| 443 | if pno == 0 and self.title_is_ebank(char): | ||
| 444 | in_ebank_set = True | ||
| 445 | text_list.append((bbox, char)) | ||
| 446 | text_item_sum += len(text_list) | ||
| 447 | if text_item_sum < (pno + 1) * 5: | ||
| 448 | return | ||
| 449 | else: | ||
| 450 | page_text_list.append( | ||
| 451 | { | ||
| 452 | 'width': text.get('width'), | ||
| 453 | 'height': text.get('height'), | ||
| 454 | 'rotation': rotation, | ||
| 455 | 'text': text_list | ||
| 456 | } | ||
| 457 | ) | ||
| 458 | self.is_e_pdf = True | ||
| 459 | self.is_e_weixin_bs = True | ||
| 460 | self.page_text_list = page_text_list | ||
| 461 | |||
| 410 | def e_contract_process(self): | 462 | def e_contract_process(self): |
| 411 | os.makedirs(self.img_dir_path, exist_ok=True) | 463 | os.makedirs(self.img_dir_path, exist_ok=True) |
| 412 | with fitz.Document(self.path) as pdf: | 464 | with fitz.Document(self.path) as pdf: |
| ... | @@ -473,6 +525,59 @@ class PDFHandler: | ... | @@ -473,6 +525,59 @@ class PDFHandler: |
| 473 | self.merge_il(pdf, pno, il) | 525 | self.merge_il(pdf, pno, il) |
| 474 | self.img_count = len(self.img_path_list) | 526 | self.img_count = len(self.img_path_list) |
| 475 | 527 | ||
| 528 | def extract_image_for_weixin(self, max_img_count=None): | ||
| 529 | self.img_path_list = [] | ||
| 530 | self.xref_set = set() | ||
| 531 | os.makedirs(self.img_dir_path, exist_ok=True) | ||
| 532 | |||
| 533 | if self.suffix in self.img_suffixs: | ||
| 534 | img_save_path = self.get_img_save_path(0, ext=self.suffix[1:]) | ||
| 535 | shutil.copy(self.path, img_save_path) | ||
| 536 | self.img_path_list.append(img_save_path) | ||
| 537 | else: | ||
| 538 | with fitz.Document(self.path) as pdf: | ||
| 539 | # 解密 | ||
| 540 | for pwd in self.pwd_list: | ||
| 541 | if not pdf.isEncrypted: | ||
| 542 | break | ||
| 543 | pdf.authenticate(pwd) | ||
| 544 | |||
| 545 | self.metadata = pdf.metadata | ||
| 546 | self.page_count = pdf.pageCount | ||
| 547 | if isinstance(max_img_count, int) and pdf.pageCount >= max_img_count: | ||
| 548 | self.img_count = pdf.pageCount | ||
| 549 | return | ||
| 550 | self.put_text(pdf) | ||
| 551 | for pno in range(pdf.pageCount): | ||
| 552 | il = pdf.getPageImageList(pno) # 获取页面图片对象 | ||
| 553 | # (xref, smask, width, height, bpc, colorspace, alt.colorspace, name, filter, invoker) | ||
| 554 | |||
| 555 | # 1.页面图片对象数目为0时,保存整个页面为png图片 | ||
| 556 | if self.is_e_pdf or self.is_ebank or len(il) == 0: | ||
| 557 | page = pdf.loadPage(pno) | ||
| 558 | self.page_to_png(page) | ||
| 559 | # 2.页面图片对象数目为1时: | ||
| 560 | # 小图(如电子账单的盖章):保存整个页面为png图片 | ||
| 561 | # 大图:提取图片对象 | ||
| 562 | elif len(il) == 1: | ||
| 563 | xref, smask, width, height, _, colorspace, _, _, _ = il[0] | ||
| 564 | # 小图 | ||
| 565 | if width < WH_COUPLE_1[0] and height < WH_COUPLE_1[1]: | ||
| 566 | page = pdf.loadPage(pno) | ||
| 567 | self.page_to_png(page) | ||
| 568 | # 大图 | ||
| 569 | elif width >= WH_COUPLE_6[0] or height >= WH_COUPLE_6[1]: | ||
| 570 | self.is_new_modify = 1 | ||
| 571 | is_big_img = (width < WH_COUPLE_7[0] and height < WH_COUPLE_7[1]) # 防止图片过大 | ||
| 572 | page = pdf.loadPage(pno) | ||
| 573 | self.page_to_png(page, is_big_img=is_big_img) | ||
| 574 | elif xref not in self.xref_set: | ||
| 575 | self.extract_single_image(pdf, xref, smask, colorspace, pno) | ||
| 576 | # 3.页面图片对象数目大于1时,特殊处理 | ||
| 577 | else: | ||
| 578 | self.merge_il(pdf, pno, il) | ||
| 579 | self.img_count = len(self.img_path_list) | ||
| 580 | |||
| 476 | def extract_page_image(self): | 581 | def extract_page_image(self): |
| 477 | self.img_path_list = [] | 582 | self.img_path_list = [] |
| 478 | self.xref_set = set() | 583 | self.xref_set = set() | ... | ... |
-
Please register or sign in to post a comment