2932c540 by 冯轩

init

1 parent 8ddb1d4c
...@@ -1504,6 +1504,137 @@ class Command(BaseCommand, LoggerMixin): ...@@ -1504,6 +1504,137 @@ class Command(BaseCommand, LoggerMixin):
1504 self.log_base, traceback.format_exc())) 1504 self.log_base, traceback.format_exc()))
1505 # error_list.append(1) 1505 # error_list.append(1)
1506 # return 1506 # return
1507 elif classify_1_str == '12': # weixin e-bs
1508 try:
1509 max_img_count = 500
1510 for times in range(consts.RETRY_TIMES):
1511 try:
1512 if doc.application_id.startswith(consts.FIXED_APPLICATION_ID_PREFIX):
1513 self.online_log.info('{0} [mo ni xia dan] [task={1}] [times={2}] '
1514 '[pdf_path={3}]'.format(self.log_base, task_str,
1515 times, pdf_path))
1516 elif os.path.exists(pdf_path):
1517 self.online_log.info('{0} [pdf from zip file] [task={1}] [times={2}] '
1518 '[pdf_path={3}]'.format(self.log_base, task_str,
1519 times, pdf_path))
1520 else:
1521 # self.edms.download(pdf_path, doc.metadata_version_id)
1522 self.edms.download(pdf_path, doc.metadata_version_id, doc.document_scheme,
1523 business_type)
1524 self.online_log.info('{0} [ecm download success] [task={1}] [times={2}] '
1525 '[pdf_path={3}]'.format(self.log_base, task_str,
1526 times, pdf_path))
1527
1528 # 3.PDF文件提取图片
1529 self.online_log.info('{0} [pdf to img start] [task={1}] [times={2}]'.format(
1530 self.log_base, task_str, times))
1531 start_time = time.time()
1532 pdf_handler.extract_image_for_weixin(max_img_count)
1533 end_time = time.time()
1534 speed_time = int(end_time - start_time)
1535 self.online_log.info('{0} [pdf to img end] [task={1}] [times={2}] [spend_time={3}] [is_new_modify={4}]'.format(
1536 self.log_base, task_str, times, speed_time, pdf_handler.is_new_modify))
1537 except Exception as e:
1538 self.online_log.warn('{0} [download or pdf to img failed] [task={1}] [times={2}] '
1539 '[error={3}]'.format(self.log_base, task_str, times,
1540 traceback.format_exc()))
1541 else:
1542 break
1543 else:
1544 raise Exception('download or pdf to img failed')
1545
1546 if pdf_handler.img_count == 0:
1547 self.online_log.warn('{0} [pdf to img failed (pdf img empty)] [task={1}]'.format(
1548 self.log_base, task_str))
1549 raise Exception('pdf img empty')
1550 elif pdf_handler.img_count >= max_img_count:
1551 self.online_log.info('{0} [too many pdf image] [task={1}] [img_count={2}]'.format(
1552 self.log_base, task_str, pdf_handler.img_count))
1553
1554 try:
1555 report_table = HILOCRReport if business_type == consts.HIL_PREFIX else AFCOCRReport
1556 report_table.objects.create(
1557 case_number=doc.application_id,
1558 request_team=RequestTeam.get_value(doc.document_scheme, 0),
1559 request_trigger=RequestTrigger.get_value(doc.data_source, 0),
1560 input_file=doc.document_name,
1561 transaction_start=doc.start_time,
1562 transaction_end=doc.start_time,
1563 successful_at_this_level=False,
1564 failure_reason=FailureReason.IMG_LIMIT.value,
1565 process_name=ProcessName.ALL.value,
1566 notes='pdf page count: {0}'.format(str(pdf_handler.img_count))
1567 )
1568 except Exception as e:
1569 self.online_log.error('{0} [process error (report db save)] [error={1}]'.format(
1570 self.log_base, traceback.format_exc()))
1571
1572 try:
1573 doc.status = DocStatus.PROCESS_FAILED.value
1574 doc.page_count = pdf_handler.page_count
1575 doc.save()
1576 except Exception as e:
1577 self.online_log.error('{0} [process error (db save)] [error={1}]'.format(
1578 self.log_base, traceback.format_exc()))
1579 else:
1580 try:
1581 if pdf_handler.is_e_pdf:
1582 doc.metadata = pdf_handler.metadata if pdf_handler.metadata is None else \
1583 json.dumps(pdf_handler.metadata)
1584 doc.page_count = pdf_handler.page_count
1585 doc.save()
1586 except Exception as e:
1587 self.online_log.error('{0} [process error (db save)] [error={1}]'.format(
1588 self.log_base, traceback.format_exc()))
1589
1590 with lock:
1591 todo_count_dict[task_str] = pdf_handler.img_count
1592
1593 self.online_log.info('{0} [pdf_2_img_2_queue] [{1}] [is_ebank={2}]'.format(
1594 self.log_base, task_str, pdf_handler.is_ebank
1595 ))
1596 for img_idx, img_path in enumerate(pdf_handler.img_path_list):
1597 while img_queue.full():
1598 self.online_log.info('{0} [pdf_2_img_2_queue] [img queue full]'.format(self.log_base))
1599 time.sleep(self.sleep_time_img_put)
1600 if pdf_handler.is_e_weixin_bs:
1601 try:
1602 text_list = pdf_handler.page_text_list
1603 except Exception as e:
1604 text_list = []
1605 else:
1606 text_list = []
1607 img_queue.put((business_type, img_path, text_list))
1608 except Exception as e:
1609 try:
1610 end_time = timezone.now()
1611 report_table = HILOCRReport if business_type == consts.HIL_PREFIX else AFCOCRReport
1612 report_table.objects.create(
1613 case_number=doc.application_id,
1614 request_team=RequestTeam.get_value(doc.document_scheme, 0),
1615 request_trigger=RequestTrigger.get_value(doc.data_source, 0),
1616 input_file=doc.document_name,
1617 transaction_start=doc.start_time,
1618 transaction_end=end_time,
1619 successful_at_this_level=False,
1620 failure_reason=FailureReason.PDF.value,
1621 process_name=ProcessName.ALL.value,
1622 )
1623 except Exception as e:
1624 self.online_log.error('{0} [process error (report db save)] [error={1}]'.format(
1625 self.log_base, traceback.format_exc()))
1626
1627 try:
1628 doc.status = DocStatus.PROCESS_FAILED.value
1629 doc.page_count = pdf_handler.page_count
1630 doc.save()
1631 self.online_log.warn('{0} [process failed (pdf_2_img_2_queue)] [task={1}] '
1632 '[error={2}]'.format(self.log_base, task_str, traceback.format_exc()))
1633 except Exception as e:
1634 self.online_log.error('{0} [process error (db save)] [error={1}]'.format(
1635 self.log_base, traceback.format_exc()))
1636 # error_list.append(1)
1637 # return
1507 else: # e-contract or or e-fsm-contract or e-hmh 1638 else: # e-contract or or e-fsm-contract or e-hmh
1508 try: 1639 try:
1509 # pdf下载 处理 图片存储 识别 1640 # pdf下载 处理 图片存储 识别
...@@ -1674,6 +1805,7 @@ class Command(BaseCommand, LoggerMixin): ...@@ -1674,6 +1805,7 @@ class Command(BaseCommand, LoggerMixin):
1674 json_data_1['text_list'] = text_list 1805 json_data_1['text_list'] = text_list
1675 1806
1676 start_time = time.time() 1807 start_time = time.time()
1808 self.online_log.info('{0} [ocr_1 api] [img={1}] [json_data_1={2}]'.format(self.log_base, img_path, json_data_1))
1677 ocr_1_response = requests.post(url, json=json_data_1) 1809 ocr_1_response = requests.post(url, json=json_data_1)
1678 if ocr_1_response.status_code != 200: 1810 if ocr_1_response.status_code != 200:
1679 raise OCR1Exception('ocr_1 status code: {0}'.format(ocr_1_response.status_code)) 1811 raise OCR1Exception('ocr_1 status code: {0}'.format(ocr_1_response.status_code))
......
...@@ -684,6 +684,10 @@ class UploadDocView(GenericView, DocHandler): ...@@ -684,6 +684,10 @@ class UploadDocView(GenericView, DocHandler):
684 classify_1 = classify_1_tmp 684 classify_1 = classify_1_tmp
685 break 685 break
686 686
687 if classify_1 == 0 and ('微信支付交易明细证明' in document_name or '微信流水' in document_name):
688 classify_1 = 12
689 self.running_log.info('[weixin bs process] [doc_id={0}]'.format(doc.id))
690
687 691
688 if document_name.endswith('.zip') or document_name.endswith('.rar') or document_name.endswith('.ZIP') \ 692 if document_name.endswith('.zip') or document_name.endswith('.rar') or document_name.endswith('.ZIP') \
689 or document_name.endswith('.RAR'): 693 or document_name.endswith('.RAR'):
...@@ -1239,6 +1243,10 @@ class DocView(DocGenericView, DocHandler): ...@@ -1239,6 +1243,10 @@ class DocView(DocGenericView, DocHandler):
1239 classify_1 = classify_1_tmp 1243 classify_1 = classify_1_tmp
1240 break 1244 break
1241 1245
1246 if classify_1 == 0 and ('微信支付交易明细证明' in document_name or '微信流水' in document_name):
1247 classify_1 = 12
1248 self.running_log.info('[weixin bs process] [doc_id={0}]'.format(doc.id))
1249
1242 # tasks = ['{0}{1}{2}'.format(prefix, consts.SPLIT_STR, doc.id)] 1250 # tasks = ['{0}{1}{2}'.format(prefix, consts.SPLIT_STR, doc.id)]
1243 task = consts.SPLIT_STR.join([prefix, str(doc.id), str(classify_1)]) 1251 task = consts.SPLIT_STR.join([prefix, str(doc.id), str(classify_1)])
1244 enqueue_res = rh.enqueue([task], is_priority) 1252 enqueue_res = rh.enqueue([task], is_priority)
......
...@@ -69,6 +69,7 @@ class PDFHandler: ...@@ -69,6 +69,7 @@ class PDFHandler:
69 self.suffix = self.get_suffix(document_name) 69 self.suffix = self.get_suffix(document_name)
70 self.is_ebank = False 70 self.is_ebank = False
71 self.is_e_pdf = False 71 self.is_e_pdf = False
72 self.is_e_weixin_bs = False
72 self.page_text_list = [] 73 self.page_text_list = []
73 self.pdf_info = {} 74 self.pdf_info = {}
74 self.img_path_pno_list = [] 75 self.img_path_pno_list = []
...@@ -407,6 +408,57 @@ class PDFHandler: ...@@ -407,6 +408,57 @@ class PDFHandler:
407 self.is_e_pdf = True 408 self.is_e_pdf = True
408 self.page_text_list = page_text_list 409 self.page_text_list = page_text_list
409 410
411 def put_text(self, pdf):
412 page_text_list = []
413 text_item_sum = 0
414 for pno in range(pdf.pageCount):
415 page = pdf.loadPage(pno)
416 if page.rotation is None:
417 rotation = 0
418 elif isinstance(page.rotation, int):
419 divisor, remainder = divmod(page.rotation, 90)
420 if remainder != 0:
421 return
422 rotation = divmod(divisor, 4)[1]
423 else:
424 return
425 textpage = page.getTextPage()
426 text = textpage.extractDICT()
427 text_list = []
428 for block in text.get('blocks'):
429 for line in block.get('lines'):
430 for span in line.get('spans'):
431 char = span.get('text')
432
433 if char.strip() == '':
434 continue
435
436 # 特殊emoji跳过
437 try:
438 print(char)
439 except Exception as e:
440 continue
441
442 bbox = span.get('bbox')
443 if pno == 0 and self.title_is_ebank(char):
444 in_ebank_set = True
445 text_list.append((bbox, char))
446 text_item_sum += len(text_list)
447 if text_item_sum < (pno + 1) * 5:
448 return
449 else:
450 page_text_list.append(
451 {
452 'width': text.get('width'),
453 'height': text.get('height'),
454 'rotation': rotation,
455 'text': text_list
456 }
457 )
458 self.is_e_pdf = True
459 self.is_e_weixin_bs = True
460 self.page_text_list = page_text_list
461
410 def e_contract_process(self): 462 def e_contract_process(self):
411 os.makedirs(self.img_dir_path, exist_ok=True) 463 os.makedirs(self.img_dir_path, exist_ok=True)
412 with fitz.Document(self.path) as pdf: 464 with fitz.Document(self.path) as pdf:
...@@ -473,6 +525,59 @@ class PDFHandler: ...@@ -473,6 +525,59 @@ class PDFHandler:
473 self.merge_il(pdf, pno, il) 525 self.merge_il(pdf, pno, il)
474 self.img_count = len(self.img_path_list) 526 self.img_count = len(self.img_path_list)
475 527
528 def extract_image_for_weixin(self, max_img_count=None):
529 self.img_path_list = []
530 self.xref_set = set()
531 os.makedirs(self.img_dir_path, exist_ok=True)
532
533 if self.suffix in self.img_suffixs:
534 img_save_path = self.get_img_save_path(0, ext=self.suffix[1:])
535 shutil.copy(self.path, img_save_path)
536 self.img_path_list.append(img_save_path)
537 else:
538 with fitz.Document(self.path) as pdf:
539 # 解密
540 for pwd in self.pwd_list:
541 if not pdf.isEncrypted:
542 break
543 pdf.authenticate(pwd)
544
545 self.metadata = pdf.metadata
546 self.page_count = pdf.pageCount
547 if isinstance(max_img_count, int) and pdf.pageCount >= max_img_count:
548 self.img_count = pdf.pageCount
549 return
550 self.put_text(pdf)
551 for pno in range(pdf.pageCount):
552 il = pdf.getPageImageList(pno) # 获取页面图片对象
553 # (xref, smask, width, height, bpc, colorspace, alt.colorspace, name, filter, invoker)
554
555 # 1.页面图片对象数目为0时,保存整个页面为png图片
556 if self.is_e_pdf or self.is_ebank or len(il) == 0:
557 page = pdf.loadPage(pno)
558 self.page_to_png(page)
559 # 2.页面图片对象数目为1时:
560 # 小图(如电子账单的盖章):保存整个页面为png图片
561 # 大图:提取图片对象
562 elif len(il) == 1:
563 xref, smask, width, height, _, colorspace, _, _, _ = il[0]
564 # 小图
565 if width < WH_COUPLE_1[0] and height < WH_COUPLE_1[1]:
566 page = pdf.loadPage(pno)
567 self.page_to_png(page)
568 # 大图
569 elif width >= WH_COUPLE_6[0] or height >= WH_COUPLE_6[1]:
570 self.is_new_modify = 1
571 is_big_img = (width < WH_COUPLE_7[0] and height < WH_COUPLE_7[1]) # 防止图片过大
572 page = pdf.loadPage(pno)
573 self.page_to_png(page, is_big_img=is_big_img)
574 elif xref not in self.xref_set:
575 self.extract_single_image(pdf, xref, smask, colorspace, pno)
576 # 3.页面图片对象数目大于1时,特殊处理
577 else:
578 self.merge_il(pdf, pno, il)
579 self.img_count = len(self.img_path_list)
580
476 def extract_page_image(self): 581 def extract_page_image(self):
477 self.img_path_list = [] 582 self.img_path_list = []
478 self.xref_set = set() 583 self.xref_set = set()
......
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!