272692c8 by 冯轩

Merge branch 'feature/weixin-bs-2'

2 parents e08e5c00 df94248b
...@@ -11,6 +11,8 @@ from openpyxl import Workbook ...@@ -11,6 +11,8 @@ from openpyxl import Workbook
11 from openpyxl.styles import PatternFill, numbers 11 from openpyxl.styles import PatternFill, numbers
12 from openpyxl.utils import get_column_letter 12 from openpyxl.utils import get_column_letter
13 from apps.doc import consts 13 from apps.doc import consts
14 import logging
15 online_log = logging.getLogger('online')
14 16
15 17
16 class BSWorkbook(Workbook): 18 class BSWorkbook(Workbook):
...@@ -562,6 +564,8 @@ class BSWorkbook(Workbook): ...@@ -562,6 +564,8 @@ class BSWorkbook(Workbook):
562 borrow_cell = None if borrow_cell_idx is None or borrow_cell_idx >= length else rows[borrow_cell_idx] 564 borrow_cell = None if borrow_cell_idx is None or borrow_cell_idx >= length else rows[borrow_cell_idx]
563 565
564 summary_cell_value = None if summary_cell is None else summary_cell.value 566 summary_cell_value = None if summary_cell is None else summary_cell.value
567 if summary_cell.value is not None:
568 summary_cell_value = summary_cell_value.strip()
565 date_cell_value = None if date_cell is None else date_cell.value 569 date_cell_value = None if date_cell is None else date_cell.value
566 amount_cell_value = None if amount_cell is None else amount_cell.value 570 amount_cell_value = None if amount_cell is None else amount_cell.value
567 over_cell_value = None if over_cell is None else over_cell.value 571 over_cell_value = None if over_cell is None else over_cell.value
...@@ -638,6 +642,7 @@ class BSWorkbook(Workbook): ...@@ -638,6 +642,7 @@ class BSWorkbook(Workbook):
638 642
639 # 3.2.提取信息、高亮 643 # 3.2.提取信息、高亮
640 # row = summary_cell.row 644 # row = summary_cell.row
645 # online_log.info('[ti qu xin xi gao liang =========== >] [summary_cell_value={0}]'.format(summary_cell_value))
641 if summary_cell is not None: 646 if summary_cell is not None:
642 # 关键词1提取 647 # 关键词1提取
643 if summary_cell_value in self.interest_keyword: 648 if summary_cell_value in self.interest_keyword:
......
...@@ -693,6 +693,14 @@ class UploadDocView(GenericView, DocHandler): ...@@ -693,6 +693,14 @@ class UploadDocView(GenericView, DocHandler):
693 classify_1 = classify_1_tmp 693 classify_1 = classify_1_tmp
694 break 694 break
695 695
696 if classify_1 == 0 and ('微信支付交易明细证明' in document_name or '微信流水' in document_name):
697 classify_1 = 12
698 self.running_log.info('[weixin bs process] [doc_id={0}]'.format(doc.id))
699
700 if classify_1 == 0 and (document_name.startswith("dzfp_")):
701 classify_1 = 0
702 self.running_log.info('[dzfp process] [doc_id={0}]'.format(doc.id))
703
696 704
697 if document_name.endswith('.zip') or document_name.endswith('.rar') or document_name.endswith('.ZIP') \ 705 if document_name.endswith('.zip') or document_name.endswith('.rar') or document_name.endswith('.ZIP') \
698 or document_name.endswith('.RAR'): 706 or document_name.endswith('.RAR'):
...@@ -1248,6 +1256,14 @@ class DocView(DocGenericView, DocHandler): ...@@ -1248,6 +1256,14 @@ class DocView(DocGenericView, DocHandler):
1248 classify_1 = classify_1_tmp 1256 classify_1 = classify_1_tmp
1249 break 1257 break
1250 1258
1259 if classify_1 == 0 and ('微信支付交易明细证明' in document_name or '微信流水' in document_name):
1260 classify_1 = 12
1261 self.running_log.info('[weixin bs process] [doc_id={0}]'.format(doc.id))
1262
1263 if classify_1 == 0 and (document_name.startswith("dzfp_")):
1264 classify_1 = 0
1265 self.running_log.info('[dzfp process] [doc_id={0}]'.format(doc.id))
1266
1251 # tasks = ['{0}{1}{2}'.format(prefix, consts.SPLIT_STR, doc.id)] 1267 # tasks = ['{0}{1}{2}'.format(prefix, consts.SPLIT_STR, doc.id)]
1252 task = consts.SPLIT_STR.join([prefix, str(doc.id), str(classify_1)]) 1268 task = consts.SPLIT_STR.join([prefix, str(doc.id), str(classify_1)])
1253 enqueue_res = rh.enqueue([task], is_priority) 1269 enqueue_res = rh.enqueue([task], is_priority)
......
...@@ -69,6 +69,7 @@ class PDFHandler: ...@@ -69,6 +69,7 @@ class PDFHandler:
69 self.suffix = self.get_suffix(document_name) 69 self.suffix = self.get_suffix(document_name)
70 self.is_ebank = False 70 self.is_ebank = False
71 self.is_e_pdf = False 71 self.is_e_pdf = False
72 self.is_e_weixin_bs = False
72 self.page_text_list = [] 73 self.page_text_list = []
73 self.pdf_info = {} 74 self.pdf_info = {}
74 self.img_path_pno_list = [] 75 self.img_path_pno_list = []
...@@ -186,6 +187,8 @@ class PDFHandler: ...@@ -186,6 +187,8 @@ class PDFHandler:
186 self.img_path_list.append(img_save_path) 187 self.img_path_list.append(img_save_path)
187 if self.is_ebank: 188 if self.is_ebank:
188 self.rebuild_bbox(pm.width, pm.height, page.number) 189 self.rebuild_bbox(pm.width, pm.height, page.number)
190 if self.is_e_weixin_bs:
191 self.rebuild_bbox(pm.width, pm.height, page.number)
189 192
190 @staticmethod 193 @staticmethod
191 def getimage(pix): 194 def getimage(pix):
...@@ -407,6 +410,57 @@ class PDFHandler: ...@@ -407,6 +410,57 @@ class PDFHandler:
407 self.is_e_pdf = True 410 self.is_e_pdf = True
408 self.page_text_list = page_text_list 411 self.page_text_list = page_text_list
409 412
413 def put_text(self, pdf):
414 page_text_list = []
415 text_item_sum = 0
416 for pno in range(pdf.pageCount):
417 page = pdf.loadPage(pno)
418 if page.rotation is None:
419 rotation = 0
420 elif isinstance(page.rotation, int):
421 divisor, remainder = divmod(page.rotation, 90)
422 if remainder != 0:
423 return
424 rotation = divmod(divisor, 4)[1]
425 else:
426 return
427 textpage = page.getTextPage()
428 text = textpage.extractDICT()
429 text_list = []
430 for block in text.get('blocks'):
431 for line in block.get('lines'):
432 for span in line.get('spans'):
433 char = span.get('text')
434
435 if char.strip() == '':
436 continue
437
438 # 特殊emoji跳过
439 try:
440 print(char)
441 except Exception as e:
442 continue
443
444 bbox = span.get('bbox')
445 if pno == 0 and self.title_is_ebank(char):
446 in_ebank_set = True
447 text_list.append((bbox, char))
448 text_item_sum += len(text_list)
449 if text_item_sum < (pno + 1) * 5:
450 return
451 else:
452 page_text_list.append(
453 {
454 'width': text.get('width'),
455 'height': text.get('height'),
456 'rotation': rotation,
457 'text': text_list
458 }
459 )
460 self.is_e_pdf = True
461 self.is_e_weixin_bs = True
462 self.page_text_list = page_text_list
463
410 def e_contract_process(self): 464 def e_contract_process(self):
411 os.makedirs(self.img_dir_path, exist_ok=True) 465 os.makedirs(self.img_dir_path, exist_ok=True)
412 with fitz.Document(self.path) as pdf: 466 with fitz.Document(self.path) as pdf:
...@@ -473,6 +527,59 @@ class PDFHandler: ...@@ -473,6 +527,59 @@ class PDFHandler:
473 self.merge_il(pdf, pno, il) 527 self.merge_il(pdf, pno, il)
474 self.img_count = len(self.img_path_list) 528 self.img_count = len(self.img_path_list)
475 529
530 def extract_image_for_weixin(self, max_img_count=None):
531 self.img_path_list = []
532 self.xref_set = set()
533 os.makedirs(self.img_dir_path, exist_ok=True)
534
535 if self.suffix in self.img_suffixs:
536 img_save_path = self.get_img_save_path(0, ext=self.suffix[1:])
537 shutil.copy(self.path, img_save_path)
538 self.img_path_list.append(img_save_path)
539 else:
540 with fitz.Document(self.path) as pdf:
541 # 解密
542 for pwd in self.pwd_list:
543 if not pdf.isEncrypted:
544 break
545 pdf.authenticate(pwd)
546
547 self.metadata = pdf.metadata
548 self.page_count = pdf.pageCount
549 if isinstance(max_img_count, int) and pdf.pageCount >= max_img_count:
550 self.img_count = pdf.pageCount
551 return
552 self.put_text(pdf)
553 for pno in range(pdf.pageCount):
554 il = pdf.getPageImageList(pno) # 获取页面图片对象
555 # (xref, smask, width, height, bpc, colorspace, alt.colorspace, name, filter, invoker)
556
557 # 1.页面图片对象数目为0时,保存整个页面为png图片
558 if self.is_e_pdf or self.is_ebank or len(il) == 0:
559 page = pdf.loadPage(pno)
560 self.page_to_png(page)
561 # 2.页面图片对象数目为1时:
562 # 小图(如电子账单的盖章):保存整个页面为png图片
563 # 大图:提取图片对象
564 elif len(il) == 1:
565 xref, smask, width, height, _, colorspace, _, _, _ = il[0]
566 # 小图
567 if width < WH_COUPLE_1[0] and height < WH_COUPLE_1[1]:
568 page = pdf.loadPage(pno)
569 self.page_to_png(page)
570 # 大图
571 elif width >= WH_COUPLE_6[0] or height >= WH_COUPLE_6[1]:
572 self.is_new_modify = 1
573 is_big_img = (width < WH_COUPLE_7[0] and height < WH_COUPLE_7[1]) # 防止图片过大
574 page = pdf.loadPage(pno)
575 self.page_to_png(page, is_big_img=is_big_img)
576 elif xref not in self.xref_set:
577 self.extract_single_image(pdf, xref, smask, colorspace, pno)
578 # 3.页面图片对象数目大于1时,特殊处理
579 else:
580 self.merge_il(pdf, pno, il)
581 self.img_count = len(self.img_path_list)
582
476 def extract_page_image(self): 583 def extract_page_image(self):
477 self.img_path_list = [] 584 self.img_path_list = []
478 self.xref_set = set() 585 self.xref_set = set()
......
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!