1a51bc03 by 周伟奇

Merge branch 'feature/main' into feature/mssql

2 parents 05a27cd8 fe25f273
...@@ -642,7 +642,7 @@ RP_FIELD_ORDER_0 = (('姓名', '姓名'), ...@@ -642,7 +642,7 @@ RP_FIELD_ORDER_0 = (('姓名', '姓名'),
642 ('住址', '住址'), 642 ('住址', '住址'),
643 ('性别', '性别'),) 643 ('性别', '性别'),)
644 RP_FIELD_ORDER_1 = IC_FIELD_ORDER_1 644 RP_FIELD_ORDER_1 = IC_FIELD_ORDER_1
645 # 增值税 645 # 增值税
646 VAT_CN_NAME = 'VAT普票' 646 VAT_CN_NAME = 'VAT普票'
647 VAT_CLASSIFY = 0 647 VAT_CLASSIFY = 0
648 VAT_FIELD_ORDER = (('发票代码', '发票代码'), 648 VAT_FIELD_ORDER = (('发票代码', '发票代码'),
...@@ -667,6 +667,32 @@ VAT_FIELD_ORDER = (('发票代码', '发票代码'), ...@@ -667,6 +667,32 @@ VAT_FIELD_ORDER = (('发票代码', '发票代码'),
667 ('销方开户行及账号', '销售方开户行及账号'), 667 ('销方开户行及账号', '销售方开户行及账号'),
668 ('下盖章', '销售方:(章)'), 668 ('下盖章', '销售方:(章)'),
669 ('备注', '备注'),) 669 ('备注', '备注'),)
670 # 增值税专票
671 VATS_CN_NAME = 'VAT专票'
672 VATS_CLASSIFY = 10088
673 VATS_FIELD_ORDER = (('发票代码', '发票代码'),
674 ('发票代码_开具', '发票代码(开具)'),
675 ('发票号码', '发票号码'),
676 ('发票号码_开具', '发票号码(开具)'),
677 ('开票日期', '开票日期'),
678 ('校验码', '校验码'),
679 ('货物或应税劳务、服务名称', '货物或应税劳务、服务名称'),
680 ('金额合计', '开具金额合计(不含税)'),
681 ('税率', '税率'),
682 ('税额合计', '税额合计'),
683 ('价税合计小写', '价税合计(小写)'),
684 ('价税合计大写', '价税合计(大写)'),
685 ('购方名称', '购买方名称'),
686 ('购方纳税人识别号', '购买方纳税人识别号'),
687 ('购方地址、电话', '购买方地址、电话'),
688 ('购方开户行及账号', '购买方开户行及账号'),
689 ('销方名称', '销售方名称'),
690 ('销方纳税人识别号', '销售方纳税人识别号'),
691 ('销方地址、电话', '销售方地址、电话'),
692 ('销方开户行及账号', '销售方开户行及账号'),
693 ('下盖章', '销售方:(章)'),
694 ('车船税', '车船税'),
695 ('备注', '备注'),)
670 # 机动车登记证书 696 # 机动车登记证书
671 MVC_CN_NAME = '机动车登记证书' 697 MVC_CN_NAME = '机动车登记证书'
672 MVC_CLASSIFY = 28 698 MVC_CLASSIFY = 28
...@@ -770,7 +796,7 @@ MVI_FIELD_ORDER = (('发票代码', '发票代码'), ...@@ -770,7 +796,7 @@ MVI_FIELD_ORDER = (('发票代码', '发票代码'),
770 ('主管税务机关及代码', '主管税务机关及代码'), 796 ('主管税务机关及代码', '主管税务机关及代码'),
771 ('吨位', '吨位'), 797 ('吨位', '吨位'),
772 ('限乘人数', '限乘人数'),) 798 ('限乘人数', '限乘人数'),)
773 IC_PID = VAT_PID = MVC_PID = MVI_PID = None 799 IC_PID = VAT_PID = VATS_PID = MVC_PID = MVI_PID = None
774 800
775 # 营业执照 801 # 营业执照
776 BL_CN_NAME = '营业执照' 802 BL_CN_NAME = '营业执照'
...@@ -909,6 +935,11 @@ LICENSE_ORDER = ((MVI_CLASSIFY, (MVI_PID, MVI_CN_NAME, MVI_FIELD_ORDER, False, F ...@@ -909,6 +935,11 @@ LICENSE_ORDER = ((MVI_CLASSIFY, (MVI_PID, MVI_CN_NAME, MVI_FIELD_ORDER, False, F
909 (MVC_CLASSIFY, (MVC_PID, MVC_CN_NAME, None, True, True, MODEL_FIELD_MVC)), 935 (MVC_CLASSIFY, (MVC_PID, MVC_CN_NAME, None, True, True, MODEL_FIELD_MVC)),
910 (VAT_CLASSIFY, (VAT_PID, VAT_CN_NAME, VAT_FIELD_ORDER, False, False, MODEL_FIELD_VAT))) 936 (VAT_CLASSIFY, (VAT_PID, VAT_CN_NAME, VAT_FIELD_ORDER, False, False, MODEL_FIELD_VAT)))
911 937
938 FOLDER_LICENSE_ORDER = ((MVI_CLASSIFY, (MVI_PID, MVI_CN_NAME, MVI_FIELD_ORDER, False, False, MODEL_FIELD_MVI)),
939 (IC_CLASSIFY, (IC_PID, IC_CN_NAME, None, True, False, MODEL_FIELD_IC)),
940 (VAT_CLASSIFY, (VAT_PID, VAT_CN_NAME, VAT_FIELD_ORDER, False, False, MODEL_FIELD_VAT)),
941 (VATS_CLASSIFY, (VATS_PID, VATS_CN_NAME, VATS_FIELD_ORDER, False, False, MODEL_FIELD_VAT)))
942
912 LICENSE_CLASSIFY_MAPPING = dict(LICENSE_ORDER) 943 LICENSE_CLASSIFY_MAPPING = dict(LICENSE_ORDER)
913 944
914 OTHER_CLASSIFY_SET = {OTHER_CLASSIFY} 945 OTHER_CLASSIFY_SET = {OTHER_CLASSIFY}
......
...@@ -165,7 +165,6 @@ class Command(BaseCommand, LoggerMixin): ...@@ -165,7 +165,6 @@ class Command(BaseCommand, LoggerMixin):
165 def folder_process(self, input_dir, classify): 165 def folder_process(self, input_dir, classify):
166 while not os.path.isdir(input_dir): 166 while not os.path.isdir(input_dir):
167 self.folder_log.info('{0} [input dir is not dir] [input_dir={1}]'.format(self.log_base, input_dir)) 167 self.folder_log.info('{0} [input dir is not dir] [input_dir={1}]'.format(self.log_base, input_dir))
168 print(self.switch)
169 if self.switch: 168 if self.switch:
170 time.sleep(self.sleep_time) 169 time.sleep(self.sleep_time)
171 continue 170 continue
...@@ -202,6 +201,9 @@ class Command(BaseCommand, LoggerMixin): ...@@ -202,6 +201,9 @@ class Command(BaseCommand, LoggerMixin):
202 else: 201 else:
203 self.img_process(name, path, classify, wb_output_dir, img_output_dir, pdf_output_dir) 202 self.img_process(name, path, classify, wb_output_dir, img_output_dir, pdf_output_dir)
204 self.folder_log.info('{0} [file end] [path={1}]'.format(self.log_base, path)) 203 self.folder_log.info('{0} [file end] [path={1}]'.format(self.log_base, path))
204 else:
205 self.folder_log.info('{0} [path is dir] [path={1}]'.format(self.log_base, input_dir))
206 shutil.move(path, failed_output_dir)
205 except Exception as e: 207 except Exception as e:
206 try: 208 try:
207 path = os.path.join(input_dir, name) 209 path = os.path.join(input_dir, name)
......
...@@ -102,20 +102,20 @@ class Command(BaseCommand, LoggerMixin): ...@@ -102,20 +102,20 @@ class Command(BaseCommand, LoggerMixin):
102 self.log_base, task_str, is_priority)) 102 self.log_base, task_str, is_priority))
103 return doc, business_type, task_str 103 return doc, business_type, task_str
104 104
105 def pdf_download(self, doc, pdf_path): 105 # def pdf_download(self, doc, pdf_path):
106 if not doc.application_id.startswith(consts.FIXED_APPLICATION_ID_PREFIX): 106 # if not doc.application_id.startswith(consts.FIXED_APPLICATION_ID_PREFIX):
107 for times in range(consts.RETRY_TIMES): 107 # for times in range(consts.RETRY_TIMES):
108 try: 108 # try:
109 self.edms.download(pdf_path, doc.metadata_version_id) 109 # self.edms.download(pdf_path, doc.metadata_version_id)
110 except Exception as e: 110 # except Exception as e:
111 self.cronjob_log.warn('{0} [edms download failed] [times={1}] [pdf_path={2}] ' 111 # self.cronjob_log.warn('{0} [edms download failed] [times={1}] [pdf_path={2}] '
112 '[error={3}]'.format(self.log_base, times, pdf_path, traceback.format_exc())) 112 # '[error={3}]'.format(self.log_base, times, pdf_path, traceback.format_exc()))
113 edms_exc = str(e) 113 # edms_exc = str(e)
114 else: 114 # else:
115 break 115 # break
116 else: 116 # else:
117 raise EDMSException(edms_exc) 117 # raise EDMSException(edms_exc)
118 self.cronjob_log.info('{0} [edms download success] [pdf_path={1}]'.format(self.log_base, pdf_path)) 118 # self.cronjob_log.info('{0} [edms download success] [pdf_path={1}]'.format(self.log_base, pdf_path))
119 119
120 def bs_process(self, wb, ocr_data, bs_summary, unknown_summary, classify, res_list, pno, ino, part_idx): 120 def bs_process(self, wb, ocr_data, bs_summary, unknown_summary, classify, res_list, pno, ino, part_idx):
121 sheets = ocr_data.get('data', []) 121 sheets = ocr_data.get('data', [])
...@@ -439,19 +439,35 @@ class Command(BaseCommand, LoggerMixin): ...@@ -439,19 +439,35 @@ class Command(BaseCommand, LoggerMixin):
439 # 2. 从EDMS获取PDF文件 439 # 2. 从EDMS获取PDF文件
440 doc_data_path = os.path.join(self.data_dir, business_type, consts.TMP_DIR_NAME, str(doc.id)) 440 doc_data_path = os.path.join(self.data_dir, business_type, consts.TMP_DIR_NAME, str(doc.id))
441 os.makedirs(doc_data_path, exist_ok=True) 441 os.makedirs(doc_data_path, exist_ok=True)
442 img_save_path = os.path.join(doc_data_path, 'img')
442 pdf_path = os.path.join(doc_data_path, '{0}.pdf'.format(doc.id)) 443 pdf_path = os.path.join(doc_data_path, '{0}.pdf'.format(doc.id))
443 self.pdf_download(doc, pdf_path)
444 444
445 # 3.PDF文件提取图片
446 self.cronjob_log.info('{0} [pdf to img start] [task={1}]'.format(self.log_base, task_str))
447 start_time = time.time()
448 img_save_path = os.path.join(doc_data_path, 'img')
449 pdf_handler = PDFHandler(pdf_path, img_save_path) 445 pdf_handler = PDFHandler(pdf_path, img_save_path)
450 pdf_handler.extract_image() 446
451 end_time = time.time() 447 for times in range(consts.RETRY_TIMES):
452 speed_time = int(end_time - start_time) 448 try:
453 self.cronjob_log.info('{0} [pdf to img end] [task={1}] [spend_time={2}]'.format( 449 if not doc.application_id.startswith(consts.FIXED_APPLICATION_ID_PREFIX):
454 self.log_base, task_str, speed_time)) 450 self.edms.download(pdf_path, doc.metadata_version_id)
451 self.cronjob_log.info('{0} [edms download success] [task={1}] [times={2}] '
452 '[pdf_path={3}]'.format(self.log_base, task_str, times, pdf_path))
453
454 # 3.PDF文件提取图片
455 self.cronjob_log.info('{0} [pdf to img start] [task={1}] [times={2}]'.format(
456 self.log_base, task_str, times))
457 start_time = time.time()
458 pdf_handler.extract_image()
459 end_time = time.time()
460 speed_time = int(end_time - start_time)
461 self.cronjob_log.info('{0} [pdf to img end] [task={1}] [times={2}] [spend_time={3}]'.format(
462 self.log_base, task_str, times, speed_time))
463 except Exception as e:
464 self.cronjob_log.warn('{0} [download or pdf to img failed] [task={1}] [times={2}] '
465 '[error={3}]'.format(self.log_base, task_str, times,
466 traceback.format_exc()))
467 else:
468 break
469 else:
470 raise Exception('download or pdf to img failed')
455 471
456 img_count = len(pdf_handler.img_path_list) 472 img_count = len(pdf_handler.img_path_list)
457 if img_count == 0: 473 if img_count == 0:
...@@ -466,25 +482,25 @@ class Command(BaseCommand, LoggerMixin): ...@@ -466,25 +482,25 @@ class Command(BaseCommand, LoggerMixin):
466 self.cronjob_log.info('{0} [pdf_2_img_2_queue] [img queue full]'.format(self.log_base)) 482 self.cronjob_log.info('{0} [pdf_2_img_2_queue] [img queue full]'.format(self.log_base))
467 time.sleep(self.sleep_time_img_put) 483 time.sleep(self.sleep_time_img_put)
468 img_queue.put(img_path) 484 img_queue.put(img_path)
469 except EDMSException as e: 485 # except EDMSException as e:
470 try: 486 # try:
471 doc.status = DocStatus.PROCESS_FAILED.value 487 # doc.status = DocStatus.PROCESS_FAILED.value
472 doc.save() 488 # doc.save()
473 self.cronjob_log.warn('{0} [process failed (edms download)] [task={1}] [error={2}]'.format( 489 # self.cronjob_log.warn('{0} [process failed (edms download)] [task={1}] [error={2}]'.format(
474 self.log_base, task_str, traceback.format_exc())) 490 # self.log_base, task_str, traceback.format_exc()))
475 except Exception as e: 491 # except Exception as e:
476 self.cronjob_log.error('{0} [process error (db save 1)] [error={1}]'.format( 492 # self.cronjob_log.error('{0} [process error (db save 1)] [error={1}]'.format(
477 self.log_base, traceback.format_exc())) 493 # self.log_base, traceback.format_exc()))
478 error_list.append(1) 494 # error_list.append(1)
479 return 495 # return
480 except Exception as e: 496 except Exception as e:
481 try: 497 try:
482 doc.status = DocStatus.PROCESS_FAILED.value 498 doc.status = DocStatus.PROCESS_FAILED.value
483 doc.save() 499 doc.save()
484 self.cronjob_log.warn('{0} [process failed (pdf to img)] [task={1}] [error={2}]'.format( 500 self.cronjob_log.warn('{0} [process failed (pdf_2_img_2_queue)] [task={1}] '
485 self.log_base, task_str, traceback.format_exc())) 501 '[error={2}]'.format(self.log_base, task_str, traceback.format_exc()))
486 except Exception as e: 502 except Exception as e:
487 self.cronjob_log.error('{0} [process error (db save 2)] [error={1}]'.format( 503 self.cronjob_log.error('{0} [process error (db save 1)] [error={1}]'.format(
488 self.log_base, traceback.format_exc())) 504 self.log_base, traceback.format_exc()))
489 error_list.append(1) 505 error_list.append(1)
490 return 506 return
...@@ -523,8 +539,8 @@ class Command(BaseCommand, LoggerMixin): ...@@ -523,8 +539,8 @@ class Command(BaseCommand, LoggerMixin):
523 ocr_1_res = ocr_1_response.json() 539 ocr_1_res = ocr_1_response.json()
524 end_time = time.time() 540 end_time = time.time()
525 speed_time = int(end_time - start_time) 541 speed_time = int(end_time - start_time)
526 self.cronjob_log.info('{0} [ocr_1 success] [img={1}] [res={2}] [speed_time={3}]'.format( 542 self.cronjob_log.info('{0} [ocr_1 success] [img={1}] [url={2}] [speed_time={3}]'.format(
527 self.log_base, img_path, ocr_1_res, speed_time)) 543 self.log_base, img_path, url, speed_time))
528 break 544 break
529 else: 545 else:
530 ocr_1_res = {} 546 ocr_1_res = {}
...@@ -636,8 +652,8 @@ class Command(BaseCommand, LoggerMixin): ...@@ -636,8 +652,8 @@ class Command(BaseCommand, LoggerMixin):
636 end_time = time.time() 652 end_time = time.time()
637 speed_time = int(end_time - start_time) 653 speed_time = int(end_time - start_time)
638 self.cronjob_log.info( 654 self.cronjob_log.info(
639 '{0} [ocr_2 success] [img={1}] [res={2}] [speed_time={3}]'.format( 655 '{0} [ocr_2 success] [img={1}] [speed_time={2}]'.format(
640 self.log_base, img_path, ocr_2_res, speed_time)) 656 self.log_base, img_path, speed_time))
641 657
642 if classify == consts.BC_CLASSIFY: 658 if classify == consts.BC_CLASSIFY:
643 name = '有' 659 name = '有'
......
...@@ -520,7 +520,7 @@ class BSWorkbook(Workbook): ...@@ -520,7 +520,7 @@ class BSWorkbook(Workbook):
520 520
521 for row in loan_fill_row: 521 for row in loan_fill_row:
522 for cell in new_ws[row]: 522 for cell in new_ws[row]:
523 cell.fill = self.loan_fill 523 cell.fill = self.amount_fill
524 524
525 # 3.6.同一天相同进出账高亮 525 # 3.6.同一天相同进出账高亮
526 del amount_mapping 526 del amount_mapping
...@@ -656,17 +656,24 @@ class BSWorkbook(Workbook): ...@@ -656,17 +656,24 @@ class BSWorkbook(Workbook):
656 count_list.append((field_str, count)) 656 count_list.append((field_str, count))
657 657
658 def simple_license_rebuild(self, license_summary, document_scheme): 658 def simple_license_rebuild(self, license_summary, document_scheme):
659 for classify, (_, name, field_order, side_diff, scheme_diff, _) in consts.LICENSE_ORDER: 659 for ic_license_dict in license_summary.get(consts.IC_CLASSIFY, []):
660 if ic_license_dict.get('类别') == '1':
661 license_summary.setdefault(consts.RP_CLASSIFY, []).append(ic_license_dict)
662 continue
663
664 for vat_license_dict in license_summary.get(consts.VAT_CLASSIFY, []):
665 if vat_license_dict.get('发票类型') == 'special':
666 license_summary.setdefault(consts.VATS_CLASSIFY, []).append(vat_license_dict)
667 continue
668
669 for classify, (_, name, field_order, side_diff, scheme_diff, _) in consts.FOLDER_LICENSE_ORDER:
660 license_list = license_summary.get(classify) 670 license_list = license_summary.get(classify)
661 if not license_list: 671 if not license_list:
662 continue 672 continue
663 ws = self.create_sheet(name) 673 ws = self.create_sheet(name)
664 if scheme_diff and document_scheme == consts.DOC_SCHEME_LIST[1]: 674 # if scheme_diff and document_scheme == consts.DOC_SCHEME_LIST[1]:
665 classify = consts.MVC_CLASSIFY_SE 675 # classify = consts.MVC_CLASSIFY_SE
666 for license_dict in license_list: 676 for license_dict in license_list:
667 if classify == consts.IC_CLASSIFY and license_dict.get('类别') == '1':
668 license_summary.setdefault(consts.RP_CLASSIFY, []).append(license_dict)
669 continue
670 if side_diff: 677 if side_diff:
671 key, field_order_yes, field_order_no = consts.FIELD_ORDER_MAP.get(classify) 678 key, field_order_yes, field_order_no = consts.FIELD_ORDER_MAP.get(classify)
672 field_order = field_order_yes if key in license_dict else field_order_no 679 field_order = field_order_yes if key in license_dict else field_order_no
......
...@@ -187,6 +187,8 @@ class PDFHandler: ...@@ -187,6 +187,8 @@ class PDFHandler:
187 self.page_to_png(page) 187 self.page_to_png(page)
188 188
189 def extract_image(self): 189 def extract_image(self):
190 self.img_path_list = []
191 self.xref_set = set()
190 os.makedirs(self.img_dir_path, exist_ok=True) 192 os.makedirs(self.img_dir_path, exist_ok=True)
191 with fitz.Document(self.path) as pdf: 193 with fitz.Document(self.path) as pdf:
192 for pno in range(pdf.pageCount): 194 for pno in range(pdf.pageCount):
......
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!