Merge branch 'feature/main' into feature/mssql
Showing
5 changed files
with
110 additions
and
52 deletions
... | @@ -642,7 +642,7 @@ RP_FIELD_ORDER_0 = (('姓名', '姓名'), | ... | @@ -642,7 +642,7 @@ RP_FIELD_ORDER_0 = (('姓名', '姓名'), |
642 | ('住址', '住址'), | 642 | ('住址', '住址'), |
643 | ('性别', '性别'),) | 643 | ('性别', '性别'),) |
644 | RP_FIELD_ORDER_1 = IC_FIELD_ORDER_1 | 644 | RP_FIELD_ORDER_1 = IC_FIELD_ORDER_1 |
645 | # 增值税发票 | 645 | # 增值税普票 |
646 | VAT_CN_NAME = 'VAT普票' | 646 | VAT_CN_NAME = 'VAT普票' |
647 | VAT_CLASSIFY = 0 | 647 | VAT_CLASSIFY = 0 |
648 | VAT_FIELD_ORDER = (('发票代码', '发票代码'), | 648 | VAT_FIELD_ORDER = (('发票代码', '发票代码'), |
... | @@ -667,6 +667,32 @@ VAT_FIELD_ORDER = (('发票代码', '发票代码'), | ... | @@ -667,6 +667,32 @@ VAT_FIELD_ORDER = (('发票代码', '发票代码'), |
667 | ('销方开户行及账号', '销售方开户行及账号'), | 667 | ('销方开户行及账号', '销售方开户行及账号'), |
668 | ('下盖章', '销售方:(章)'), | 668 | ('下盖章', '销售方:(章)'), |
669 | ('备注', '备注'),) | 669 | ('备注', '备注'),) |
670 | # 增值税专票 | ||
671 | VATS_CN_NAME = 'VAT专票' | ||
672 | VATS_CLASSIFY = 10088 | ||
673 | VATS_FIELD_ORDER = (('发票代码', '发票代码'), | ||
674 | ('发票代码_开具', '发票代码(开具)'), | ||
675 | ('发票号码', '发票号码'), | ||
676 | ('发票号码_开具', '发票号码(开具)'), | ||
677 | ('开票日期', '开票日期'), | ||
678 | ('校验码', '校验码'), | ||
679 | ('货物或应税劳务、服务名称', '货物或应税劳务、服务名称'), | ||
680 | ('金额合计', '开具金额合计(不含税)'), | ||
681 | ('税率', '税率'), | ||
682 | ('税额合计', '税额合计'), | ||
683 | ('价税合计小写', '价税合计(小写)'), | ||
684 | ('价税合计大写', '价税合计(大写)'), | ||
685 | ('购方名称', '购买方名称'), | ||
686 | ('购方纳税人识别号', '购买方纳税人识别号'), | ||
687 | ('购方地址、电话', '购买方地址、电话'), | ||
688 | ('购方开户行及账号', '购买方开户行及账号'), | ||
689 | ('销方名称', '销售方名称'), | ||
690 | ('销方纳税人识别号', '销售方纳税人识别号'), | ||
691 | ('销方地址、电话', '销售方地址、电话'), | ||
692 | ('销方开户行及账号', '销售方开户行及账号'), | ||
693 | ('下盖章', '销售方:(章)'), | ||
694 | ('车船税', '车船税'), | ||
695 | ('备注', '备注'),) | ||
670 | # 机动车登记证书 | 696 | # 机动车登记证书 |
671 | MVC_CN_NAME = '机动车登记证书' | 697 | MVC_CN_NAME = '机动车登记证书' |
672 | MVC_CLASSIFY = 28 | 698 | MVC_CLASSIFY = 28 |
... | @@ -770,7 +796,7 @@ MVI_FIELD_ORDER = (('发票代码', '发票代码'), | ... | @@ -770,7 +796,7 @@ MVI_FIELD_ORDER = (('发票代码', '发票代码'), |
770 | ('主管税务机关及代码', '主管税务机关及代码'), | 796 | ('主管税务机关及代码', '主管税务机关及代码'), |
771 | ('吨位', '吨位'), | 797 | ('吨位', '吨位'), |
772 | ('限乘人数', '限乘人数'),) | 798 | ('限乘人数', '限乘人数'),) |
773 | IC_PID = VAT_PID = MVC_PID = MVI_PID = None | 799 | IC_PID = VAT_PID = VATS_PID = MVC_PID = MVI_PID = None |
774 | 800 | ||
775 | # 营业执照 | 801 | # 营业执照 |
776 | BL_CN_NAME = '营业执照' | 802 | BL_CN_NAME = '营业执照' |
... | @@ -909,6 +935,11 @@ LICENSE_ORDER = ((MVI_CLASSIFY, (MVI_PID, MVI_CN_NAME, MVI_FIELD_ORDER, False, F | ... | @@ -909,6 +935,11 @@ LICENSE_ORDER = ((MVI_CLASSIFY, (MVI_PID, MVI_CN_NAME, MVI_FIELD_ORDER, False, F |
909 | (MVC_CLASSIFY, (MVC_PID, MVC_CN_NAME, None, True, True, MODEL_FIELD_MVC)), | 935 | (MVC_CLASSIFY, (MVC_PID, MVC_CN_NAME, None, True, True, MODEL_FIELD_MVC)), |
910 | (VAT_CLASSIFY, (VAT_PID, VAT_CN_NAME, VAT_FIELD_ORDER, False, False, MODEL_FIELD_VAT))) | 936 | (VAT_CLASSIFY, (VAT_PID, VAT_CN_NAME, VAT_FIELD_ORDER, False, False, MODEL_FIELD_VAT))) |
911 | 937 | ||
938 | FOLDER_LICENSE_ORDER = ((MVI_CLASSIFY, (MVI_PID, MVI_CN_NAME, MVI_FIELD_ORDER, False, False, MODEL_FIELD_MVI)), | ||
939 | (IC_CLASSIFY, (IC_PID, IC_CN_NAME, None, True, False, MODEL_FIELD_IC)), | ||
940 | (VAT_CLASSIFY, (VAT_PID, VAT_CN_NAME, VAT_FIELD_ORDER, False, False, MODEL_FIELD_VAT)), | ||
941 | (VATS_CLASSIFY, (VATS_PID, VATS_CN_NAME, VATS_FIELD_ORDER, False, False, MODEL_FIELD_VAT))) | ||
942 | |||
912 | LICENSE_CLASSIFY_MAPPING = dict(LICENSE_ORDER) | 943 | LICENSE_CLASSIFY_MAPPING = dict(LICENSE_ORDER) |
913 | 944 | ||
914 | OTHER_CLASSIFY_SET = {OTHER_CLASSIFY} | 945 | OTHER_CLASSIFY_SET = {OTHER_CLASSIFY} | ... | ... |
... | @@ -165,7 +165,6 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -165,7 +165,6 @@ class Command(BaseCommand, LoggerMixin): |
165 | def folder_process(self, input_dir, classify): | 165 | def folder_process(self, input_dir, classify): |
166 | while not os.path.isdir(input_dir): | 166 | while not os.path.isdir(input_dir): |
167 | self.folder_log.info('{0} [input dir is not dir] [input_dir={1}]'.format(self.log_base, input_dir)) | 167 | self.folder_log.info('{0} [input dir is not dir] [input_dir={1}]'.format(self.log_base, input_dir)) |
168 | print(self.switch) | ||
169 | if self.switch: | 168 | if self.switch: |
170 | time.sleep(self.sleep_time) | 169 | time.sleep(self.sleep_time) |
171 | continue | 170 | continue |
... | @@ -202,6 +201,9 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -202,6 +201,9 @@ class Command(BaseCommand, LoggerMixin): |
202 | else: | 201 | else: |
203 | self.img_process(name, path, classify, wb_output_dir, img_output_dir, pdf_output_dir) | 202 | self.img_process(name, path, classify, wb_output_dir, img_output_dir, pdf_output_dir) |
204 | self.folder_log.info('{0} [file end] [path={1}]'.format(self.log_base, path)) | 203 | self.folder_log.info('{0} [file end] [path={1}]'.format(self.log_base, path)) |
204 | else: | ||
205 | self.folder_log.info('{0} [path is dir] [path={1}]'.format(self.log_base, input_dir)) | ||
206 | shutil.move(path, failed_output_dir) | ||
205 | except Exception as e: | 207 | except Exception as e: |
206 | try: | 208 | try: |
207 | path = os.path.join(input_dir, name) | 209 | path = os.path.join(input_dir, name) | ... | ... |
... | @@ -102,20 +102,20 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -102,20 +102,20 @@ class Command(BaseCommand, LoggerMixin): |
102 | self.log_base, task_str, is_priority)) | 102 | self.log_base, task_str, is_priority)) |
103 | return doc, business_type, task_str | 103 | return doc, business_type, task_str |
104 | 104 | ||
105 | def pdf_download(self, doc, pdf_path): | 105 | # def pdf_download(self, doc, pdf_path): |
106 | if not doc.application_id.startswith(consts.FIXED_APPLICATION_ID_PREFIX): | 106 | # if not doc.application_id.startswith(consts.FIXED_APPLICATION_ID_PREFIX): |
107 | for times in range(consts.RETRY_TIMES): | 107 | # for times in range(consts.RETRY_TIMES): |
108 | try: | 108 | # try: |
109 | self.edms.download(pdf_path, doc.metadata_version_id) | 109 | # self.edms.download(pdf_path, doc.metadata_version_id) |
110 | except Exception as e: | 110 | # except Exception as e: |
111 | self.cronjob_log.warn('{0} [edms download failed] [times={1}] [pdf_path={2}] ' | 111 | # self.cronjob_log.warn('{0} [edms download failed] [times={1}] [pdf_path={2}] ' |
112 | '[error={3}]'.format(self.log_base, times, pdf_path, traceback.format_exc())) | 112 | # '[error={3}]'.format(self.log_base, times, pdf_path, traceback.format_exc())) |
113 | edms_exc = str(e) | 113 | # edms_exc = str(e) |
114 | else: | 114 | # else: |
115 | break | 115 | # break |
116 | else: | 116 | # else: |
117 | raise EDMSException(edms_exc) | 117 | # raise EDMSException(edms_exc) |
118 | self.cronjob_log.info('{0} [edms download success] [pdf_path={1}]'.format(self.log_base, pdf_path)) | 118 | # self.cronjob_log.info('{0} [edms download success] [pdf_path={1}]'.format(self.log_base, pdf_path)) |
119 | 119 | ||
120 | def bs_process(self, wb, ocr_data, bs_summary, unknown_summary, classify, res_list, pno, ino, part_idx): | 120 | def bs_process(self, wb, ocr_data, bs_summary, unknown_summary, classify, res_list, pno, ino, part_idx): |
121 | sheets = ocr_data.get('data', []) | 121 | sheets = ocr_data.get('data', []) |
... | @@ -439,19 +439,35 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -439,19 +439,35 @@ class Command(BaseCommand, LoggerMixin): |
439 | # 2. 从EDMS获取PDF文件 | 439 | # 2. 从EDMS获取PDF文件 |
440 | doc_data_path = os.path.join(self.data_dir, business_type, consts.TMP_DIR_NAME, str(doc.id)) | 440 | doc_data_path = os.path.join(self.data_dir, business_type, consts.TMP_DIR_NAME, str(doc.id)) |
441 | os.makedirs(doc_data_path, exist_ok=True) | 441 | os.makedirs(doc_data_path, exist_ok=True) |
442 | img_save_path = os.path.join(doc_data_path, 'img') | ||
442 | pdf_path = os.path.join(doc_data_path, '{0}.pdf'.format(doc.id)) | 443 | pdf_path = os.path.join(doc_data_path, '{0}.pdf'.format(doc.id)) |
443 | self.pdf_download(doc, pdf_path) | ||
444 | 444 | ||
445 | # 3.PDF文件提取图片 | ||
446 | self.cronjob_log.info('{0} [pdf to img start] [task={1}]'.format(self.log_base, task_str)) | ||
447 | start_time = time.time() | ||
448 | img_save_path = os.path.join(doc_data_path, 'img') | ||
449 | pdf_handler = PDFHandler(pdf_path, img_save_path) | 445 | pdf_handler = PDFHandler(pdf_path, img_save_path) |
450 | pdf_handler.extract_image() | 446 | |
451 | end_time = time.time() | 447 | for times in range(consts.RETRY_TIMES): |
452 | speed_time = int(end_time - start_time) | 448 | try: |
453 | self.cronjob_log.info('{0} [pdf to img end] [task={1}] [spend_time={2}]'.format( | 449 | if not doc.application_id.startswith(consts.FIXED_APPLICATION_ID_PREFIX): |
454 | self.log_base, task_str, speed_time)) | 450 | self.edms.download(pdf_path, doc.metadata_version_id) |
451 | self.cronjob_log.info('{0} [edms download success] [task={1}] [times={2}] ' | ||
452 | '[pdf_path={3}]'.format(self.log_base, task_str, times, pdf_path)) | ||
453 | |||
454 | # 3.PDF文件提取图片 | ||
455 | self.cronjob_log.info('{0} [pdf to img start] [task={1}] [times={2}]'.format( | ||
456 | self.log_base, task_str, times)) | ||
457 | start_time = time.time() | ||
458 | pdf_handler.extract_image() | ||
459 | end_time = time.time() | ||
460 | speed_time = int(end_time - start_time) | ||
461 | self.cronjob_log.info('{0} [pdf to img end] [task={1}] [times={2}] [spend_time={3}]'.format( | ||
462 | self.log_base, task_str, times, speed_time)) | ||
463 | except Exception as e: | ||
464 | self.cronjob_log.warn('{0} [download or pdf to img failed] [task={1}] [times={2}] ' | ||
465 | '[error={3}]'.format(self.log_base, task_str, times, | ||
466 | traceback.format_exc())) | ||
467 | else: | ||
468 | break | ||
469 | else: | ||
470 | raise Exception('download or pdf to img failed') | ||
455 | 471 | ||
456 | img_count = len(pdf_handler.img_path_list) | 472 | img_count = len(pdf_handler.img_path_list) |
457 | if img_count == 0: | 473 | if img_count == 0: |
... | @@ -466,25 +482,25 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -466,25 +482,25 @@ class Command(BaseCommand, LoggerMixin): |
466 | self.cronjob_log.info('{0} [pdf_2_img_2_queue] [img queue full]'.format(self.log_base)) | 482 | self.cronjob_log.info('{0} [pdf_2_img_2_queue] [img queue full]'.format(self.log_base)) |
467 | time.sleep(self.sleep_time_img_put) | 483 | time.sleep(self.sleep_time_img_put) |
468 | img_queue.put(img_path) | 484 | img_queue.put(img_path) |
469 | except EDMSException as e: | 485 | # except EDMSException as e: |
470 | try: | 486 | # try: |
471 | doc.status = DocStatus.PROCESS_FAILED.value | 487 | # doc.status = DocStatus.PROCESS_FAILED.value |
472 | doc.save() | 488 | # doc.save() |
473 | self.cronjob_log.warn('{0} [process failed (edms download)] [task={1}] [error={2}]'.format( | 489 | # self.cronjob_log.warn('{0} [process failed (edms download)] [task={1}] [error={2}]'.format( |
474 | self.log_base, task_str, traceback.format_exc())) | 490 | # self.log_base, task_str, traceback.format_exc())) |
475 | except Exception as e: | 491 | # except Exception as e: |
476 | self.cronjob_log.error('{0} [process error (db save 1)] [error={1}]'.format( | 492 | # self.cronjob_log.error('{0} [process error (db save 1)] [error={1}]'.format( |
477 | self.log_base, traceback.format_exc())) | 493 | # self.log_base, traceback.format_exc())) |
478 | error_list.append(1) | 494 | # error_list.append(1) |
479 | return | 495 | # return |
480 | except Exception as e: | 496 | except Exception as e: |
481 | try: | 497 | try: |
482 | doc.status = DocStatus.PROCESS_FAILED.value | 498 | doc.status = DocStatus.PROCESS_FAILED.value |
483 | doc.save() | 499 | doc.save() |
484 | self.cronjob_log.warn('{0} [process failed (pdf to img)] [task={1}] [error={2}]'.format( | 500 | self.cronjob_log.warn('{0} [process failed (pdf_2_img_2_queue)] [task={1}] ' |
485 | self.log_base, task_str, traceback.format_exc())) | 501 | '[error={2}]'.format(self.log_base, task_str, traceback.format_exc())) |
486 | except Exception as e: | 502 | except Exception as e: |
487 | self.cronjob_log.error('{0} [process error (db save 2)] [error={1}]'.format( | 503 | self.cronjob_log.error('{0} [process error (db save 1)] [error={1}]'.format( |
488 | self.log_base, traceback.format_exc())) | 504 | self.log_base, traceback.format_exc())) |
489 | error_list.append(1) | 505 | error_list.append(1) |
490 | return | 506 | return |
... | @@ -523,8 +539,8 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -523,8 +539,8 @@ class Command(BaseCommand, LoggerMixin): |
523 | ocr_1_res = ocr_1_response.json() | 539 | ocr_1_res = ocr_1_response.json() |
524 | end_time = time.time() | 540 | end_time = time.time() |
525 | speed_time = int(end_time - start_time) | 541 | speed_time = int(end_time - start_time) |
526 | self.cronjob_log.info('{0} [ocr_1 success] [img={1}] [res={2}] [speed_time={3}]'.format( | 542 | self.cronjob_log.info('{0} [ocr_1 success] [img={1}] [url={2}] [speed_time={3}]'.format( |
527 | self.log_base, img_path, ocr_1_res, speed_time)) | 543 | self.log_base, img_path, url, speed_time)) |
528 | break | 544 | break |
529 | else: | 545 | else: |
530 | ocr_1_res = {} | 546 | ocr_1_res = {} |
... | @@ -636,8 +652,8 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -636,8 +652,8 @@ class Command(BaseCommand, LoggerMixin): |
636 | end_time = time.time() | 652 | end_time = time.time() |
637 | speed_time = int(end_time - start_time) | 653 | speed_time = int(end_time - start_time) |
638 | self.cronjob_log.info( | 654 | self.cronjob_log.info( |
639 | '{0} [ocr_2 success] [img={1}] [res={2}] [speed_time={3}]'.format( | 655 | '{0} [ocr_2 success] [img={1}] [speed_time={2}]'.format( |
640 | self.log_base, img_path, ocr_2_res, speed_time)) | 656 | self.log_base, img_path, speed_time)) |
641 | 657 | ||
642 | if classify == consts.BC_CLASSIFY: | 658 | if classify == consts.BC_CLASSIFY: |
643 | name = '有' | 659 | name = '有' | ... | ... |
... | @@ -520,7 +520,7 @@ class BSWorkbook(Workbook): | ... | @@ -520,7 +520,7 @@ class BSWorkbook(Workbook): |
520 | 520 | ||
521 | for row in loan_fill_row: | 521 | for row in loan_fill_row: |
522 | for cell in new_ws[row]: | 522 | for cell in new_ws[row]: |
523 | cell.fill = self.loan_fill | 523 | cell.fill = self.amount_fill |
524 | 524 | ||
525 | # 3.6.同一天相同进出账高亮 | 525 | # 3.6.同一天相同进出账高亮 |
526 | del amount_mapping | 526 | del amount_mapping |
... | @@ -656,17 +656,24 @@ class BSWorkbook(Workbook): | ... | @@ -656,17 +656,24 @@ class BSWorkbook(Workbook): |
656 | count_list.append((field_str, count)) | 656 | count_list.append((field_str, count)) |
657 | 657 | ||
658 | def simple_license_rebuild(self, license_summary, document_scheme): | 658 | def simple_license_rebuild(self, license_summary, document_scheme): |
659 | for classify, (_, name, field_order, side_diff, scheme_diff, _) in consts.LICENSE_ORDER: | 659 | for ic_license_dict in license_summary.get(consts.IC_CLASSIFY, []): |
660 | if ic_license_dict.get('类别') == '1': | ||
661 | license_summary.setdefault(consts.RP_CLASSIFY, []).append(ic_license_dict) | ||
662 | continue | ||
663 | |||
664 | for vat_license_dict in license_summary.get(consts.VAT_CLASSIFY, []): | ||
665 | if vat_license_dict.get('发票类型') == 'special': | ||
666 | license_summary.setdefault(consts.VATS_CLASSIFY, []).append(vat_license_dict) | ||
667 | continue | ||
668 | |||
669 | for classify, (_, name, field_order, side_diff, scheme_diff, _) in consts.FOLDER_LICENSE_ORDER: | ||
660 | license_list = license_summary.get(classify) | 670 | license_list = license_summary.get(classify) |
661 | if not license_list: | 671 | if not license_list: |
662 | continue | 672 | continue |
663 | ws = self.create_sheet(name) | 673 | ws = self.create_sheet(name) |
664 | if scheme_diff and document_scheme == consts.DOC_SCHEME_LIST[1]: | 674 | # if scheme_diff and document_scheme == consts.DOC_SCHEME_LIST[1]: |
665 | classify = consts.MVC_CLASSIFY_SE | 675 | # classify = consts.MVC_CLASSIFY_SE |
666 | for license_dict in license_list: | 676 | for license_dict in license_list: |
667 | if classify == consts.IC_CLASSIFY and license_dict.get('类别') == '1': | ||
668 | license_summary.setdefault(consts.RP_CLASSIFY, []).append(license_dict) | ||
669 | continue | ||
670 | if side_diff: | 677 | if side_diff: |
671 | key, field_order_yes, field_order_no = consts.FIELD_ORDER_MAP.get(classify) | 678 | key, field_order_yes, field_order_no = consts.FIELD_ORDER_MAP.get(classify) |
672 | field_order = field_order_yes if key in license_dict else field_order_no | 679 | field_order = field_order_yes if key in license_dict else field_order_no | ... | ... |
... | @@ -187,6 +187,8 @@ class PDFHandler: | ... | @@ -187,6 +187,8 @@ class PDFHandler: |
187 | self.page_to_png(page) | 187 | self.page_to_png(page) |
188 | 188 | ||
189 | def extract_image(self): | 189 | def extract_image(self): |
190 | self.img_path_list = [] | ||
191 | self.xref_set = set() | ||
190 | os.makedirs(self.img_dir_path, exist_ok=True) | 192 | os.makedirs(self.img_dir_path, exist_ok=True) |
191 | with fitz.Document(self.path) as pdf: | 193 | with fitz.Document(self.path) as pdf: |
192 | for pno in range(pdf.pageCount): | 194 | for pno in range(pdf.pageCount): | ... | ... |
-
Please register or sign in to post a comment