Merge branch 'feature/main' into feature/mssql
Showing
5 changed files
with
106 additions
and
48 deletions
| ... | @@ -642,7 +642,7 @@ RP_FIELD_ORDER_0 = (('姓名', '姓名'), | ... | @@ -642,7 +642,7 @@ RP_FIELD_ORDER_0 = (('姓名', '姓名'), |
| 642 | ('住址', '住址'), | 642 | ('住址', '住址'), |
| 643 | ('性别', '性别'),) | 643 | ('性别', '性别'),) |
| 644 | RP_FIELD_ORDER_1 = IC_FIELD_ORDER_1 | 644 | RP_FIELD_ORDER_1 = IC_FIELD_ORDER_1 |
| 645 | # 增值税发票 | 645 | # 增值税普票 |
| 646 | VAT_CN_NAME = 'VAT普票' | 646 | VAT_CN_NAME = 'VAT普票' |
| 647 | VAT_CLASSIFY = 0 | 647 | VAT_CLASSIFY = 0 |
| 648 | VAT_FIELD_ORDER = (('发票代码', '发票代码'), | 648 | VAT_FIELD_ORDER = (('发票代码', '发票代码'), |
| ... | @@ -667,6 +667,32 @@ VAT_FIELD_ORDER = (('发票代码', '发票代码'), | ... | @@ -667,6 +667,32 @@ VAT_FIELD_ORDER = (('发票代码', '发票代码'), |
| 667 | ('销方开户行及账号', '销售方开户行及账号'), | 667 | ('销方开户行及账号', '销售方开户行及账号'), |
| 668 | ('下盖章', '销售方:(章)'), | 668 | ('下盖章', '销售方:(章)'), |
| 669 | ('备注', '备注'),) | 669 | ('备注', '备注'),) |
| 670 | # 增值税专票 | ||
| 671 | VATS_CN_NAME = 'VAT专票' | ||
| 672 | VATS_CLASSIFY = 10088 | ||
| 673 | VATS_FIELD_ORDER = (('发票代码', '发票代码'), | ||
| 674 | ('发票代码_开具', '发票代码(开具)'), | ||
| 675 | ('发票号码', '发票号码'), | ||
| 676 | ('发票号码_开具', '发票号码(开具)'), | ||
| 677 | ('开票日期', '开票日期'), | ||
| 678 | ('校验码', '校验码'), | ||
| 679 | ('货物或应税劳务、服务名称', '货物或应税劳务、服务名称'), | ||
| 680 | ('金额合计', '开具金额合计(不含税)'), | ||
| 681 | ('税率', '税率'), | ||
| 682 | ('税额合计', '税额合计'), | ||
| 683 | ('价税合计小写', '价税合计(小写)'), | ||
| 684 | ('价税合计大写', '价税合计(大写)'), | ||
| 685 | ('购方名称', '购买方名称'), | ||
| 686 | ('购方纳税人识别号', '购买方纳税人识别号'), | ||
| 687 | ('购方地址、电话', '购买方地址、电话'), | ||
| 688 | ('购方开户行及账号', '购买方开户行及账号'), | ||
| 689 | ('销方名称', '销售方名称'), | ||
| 690 | ('销方纳税人识别号', '销售方纳税人识别号'), | ||
| 691 | ('销方地址、电话', '销售方地址、电话'), | ||
| 692 | ('销方开户行及账号', '销售方开户行及账号'), | ||
| 693 | ('下盖章', '销售方:(章)'), | ||
| 694 | ('车船税', '车船税'), | ||
| 695 | ('备注', '备注'),) | ||
| 670 | # 机动车登记证书 | 696 | # 机动车登记证书 |
| 671 | MVC_CN_NAME = '机动车登记证书' | 697 | MVC_CN_NAME = '机动车登记证书' |
| 672 | MVC_CLASSIFY = 28 | 698 | MVC_CLASSIFY = 28 |
| ... | @@ -770,7 +796,7 @@ MVI_FIELD_ORDER = (('发票代码', '发票代码'), | ... | @@ -770,7 +796,7 @@ MVI_FIELD_ORDER = (('发票代码', '发票代码'), |
| 770 | ('主管税务机关及代码', '主管税务机关及代码'), | 796 | ('主管税务机关及代码', '主管税务机关及代码'), |
| 771 | ('吨位', '吨位'), | 797 | ('吨位', '吨位'), |
| 772 | ('限乘人数', '限乘人数'),) | 798 | ('限乘人数', '限乘人数'),) |
| 773 | IC_PID = VAT_PID = MVC_PID = MVI_PID = None | 799 | IC_PID = VAT_PID = VATS_PID = MVC_PID = MVI_PID = None |
| 774 | 800 | ||
| 775 | # 营业执照 | 801 | # 营业执照 |
| 776 | BL_CN_NAME = '营业执照' | 802 | BL_CN_NAME = '营业执照' |
| ... | @@ -909,6 +935,11 @@ LICENSE_ORDER = ((MVI_CLASSIFY, (MVI_PID, MVI_CN_NAME, MVI_FIELD_ORDER, False, F | ... | @@ -909,6 +935,11 @@ LICENSE_ORDER = ((MVI_CLASSIFY, (MVI_PID, MVI_CN_NAME, MVI_FIELD_ORDER, False, F |
| 909 | (MVC_CLASSIFY, (MVC_PID, MVC_CN_NAME, None, True, True, MODEL_FIELD_MVC)), | 935 | (MVC_CLASSIFY, (MVC_PID, MVC_CN_NAME, None, True, True, MODEL_FIELD_MVC)), |
| 910 | (VAT_CLASSIFY, (VAT_PID, VAT_CN_NAME, VAT_FIELD_ORDER, False, False, MODEL_FIELD_VAT))) | 936 | (VAT_CLASSIFY, (VAT_PID, VAT_CN_NAME, VAT_FIELD_ORDER, False, False, MODEL_FIELD_VAT))) |
| 911 | 937 | ||
| 938 | FOLDER_LICENSE_ORDER = ((MVI_CLASSIFY, (MVI_PID, MVI_CN_NAME, MVI_FIELD_ORDER, False, False, MODEL_FIELD_MVI)), | ||
| 939 | (IC_CLASSIFY, (IC_PID, IC_CN_NAME, None, True, False, MODEL_FIELD_IC)), | ||
| 940 | (VAT_CLASSIFY, (VAT_PID, VAT_CN_NAME, VAT_FIELD_ORDER, False, False, MODEL_FIELD_VAT)), | ||
| 941 | (VATS_CLASSIFY, (VATS_PID, VATS_CN_NAME, VATS_FIELD_ORDER, False, False, MODEL_FIELD_VAT))) | ||
| 942 | |||
| 912 | LICENSE_CLASSIFY_MAPPING = dict(LICENSE_ORDER) | 943 | LICENSE_CLASSIFY_MAPPING = dict(LICENSE_ORDER) |
| 913 | 944 | ||
| 914 | OTHER_CLASSIFY_SET = {OTHER_CLASSIFY} | 945 | OTHER_CLASSIFY_SET = {OTHER_CLASSIFY} | ... | ... |
| ... | @@ -165,7 +165,6 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -165,7 +165,6 @@ class Command(BaseCommand, LoggerMixin): |
| 165 | def folder_process(self, input_dir, classify): | 165 | def folder_process(self, input_dir, classify): |
| 166 | while not os.path.isdir(input_dir): | 166 | while not os.path.isdir(input_dir): |
| 167 | self.folder_log.info('{0} [input dir is not dir] [input_dir={1}]'.format(self.log_base, input_dir)) | 167 | self.folder_log.info('{0} [input dir is not dir] [input_dir={1}]'.format(self.log_base, input_dir)) |
| 168 | print(self.switch) | ||
| 169 | if self.switch: | 168 | if self.switch: |
| 170 | time.sleep(self.sleep_time) | 169 | time.sleep(self.sleep_time) |
| 171 | continue | 170 | continue |
| ... | @@ -202,6 +201,9 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -202,6 +201,9 @@ class Command(BaseCommand, LoggerMixin): |
| 202 | else: | 201 | else: |
| 203 | self.img_process(name, path, classify, wb_output_dir, img_output_dir, pdf_output_dir) | 202 | self.img_process(name, path, classify, wb_output_dir, img_output_dir, pdf_output_dir) |
| 204 | self.folder_log.info('{0} [file end] [path={1}]'.format(self.log_base, path)) | 203 | self.folder_log.info('{0} [file end] [path={1}]'.format(self.log_base, path)) |
| 204 | else: | ||
| 205 | self.folder_log.info('{0} [path is dir] [path={1}]'.format(self.log_base, input_dir)) | ||
| 206 | shutil.move(path, failed_output_dir) | ||
| 205 | except Exception as e: | 207 | except Exception as e: |
| 206 | try: | 208 | try: |
| 207 | path = os.path.join(input_dir, name) | 209 | path = os.path.join(input_dir, name) | ... | ... |
| ... | @@ -102,20 +102,20 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -102,20 +102,20 @@ class Command(BaseCommand, LoggerMixin): |
| 102 | self.log_base, task_str, is_priority)) | 102 | self.log_base, task_str, is_priority)) |
| 103 | return doc, business_type, task_str | 103 | return doc, business_type, task_str |
| 104 | 104 | ||
| 105 | def pdf_download(self, doc, pdf_path): | 105 | # def pdf_download(self, doc, pdf_path): |
| 106 | if not doc.application_id.startswith(consts.FIXED_APPLICATION_ID_PREFIX): | 106 | # if not doc.application_id.startswith(consts.FIXED_APPLICATION_ID_PREFIX): |
| 107 | for times in range(consts.RETRY_TIMES): | 107 | # for times in range(consts.RETRY_TIMES): |
| 108 | try: | 108 | # try: |
| 109 | self.edms.download(pdf_path, doc.metadata_version_id) | 109 | # self.edms.download(pdf_path, doc.metadata_version_id) |
| 110 | except Exception as e: | 110 | # except Exception as e: |
| 111 | self.cronjob_log.warn('{0} [edms download failed] [times={1}] [pdf_path={2}] ' | 111 | # self.cronjob_log.warn('{0} [edms download failed] [times={1}] [pdf_path={2}] ' |
| 112 | '[error={3}]'.format(self.log_base, times, pdf_path, traceback.format_exc())) | 112 | # '[error={3}]'.format(self.log_base, times, pdf_path, traceback.format_exc())) |
| 113 | edms_exc = str(e) | 113 | # edms_exc = str(e) |
| 114 | else: | 114 | # else: |
| 115 | break | 115 | # break |
| 116 | else: | 116 | # else: |
| 117 | raise EDMSException(edms_exc) | 117 | # raise EDMSException(edms_exc) |
| 118 | self.cronjob_log.info('{0} [edms download success] [pdf_path={1}]'.format(self.log_base, pdf_path)) | 118 | # self.cronjob_log.info('{0} [edms download success] [pdf_path={1}]'.format(self.log_base, pdf_path)) |
| 119 | 119 | ||
| 120 | def bs_process(self, wb, ocr_data, bs_summary, unknown_summary, classify, res_list, pno, ino, part_idx): | 120 | def bs_process(self, wb, ocr_data, bs_summary, unknown_summary, classify, res_list, pno, ino, part_idx): |
| 121 | sheets = ocr_data.get('data', []) | 121 | sheets = ocr_data.get('data', []) |
| ... | @@ -439,19 +439,35 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -439,19 +439,35 @@ class Command(BaseCommand, LoggerMixin): |
| 439 | # 2. 从EDMS获取PDF文件 | 439 | # 2. 从EDMS获取PDF文件 |
| 440 | doc_data_path = os.path.join(self.data_dir, business_type, consts.TMP_DIR_NAME, str(doc.id)) | 440 | doc_data_path = os.path.join(self.data_dir, business_type, consts.TMP_DIR_NAME, str(doc.id)) |
| 441 | os.makedirs(doc_data_path, exist_ok=True) | 441 | os.makedirs(doc_data_path, exist_ok=True) |
| 442 | img_save_path = os.path.join(doc_data_path, 'img') | ||
| 442 | pdf_path = os.path.join(doc_data_path, '{0}.pdf'.format(doc.id)) | 443 | pdf_path = os.path.join(doc_data_path, '{0}.pdf'.format(doc.id)) |
| 443 | self.pdf_download(doc, pdf_path) | 444 | |
| 445 | pdf_handler = PDFHandler(pdf_path, img_save_path) | ||
| 446 | |||
| 447 | for times in range(consts.RETRY_TIMES): | ||
| 448 | try: | ||
| 449 | if not doc.application_id.startswith(consts.FIXED_APPLICATION_ID_PREFIX): | ||
| 450 | self.edms.download(pdf_path, doc.metadata_version_id) | ||
| 451 | self.cronjob_log.info('{0} [edms download success] [task={1}] [times={2}] ' | ||
| 452 | '[pdf_path={3}]'.format(self.log_base, task_str, times, pdf_path)) | ||
| 444 | 453 | ||
| 445 | # 3.PDF文件提取图片 | 454 | # 3.PDF文件提取图片 |
| 446 | self.cronjob_log.info('{0} [pdf to img start] [task={1}]'.format(self.log_base, task_str)) | 455 | self.cronjob_log.info('{0} [pdf to img start] [task={1}] [times={2}]'.format( |
| 456 | self.log_base, task_str, times)) | ||
| 447 | start_time = time.time() | 457 | start_time = time.time() |
| 448 | img_save_path = os.path.join(doc_data_path, 'img') | ||
| 449 | pdf_handler = PDFHandler(pdf_path, img_save_path) | ||
| 450 | pdf_handler.extract_image() | 458 | pdf_handler.extract_image() |
| 451 | end_time = time.time() | 459 | end_time = time.time() |
| 452 | speed_time = int(end_time - start_time) | 460 | speed_time = int(end_time - start_time) |
| 453 | self.cronjob_log.info('{0} [pdf to img end] [task={1}] [spend_time={2}]'.format( | 461 | self.cronjob_log.info('{0} [pdf to img end] [task={1}] [times={2}] [spend_time={3}]'.format( |
| 454 | self.log_base, task_str, speed_time)) | 462 | self.log_base, task_str, times, speed_time)) |
| 463 | except Exception as e: | ||
| 464 | self.cronjob_log.warn('{0} [download or pdf to img failed] [task={1}] [times={2}] ' | ||
| 465 | '[error={3}]'.format(self.log_base, task_str, times, | ||
| 466 | traceback.format_exc())) | ||
| 467 | else: | ||
| 468 | break | ||
| 469 | else: | ||
| 470 | raise Exception('download or pdf to img failed') | ||
| 455 | 471 | ||
| 456 | img_count = len(pdf_handler.img_path_list) | 472 | img_count = len(pdf_handler.img_path_list) |
| 457 | if img_count == 0: | 473 | if img_count == 0: |
| ... | @@ -466,25 +482,25 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -466,25 +482,25 @@ class Command(BaseCommand, LoggerMixin): |
| 466 | self.cronjob_log.info('{0} [pdf_2_img_2_queue] [img queue full]'.format(self.log_base)) | 482 | self.cronjob_log.info('{0} [pdf_2_img_2_queue] [img queue full]'.format(self.log_base)) |
| 467 | time.sleep(self.sleep_time_img_put) | 483 | time.sleep(self.sleep_time_img_put) |
| 468 | img_queue.put(img_path) | 484 | img_queue.put(img_path) |
| 469 | except EDMSException as e: | 485 | # except EDMSException as e: |
| 470 | try: | 486 | # try: |
| 471 | doc.status = DocStatus.PROCESS_FAILED.value | 487 | # doc.status = DocStatus.PROCESS_FAILED.value |
| 472 | doc.save() | 488 | # doc.save() |
| 473 | self.cronjob_log.warn('{0} [process failed (edms download)] [task={1}] [error={2}]'.format( | 489 | # self.cronjob_log.warn('{0} [process failed (edms download)] [task={1}] [error={2}]'.format( |
| 474 | self.log_base, task_str, traceback.format_exc())) | 490 | # self.log_base, task_str, traceback.format_exc())) |
| 475 | except Exception as e: | 491 | # except Exception as e: |
| 476 | self.cronjob_log.error('{0} [process error (db save 1)] [error={1}]'.format( | 492 | # self.cronjob_log.error('{0} [process error (db save 1)] [error={1}]'.format( |
| 477 | self.log_base, traceback.format_exc())) | 493 | # self.log_base, traceback.format_exc())) |
| 478 | error_list.append(1) | 494 | # error_list.append(1) |
| 479 | return | 495 | # return |
| 480 | except Exception as e: | 496 | except Exception as e: |
| 481 | try: | 497 | try: |
| 482 | doc.status = DocStatus.PROCESS_FAILED.value | 498 | doc.status = DocStatus.PROCESS_FAILED.value |
| 483 | doc.save() | 499 | doc.save() |
| 484 | self.cronjob_log.warn('{0} [process failed (pdf to img)] [task={1}] [error={2}]'.format( | 500 | self.cronjob_log.warn('{0} [process failed (pdf_2_img_2_queue)] [task={1}] ' |
| 485 | self.log_base, task_str, traceback.format_exc())) | 501 | '[error={2}]'.format(self.log_base, task_str, traceback.format_exc())) |
| 486 | except Exception as e: | 502 | except Exception as e: |
| 487 | self.cronjob_log.error('{0} [process error (db save 2)] [error={1}]'.format( | 503 | self.cronjob_log.error('{0} [process error (db save 1)] [error={1}]'.format( |
| 488 | self.log_base, traceback.format_exc())) | 504 | self.log_base, traceback.format_exc())) |
| 489 | error_list.append(1) | 505 | error_list.append(1) |
| 490 | return | 506 | return |
| ... | @@ -523,8 +539,8 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -523,8 +539,8 @@ class Command(BaseCommand, LoggerMixin): |
| 523 | ocr_1_res = ocr_1_response.json() | 539 | ocr_1_res = ocr_1_response.json() |
| 524 | end_time = time.time() | 540 | end_time = time.time() |
| 525 | speed_time = int(end_time - start_time) | 541 | speed_time = int(end_time - start_time) |
| 526 | self.cronjob_log.info('{0} [ocr_1 success] [img={1}] [res={2}] [speed_time={3}]'.format( | 542 | self.cronjob_log.info('{0} [ocr_1 success] [img={1}] [url={2}] [speed_time={3}]'.format( |
| 527 | self.log_base, img_path, ocr_1_res, speed_time)) | 543 | self.log_base, img_path, url, speed_time)) |
| 528 | break | 544 | break |
| 529 | else: | 545 | else: |
| 530 | ocr_1_res = {} | 546 | ocr_1_res = {} |
| ... | @@ -636,8 +652,8 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -636,8 +652,8 @@ class Command(BaseCommand, LoggerMixin): |
| 636 | end_time = time.time() | 652 | end_time = time.time() |
| 637 | speed_time = int(end_time - start_time) | 653 | speed_time = int(end_time - start_time) |
| 638 | self.cronjob_log.info( | 654 | self.cronjob_log.info( |
| 639 | '{0} [ocr_2 success] [img={1}] [res={2}] [speed_time={3}]'.format( | 655 | '{0} [ocr_2 success] [img={1}] [speed_time={2}]'.format( |
| 640 | self.log_base, img_path, ocr_2_res, speed_time)) | 656 | self.log_base, img_path, speed_time)) |
| 641 | 657 | ||
| 642 | if classify == consts.BC_CLASSIFY: | 658 | if classify == consts.BC_CLASSIFY: |
| 643 | name = '有' | 659 | name = '有' | ... | ... |
| ... | @@ -520,7 +520,7 @@ class BSWorkbook(Workbook): | ... | @@ -520,7 +520,7 @@ class BSWorkbook(Workbook): |
| 520 | 520 | ||
| 521 | for row in loan_fill_row: | 521 | for row in loan_fill_row: |
| 522 | for cell in new_ws[row]: | 522 | for cell in new_ws[row]: |
| 523 | cell.fill = self.loan_fill | 523 | cell.fill = self.amount_fill |
| 524 | 524 | ||
| 525 | # 3.6.同一天相同进出账高亮 | 525 | # 3.6.同一天相同进出账高亮 |
| 526 | del amount_mapping | 526 | del amount_mapping |
| ... | @@ -656,17 +656,24 @@ class BSWorkbook(Workbook): | ... | @@ -656,17 +656,24 @@ class BSWorkbook(Workbook): |
| 656 | count_list.append((field_str, count)) | 656 | count_list.append((field_str, count)) |
| 657 | 657 | ||
| 658 | def simple_license_rebuild(self, license_summary, document_scheme): | 658 | def simple_license_rebuild(self, license_summary, document_scheme): |
| 659 | for classify, (_, name, field_order, side_diff, scheme_diff, _) in consts.LICENSE_ORDER: | 659 | for ic_license_dict in license_summary.get(consts.IC_CLASSIFY, []): |
| 660 | if ic_license_dict.get('类别') == '1': | ||
| 661 | license_summary.setdefault(consts.RP_CLASSIFY, []).append(ic_license_dict) | ||
| 662 | continue | ||
| 663 | |||
| 664 | for vat_license_dict in license_summary.get(consts.VAT_CLASSIFY, []): | ||
| 665 | if vat_license_dict.get('发票类型') == 'special': | ||
| 666 | license_summary.setdefault(consts.VATS_CLASSIFY, []).append(vat_license_dict) | ||
| 667 | continue | ||
| 668 | |||
| 669 | for classify, (_, name, field_order, side_diff, scheme_diff, _) in consts.FOLDER_LICENSE_ORDER: | ||
| 660 | license_list = license_summary.get(classify) | 670 | license_list = license_summary.get(classify) |
| 661 | if not license_list: | 671 | if not license_list: |
| 662 | continue | 672 | continue |
| 663 | ws = self.create_sheet(name) | 673 | ws = self.create_sheet(name) |
| 664 | if scheme_diff and document_scheme == consts.DOC_SCHEME_LIST[1]: | 674 | # if scheme_diff and document_scheme == consts.DOC_SCHEME_LIST[1]: |
| 665 | classify = consts.MVC_CLASSIFY_SE | 675 | # classify = consts.MVC_CLASSIFY_SE |
| 666 | for license_dict in license_list: | 676 | for license_dict in license_list: |
| 667 | if classify == consts.IC_CLASSIFY and license_dict.get('类别') == '1': | ||
| 668 | license_summary.setdefault(consts.RP_CLASSIFY, []).append(license_dict) | ||
| 669 | continue | ||
| 670 | if side_diff: | 677 | if side_diff: |
| 671 | key, field_order_yes, field_order_no = consts.FIELD_ORDER_MAP.get(classify) | 678 | key, field_order_yes, field_order_no = consts.FIELD_ORDER_MAP.get(classify) |
| 672 | field_order = field_order_yes if key in license_dict else field_order_no | 679 | field_order = field_order_yes if key in license_dict else field_order_no | ... | ... |
| ... | @@ -187,6 +187,8 @@ class PDFHandler: | ... | @@ -187,6 +187,8 @@ class PDFHandler: |
| 187 | self.page_to_png(page) | 187 | self.page_to_png(page) |
| 188 | 188 | ||
| 189 | def extract_image(self): | 189 | def extract_image(self): |
| 190 | self.img_path_list = [] | ||
| 191 | self.xref_set = set() | ||
| 190 | os.makedirs(self.img_dir_path, exist_ok=True) | 192 | os.makedirs(self.img_dir_path, exist_ok=True) |
| 191 | with fitz.Document(self.path) as pdf: | 193 | with fitz.Document(self.path) as pdf: |
| 192 | for pno in range(pdf.pageCount): | 194 | for pno in range(pdf.pageCount): | ... | ... |
-
Please register or sign in to post a comment