Merge branch 'feature/main' into feature/mssql

周伟奇
Showing 5 changed files with 110 additions and 52 deletions
src/apps/doc/consts.py
src/apps/doc/management/commands/folder_ocr_process.py
src/apps/doc/management/commands/ocr_process.py
src/apps/doc/ocr/wb.py
src/common/tools/pdf_to_img.py
--- a/src/apps/doc/consts.py
View file @1a51bc0
+++ b/src/apps/doc/consts.py
View file @1a51bc0
@@ -642,7 +642,7 @@ RP_FIELD_ORDER_0 = (('姓名', '姓名'),
                    ('住址', '住址'),
                    ('性别', '性别'),)
 RP_FIELD_ORDER_1 = IC_FIELD_ORDER_1
-# 增值税发票
+# 增值税普票
 VAT_CN_NAME = 'VAT普票'
 VAT_CLASSIFY = 0
 VAT_FIELD_ORDER = (('发票代码', '发票代码'),
@@ -667,6 +667,32 @@ VAT_FIELD_ORDER = (('发票代码', '发票代码'),
                   ('销方开户行及账号', '销售方开户行及账号'),
                   ('下盖章', '销售方：（章）'),
                   ('备注', '备注'),)
+# 增值税专票
+VATS_CN_NAME = 'VAT专票'
+VATS_CLASSIFY = 10088
+VATS_FIELD_ORDER = (('发票代码', '发票代码'),
+                    ('发票代码_开具', '发票代码（开具）'),
+                    ('发票号码', '发票号码'),
+                    ('发票号码_开具', '发票号码（开具）'),
+                    ('开票日期', '开票日期'),
+                    ('校验码', '校验码'),
+                    ('货物或应税劳务、服务名称', '货物或应税劳务、服务名称'),
+                    ('金额合计', '开具金额合计（不含税）'),
+                    ('税率', '税率'),
+                    ('税额合计', '税额合计'),
+                    ('价税合计小写', '价税合计（小写）'),
+                    ('价税合计大写', '价税合计（大写）'),
+                    ('购方名称', '购买方名称'),
+                    ('购方纳税人识别号', '购买方纳税人识别号'),
+                    ('购方地址、电话', '购买方地址、电话'),
+                    ('购方开户行及账号', '购买方开户行及账号'),
+                    ('销方名称', '销售方名称'),
+                    ('销方纳税人识别号', '销售方纳税人识别号'),
+                    ('销方地址、电话', '销售方地址、电话'),
+                    ('销方开户行及账号', '销售方开户行及账号'),
+                    ('下盖章', '销售方：（章）'),
+                    ('车船税', '车船税'),
+                    ('备注', '备注'),)
 # 机动车登记证书
 MVC_CN_NAME = '机动车登记证书'
 MVC_CLASSIFY = 28
@@ -770,7 +796,7 @@ MVI_FIELD_ORDER = (('发票代码', '发票代码'),
                   ('主管税务机关及代码', '主管税务机关及代码'),
                   ('吨位', '吨位'),
                   ('限乘人数', '限乘人数'),)
-IC_PID = VAT_PID = MVC_PID = MVI_PID = None
+IC_PID = VAT_PID = VATS_PID = MVC_PID = MVI_PID = None

 # 营业执照
 BL_CN_NAME = '营业执照'
@@ -909,6 +935,11 @@ LICENSE_ORDER = ((MVI_CLASSIFY, (MVI_PID, MVI_CN_NAME, MVI_FIELD_ORDER, False, F
                 (MVC_CLASSIFY, (MVC_PID, MVC_CN_NAME, None, True, True, MODEL_FIELD_MVC)),
                 (VAT_CLASSIFY, (VAT_PID, VAT_CN_NAME, VAT_FIELD_ORDER, False, False, MODEL_FIELD_VAT)))

+FOLDER_LICENSE_ORDER = ((MVI_CLASSIFY, (MVI_PID, MVI_CN_NAME, MVI_FIELD_ORDER, False, False, MODEL_FIELD_MVI)),
+                        (IC_CLASSIFY, (IC_PID, IC_CN_NAME, None, True, False, MODEL_FIELD_IC)),
+                        (VAT_CLASSIFY, (VAT_PID, VAT_CN_NAME, VAT_FIELD_ORDER, False, False, MODEL_FIELD_VAT)),
+                        (VATS_CLASSIFY, (VATS_PID, VATS_CN_NAME, VATS_FIELD_ORDER, False, False, MODEL_FIELD_VAT)))
+
 LICENSE_CLASSIFY_MAPPING = dict(LICENSE_ORDER)

 OTHER_CLASSIFY_SET = {OTHER_CLASSIFY}
--- a/src/apps/doc/management/commands/folder_ocr_process.py
View file @1a51bc0
+++ b/src/apps/doc/management/commands/folder_ocr_process.py
View file @1a51bc0
@@ -165,7 +165,6 @@ class Command(BaseCommand, LoggerMixin):
    def folder_process(self, input_dir, classify):
        while not os.path.isdir(input_dir):
            self.folder_log.info('{0} [input dir is not dir] [input_dir={1}]'.format(self.log_base, input_dir))
-            print(self.switch)
            if self.switch:
                time.sleep(self.sleep_time)
                continue
@@ -202,6 +201,9 @@ class Command(BaseCommand, LoggerMixin):
                        else:
                            self.img_process(name, path, classify, wb_output_dir, img_output_dir, pdf_output_dir)
                        self.folder_log.info('{0} [file end] [path={1}]'.format(self.log_base, path))
+                    else:
+                        self.folder_log.info('{0} [path is dir] [path={1}]'.format(self.log_base, input_dir))
+                        shutil.move(path, failed_output_dir)
                except Exception as e:
                    try:
                        path = os.path.join(input_dir, name)
--- a/src/apps/doc/management/commands/ocr_process.py
View file @1a51bc0
+++ b/src/apps/doc/management/commands/ocr_process.py
View file @1a51bc0
@@ -102,20 +102,20 @@ class Command(BaseCommand, LoggerMixin):
                self.log_base, task_str, is_priority))
            return doc, business_type, task_str

-    def pdf_download(self, doc, pdf_path):
-        if not doc.application_id.startswith(consts.FIXED_APPLICATION_ID_PREFIX):
-            for times in range(consts.RETRY_TIMES):
-                try:
-                    self.edms.download(pdf_path, doc.metadata_version_id)
-                except Exception as e:
-                    self.cronjob_log.warn('{0} [edms download failed] [times={1}] [pdf_path={2}] '
-                                          '[error={3}]'.format(self.log_base, times, pdf_path, traceback.format_exc()))
-                    edms_exc = str(e)
-                else:
-                    break
-            else:
-                raise EDMSException(edms_exc)
-        self.cronjob_log.info('{0} [edms download success] [pdf_path={1}]'.format(self.log_base, pdf_path))
+    # def pdf_download(self, doc, pdf_path):
+    #     if not doc.application_id.startswith(consts.FIXED_APPLICATION_ID_PREFIX):
+    #         for times in range(consts.RETRY_TIMES):
+    #             try:
+    #                 self.edms.download(pdf_path, doc.metadata_version_id)
+    #             except Exception as e:
+    #                 self.cronjob_log.warn('{0} [edms download failed] [times={1}] [pdf_path={2}] '
+    #                                       '[error={3}]'.format(self.log_base, times, pdf_path, traceback.format_exc()))
+    #                 edms_exc = str(e)
+    #             else:
+    #                 break
+    #         else:
+    #             raise EDMSException(edms_exc)
+    #     self.cronjob_log.info('{0} [edms download success] [pdf_path={1}]'.format(self.log_base, pdf_path))

    def bs_process(self, wb, ocr_data, bs_summary, unknown_summary, classify, res_list, pno, ino, part_idx):
        sheets = ocr_data.get('data', [])
@@ -439,19 +439,35 @@ class Command(BaseCommand, LoggerMixin):
                    # 2. 从EDMS获取PDF文件
                    doc_data_path = os.path.join(self.data_dir, business_type, consts.TMP_DIR_NAME, str(doc.id))
                    os.makedirs(doc_data_path, exist_ok=True)
+                    img_save_path = os.path.join(doc_data_path, 'img')
                    pdf_path = os.path.join(doc_data_path, '{0}.pdf'.format(doc.id))
-                    self.pdf_download(doc, pdf_path)

-                    # 3.PDF文件提取图片
-                    self.cronjob_log.info('{0} [pdf to img start] [task={1}]'.format(self.log_base, task_str))
-                    start_time = time.time()
-                    img_save_path = os.path.join(doc_data_path, 'img')
                    pdf_handler = PDFHandler(pdf_path, img_save_path)
-                    pdf_handler.extract_image()
-                    end_time = time.time()
-                    speed_time = int(end_time - start_time)
-                    self.cronjob_log.info('{0} [pdf to img end] [task={1}] [spend_time={2}]'.format(
-                        self.log_base, task_str, speed_time))
+
+                    for times in range(consts.RETRY_TIMES):
+                        try:
+                            if not doc.application_id.startswith(consts.FIXED_APPLICATION_ID_PREFIX):
+                                self.edms.download(pdf_path, doc.metadata_version_id)
+                            self.cronjob_log.info('{0} [edms download success] [task={1}] [times={2}] '
+                                                  '[pdf_path={3}]'.format(self.log_base, task_str, times, pdf_path))
+
+                            # 3.PDF文件提取图片
+                            self.cronjob_log.info('{0} [pdf to img start] [task={1}] [times={2}]'.format(
+                                self.log_base, task_str, times))
+                            start_time = time.time()
+                            pdf_handler.extract_image()
+                            end_time = time.time()
+                            speed_time = int(end_time - start_time)
+                            self.cronjob_log.info('{0} [pdf to img end] [task={1}] [times={2}] [spend_time={3}]'.format(
+                                self.log_base, task_str, times, speed_time))
+                        except Exception as e:
+                            self.cronjob_log.warn('{0} [download or pdf to img failed] [task={1}] [times={2}] '
+                                                  '[error={3}]'.format(self.log_base, task_str, times,
+                                                                       traceback.format_exc()))
+                        else:
+                            break
+                    else:
+                        raise Exception('download or pdf to img failed')

                    img_count = len(pdf_handler.img_path_list)
                    if img_count == 0:
@@ -466,25 +482,25 @@ class Command(BaseCommand, LoggerMixin):
                                self.cronjob_log.info('{0} [pdf_2_img_2_queue] [img queue full]'.format(self.log_base))
                                time.sleep(self.sleep_time_img_put)
                            img_queue.put(img_path)
-                except EDMSException as e:
-                    try:
-                        doc.status = DocStatus.PROCESS_FAILED.value
-                        doc.save()
-                        self.cronjob_log.warn('{0} [process failed (edms download)] [task={1}] [error={2}]'.format(
-                            self.log_base, task_str, traceback.format_exc()))
-                    except Exception as e:
-                        self.cronjob_log.error('{0} [process error (db save 1)] [error={1}]'.format(
-                            self.log_base, traceback.format_exc()))
-                        error_list.append(1)
-                        return
+                # except EDMSException as e:
+                #     try:
+                #         doc.status = DocStatus.PROCESS_FAILED.value
+                #         doc.save()
+                #         self.cronjob_log.warn('{0} [process failed (edms download)] [task={1}] [error={2}]'.format(
+                #             self.log_base, task_str, traceback.format_exc()))
+                #     except Exception as e:
+                #         self.cronjob_log.error('{0} [process error (db save 1)] [error={1}]'.format(
+                #             self.log_base, traceback.format_exc()))
+                #         error_list.append(1)
+                #         return
                except Exception as e:
                    try:
                        doc.status = DocStatus.PROCESS_FAILED.value
                        doc.save()
-                        self.cronjob_log.warn('{0} [process failed (pdf to img)] [task={1}] [error={2}]'.format(
-                            self.log_base, task_str, traceback.format_exc()))
+                        self.cronjob_log.warn('{0} [process failed (pdf_2_img_2_queue)] [task={1}] '
+                                              '[error={2}]'.format(self.log_base, task_str, traceback.format_exc()))
                    except Exception as e:
-                        self.cronjob_log.error('{0} [process error (db save 2)] [error={1}]'.format(
+                        self.cronjob_log.error('{0} [process error (db save 1)] [error={1}]'.format(
                            self.log_base, traceback.format_exc()))
                        error_list.append(1)
                        return
@@ -523,8 +539,8 @@ class Command(BaseCommand, LoggerMixin):
                            ocr_1_res = ocr_1_response.json()
                            end_time = time.time()
                            speed_time = int(end_time - start_time)
-                            self.cronjob_log.info('{0} [ocr_1 success] [img={1}] [res={2}] [speed_time={3}]'.format(
-                                self.log_base, img_path, ocr_1_res, speed_time))
+                            self.cronjob_log.info('{0} [ocr_1 success] [img={1}] [url={2}] [speed_time={3}]'.format(
+                                self.log_base, img_path, url, speed_time))
                            break
                    else:
                        ocr_1_res = {}
@@ -636,8 +652,8 @@ class Command(BaseCommand, LoggerMixin):
                                                end_time = time.time()
                                                speed_time = int(end_time - start_time)
                                                self.cronjob_log.info(
-                                                    '{0} [ocr_2 success] [img={1}] [res={2}] [speed_time={3}]'.format(
-                                                        self.log_base, img_path, ocr_2_res, speed_time))
+                                                    '{0} [ocr_2 success] [img={1}] [speed_time={2}]'.format(
+                                                        self.log_base, img_path, speed_time))

                                                if classify == consts.BC_CLASSIFY:
                                                    name = '有'
--- a/src/apps/doc/ocr/wb.py
View file @1a51bc0
+++ b/src/apps/doc/ocr/wb.py
View file @1a51bc0
@@ -520,7 +520,7 @@ class BSWorkbook(Workbook):

            for row in loan_fill_row:
                for cell in new_ws[row]:
-                    cell.fill = self.loan_fill
+                    cell.fill = self.amount_fill

            # 3.6.同一天相同进出账高亮
            del amount_mapping
@@ -656,17 +656,24 @@ class BSWorkbook(Workbook):
            count_list.append((field_str, count))

    def simple_license_rebuild(self, license_summary, document_scheme):
-        for classify, (_, name, field_order, side_diff, scheme_diff, _) in consts.LICENSE_ORDER:
+        for ic_license_dict in license_summary.get(consts.IC_CLASSIFY, []):
+            if ic_license_dict.get('类别') == '1':
+                license_summary.setdefault(consts.RP_CLASSIFY, []).append(ic_license_dict)
+                continue
+
+        for vat_license_dict in license_summary.get(consts.VAT_CLASSIFY, []):
+            if vat_license_dict.get('发票类型') == 'special':
+                license_summary.setdefault(consts.VATS_CLASSIFY, []).append(vat_license_dict)
+                continue
+
+        for classify, (_, name, field_order, side_diff, scheme_diff, _) in consts.FOLDER_LICENSE_ORDER:
            license_list = license_summary.get(classify)
            if not license_list:
                continue
            ws = self.create_sheet(name)
-            if scheme_diff and document_scheme == consts.DOC_SCHEME_LIST[1]:
-                classify = consts.MVC_CLASSIFY_SE
+            # if scheme_diff and document_scheme == consts.DOC_SCHEME_LIST[1]:
+            #     classify = consts.MVC_CLASSIFY_SE
            for license_dict in license_list:
-                if classify == consts.IC_CLASSIFY and license_dict.get('类别') == '1':
-                    license_summary.setdefault(consts.RP_CLASSIFY, []).append(license_dict)
-                    continue
                if side_diff:
                    key, field_order_yes, field_order_no = consts.FIELD_ORDER_MAP.get(classify)
                    field_order = field_order_yes if key in license_dict else field_order_no
--- a/src/common/tools/pdf_to_img.py
View file @1a51bc0
+++ b/src/common/tools/pdf_to_img.py
View file @1a51bc0
@@ -187,6 +187,8 @@ class PDFHandler:
            self.page_to_png(page)

    def extract_image(self):
+        self.img_path_list = []
+        self.xref_set = set()
        os.makedirs(self.img_dir_path, exist_ok=True)
        with fitz.Document(self.path) as pdf:
            for pno in range(pdf.pageCount):