add res sheet

周伟奇
Showing 3 changed files with 56 additions and 36 deletions
src/apps/doc/consts.py
src/apps/doc/management/commands/doc_ocr_process.py
src/apps/doc/ocr/wb.py
--- a/src/apps/doc/consts.py
View file @bffd259
+++ b/src/apps/doc/consts.py
View file @bffd259
@@ -72,8 +72,12 @@ TRANS_MAP = {
 }
 TRANS = str.maketrans(TRANS_MAP)
 ERROR_CHARS = {'.', '。', ':', '：', '•', '·', ',', '，'}
-SKIP_IMG_SHEET_NAME = '未处理图片'
+RES_SHEET_NAME = '结果统计'
-SKIP_IMG_SHEET_HEADER = ('页码', '序号')
+RES_SHEET_HEADER = ('页码', '序号', '结果')
+RES_SUCCESS = '识别成功'
+RES_SUCCESS_OTHER = '识别成功（其他类）'
+RES_SUCCESS_EMPTY = '识别成功（空数据）'
+RES_FAILED = '识别识别'
 CARD_RATIO = 0.9
 UNKNOWN_CARD = '未知卡号'
--- a/src/apps/doc/management/commands/doc_ocr_process.py
View file @bffd259
+++ b/src/apps/doc/management/commands/doc_ocr_process.py
View file @bffd259
@@ -93,18 +93,19 @@ class Command(BaseCommand, LoggerMixin):
            self.log_base, business_type, doc.id, pdf_path))
        return doc_data_path, excel_path, src_excel_path, pdf_path
-    def bs_process(self, wb, ocr_data, bs_summary, unknown_summary, img_path, classify, skip_img):
+    def bs_process(self, wb, ocr_data, bs_summary, unknown_summary, classify, res_list, pno, ino):
        sheets = ocr_data.get('data', [])
        if not sheets:
-            skip_img.append(self.parse_img_path(img_path))
+            res_list.append((pno, ino, consts.RES_SUCCESS_EMPTY))
            return
        confidence = ocr_data.get('confidence', 1)
-        img_name, _ = os.path.splitext(os.path.basename(img_path))
+        img_name = 'page_{0}_img_{1}'.format(pno, ino)
+        cells_exists = False
        for i, sheet in enumerate(sheets):
            cells = sheet.get('cells')
            if not cells:
-                skip_img.append(self.parse_img_path(img_path))
                continue
+            cells_exists = True
            sheet_name = '{0}_{1}'.format(img_name, i)
            ws = wb.create_sheet(sheet_name)
            for cell in cells:
@@ -160,16 +161,23 @@ class Command(BaseCommand, LoggerMixin):
                if summary[6] is not None:
                    ed_list.append(summary[6])
-    def license1_process(self, ocr_data, license_summary, classify, skip_img, img_path):
+        if cells_exists:
+            res_list.append((pno, ino, consts.RES_SUCCESS))
+        else:
+            res_list.append((pno, ino, consts.RES_SUCCESS_EMPTY))
+    def license1_process(self, ocr_data, license_summary, classify, res_list, pno, ino):
        # 类别：'0'身份证， '1'居住证
        license_data = ocr_data.get('data', [])
        if not license_data:
-            skip_img.append(self.parse_img_path(img_path))
+            res_list.append((pno, ino, consts.RES_SUCCESS_EMPTY))
            return
+        res_list.append((pno, ino, consts.RES_SUCCESS))
        license_summary.setdefault(classify, []).extend(license_data)
-    def license2_process(self, ocr_res_2, license_summary, pid, classify, skip_img, img_path):
+    def license2_process(self, ocr_res_2, license_summary, pid, classify, res_list, pno, ino):
        if ocr_res_2.get('ErrorCode') in consts.SUCCESS_CODE_SET:
+            res_list.append((pno, ino, consts.RES_SUCCESS))
            if pid == consts.BC_PID:
                # 银行卡
                # res_dict = {}
@@ -184,7 +192,7 @@ class Command(BaseCommand, LoggerMixin):
                        res_dict[field_dict.get('chn_key', '')] = field_dict.get('value', '')
                    license_summary.setdefault(classify, []).append(res_dict)
        else:
-            skip_img.append(self.parse_img_path(img_path))
+            res_list.append((pno, ino, consts.RES_FAILED))
    @staticmethod
    async def fetch_ocr_1_result(url, json_data):
@@ -207,7 +215,8 @@ class Command(BaseCommand, LoggerMixin):
                if response.status == 200:
                    return await response.json()
-    async def img_2_ocr_2_wb(self, wb, img_path, bs_summary, unknown_summary, license_summary, skip_img):
+    async def img_2_ocr_2_wb(self, wb, img_path, bs_summary, unknown_summary, license_summary, res_list):
+        pno, ino = self.parse_img_path(img_path)
        with open(img_path, 'rb') as f:
            base64_data = base64.b64encode(f.read())
            # 获取解码后的base64值
@@ -217,23 +226,26 @@ class Command(BaseCommand, LoggerMixin):
        }
        ocr_res_1 = await self.fetch_ocr_1_result(self.ocr_url_1, json_data_1)
        if ocr_res_1 is None:
-            skip_img.append(self.parse_img_path(img_path))
+            res_list.append((pno, ino, consts.RES_FAILED))
-            raise Exception('ocr 1 error, img_path={0}'.format(img_path))
+            self.cronjob_log.info('{0} [ocr_1 failed] [img={1}]'.format(self.log_base, img_path))
+            # raise Exception('ocr 1 error, img_path={0}'.format(img_path))
        else:
-            self.cronjob_log.info('{0} [ocr_1 result] [img={1}] [res={2}]'.format(
+            self.cronjob_log.info('{0} [ocr_1 success] [img={1}] [res={2}]'.format(
                self.log_base, img_path, ocr_res_1))
            if ocr_res_1.get('code') == 1:
                ocr_data = ocr_res_1.get('data', {})
                classify = ocr_data.get('classify')
                if classify is None:
-                    skip_img.append(self.parse_img_path(img_path))
+                    res_list.append((pno, ino, consts.RES_FAILED))
+                    self.cronjob_log.info('{0} [ocr_1 res error] [img={1}] [res={2}]'.format(
+                        self.log_base, img_path, ocr_res_1))
                    return
                elif classify in consts.OTHER_CLASSIFY_SET:  # 其他类
-                    skip_img.append(self.parse_img_path(img_path))
+                    res_list.append((pno, ino, consts.RES_SUCCESS_OTHER))
                    return
                elif classify in consts.LICENSE_CLASSIFY_SET_1:  # 证件1
-                    self.license1_process(ocr_data, license_summary, classify, skip_img, img_path)
+                    self.license1_process(ocr_data, license_summary, classify, res_list, pno, ino)
                elif classify in consts.LICENSE_CLASSIFY_SET_2:  # 证件2
                    pid, _, _, _, _ = consts.LICENSE_CLASSIFY_MAPPING.get(classify)
                    json_data_2 = {
@@ -244,11 +256,13 @@ class Command(BaseCommand, LoggerMixin):
                    }
                    ocr_res_2 = await self.fetch_ocr_2_result(self.ocr_url_2, json_data_2)
                    if ocr_res_2 is None:
-                        raise Exception('ocr 2 error, img_path={0}'.format(img_path))
+                        res_list.append((pno, ino, consts.RES_FAILED))
+                        self.cronjob_log.info('{0} [ocr_2 failed] [img={1}]'.format(self.log_base, img_path))
+                        # raise Exception('ocr 2 error, img_path={0}'.format(img_path))
                    else:
                        # 识别结果
                        ocr_res_2 = json.loads(ocr_res_2)
-                        self.cronjob_log.info('{0} [ocr_2 result] [img={1}] [res={2}]'.format(
+                        self.cronjob_log.info('{0} [ocr_2 success] [img={1}] [res={2}]'.format(
                            self.log_base, img_path, ocr_res_2))
                        if classify == consts.BC_CLASSIFY:
                            name = '有'
@@ -258,11 +272,13 @@ class Command(BaseCommand, LoggerMixin):
                                    card_name_res.get('data', {}).get('is_exists_name') == 0:
                                name = '无'
                            ocr_res_2['Name'] = name
-                        self.license2_process(ocr_res_2, license_summary, pid, classify, skip_img, img_path)
+                        self.license2_process(ocr_res_2, license_summary, pid, classify, res_list, pno, ino)
                else:  # 流水处理
-                    self.bs_process(wb, ocr_data, bs_summary, unknown_summary, img_path, classify, skip_img)
+                    self.bs_process(wb, ocr_data, bs_summary, unknown_summary, classify, res_list, pno, ino)
            else:
-                skip_img.append(self.parse_img_path(img_path))
+                res_list.append((pno, ino, consts.RES_FAILED))
+                self.cronjob_log.info('{0} [ocr_1 res error] [img={1}] [res={2}]'.format(
+                    self.log_base, img_path, ocr_res_1))
    # def img_2_ocr_2_wb(self, wb, img_path, bs_summary, unknown_summary, license_summary, skip_img):
    #     # # 流水
@@ -559,7 +575,7 @@ class Command(BaseCommand, LoggerMixin):
                bs_summary = {}
                license_summary = {}
                unknown_summary = {}
-                skip_img = []
+                res_list = []
                interest_keyword = Keywords.objects.filter(
                    type=KeywordsType.INTEREST.value, on_off=True).values_list('keyword', flat=True)
                salary_keyword = Keywords.objects.filter(
@@ -573,13 +589,13 @@ class Command(BaseCommand, LoggerMixin):
                # 4.1 获取OCR结果
                loop = asyncio.get_event_loop()
-                tasks = [self.img_2_ocr_2_wb(wb, img_path, bs_summary, unknown_summary, license_summary, skip_img)
+                tasks = [self.img_2_ocr_2_wb(wb, img_path, bs_summary, unknown_summary, license_summary, res_list)
                         for img_path in pdf_handler.img_path_list]
                loop.run_until_complete(asyncio.wait(tasks))
                # loop.close()
                # for img_path in pdf_handler.img_path_list:
-                #     self.img_2_ocr_2_wb(wb, img_path, bs_summary, unknown_summary, license_summary, skip_img)
+                #     self.img_2_ocr_2_wb(wb, img_path, bs_summary, unknown_summary, license_summary, res_list)
                self.cronjob_log.info('{0} [business_type={1}] [doc_id={2}] [bs_summary={3}] [unknown_summary={4}] '
                                      '[license_summary={5}]'.format(self.log_base, business_type, doc.id, bs_summary,
@@ -588,14 +604,14 @@ class Command(BaseCommand, LoggerMixin):
                merged_bs_summary = self.rebuild_bs_summary(bs_summary, unknown_summary)
                self.cronjob_log.info('{0} [business_type={1}] [doc_id={2}] [merged_bs_summary={3}] '
-                                      '[unknown_summary={4}] [skip_img={5}]'.format(self.log_base, business_type,
+                                      '[unknown_summary={4}] [res_list={5}]'.format(self.log_base, business_type,
                                                                                    doc.id, merged_bs_summary,
-                                                                                    unknown_summary, skip_img))
+                                                                                    unknown_summary, res_list))
                del unknown_summary
                # 4.2 重构Excel文件
                wb.save(src_excel_path)
-                wb.rebuild(merged_bs_summary, license_summary, skip_img, doc.document_scheme)
+                wb.rebuild(merged_bs_summary, license_summary, res_list, doc.document_scheme)
                wb.save(excel_path)
            except EDMSException as e:
                doc.status = DocStatus.PROCESS_FAILED.value
--- a/src/apps/doc/ocr/wb.py
View file @bffd259
+++ b/src/apps/doc/ocr/wb.py
View file @bffd259
@@ -502,19 +502,19 @@ class BSWorkbook(Workbook):
                    ws.append((write_field, license_dict.get(search_field, '')))
                ws.append((None, ))
-    def skip_img_sheet(self, skip_img):
+    def res_sheet(self, res_list):
-        if skip_img:
+        if res_list:
-            ws = self.create_sheet(consts.SKIP_IMG_SHEET_NAME)
+            ws = self.create_sheet(consts.RES_SHEET_NAME)
-            ws.append(consts.SKIP_IMG_SHEET_HEADER)
+            ws.append(consts.RES_SHEET_HEADER)
-            for img_tuple in skip_img:
+            for res_tuple in res_list:
-                ws.append(img_tuple)
+                ws.append(res_tuple)
    def remove_base_sheet(self):
        if len(self.sheetnames) > 1:
            self.remove(self.get_sheet_by_name('Sheet'))
-    def rebuild(self, bs_summary, license_summary, skip_img, document_scheme):
+    def rebuild(self, bs_summary, license_summary, res_list, document_scheme):
        self.bs_rebuild(bs_summary)
        self.license_rebuild(license_summary, document_scheme)
-        self.skip_img_sheet(skip_img)
+        self.res_sheet(res_list)
        self.remove_base_sheet()