fix bug & add skip_img_sheet

周伟奇
Showing 4 changed files with 175 additions and 166 deletions
.gitignore
src/apps/doc/consts.py
src/apps/doc/management/commands/doc_ocr_process.py
src/apps/doc/ocr/wb.py
--- a/.gitignore
View file @59cbfab
+++ b/.gitignore
View file @59cbfab
@@ -33,4 +33,5 @@ data/*
 # 脚本
 src/*.sh

-test*
\ No newline at end of file
+test*
+ocr_test.py
\ No newline at end of file
--- a/src/apps/doc/consts.py
View file @59cbfab
+++ b/src/apps/doc/consts.py
View file @59cbfab
@@ -60,6 +60,8 @@ TRANS_MAP = {
 }
 TRANS = str.maketrans(TRANS_MAP)
 ERROR_CHARS = {'.', '·', '•'}
+SKIP_IMG_SHEET_NAME = '未处理图片'
+SKIP_IMG_SHEET_HEADER = ('页码', '序号')

 CARD_RATIO = 0.9
 UNKNOWN_CARD = '未知卡号'
--- a/src/apps/doc/management/commands/doc_ocr_process.py
View file @59cbfab
+++ b/src/apps/doc/management/commands/doc_ocr_process.py
View file @59cbfab
@@ -80,19 +80,20 @@ class Command(BaseCommand, LoggerMixin):
            self.log_base, business_type, doc.id, pdf_path))
        return doc_data_path, excel_path, src_excel_path, pdf_path

-    @staticmethod
-    def bs_process(wb, ocr_data, bs_summary, unknown_summary, img_path, classify):
+    def bs_process(self, wb, ocr_data, bs_summary, unknown_summary, img_path, classify, skip_img):
        sheets = ocr_data.get('data', [])
        if not sheets:
+            skip_img.append(self.parse_img_path(img_path))
            return
        confidence = ocr_data.get('confidence', 1)
        img_name, _ = os.path.splitext(os.path.basename(img_path))
        for i, sheet in enumerate(sheets):
-            sheet_name = '{0}_{1}'.format(img_name, i)
-            ws = wb.create_sheet(sheet_name)
            cells = sheet.get('cells')
            if not cells:
+                skip_img.append(self.parse_img_path(img_path))
                continue
+            sheet_name = '{0}_{1}'.format(img_name, i)
+            ws = wb.create_sheet(sheet_name)
            for cell in cells:
                c1 = cell.get('start_column')
                r1 = cell.get('start_row')
@@ -147,9 +148,10 @@ class Command(BaseCommand, LoggerMixin):
                    ed_list.append(summary[6])

    @staticmethod
-    def license1_process(ocr_data, license_summary, classify):
+    def license1_process(ocr_data, license_summary, classify, skip_img, img_path):
        license_data = ocr_data.get('data', [])
        if not license_data:
+            skip_img.append(img_path)
            return
        for license_dict in license_data:
            res_list = []
@@ -157,8 +159,7 @@ class Command(BaseCommand, LoggerMixin):
                res_list.append((field, value))
            license_summary.setdefault(classify, []).append(res_list)

-    @staticmethod
-    def license2_process(ocr_res_2, license_summary, pid, classify):
+    def license2_process(self, ocr_res_2, license_summary, pid, classify, skip_img, img_path):
        if ocr_res_2.get('ErrorCode') in consts.SUCCESS_CODE_SET:
            if pid == consts.BC_PID:
                # 银行卡
@@ -174,113 +175,16 @@ class Command(BaseCommand, LoggerMixin):
                        res_list.append(
                            (field_dict.get('chn_key', ''), field_dict.get('value', '')))
                    license_summary.setdefault(classify, []).append(res_list)
-
-    async def fetch_ocr_result(self, url, json_data):
-        async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(ssl=False)) as session:
-            async with session.post(url, json=json_data) as response:
-                if response.status == 200:
-                    return await response.json()
-
-    # async def img_2_ocr_2_wb(self, wb, img_path, summary):
-    #     res = await self.fetch_ocr_result(img_path)
-    #     self.cronjob_log.info('{0} [fetch ocr result success] [img={1}] [res={2}]'.format(self.log_base, img_path, res))
-    #     sheets_list = res.get('result').get('res')
-    #     img_name = os.path.basename(img_path)
-    #     self.append_sheet(wb, sheets_list, img_name, summary)
-
-    async def img_2_ocr_2_wb(self, wb, img_path, bs_summary, unknown_summary, license_summary):
-        with open(img_path, 'rb') as f:
-            base64_data = base64.b64encode(f.read())
-            # 获取解码后的base64值
-            file_data = base64_data.decode()
-        json_data_1 = {
-            "file": file_data
-        }
-        ocr_res_1 = await self.fetch_ocr_result(self.ocr_url_1, json_data_1)
-        if ocr_res_1 is None:
-            raise Exception('ocr 1 error, img_path={0}'.format(img_path))
        else:
-            self.cronjob_log.info('{0} [ocr_1 result] [img={1}] [res={2}]'.format(
-                self.log_base, img_path, ocr_res_1))
-
-            if ocr_res_1.get('code') == 1:
-                ocr_data = ocr_res_1.get('data', {})
-                classify = ocr_data.get('classify')
-                if classify is None:
-                    return
-                elif classify in consts.OTHER_CLASSIFY_SET:  # 其他类
-                    return
-                elif classify in consts.LICENSE_CLASSIFY_SET_1:  # 证件1
-                    self.license1_process(ocr_data, license_summary, classify)
-                elif classify in consts.LICENSE_CLASSIFY_SET_2:  # 证件2
-                    pid, _ = consts.LICENSE_CLASSIFY_MAPPING.get(classify)
-                    json_data_2 = {
-                        "pid": str(pid),
-                        "key": conf.OCR_KEY,
-                        "secret": conf.OCR_SECRET,
-                        "file": file_data
-                    }
-                    ocr_res_2 = await self.fetch_ocr_result(self.ocr_url_2, json_data_2)
-                    if ocr_res_2 is None:
-                        raise Exception('ocr 2 error, img_path={0}'.format(img_path))
-                    else:
-                        # 识别结果
-                        self.cronjob_log.info('{0} [ocr_2 result] [img={1}] [res={2}]'.format(
-                            self.log_base, img_path, ocr_res_2))
-                        self.license2_process(ocr_res_2, license_summary, pid, classify)
-                else:  # 流水处理
-                    self.bs_process(wb, ocr_data, bs_summary, unknown_summary, img_path, classify)
+            skip_img.append(self.parse_img_path(img_path))

-    # def img_2_ocr_2_wb(self, wb, img_path, bs_summary, unknown_summary, license_summary):
-    #     # # 流水
-    #     # res = {
-    #     #     'code': 1,
-    #     #     'msg': 'success',
-    #     #     'data': {
-    #     #         'classify': 0,
-    #     #         'confidence': 0.999,
-    #     #         'data': [
-    #     #             {
-    #     #                 'summary': ['户名', '卡号', '页码', '回单验证码', '打印时间', '起始时间', '终止时间'],
-    #     #                 'cells': []
-    #     #             },
-    #     #             {
-    #     #                 'summary': ['户名', '卡号', '页码', '回单验证码', '打印时间', '起始时间', '终止时间'],
-    #     #                 'cells': []
-    #     #             }
-    #     #         ]
-    #     #     }
-    #     # }
-    #     #
-    #     # # 证件-1
-    #     # res = {
-    #     #     'code': 1,
-    #     #     'msg': 'success',
-    #     #     'data': {
-    #     #         'classify': 0,
-    #     #         'confidence': 0.999,
-    #     #         'data': [
-    #     #             {
-    #     #                 'cn_key': 'value',
-    #     #                 'cn_key': 'value',
-    #     #             },
-    #     #             {
-    #     #                 'cn_key': 'value',
-    #     #                 'cn_key': 'value',
-    #     #             },
-    #     #         ]
-    #     #     }
-    #     # }
-    #     #
-    #     # # 证件-2 or 其他类
-    #     # res = {
-    #     #     'code': 1,
-    #     #     'msg': 'success',
-    #     #     'data': {
-    #     #         'classify': 0,
-    #     #         'confidence': 0.999,
-    #     #     }
-    #     # }
+    # async def fetch_ocr_result(self, url, json_data):
+    #     async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(ssl=False)) as session:
+    #         async with session.post(url, json=json_data) as response:
+    #             if response.status == 200:
+    #                 return await response.json()
+    #
+    # async def img_2_ocr_2_wb(self, wb, img_path, bs_summary, unknown_summary, license_summary):
    #     with open(img_path, 'rb') as f:
    #         base64_data = base64.b64encode(f.read())
    #         # 获取解码后的base64值
@@ -288,9 +192,10 @@ class Command(BaseCommand, LoggerMixin):
    #     json_data_1 = {
    #         "file": file_data
    #     }
-    #     response_1 = requests.post(self.ocr_url_1, json=json_data_1)
-    #     if response_1.status_code == 200:
-    #         ocr_res_1 = response_1.json()
+    #     ocr_res_1 = await self.fetch_ocr_result(self.ocr_url_1, json_data_1)
+    #     if ocr_res_1 is None:
+    #         raise Exception('ocr 1 error, img_path={0}'.format(img_path))
+    #     else:
    #         self.cronjob_log.info('{0} [ocr_1 result] [img={1}] [res={2}]'.format(
    #             self.log_base, img_path, ocr_res_1))
    #
@@ -311,21 +216,119 @@ class Command(BaseCommand, LoggerMixin):
    #                     "secret": conf.OCR_SECRET,
    #                     "file": file_data
    #                 }
-    #                 response_2 = requests.post(self.ocr_url_2, data=json_data_2)
-    #                 if response_2.status_code == 200:
+    #                 ocr_res_2 = await self.fetch_ocr_result(self.ocr_url_2, json_data_2)
+    #                 if ocr_res_2 is None:
+    #                     raise Exception('ocr 2 error, img_path={0}'.format(img_path))
+    #                 else:
    #                     # 识别结果
-    #                     ocr_res_2 = response_2.json()
    #                     self.cronjob_log.info('{0} [ocr_2 result] [img={1}] [res={2}]'.format(
    #                         self.log_base, img_path, ocr_res_2))
    #                     self.license2_process(ocr_res_2, license_summary, pid, classify)
-    #                 else:
-    #                     raise Exception('ocr 2 error, img_path={0}'.format(img_path))
    #             else:  # 流水处理
    #                 self.bs_process(wb, ocr_data, bs_summary, unknown_summary, img_path, classify)
-    #         else:
-    #             pass
-    #     else:
-    #         raise Exception('ocr 1 error, img_path={0}'.format(img_path))
+
+    def img_2_ocr_2_wb(self, wb, img_path, bs_summary, unknown_summary, license_summary, skip_img):
+        # # 流水
+        # res = {
+        #     'code': 1,
+        #     'msg': 'success',
+        #     'data': {
+        #         'classify': 0,
+        #         'confidence': 0.999,
+        #         'data': [
+        #             {
+        #                 'summary': ['户名', '卡号', '页码', '回单验证码', '打印时间', '起始时间', '终止时间'],
+        #                 'cells': []
+        #             },
+        #             {
+        #                 'summary': ['户名', '卡号', '页码', '回单验证码', '打印时间', '起始时间', '终止时间'],
+        #                 'cells': []
+        #             }
+        #         ]
+        #     }
+        # }
+        #
+        # # 证件-1
+        # res = {
+        #     'code': 1,
+        #     'msg': 'success',
+        #     'data': {
+        #         'classify': 0,
+        #         'confidence': 0.999,
+        #         'data': [
+        #             {
+        #                 'cn_key': 'value',
+        #                 'cn_key': 'value',
+        #             },
+        #             {
+        #                 'cn_key': 'value',
+        #                 'cn_key': 'value',
+        #             },
+        #         ]
+        #     }
+        # }
+        #
+        # # 证件-2 or 其他类
+        # res = {
+        #     'code': 1,
+        #     'msg': 'success',
+        #     'data': {
+        #         'classify': 0,
+        #         'confidence': 0.999,
+        #     }
+        # }
+        with open(img_path, 'rb') as f:
+            base64_data = base64.b64encode(f.read())
+            # 获取解码后的base64值
+            file_data = base64_data.decode()
+        json_data_1 = {
+            "file": file_data
+        }
+        response_1 = requests.post(self.ocr_url_1, json=json_data_1)
+        if response_1.status_code == 200:
+            ocr_res_1 = response_1.json()
+            self.cronjob_log.info('{0} [ocr_1 result] [img={1}] [res={2}]'.format(
+                self.log_base, img_path, ocr_res_1))
+
+            if ocr_res_1.get('code') == 1:
+                ocr_data = ocr_res_1.get('data', {})
+                classify = ocr_data.get('classify')
+                if classify is None:
+                    skip_img.append(self.parse_img_path(img_path))
+                    return
+                elif classify in consts.OTHER_CLASSIFY_SET:  # 其他类
+                    skip_img.append(self.parse_img_path(img_path))
+                    return
+                elif classify in consts.LICENSE_CLASSIFY_SET_1:  # 证件1
+                    self.license1_process(ocr_data, license_summary, classify, skip_img, img_path)
+                elif classify in consts.LICENSE_CLASSIFY_SET_2:  # 证件2
+                    pid, _ = consts.LICENSE_CLASSIFY_MAPPING.get(classify)
+                    json_data_2 = {
+                        "pid": str(pid),
+                        "key": conf.OCR_KEY,
+                        "secret": conf.OCR_SECRET,
+                        "file": file_data
+                    }
+                    response_2 = requests.post(self.ocr_url_2, data=json_data_2)
+                    if response_2.status_code == 200:
+                        # 识别结果
+                        ocr_res_2 = response_2.json()
+                        self.cronjob_log.info('{0} [ocr_2 result] [img={1}] [res={2}]'.format(
+                            self.log_base, img_path, ocr_res_2))
+                        self.license2_process(ocr_res_2, license_summary, pid, classify, skip_img, img_path)
+                    else:
+                        raise Exception('ocr 2 error, img_path={0}'.format(img_path))
+                else:  # 流水处理
+                    self.bs_process(wb, ocr_data, bs_summary, unknown_summary, img_path, classify)
+            else:
+                skip_img.append(self.parse_img_path(img_path))
+        else:
+            raise Exception('ocr 1 error, img_path={0}'.format(img_path))
+
+    @staticmethod
+    def parse_img_path(img_path):
+        img_name, _ = os.path.splitext(os.path.basename(img_path))
+        return img_name[5], img_name[11]

    @staticmethod
    def get_most(value_list):
@@ -425,8 +428,10 @@ class Command(BaseCommand, LoggerMixin):
                    merged_bs_summary[card] = summary
        else:
            # 1卡号
+            one_card = False
            if len(bs_summary) == 1:
                merged_bs_summary = self.prune_bs_summary(bs_summary)
+                one_card = True
            # 多卡号
            else:
                merged_bs_summary = self.merge_card(bs_summary)
@@ -435,7 +440,7 @@ class Command(BaseCommand, LoggerMixin):
                merge_role = []
                classify_summary = unknown_summary.get(card_summary['classify'], {})
                for role, summary in classify_summary.items():
-                    if role in card_summary['role_set']:
+                    if one_card or role in card_summary['role_set']:
                        merge_role.append(role)
                        card_summary['sheet'].extend(summary['sheet'])
                        card_summary['code'].extend(summary['code'])
@@ -503,6 +508,7 @@ class Command(BaseCommand, LoggerMixin):
                bs_summary = {}
                license_summary = {}
                unknown_summary = {}
+                skip_img = []
                interest_keyword = Keywords.objects.filter(
                    type=KeywordsType.INTEREST.value, on_off=True).values_list('keyword', flat=True)
                salary_keyword = Keywords.objects.filter(
@@ -515,27 +521,29 @@ class Command(BaseCommand, LoggerMixin):
                # wb = Workbook()

                # 4.1 获取OCR结果
-                loop = asyncio.get_event_loop()
-                tasks = [self.img_2_ocr_2_wb(wb, img_path, bs_summary, unknown_summary, license_summary)
-                         for img_path in pdf_handler.img_path_list]
-                loop.run_until_complete(asyncio.wait(tasks))
+                # loop = asyncio.get_event_loop()
+                # tasks = [self.img_2_ocr_2_wb(wb, img_path, bs_summary, unknown_summary, license_summary)
+                #          for img_path in pdf_handler.img_path_list]
+                # loop.run_until_complete(asyncio.wait(tasks))
                # loop.close()

-                # for img_path in pdf_handler.img_path_list:
-                #     self.img_2_ocr_2_wb(wb, img_path, bs_summary, unknown_summary, license_summary)
+                for img_path in pdf_handler.img_path_list:
+                    self.img_2_ocr_2_wb(wb, img_path, bs_summary, unknown_summary, license_summary, skip_img)

-                self.cronjob_log.info('{0} [bs_summary={1}] [unknown_summary={2}] [license_summary={3}]'.format(
-                    self.log_base, bs_summary, unknown_summary, license_summary))
+                self.cronjob_log.info('{0} [business_type={1}] [doc_id={2}] [bs_summary={3}] [unknown_summary={4}] '
+                                      '[license_summary={5}]'.format(self.log_base, business_type, doc.id, bs_summary,
+                                                                     unknown_summary, license_summary))

                merged_bs_summary = self.rebuild_bs_summary(bs_summary, unknown_summary)

-                self.cronjob_log.info('{0} [merged_bs_summary={1}] [unknown_summary={2}]'.format(
-                    self.log_base, merged_bs_summary, unknown_summary))
+                self.cronjob_log.info('{0} [business_type={1}] [doc_id={2}] [merged_bs_summary={3}] '
+                                      '[unknown_summary={4}]'.format(self.log_base, business_type, doc.id,
+                                                                     merged_bs_summary, unknown_summary))
                del unknown_summary

                # 4.2 重构Excel文件
                wb.save(src_excel_path)
-                wb.rebuild(merged_bs_summary, license_summary)
+                wb.rebuild(merged_bs_summary, license_summary, skip_img)
                wb.save(excel_path)
            except Exception as e:
                doc.status = DocStatus.PROCESS_FAILED.value
--- a/src/apps/doc/ocr/wb.py
View file @59cbfab
+++ b/src/apps/doc/ocr/wb.py
View file @59cbfab
@@ -141,32 +141,22 @@ class BSWorkbook(Workbook):
                # month_info process
                month_info = month_mapping.setdefault('xxxx-xx', [])
                month_info.append((ws.title, min_row, ws.max_row, 0))
-            elif len(month_list) == 1:
-                # reverse_trend_list process
-                reverse_trend = self.get_reverse_trend(dti.day, idx_list)
-                reverse_trend_list.append(reverse_trend)
-                # month_info process
-                month_info = month_mapping.setdefault(month_list[0], [])
-                day_mean = np.mean(dti.day.dropna())
-                if len(month_info) == 0:
-                    month_info.append((ws.title, min_row, ws.max_row, day_mean))
-                else:
-                    for i, item in enumerate(month_info):
-                        if day_mean <= item[-1]:
-                            month_info.insert(i, (ws.title, min_row, ws.max_row, day_mean))
-                            break
-                    else:
-                        month_info.append((ws.title, min_row, ws.max_row, day_mean))
            else:
                # reverse_trend_list process
                reverse_trend = self.get_reverse_trend(dti.day, idx_list)
                reverse_trend_list.append(reverse_trend)
                # month_info process
-                for i, item in enumerate(month_list[:-1]):
-                    month_mapping.setdefault(item, []).append(
-                        (ws.title, idx_list[i] + min_row, idx_list[i + 1] + min_row - 1, self.MAX_MEAN))
-                month_mapping.setdefault(month_list[-1], []).insert(
-                    0, (ws.title, idx_list[-1] + min_row, ws.max_row, 0))
+                day_idx = dti.day
+                idx_list_max_idx = len(idx_list) - 1
+                for i, item in enumerate(month_list):
+                    if i == idx_list_max_idx:
+                        day_mean = np.mean(day_idx[idx_list[i]:].dropna())
+                        month_mapping.setdefault(item, []).append(
+                            (ws.title, idx_list[i] + min_row, ws.max_row, day_mean))
+                    else:
+                        day_mean = np.mean(day_idx[idx_list[i]: idx_list[i + 1]].dropna())
+                        month_mapping.setdefault(item, []).append(
+                            (ws.title, idx_list[i] + min_row, idx_list[i + 1] + min_row - 1, day_mean))

    def build_metadata_rows(self, confidence, code, print_time, start_date, end_date):
        if start_date is None or end_date is None:
@@ -259,7 +249,7 @@ class BSWorkbook(Workbook):
                except Exception as e:
                    continue
                else:
-                    over_cell.number_format = numbers.FORMAT_NUMBER_COMMA_SEPARATED1
+                    over_cell.number_format = numbers.FORMAT_GENERAL

                # 3.4.金额转数值
                try:
@@ -281,7 +271,7 @@ class BSWorkbook(Workbook):
                else:
                    if rows[consts.BORROW_IDX].value in consts.BORROW_OUTLAY_SET:
                        amount_cell.value = -amount_cell.value
-                    amount_cell.number_format = numbers.FORMAT_NUMBER_COMMA_SEPARATED1
+                    amount_cell.number_format = numbers.FORMAT_GENERAL
                    same_amount_mapping = amount_mapping.get(date_cell.value, {})
                    fill_rows = same_amount_mapping.get(-amount_cell.value)
                    if fill_rows:
@@ -357,11 +347,11 @@ class BSWorkbook(Workbook):
                                       end_date)

            # 3.创建月份表、提取/高亮关键行
-            is_reverse = False
-            if sum(reverse_trend_list) > 0:  # 倒序处理
-                is_reverse = True
-                for month_list in month_mapping.values():
-                    month_list.sort(key=lambda x: x[-1], reverse=True)
+            # 倒序处理
+            is_reverse = True if sum(reverse_trend_list) > 0 else False
+            for month_list in month_mapping.values():
+                month_list.sort(key=lambda x: x[-1], reverse=is_reverse)
+
            self.build_month_sheet(card, month_mapping, ms, is_reverse)

            # 4.删除原表
@@ -379,6 +369,14 @@ class BSWorkbook(Workbook):
                    ws.append(bl_field)
                ws.append((None, ))

-    def rebuild(self, bs_summary, license_summary):
+    def skip_img_sheet(self, skip_img):
+        if skip_img:
+            ws = self.create_sheet(consts.SKIP_IMG_SHEET_NAME)
+            ws.append(consts.SKIP_IMG_SHEET_HEADER)
+            for img_tuple in skip_img:
+                ws.append(img_tuple)
+
+    def rebuild(self, bs_summary, license_summary, skip_img):
        self.bs_rebuild(bs_summary)
        self.license_rebuild(license_summary)
+        self.skip_img_sheet(skip_img)