add wb rebuild

周伟奇
Showing 2 changed files with 86 additions and 18 deletions
src/apps/doc/management/commands/doc_ocr_process.py
src/apps/doc/ocr/wb.py
--- a/src/apps/doc/management/commands/doc_ocr_process.py
View file @13e30ac
+++ b/src/apps/doc/management/commands/doc_ocr_process.py
View file @13e30ac
@@ -5,14 +5,16 @@ import signal
 import base64
 import asyncio
 import aiohttp
-from openpyxl import Workbook
+# from openpyxl import Workbook
+from apps.doc.ocr.wb import BSWorkbook
 from django.core.management import BaseCommand

 from settings import conf
 from common.mixins import LoggerMixin
 from common.tools.file_tools import write_zip_file
 from common.tools.pdf_to_img import PDFHandler
-from apps.doc.models import DocStatus, HILDoc, AFCDoc
+from apps.doc.models import DocStatus, HILDoc, AFCDoc, Keywords
+from apps.doc.named_enum import KeywordsType
 from apps.doc import consts
 from apps.doc.ocr.edms import EDMS, rh

@@ -86,9 +88,11 @@ class Command(BaseCommand, LoggerMixin):
        return doc_data_path, excel_path, pdf_path

    @staticmethod
-    def append_sheet(wb, sheets_list, img_name):
+    def append_sheet(wb, sheets_list, img_name, role_summary):
        for i, sheet in enumerate(sheets_list):
-            ws = wb.create_sheet('{0}_{1}'.format(img_name, i))
+            sheet_name = '{0}_{1}'.format(img_name, i)
+            role_summary['银行-户名'].append((sheet_name, 1, None, None, None, None, None))
+            ws = wb.create_sheet(sheet_name)
            cells = sheet.get('cells')
            for cell in cells:
                c1 = cell.get('start_column')
@@ -112,12 +116,12 @@ class Command(BaseCommand, LoggerMixin):
            async with session.post(self.ocr_url, json=json_data) as response:
                return await response.json()

-    async def img_ocr_excel(self, wb, img_path):
+    async def img_ocr_excel(self, wb, img_path, role_summary):
        res = await self.fetch_ocr_result(img_path)
        self.cronjob_log.info('{0} [fetch ocr result success] [img={1}] [res={2}]'.format(self.log_base, img_path, res))
        sheets_list = res.get('result').get('res')
        img_name = os.path.basename(img_path)
-        self.append_sheet(wb, sheets_list, img_name)
+        self.append_sheet(wb, sheets_list, img_name, role_summary)

    # TODO 细化文件状态，不同异常状态采取不同的处理
    # TODO 调用接口重试
@@ -148,13 +152,22 @@ class Command(BaseCommand, LoggerMixin):
                write_zip_file(img_save_path, os.path.join(doc_data_path, '{0}_img.zip'.format(doc.id)))

                # 4.图片调用算法判断是否为银行流水, 图片调用算法OCR为excel文件
-                wb = Workbook()
+                role_summary = {
+                    '银行-户名': []
+                }
+                interest_keyword = Keywords.objects.filter(
+                    type=KeywordsType.INTEREST.value).values_list('keyword', flat=True)
+                salary_keyword = Keywords.objects.filter(
+                    type=KeywordsType.SALARY.value).values_list('keyword', flat=True)
+                loan_keyword = Keywords.objects.filter(type=KeywordsType.LOAN.value).values_list('keyword', flat=True)
+                wb = BSWorkbook(interest_keyword, salary_keyword, loan_keyword)
                loop = asyncio.get_event_loop()
-                tasks = [self.img_ocr_excel(wb, img_path) for img_path in pdf_handler.img_path_list]
+                tasks = [self.img_ocr_excel(wb, img_path, role_summary) for img_path in pdf_handler.img_path_list]
                loop.run_until_complete(asyncio.wait(tasks))
                # loop.close()

                # 整合excel文件
+                wb.rebuild(role_summary)
                wb.save(excel_path)
            except Exception as e:
                doc.status = DocStatus.PROCESS_FAILED.value
@@ -164,7 +177,8 @@ class Command(BaseCommand, LoggerMixin):
            else:
                try:
                    # 5.上传至EDMS
-                    self.edms.upload(excel_path, doc, business_type)
+                    # self.edms.upload(excel_path, doc, business_type)
+                    print('upload pass')
                except Exception as e:
                    doc.status = DocStatus.UPLOAD_FAILED.value
                    doc.save()
--- a/src/apps/doc/ocr/wb.py
View file @13e30ac
+++ b/src/apps/doc/ocr/wb.py
View file @13e30ac
-import numpy as np
 import locale
+import numpy as np
 from pandas._libs import tslib
+from pandas._libs.tslibs.nattype import NaTType
 from pandas.core.indexes.datetimes import DatetimeIndex
 from openpyxl import Workbook
 from openpyxl.styles import Border, Side, PatternFill, numbers
@@ -43,6 +44,7 @@ class BSWorkbook(Workbook):
        self.amount_fill = PatternFill("solid", fgColor="00FFFF00")
        self.bd = Side(style='thin', color="000000")
        self.border = Border(left=self.bd, top=self.bd, right=self.bd, bottom=self.bd)
+        self.MAX_MEAN = 31

    def sheet_prune(self, ws):
        ws.insert_cols(1, amount=self.fixed_col_amount)
@@ -56,7 +58,29 @@ class BSWorkbook(Workbook):
            ws.move_range("{0}1:{0}{1}".format(letter, ws.max_row), cols=header_idx - col)
        ws.delete_cols(self.fixed_col_amount + 1, amount=ws.max_column)

-    def sheet_split(self, ws, month_mapping):
+    @staticmethod
+    def month_split(dti, date_list):
+        month_list = []
+        idx_list = []
+        month_pre = None
+        for idx, month_str in enumerate(dti.strftime('%Y-%m')):
+            if isinstance(month_str, float):
+                continue
+            if month_str != month_pre:
+                month_list.append(month_str)
+                if month_pre is None:
+                    date_list.append(dti[idx].date())
+                    idx = 0
+                idx_list.append(idx)
+                month_pre = month_str
+        for idx in range(len(dti)-1, -1, -1):
+            if isinstance(dti[idx], NaTType):
+                continue
+            date_list.append(dti[idx].date())
+            break
+        return month_list, idx_list
+
+    def sheet_split(self, ws, month_mapping, date_list):
        for date_tuple in ws.iter_cols(min_col=1, max_col=1, min_row=2, values_only=True):
            dt_array, tz_parsed = tslib.array_to_datetime(
                np.array(date_tuple, copy=False, dtype=np.object_),
@@ -68,6 +92,31 @@ class BSWorkbook(Workbook):
            )
            dti = DatetimeIndex(dt_array, tz=None, name=None)

+            month_list, idx_list = self.month_split(dti, date_list)
+
+            if len(month_list) == 0:
+                month_info = month_mapping.setdefault('xxxx-xx', [])
+                month_info.append((ws.title, 2, ws.max_row, 0))
+            elif len(month_list) == 1:
+                month_info = month_mapping.setdefault(month_list[0], [])
+                day_mean = np.mean(dti.day.dropna())
+                if len(month_info) == 0:
+                    month_info.append((ws.title, 2, ws.max_row, day_mean))
+                else:
+                    for i, item in enumerate(month_info):
+                        # TODO 倒序处理
+                        if day_mean <= item[-1]:
+                            month_info.insert(i, (ws.title, 2, ws.max_row, day_mean))
+                            break
+                    else:
+                        month_info.append((ws.title, 2, ws.max_row, day_mean))
+            else:
+                for i, item in enumerate(month_list[:-1]):
+                    month_mapping.setdefault(item, []).append(
+                        (ws.title, idx_list[i] + 2, idx_list[i + 1] + 1, self.MAX_MEAN))
+                month_mapping.setdefault(month_list[-1], []).insert(
+                    0, (ws.title, idx_list[-1] + 2, ws.max_row, 0))
+
    def build_metadata_rows(self, confidence_max, code_list, print_time, start_date, end_date, date_interval):
        metadata_rows = [('流水识别置信度', confidence_max), self.blank_row, self.code_header]
        metadata_rows.extend(code_list)
@@ -169,19 +218,24 @@ class BSWorkbook(Workbook):
            confidence_max = 0
            code_list = []
            month_mapping = {}
-            print_time = start_date = end_date = date_interval = None
+            date_list = []
+            start_date = end_date = date_interval = print_time = None
            for summary in summary_list:
-                sheet_name, confidence, page, code, print_time, start_date, end_date = summary
+                sheet_name, confidence, page, code, print_time_local, start_date_local, end_date_local = summary
                ws = self.get_sheet_by_name(sheet_name)
                # 1.1.删除多余列、排列
                self.sheet_prune(ws)
-                # 1.2.TODO 按月份分割
-                self.sheet_split(ws, month_mapping)
+                # 1.2.按月份分割
+                self.sheet_split(ws, month_mapping, date_list)
                # 1.3.元数据处理 TODO 时间与日期处理
-                # confidence_max = max(confidence, confidence_max)
-                # if code is not None:
-                #     code_list.append((page, code))
+                confidence_max = max(confidence, confidence_max)
+                if code is not None:
+                    code_list.append((page, code))

+            if len(date_list) > 1:
+                start_date = min(date_list)
+                end_date = max(date_list)
+                date_interval = (end_date - start_date).days
            # 2.元信息提取表
            ms = self.build_meta_sheet(role, confidence_max, code_list, print_time, start_date, end_date, date_interval)