add wb rebuild
Showing
2 changed files
with
86 additions
and
18 deletions
... | @@ -5,14 +5,16 @@ import signal | ... | @@ -5,14 +5,16 @@ import signal |
5 | import base64 | 5 | import base64 |
6 | import asyncio | 6 | import asyncio |
7 | import aiohttp | 7 | import aiohttp |
8 | from openpyxl import Workbook | 8 | # from openpyxl import Workbook |
9 | from apps.doc.ocr.wb import BSWorkbook | ||
9 | from django.core.management import BaseCommand | 10 | from django.core.management import BaseCommand |
10 | 11 | ||
11 | from settings import conf | 12 | from settings import conf |
12 | from common.mixins import LoggerMixin | 13 | from common.mixins import LoggerMixin |
13 | from common.tools.file_tools import write_zip_file | 14 | from common.tools.file_tools import write_zip_file |
14 | from common.tools.pdf_to_img import PDFHandler | 15 | from common.tools.pdf_to_img import PDFHandler |
15 | from apps.doc.models import DocStatus, HILDoc, AFCDoc | 16 | from apps.doc.models import DocStatus, HILDoc, AFCDoc, Keywords |
17 | from apps.doc.named_enum import KeywordsType | ||
16 | from apps.doc import consts | 18 | from apps.doc import consts |
17 | from apps.doc.ocr.edms import EDMS, rh | 19 | from apps.doc.ocr.edms import EDMS, rh |
18 | 20 | ||
... | @@ -86,9 +88,11 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -86,9 +88,11 @@ class Command(BaseCommand, LoggerMixin): |
86 | return doc_data_path, excel_path, pdf_path | 88 | return doc_data_path, excel_path, pdf_path |
87 | 89 | ||
88 | @staticmethod | 90 | @staticmethod |
89 | def append_sheet(wb, sheets_list, img_name): | 91 | def append_sheet(wb, sheets_list, img_name, role_summary): |
90 | for i, sheet in enumerate(sheets_list): | 92 | for i, sheet in enumerate(sheets_list): |
91 | ws = wb.create_sheet('{0}_{1}'.format(img_name, i)) | 93 | sheet_name = '{0}_{1}'.format(img_name, i) |
94 | role_summary['银行-户名'].append((sheet_name, 1, None, None, None, None, None)) | ||
95 | ws = wb.create_sheet(sheet_name) | ||
92 | cells = sheet.get('cells') | 96 | cells = sheet.get('cells') |
93 | for cell in cells: | 97 | for cell in cells: |
94 | c1 = cell.get('start_column') | 98 | c1 = cell.get('start_column') |
... | @@ -112,12 +116,12 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -112,12 +116,12 @@ class Command(BaseCommand, LoggerMixin): |
112 | async with session.post(self.ocr_url, json=json_data) as response: | 116 | async with session.post(self.ocr_url, json=json_data) as response: |
113 | return await response.json() | 117 | return await response.json() |
114 | 118 | ||
115 | async def img_ocr_excel(self, wb, img_path): | 119 | async def img_ocr_excel(self, wb, img_path, role_summary): |
116 | res = await self.fetch_ocr_result(img_path) | 120 | res = await self.fetch_ocr_result(img_path) |
117 | self.cronjob_log.info('{0} [fetch ocr result success] [img={1}] [res={2}]'.format(self.log_base, img_path, res)) | 121 | self.cronjob_log.info('{0} [fetch ocr result success] [img={1}] [res={2}]'.format(self.log_base, img_path, res)) |
118 | sheets_list = res.get('result').get('res') | 122 | sheets_list = res.get('result').get('res') |
119 | img_name = os.path.basename(img_path) | 123 | img_name = os.path.basename(img_path) |
120 | self.append_sheet(wb, sheets_list, img_name) | 124 | self.append_sheet(wb, sheets_list, img_name, role_summary) |
121 | 125 | ||
122 | # TODO 细化文件状态,不同异常状态采取不同的处理 | 126 | # TODO 细化文件状态,不同异常状态采取不同的处理 |
123 | # TODO 调用接口重试 | 127 | # TODO 调用接口重试 |
... | @@ -148,13 +152,22 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -148,13 +152,22 @@ class Command(BaseCommand, LoggerMixin): |
148 | write_zip_file(img_save_path, os.path.join(doc_data_path, '{0}_img.zip'.format(doc.id))) | 152 | write_zip_file(img_save_path, os.path.join(doc_data_path, '{0}_img.zip'.format(doc.id))) |
149 | 153 | ||
150 | # 4.图片调用算法判断是否为银行流水, 图片调用算法OCR为excel文件 | 154 | # 4.图片调用算法判断是否为银行流水, 图片调用算法OCR为excel文件 |
151 | wb = Workbook() | 155 | role_summary = { |
156 | '银行-户名': [] | ||
157 | } | ||
158 | interest_keyword = Keywords.objects.filter( | ||
159 | type=KeywordsType.INTEREST.value).values_list('keyword', flat=True) | ||
160 | salary_keyword = Keywords.objects.filter( | ||
161 | type=KeywordsType.SALARY.value).values_list('keyword', flat=True) | ||
162 | loan_keyword = Keywords.objects.filter(type=KeywordsType.LOAN.value).values_list('keyword', flat=True) | ||
163 | wb = BSWorkbook(interest_keyword, salary_keyword, loan_keyword) | ||
152 | loop = asyncio.get_event_loop() | 164 | loop = asyncio.get_event_loop() |
153 | tasks = [self.img_ocr_excel(wb, img_path) for img_path in pdf_handler.img_path_list] | 165 | tasks = [self.img_ocr_excel(wb, img_path, role_summary) for img_path in pdf_handler.img_path_list] |
154 | loop.run_until_complete(asyncio.wait(tasks)) | 166 | loop.run_until_complete(asyncio.wait(tasks)) |
155 | # loop.close() | 167 | # loop.close() |
156 | 168 | ||
157 | # 整合excel文件 | 169 | # 整合excel文件 |
170 | wb.rebuild(role_summary) | ||
158 | wb.save(excel_path) | 171 | wb.save(excel_path) |
159 | except Exception as e: | 172 | except Exception as e: |
160 | doc.status = DocStatus.PROCESS_FAILED.value | 173 | doc.status = DocStatus.PROCESS_FAILED.value |
... | @@ -164,7 +177,8 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -164,7 +177,8 @@ class Command(BaseCommand, LoggerMixin): |
164 | else: | 177 | else: |
165 | try: | 178 | try: |
166 | # 5.上传至EDMS | 179 | # 5.上传至EDMS |
167 | self.edms.upload(excel_path, doc, business_type) | 180 | # self.edms.upload(excel_path, doc, business_type) |
181 | print('upload pass') | ||
168 | except Exception as e: | 182 | except Exception as e: |
169 | doc.status = DocStatus.UPLOAD_FAILED.value | 183 | doc.status = DocStatus.UPLOAD_FAILED.value |
170 | doc.save() | 184 | doc.save() | ... | ... |
1 | import numpy as np | ||
2 | import locale | 1 | import locale |
2 | import numpy as np | ||
3 | from pandas._libs import tslib | 3 | from pandas._libs import tslib |
4 | from pandas._libs.tslibs.nattype import NaTType | ||
4 | from pandas.core.indexes.datetimes import DatetimeIndex | 5 | from pandas.core.indexes.datetimes import DatetimeIndex |
5 | from openpyxl import Workbook | 6 | from openpyxl import Workbook |
6 | from openpyxl.styles import Border, Side, PatternFill, numbers | 7 | from openpyxl.styles import Border, Side, PatternFill, numbers |
... | @@ -43,6 +44,7 @@ class BSWorkbook(Workbook): | ... | @@ -43,6 +44,7 @@ class BSWorkbook(Workbook): |
43 | self.amount_fill = PatternFill("solid", fgColor="00FFFF00") | 44 | self.amount_fill = PatternFill("solid", fgColor="00FFFF00") |
44 | self.bd = Side(style='thin', color="000000") | 45 | self.bd = Side(style='thin', color="000000") |
45 | self.border = Border(left=self.bd, top=self.bd, right=self.bd, bottom=self.bd) | 46 | self.border = Border(left=self.bd, top=self.bd, right=self.bd, bottom=self.bd) |
47 | self.MAX_MEAN = 31 | ||
46 | 48 | ||
47 | def sheet_prune(self, ws): | 49 | def sheet_prune(self, ws): |
48 | ws.insert_cols(1, amount=self.fixed_col_amount) | 50 | ws.insert_cols(1, amount=self.fixed_col_amount) |
... | @@ -56,7 +58,29 @@ class BSWorkbook(Workbook): | ... | @@ -56,7 +58,29 @@ class BSWorkbook(Workbook): |
56 | ws.move_range("{0}1:{0}{1}".format(letter, ws.max_row), cols=header_idx - col) | 58 | ws.move_range("{0}1:{0}{1}".format(letter, ws.max_row), cols=header_idx - col) |
57 | ws.delete_cols(self.fixed_col_amount + 1, amount=ws.max_column) | 59 | ws.delete_cols(self.fixed_col_amount + 1, amount=ws.max_column) |
58 | 60 | ||
59 | def sheet_split(self, ws, month_mapping): | 61 | @staticmethod |
62 | def month_split(dti, date_list): | ||
63 | month_list = [] | ||
64 | idx_list = [] | ||
65 | month_pre = None | ||
66 | for idx, month_str in enumerate(dti.strftime('%Y-%m')): | ||
67 | if isinstance(month_str, float): | ||
68 | continue | ||
69 | if month_str != month_pre: | ||
70 | month_list.append(month_str) | ||
71 | if month_pre is None: | ||
72 | date_list.append(dti[idx].date()) | ||
73 | idx = 0 | ||
74 | idx_list.append(idx) | ||
75 | month_pre = month_str | ||
76 | for idx in range(len(dti)-1, -1, -1): | ||
77 | if isinstance(dti[idx], NaTType): | ||
78 | continue | ||
79 | date_list.append(dti[idx].date()) | ||
80 | break | ||
81 | return month_list, idx_list | ||
82 | |||
83 | def sheet_split(self, ws, month_mapping, date_list): | ||
60 | for date_tuple in ws.iter_cols(min_col=1, max_col=1, min_row=2, values_only=True): | 84 | for date_tuple in ws.iter_cols(min_col=1, max_col=1, min_row=2, values_only=True): |
61 | dt_array, tz_parsed = tslib.array_to_datetime( | 85 | dt_array, tz_parsed = tslib.array_to_datetime( |
62 | np.array(date_tuple, copy=False, dtype=np.object_), | 86 | np.array(date_tuple, copy=False, dtype=np.object_), |
... | @@ -68,6 +92,31 @@ class BSWorkbook(Workbook): | ... | @@ -68,6 +92,31 @@ class BSWorkbook(Workbook): |
68 | ) | 92 | ) |
69 | dti = DatetimeIndex(dt_array, tz=None, name=None) | 93 | dti = DatetimeIndex(dt_array, tz=None, name=None) |
70 | 94 | ||
95 | month_list, idx_list = self.month_split(dti, date_list) | ||
96 | |||
97 | if len(month_list) == 0: | ||
98 | month_info = month_mapping.setdefault('xxxx-xx', []) | ||
99 | month_info.append((ws.title, 2, ws.max_row, 0)) | ||
100 | elif len(month_list) == 1: | ||
101 | month_info = month_mapping.setdefault(month_list[0], []) | ||
102 | day_mean = np.mean(dti.day.dropna()) | ||
103 | if len(month_info) == 0: | ||
104 | month_info.append((ws.title, 2, ws.max_row, day_mean)) | ||
105 | else: | ||
106 | for i, item in enumerate(month_info): | ||
107 | # TODO 倒序处理 | ||
108 | if day_mean <= item[-1]: | ||
109 | month_info.insert(i, (ws.title, 2, ws.max_row, day_mean)) | ||
110 | break | ||
111 | else: | ||
112 | month_info.append((ws.title, 2, ws.max_row, day_mean)) | ||
113 | else: | ||
114 | for i, item in enumerate(month_list[:-1]): | ||
115 | month_mapping.setdefault(item, []).append( | ||
116 | (ws.title, idx_list[i] + 2, idx_list[i + 1] + 1, self.MAX_MEAN)) | ||
117 | month_mapping.setdefault(month_list[-1], []).insert( | ||
118 | 0, (ws.title, idx_list[-1] + 2, ws.max_row, 0)) | ||
119 | |||
71 | def build_metadata_rows(self, confidence_max, code_list, print_time, start_date, end_date, date_interval): | 120 | def build_metadata_rows(self, confidence_max, code_list, print_time, start_date, end_date, date_interval): |
72 | metadata_rows = [('流水识别置信度', confidence_max), self.blank_row, self.code_header] | 121 | metadata_rows = [('流水识别置信度', confidence_max), self.blank_row, self.code_header] |
73 | metadata_rows.extend(code_list) | 122 | metadata_rows.extend(code_list) |
... | @@ -169,19 +218,24 @@ class BSWorkbook(Workbook): | ... | @@ -169,19 +218,24 @@ class BSWorkbook(Workbook): |
169 | confidence_max = 0 | 218 | confidence_max = 0 |
170 | code_list = [] | 219 | code_list = [] |
171 | month_mapping = {} | 220 | month_mapping = {} |
172 | print_time = start_date = end_date = date_interval = None | 221 | date_list = [] |
222 | start_date = end_date = date_interval = print_time = None | ||
173 | for summary in summary_list: | 223 | for summary in summary_list: |
174 | sheet_name, confidence, page, code, print_time, start_date, end_date = summary | 224 | sheet_name, confidence, page, code, print_time_local, start_date_local, end_date_local = summary |
175 | ws = self.get_sheet_by_name(sheet_name) | 225 | ws = self.get_sheet_by_name(sheet_name) |
176 | # 1.1.删除多余列、排列 | 226 | # 1.1.删除多余列、排列 |
177 | self.sheet_prune(ws) | 227 | self.sheet_prune(ws) |
178 | # 1.2.TODO 按月份分割 | 228 | # 1.2.按月份分割 |
179 | self.sheet_split(ws, month_mapping) | 229 | self.sheet_split(ws, month_mapping, date_list) |
180 | # 1.3.元数据处理 TODO 时间与日期处理 | 230 | # 1.3.元数据处理 TODO 时间与日期处理 |
181 | # confidence_max = max(confidence, confidence_max) | 231 | confidence_max = max(confidence, confidence_max) |
182 | # if code is not None: | 232 | if code is not None: |
183 | # code_list.append((page, code)) | 233 | code_list.append((page, code)) |
184 | 234 | ||
235 | if len(date_list) > 1: | ||
236 | start_date = min(date_list) | ||
237 | end_date = max(date_list) | ||
238 | date_interval = (end_date - start_date).days | ||
185 | # 2.元信息提取表 | 239 | # 2.元信息提取表 |
186 | ms = self.build_meta_sheet(role, confidence_max, code_list, print_time, start_date, end_date, date_interval) | 240 | ms = self.build_meta_sheet(role, confidence_max, code_list, print_time, start_date, end_date, date_interval) |
187 | 241 | ... | ... |
-
Please register or sign in to post a comment