13e30ac5 by 周伟奇

add wb rebuild

1 parent 1526125c
...@@ -5,14 +5,16 @@ import signal ...@@ -5,14 +5,16 @@ import signal
5 import base64 5 import base64
6 import asyncio 6 import asyncio
7 import aiohttp 7 import aiohttp
8 from openpyxl import Workbook 8 # from openpyxl import Workbook
9 from apps.doc.ocr.wb import BSWorkbook
9 from django.core.management import BaseCommand 10 from django.core.management import BaseCommand
10 11
11 from settings import conf 12 from settings import conf
12 from common.mixins import LoggerMixin 13 from common.mixins import LoggerMixin
13 from common.tools.file_tools import write_zip_file 14 from common.tools.file_tools import write_zip_file
14 from common.tools.pdf_to_img import PDFHandler 15 from common.tools.pdf_to_img import PDFHandler
15 from apps.doc.models import DocStatus, HILDoc, AFCDoc 16 from apps.doc.models import DocStatus, HILDoc, AFCDoc, Keywords
17 from apps.doc.named_enum import KeywordsType
16 from apps.doc import consts 18 from apps.doc import consts
17 from apps.doc.ocr.edms import EDMS, rh 19 from apps.doc.ocr.edms import EDMS, rh
18 20
...@@ -86,9 +88,11 @@ class Command(BaseCommand, LoggerMixin): ...@@ -86,9 +88,11 @@ class Command(BaseCommand, LoggerMixin):
86 return doc_data_path, excel_path, pdf_path 88 return doc_data_path, excel_path, pdf_path
87 89
88 @staticmethod 90 @staticmethod
89 def append_sheet(wb, sheets_list, img_name): 91 def append_sheet(wb, sheets_list, img_name, role_summary):
90 for i, sheet in enumerate(sheets_list): 92 for i, sheet in enumerate(sheets_list):
91 ws = wb.create_sheet('{0}_{1}'.format(img_name, i)) 93 sheet_name = '{0}_{1}'.format(img_name, i)
94 role_summary['银行-户名'].append((sheet_name, 1, None, None, None, None, None))
95 ws = wb.create_sheet(sheet_name)
92 cells = sheet.get('cells') 96 cells = sheet.get('cells')
93 for cell in cells: 97 for cell in cells:
94 c1 = cell.get('start_column') 98 c1 = cell.get('start_column')
...@@ -112,12 +116,12 @@ class Command(BaseCommand, LoggerMixin): ...@@ -112,12 +116,12 @@ class Command(BaseCommand, LoggerMixin):
112 async with session.post(self.ocr_url, json=json_data) as response: 116 async with session.post(self.ocr_url, json=json_data) as response:
113 return await response.json() 117 return await response.json()
114 118
115 async def img_ocr_excel(self, wb, img_path): 119 async def img_ocr_excel(self, wb, img_path, role_summary):
116 res = await self.fetch_ocr_result(img_path) 120 res = await self.fetch_ocr_result(img_path)
117 self.cronjob_log.info('{0} [fetch ocr result success] [img={1}] [res={2}]'.format(self.log_base, img_path, res)) 121 self.cronjob_log.info('{0} [fetch ocr result success] [img={1}] [res={2}]'.format(self.log_base, img_path, res))
118 sheets_list = res.get('result').get('res') 122 sheets_list = res.get('result').get('res')
119 img_name = os.path.basename(img_path) 123 img_name = os.path.basename(img_path)
120 self.append_sheet(wb, sheets_list, img_name) 124 self.append_sheet(wb, sheets_list, img_name, role_summary)
121 125
122 # TODO 细化文件状态,不同异常状态采取不同的处理 126 # TODO 细化文件状态,不同异常状态采取不同的处理
123 # TODO 调用接口重试 127 # TODO 调用接口重试
...@@ -148,13 +152,22 @@ class Command(BaseCommand, LoggerMixin): ...@@ -148,13 +152,22 @@ class Command(BaseCommand, LoggerMixin):
148 write_zip_file(img_save_path, os.path.join(doc_data_path, '{0}_img.zip'.format(doc.id))) 152 write_zip_file(img_save_path, os.path.join(doc_data_path, '{0}_img.zip'.format(doc.id)))
149 153
150 # 4.图片调用算法判断是否为银行流水, 图片调用算法OCR为excel文件 154 # 4.图片调用算法判断是否为银行流水, 图片调用算法OCR为excel文件
151 wb = Workbook() 155 role_summary = {
156 '银行-户名': []
157 }
158 interest_keyword = Keywords.objects.filter(
159 type=KeywordsType.INTEREST.value).values_list('keyword', flat=True)
160 salary_keyword = Keywords.objects.filter(
161 type=KeywordsType.SALARY.value).values_list('keyword', flat=True)
162 loan_keyword = Keywords.objects.filter(type=KeywordsType.LOAN.value).values_list('keyword', flat=True)
163 wb = BSWorkbook(interest_keyword, salary_keyword, loan_keyword)
152 loop = asyncio.get_event_loop() 164 loop = asyncio.get_event_loop()
153 tasks = [self.img_ocr_excel(wb, img_path) for img_path in pdf_handler.img_path_list] 165 tasks = [self.img_ocr_excel(wb, img_path, role_summary) for img_path in pdf_handler.img_path_list]
154 loop.run_until_complete(asyncio.wait(tasks)) 166 loop.run_until_complete(asyncio.wait(tasks))
155 # loop.close() 167 # loop.close()
156 168
157 # 整合excel文件 169 # 整合excel文件
170 wb.rebuild(role_summary)
158 wb.save(excel_path) 171 wb.save(excel_path)
159 except Exception as e: 172 except Exception as e:
160 doc.status = DocStatus.PROCESS_FAILED.value 173 doc.status = DocStatus.PROCESS_FAILED.value
...@@ -164,7 +177,8 @@ class Command(BaseCommand, LoggerMixin): ...@@ -164,7 +177,8 @@ class Command(BaseCommand, LoggerMixin):
164 else: 177 else:
165 try: 178 try:
166 # 5.上传至EDMS 179 # 5.上传至EDMS
167 self.edms.upload(excel_path, doc, business_type) 180 # self.edms.upload(excel_path, doc, business_type)
181 print('upload pass')
168 except Exception as e: 182 except Exception as e:
169 doc.status = DocStatus.UPLOAD_FAILED.value 183 doc.status = DocStatus.UPLOAD_FAILED.value
170 doc.save() 184 doc.save()
......
1 import numpy as np
2 import locale 1 import locale
2 import numpy as np
3 from pandas._libs import tslib 3 from pandas._libs import tslib
4 from pandas._libs.tslibs.nattype import NaTType
4 from pandas.core.indexes.datetimes import DatetimeIndex 5 from pandas.core.indexes.datetimes import DatetimeIndex
5 from openpyxl import Workbook 6 from openpyxl import Workbook
6 from openpyxl.styles import Border, Side, PatternFill, numbers 7 from openpyxl.styles import Border, Side, PatternFill, numbers
...@@ -43,6 +44,7 @@ class BSWorkbook(Workbook): ...@@ -43,6 +44,7 @@ class BSWorkbook(Workbook):
43 self.amount_fill = PatternFill("solid", fgColor="00FFFF00") 44 self.amount_fill = PatternFill("solid", fgColor="00FFFF00")
44 self.bd = Side(style='thin', color="000000") 45 self.bd = Side(style='thin', color="000000")
45 self.border = Border(left=self.bd, top=self.bd, right=self.bd, bottom=self.bd) 46 self.border = Border(left=self.bd, top=self.bd, right=self.bd, bottom=self.bd)
47 self.MAX_MEAN = 31
46 48
47 def sheet_prune(self, ws): 49 def sheet_prune(self, ws):
48 ws.insert_cols(1, amount=self.fixed_col_amount) 50 ws.insert_cols(1, amount=self.fixed_col_amount)
...@@ -56,7 +58,29 @@ class BSWorkbook(Workbook): ...@@ -56,7 +58,29 @@ class BSWorkbook(Workbook):
56 ws.move_range("{0}1:{0}{1}".format(letter, ws.max_row), cols=header_idx - col) 58 ws.move_range("{0}1:{0}{1}".format(letter, ws.max_row), cols=header_idx - col)
57 ws.delete_cols(self.fixed_col_amount + 1, amount=ws.max_column) 59 ws.delete_cols(self.fixed_col_amount + 1, amount=ws.max_column)
58 60
59 def sheet_split(self, ws, month_mapping): 61 @staticmethod
62 def month_split(dti, date_list):
63 month_list = []
64 idx_list = []
65 month_pre = None
66 for idx, month_str in enumerate(dti.strftime('%Y-%m')):
67 if isinstance(month_str, float):
68 continue
69 if month_str != month_pre:
70 month_list.append(month_str)
71 if month_pre is None:
72 date_list.append(dti[idx].date())
73 idx = 0
74 idx_list.append(idx)
75 month_pre = month_str
76 for idx in range(len(dti)-1, -1, -1):
77 if isinstance(dti[idx], NaTType):
78 continue
79 date_list.append(dti[idx].date())
80 break
81 return month_list, idx_list
82
83 def sheet_split(self, ws, month_mapping, date_list):
60 for date_tuple in ws.iter_cols(min_col=1, max_col=1, min_row=2, values_only=True): 84 for date_tuple in ws.iter_cols(min_col=1, max_col=1, min_row=2, values_only=True):
61 dt_array, tz_parsed = tslib.array_to_datetime( 85 dt_array, tz_parsed = tslib.array_to_datetime(
62 np.array(date_tuple, copy=False, dtype=np.object_), 86 np.array(date_tuple, copy=False, dtype=np.object_),
...@@ -68,6 +92,31 @@ class BSWorkbook(Workbook): ...@@ -68,6 +92,31 @@ class BSWorkbook(Workbook):
68 ) 92 )
69 dti = DatetimeIndex(dt_array, tz=None, name=None) 93 dti = DatetimeIndex(dt_array, tz=None, name=None)
70 94
95 month_list, idx_list = self.month_split(dti, date_list)
96
97 if len(month_list) == 0:
98 month_info = month_mapping.setdefault('xxxx-xx', [])
99 month_info.append((ws.title, 2, ws.max_row, 0))
100 elif len(month_list) == 1:
101 month_info = month_mapping.setdefault(month_list[0], [])
102 day_mean = np.mean(dti.day.dropna())
103 if len(month_info) == 0:
104 month_info.append((ws.title, 2, ws.max_row, day_mean))
105 else:
106 for i, item in enumerate(month_info):
107 # TODO 倒序处理
108 if day_mean <= item[-1]:
109 month_info.insert(i, (ws.title, 2, ws.max_row, day_mean))
110 break
111 else:
112 month_info.append((ws.title, 2, ws.max_row, day_mean))
113 else:
114 for i, item in enumerate(month_list[:-1]):
115 month_mapping.setdefault(item, []).append(
116 (ws.title, idx_list[i] + 2, idx_list[i + 1] + 1, self.MAX_MEAN))
117 month_mapping.setdefault(month_list[-1], []).insert(
118 0, (ws.title, idx_list[-1] + 2, ws.max_row, 0))
119
71 def build_metadata_rows(self, confidence_max, code_list, print_time, start_date, end_date, date_interval): 120 def build_metadata_rows(self, confidence_max, code_list, print_time, start_date, end_date, date_interval):
72 metadata_rows = [('流水识别置信度', confidence_max), self.blank_row, self.code_header] 121 metadata_rows = [('流水识别置信度', confidence_max), self.blank_row, self.code_header]
73 metadata_rows.extend(code_list) 122 metadata_rows.extend(code_list)
...@@ -169,19 +218,24 @@ class BSWorkbook(Workbook): ...@@ -169,19 +218,24 @@ class BSWorkbook(Workbook):
169 confidence_max = 0 218 confidence_max = 0
170 code_list = [] 219 code_list = []
171 month_mapping = {} 220 month_mapping = {}
172 print_time = start_date = end_date = date_interval = None 221 date_list = []
222 start_date = end_date = date_interval = print_time = None
173 for summary in summary_list: 223 for summary in summary_list:
174 sheet_name, confidence, page, code, print_time, start_date, end_date = summary 224 sheet_name, confidence, page, code, print_time_local, start_date_local, end_date_local = summary
175 ws = self.get_sheet_by_name(sheet_name) 225 ws = self.get_sheet_by_name(sheet_name)
176 # 1.1.删除多余列、排列 226 # 1.1.删除多余列、排列
177 self.sheet_prune(ws) 227 self.sheet_prune(ws)
178 # 1.2.TODO 按月份分割 228 # 1.2.按月份分割
179 self.sheet_split(ws, month_mapping) 229 self.sheet_split(ws, month_mapping, date_list)
180 # 1.3.元数据处理 TODO 时间与日期处理 230 # 1.3.元数据处理 TODO 时间与日期处理
181 # confidence_max = max(confidence, confidence_max) 231 confidence_max = max(confidence, confidence_max)
182 # if code is not None: 232 if code is not None:
183 # code_list.append((page, code)) 233 code_list.append((page, code))
184 234
235 if len(date_list) > 1:
236 start_date = min(date_list)
237 end_date = max(date_list)
238 date_interval = (end_date - start_date).days
185 # 2.元信息提取表 239 # 2.元信息提取表
186 ms = self.build_meta_sheet(role, confidence_max, code_list, print_time, start_date, end_date, date_interval) 240 ms = self.build_meta_sheet(role, confidence_max, code_list, print_time, start_date, end_date, date_interval)
187 241
......
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!