f3d6e429 by 周伟奇

update wb build

1 parent a220590e
...@@ -33,4 +33,5 @@ data/* ...@@ -33,4 +33,5 @@ data/*
33 # 脚本 33 # 脚本
34 src/*.sh 34 src/*.sh
35 35
36 test.py
...\ No newline at end of file ...\ No newline at end of file
36 test.py
37 ocr_test.py
...\ No newline at end of file ...\ No newline at end of file
......
...@@ -67,7 +67,6 @@ class Command(BaseCommand, LoggerMixin): ...@@ -67,7 +67,6 @@ class Command(BaseCommand, LoggerMixin):
67 def pdf_download(self, doc, business_type): 67 def pdf_download(self, doc, business_type):
68 if doc is None: 68 if doc is None:
69 return None, None, None, None 69 return None, None, None, None
70 # TODO EDMS下载pdf
71 doc_data_path = os.path.join(self.data_dir, business_type, str(doc.id)) 70 doc_data_path = os.path.join(self.data_dir, business_type, str(doc.id))
72 os.makedirs(doc_data_path, exist_ok=True) 71 os.makedirs(doc_data_path, exist_ok=True)
73 pdf_path = os.path.join(doc_data_path, '{0}.pdf'.format(doc.id)) 72 pdf_path = os.path.join(doc_data_path, '{0}.pdf'.format(doc.id))
...@@ -128,8 +127,10 @@ class Command(BaseCommand, LoggerMixin): ...@@ -128,8 +127,10 @@ class Command(BaseCommand, LoggerMixin):
128 img_name = os.path.basename(img_path) 127 img_name = os.path.basename(img_path)
129 self.append_sheet(wb, sheets_list, img_name, role_summary) 128 self.append_sheet(wb, sheets_list, img_name, role_summary)
130 129
131 # TODO 细化文件状态,不同异常状态采取不同的处理 130 # TODO 细化文件状态,不同异常状态,归还队列,重试时采取不同的处理
132 # TODO 调用接口重试 131 # TODO 调用接口重试
132 # TODO 异常邮件通知
133 # TODO 数据库断联问题
133 def handle(self, *args, **kwargs): 134 def handle(self, *args, **kwargs):
134 sleep_second = int(conf.SLEEP_SECOND) 135 sleep_second = int(conf.SLEEP_SECOND)
135 max_sleep_second = int(conf.MAX_SLEEP_SECOND) 136 max_sleep_second = int(conf.MAX_SLEEP_SECOND)
......
...@@ -35,6 +35,7 @@ class BSWorkbook(Workbook): ...@@ -35,6 +35,7 @@ class BSWorkbook(Workbook):
35 header_value = ws.cell(1, col).value 35 header_value = ws.cell(1, col).value
36 header_idx = consts.HEADERS_MAPPING.get(header_value) 36 header_idx = consts.HEADERS_MAPPING.get(header_value)
37 # TODO 关键字段再次查找 37 # TODO 关键字段再次查找
38 # TODO 支付宝、微信流水第一行非表头,怎么处理
38 if header_idx is None: 39 if header_idx is None:
39 continue 40 continue
40 letter = get_column_letter(col) 41 letter = get_column_letter(col)
...@@ -63,8 +64,31 @@ class BSWorkbook(Workbook): ...@@ -63,8 +64,31 @@ class BSWorkbook(Workbook):
63 break 64 break
64 return month_list, idx_list 65 return month_list, idx_list
65 66
66 def sheet_split(self, ws, month_mapping, date_list): 67 @staticmethod
67 for date_tuple in ws.iter_cols(min_col=1, max_col=1, min_row=2, values_only=True): 68 def get_reverse_trend(day_idx, idx_list):
69 reverse_trend = 0
70 pre_day = None
71 for idx, day in enumerate(day_idx):
72 if np.isnan(day):
73 continue
74 if idx in idx_list or pre_day is None:
75 pre_day = day
76 continue
77 if day < pre_day:
78 reverse_trend += 1
79 pre_day = day
80 elif day > pre_day:
81 reverse_trend -= 1
82 pre_day = day
83 if reverse_trend > 0:
84 reverse_trend = 1
85 elif reverse_trend < 0:
86 reverse_trend = -1
87 return reverse_trend
88
89 def sheet_split(self, ws, month_mapping, date_list, reverse_trend_list):
90 for date_tuple_src in ws.iter_cols(min_col=1, max_col=1, min_row=2, values_only=True):
91 date_tuple = [date[:10] if isinstance(date, str) else date for date in date_tuple_src]
68 dt_array, tz_parsed = tslib.array_to_datetime( 92 dt_array, tz_parsed = tslib.array_to_datetime(
69 np.array(date_tuple, copy=False, dtype=np.object_), 93 np.array(date_tuple, copy=False, dtype=np.object_),
70 errors="coerce", 94 errors="coerce",
...@@ -78,22 +102,30 @@ class BSWorkbook(Workbook): ...@@ -78,22 +102,30 @@ class BSWorkbook(Workbook):
78 month_list, idx_list = self.month_split(dti, date_list) 102 month_list, idx_list = self.month_split(dti, date_list)
79 103
80 if len(month_list) == 0: 104 if len(month_list) == 0:
105 # month_info process
81 month_info = month_mapping.setdefault('xxxx-xx', []) 106 month_info = month_mapping.setdefault('xxxx-xx', [])
82 month_info.append((ws.title, 2, ws.max_row, 0)) 107 month_info.append((ws.title, 2, ws.max_row, 0))
83 elif len(month_list) == 1: 108 elif len(month_list) == 1:
109 # reverse_trend_list process
110 reverse_trend = self.get_reverse_trend(dti.day, idx_list)
111 reverse_trend_list.append(reverse_trend)
112 # month_info process
84 month_info = month_mapping.setdefault(month_list[0], []) 113 month_info = month_mapping.setdefault(month_list[0], [])
85 day_mean = np.mean(dti.day.dropna()) 114 day_mean = np.mean(dti.day.dropna())
86 if len(month_info) == 0: 115 if len(month_info) == 0:
87 month_info.append((ws.title, 2, ws.max_row, day_mean)) 116 month_info.append((ws.title, 2, ws.max_row, day_mean))
88 else: 117 else:
89 for i, item in enumerate(month_info): 118 for i, item in enumerate(month_info):
90 # TODO 倒序处理
91 if day_mean <= item[-1]: 119 if day_mean <= item[-1]:
92 month_info.insert(i, (ws.title, 2, ws.max_row, day_mean)) 120 month_info.insert(i, (ws.title, 2, ws.max_row, day_mean))
93 break 121 break
94 else: 122 else:
95 month_info.append((ws.title, 2, ws.max_row, day_mean)) 123 month_info.append((ws.title, 2, ws.max_row, day_mean))
96 else: 124 else:
125 # reverse_trend_list process
126 reverse_trend = self.get_reverse_trend(dti.day, idx_list)
127 reverse_trend_list.append(reverse_trend)
128 # month_info process
97 for i, item in enumerate(month_list[:-1]): 129 for i, item in enumerate(month_list[:-1]):
98 month_mapping.setdefault(item, []).append( 130 month_mapping.setdefault(item, []).append(
99 (ws.title, idx_list[i] + 2, idx_list[i + 1] + 1, self.MAX_MEAN)) 131 (ws.title, idx_list[i] + 2, idx_list[i + 1] + 1, self.MAX_MEAN))
...@@ -128,7 +160,7 @@ class BSWorkbook(Workbook): ...@@ -128,7 +160,7 @@ class BSWorkbook(Workbook):
128 ms.append(row) 160 ms.append(row)
129 return ms 161 return ms
130 162
131 def build_month_sheet(self, role, month_mapping, ms): 163 def build_month_sheet(self, role, month_mapping, ms, is_reverse):
132 tmp_ws = self.create_sheet('tmp_ws') 164 tmp_ws = self.create_sheet('tmp_ws')
133 for month in sorted(month_mapping.keys()): 165 for month in sorted(month_mapping.keys()):
134 # 3.1.拷贝数据 166 # 3.1.拷贝数据
...@@ -143,7 +175,6 @@ class BSWorkbook(Workbook): ...@@ -143,7 +175,6 @@ class BSWorkbook(Workbook):
143 amount_mapping = {} 175 amount_mapping = {}
144 amount_fill_row = set() 176 amount_fill_row = set()
145 for rows in new_ws.iter_rows(): 177 for rows in new_ws.iter_rows():
146 is_fill = False
147 summary_cell = rows[5] 178 summary_cell = rows[5]
148 date_cell = rows[0] 179 date_cell = rows[0]
149 # 关键词1提取 180 # 关键词1提取
...@@ -154,11 +185,9 @@ class BSWorkbook(Workbook): ...@@ -154,11 +185,9 @@ class BSWorkbook(Workbook):
154 tmp_ws.append((summary_cell.value, date_cell.value, rows[2].value)) 185 tmp_ws.append((summary_cell.value, date_cell.value, rows[2].value))
155 # 贷款关键词高亮 186 # 贷款关键词高亮
156 elif summary_cell.value in self.loan_keyword: 187 elif summary_cell.value in self.loan_keyword:
157 is_fill = True 188 summary_cell.fill = self.loan_fill
158 for i, cell in enumerate(rows): 189 for i, cell in enumerate(rows):
159 cell.border = self.border 190 cell.border = self.border
160 if is_fill:
161 cell.fill = self.loan_fill
162 if (i == 2 or i == 3) and cell.row > 1: 191 if (i == 2 or i == 3) and cell.row > 1:
163 try: 192 try:
164 # 3.3.金额、余额转数值 193 # 3.3.金额、余额转数值
...@@ -177,16 +206,18 @@ class BSWorkbook(Workbook): ...@@ -177,16 +206,18 @@ class BSWorkbook(Workbook):
177 cell.value, []).append(cell.row) 206 cell.value, []).append(cell.row)
178 # 3.4.核对结果 207 # 3.4.核对结果
179 # TODO 借贷、开支类型银行流水,需要手动添加+-号 208 # TODO 借贷、开支类型银行流水,需要手动添加+-号
180 # TODO 倒序流水需要改变公式
181 if i == 9 and cell.row > 2: 209 if i == 9 and cell.row > 2:
182 cell.value = '=IF(D{0}=SUM(D{1},C{0}), "{2}", "{3}")'.format(cell.row, cell.row - 1, 210 if is_reverse:
183 *self.proof_res) 211 cell.value = '=IF(D{0}=SUM(D{1},C{0}), "{2}", "{3}")'.format(
212 cell.row - 1, cell.row, *self.proof_res)
213 else:
214 cell.value = '=IF(D{0}=SUM(D{1},C{0}), "{2}", "{3}")'.format(
215 cell.row, cell.row - 1, *self.proof_res)
184 216
185 # 3.5.同一天相同进出账高亮 217 # 3.5.同一天相同进出账高亮
186 del amount_mapping 218 del amount_mapping
187 for row in amount_fill_row: 219 for row in amount_fill_row:
188 for cell in new_ws[row]: 220 new_ws[row][2].fill = self.amount_fill
189 cell.fill = self.amount_fill
190 221
191 # 关键词2信息提取 222 # 关键词2信息提取
192 ms.append(self.blank_row) 223 ms.append(self.blank_row)
...@@ -196,9 +227,10 @@ class BSWorkbook(Workbook): ...@@ -196,9 +227,10 @@ class BSWorkbook(Workbook):
196 self.remove(tmp_ws) 227 self.remove(tmp_ws)
197 228
198 def rebuild(self, role_summary): 229 def rebuild(self, role_summary):
199 # (sheet_name, confidence, page, code, print_time, start_date, end_date) 230 # (sheet_name, confidence, page, code, print_time, start_date, end_date) # TODO 表名简化,+卡号
200 for role, summary_list in role_summary.items(): 231 for role, summary_list in role_summary.items():
201 # 1.原表修剪、排列、按照月份分割 232 # 1.原表修剪、排列、按照月份分割
233 reverse_trend_list = []
202 confidence_max = 0 234 confidence_max = 0
203 code_list = [] 235 code_list = []
204 month_mapping = {} 236 month_mapping = {}
...@@ -210,7 +242,7 @@ class BSWorkbook(Workbook): ...@@ -210,7 +242,7 @@ class BSWorkbook(Workbook):
210 # 1.1.删除多余列、排列 242 # 1.1.删除多余列、排列
211 self.sheet_prune(ws) 243 self.sheet_prune(ws)
212 # 1.2.按月份分割 244 # 1.2.按月份分割
213 self.sheet_split(ws, month_mapping, date_list) 245 self.sheet_split(ws, month_mapping, date_list, reverse_trend_list)
214 # 1.3.元数据处理 TODO 时间与日期处理 246 # 1.3.元数据处理 TODO 时间与日期处理
215 confidence_max = max(confidence, confidence_max) 247 confidence_max = max(confidence, confidence_max)
216 if code is not None: 248 if code is not None:
...@@ -224,7 +256,12 @@ class BSWorkbook(Workbook): ...@@ -224,7 +256,12 @@ class BSWorkbook(Workbook):
224 ms = self.build_meta_sheet(role, confidence_max, code_list, print_time, start_date, end_date, date_interval) 256 ms = self.build_meta_sheet(role, confidence_max, code_list, print_time, start_date, end_date, date_interval)
225 257
226 # 3.创建月份表、提取/高亮关键行 258 # 3.创建月份表、提取/高亮关键行
227 self.build_month_sheet(role, month_mapping, ms) 259 is_reverse = False
260 if sum(reverse_trend_list) > 0: # 倒序处理
261 is_reverse = True
262 for month_list in month_mapping.values():
263 month_list.sort(key=lambda x: x[-1], reverse=True)
264 self.build_month_sheet(role, month_mapping, ms, is_reverse)
228 265
229 # 删除原表 266 # 删除原表
230 for summary in summary_list: 267 for summary in summary_list:
......
...@@ -4,8 +4,10 @@ from PIL import Image ...@@ -4,8 +4,10 @@ from PIL import Image
4 from io import BytesIO 4 from io import BytesIO
5 5
6 # 页面保存为png图片参数 6 # 页面保存为png图片参数
7 ZOOM_X = ZOOM_Y = 2.0 7 ZOOM_X_1 = ZOOM_Y_1 = 1.0
8 trans = fitz.Matrix(ZOOM_X, ZOOM_X).preRotate(0) # zoom factor 2 in each dimension 8 ZOOM_X_2 = ZOOM_Y_2 = 2.0
9 trans_1 = fitz.Matrix(ZOOM_X_1, ZOOM_X_1).preRotate(0) # zoom factor 1 in each dimension
10 trans_2 = fitz.Matrix(ZOOM_X_2, ZOOM_X_2).preRotate(0) # zoom factor 2 in each dimension
9 11
10 # 特殊filter处理 12 # 特殊filter处理
11 ADOBE_FILTER_SET = {'FlateDecode', 'JPXDecode', 'JBIG2Decode'} 13 ADOBE_FILTER_SET = {'FlateDecode', 'JPXDecode', 'JBIG2Decode'}
...@@ -30,7 +32,10 @@ class PDFHandler: ...@@ -30,7 +32,10 @@ class PDFHandler:
30 return os.path.join(self.img_dir_path, 'page_{0}_img_{1}.{2}'.format(pno, img_index, ext)) 32 return os.path.join(self.img_dir_path, 'page_{0}_img_{1}.{2}'.format(pno, img_index, ext))
31 33
32 def page_to_png(self, page): 34 def page_to_png(self, page):
33 pm = page.getPixmap(matrix=trans, alpha=False) 35 if page.MediaBoxSize.x > 1500 or page.MediaBoxSize.y > 1500:
36 pm = page.getPixmap(matrix=trans_1, alpha=False)
37 else:
38 pm = page.getPixmap(matrix=trans_2, alpha=False)
34 img_save_path = self.get_img_save_path(page.number) 39 img_save_path = self.get_img_save_path(page.number)
35 pm.writePNG(img_save_path) 40 pm.writePNG(img_save_path)
36 self.img_path_list.append(img_save_path) 41 self.img_path_list.append(img_save_path)
......
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!