7dfc2ee8 by 周伟奇

merge license

2 parents 1242adb8 e570371a
...@@ -33,6 +33,5 @@ data/* ...@@ -33,6 +33,5 @@ data/*
33 # 脚本 33 # 脚本
34 src/*.sh 34 src/*.sh
35 35
36 test.py 36 test*
37 ocr_test.py 37 ocr_test.py
...\ No newline at end of file ...\ No newline at end of file
38 ocr_test_2.py
...\ No newline at end of file ...\ No newline at end of file
......
1 class EDMSException(Exception):
2 pass
...@@ -141,32 +141,22 @@ class BSWorkbook(Workbook): ...@@ -141,32 +141,22 @@ class BSWorkbook(Workbook):
141 # month_info process 141 # month_info process
142 month_info = month_mapping.setdefault('xxxx-xx', []) 142 month_info = month_mapping.setdefault('xxxx-xx', [])
143 month_info.append((ws.title, min_row, ws.max_row, 0)) 143 month_info.append((ws.title, min_row, ws.max_row, 0))
144 elif len(month_list) == 1:
145 # reverse_trend_list process
146 reverse_trend = self.get_reverse_trend(dti.day, idx_list)
147 reverse_trend_list.append(reverse_trend)
148 # month_info process
149 month_info = month_mapping.setdefault(month_list[0], [])
150 day_mean = np.mean(dti.day.dropna())
151 if len(month_info) == 0:
152 month_info.append((ws.title, min_row, ws.max_row, day_mean))
153 else:
154 for i, item in enumerate(month_info):
155 if day_mean <= item[-1]:
156 month_info.insert(i, (ws.title, min_row, ws.max_row, day_mean))
157 break
158 else:
159 month_info.append((ws.title, min_row, ws.max_row, day_mean))
160 else: 144 else:
161 # reverse_trend_list process 145 # reverse_trend_list process
162 reverse_trend = self.get_reverse_trend(dti.day, idx_list) 146 reverse_trend = self.get_reverse_trend(dti.day, idx_list)
163 reverse_trend_list.append(reverse_trend) 147 reverse_trend_list.append(reverse_trend)
164 # month_info process 148 # month_info process
165 for i, item in enumerate(month_list[:-1]): 149 day_idx = dti.day
150 idx_list_max_idx = len(idx_list) - 1
151 for i, item in enumerate(month_list):
152 if i == idx_list_max_idx:
153 day_mean = np.mean(day_idx[idx_list[i]:].dropna())
166 month_mapping.setdefault(item, []).append( 154 month_mapping.setdefault(item, []).append(
167 (ws.title, idx_list[i] + min_row, idx_list[i + 1] + min_row - 1, self.MAX_MEAN)) 155 (ws.title, idx_list[i] + min_row, ws.max_row, day_mean))
168 month_mapping.setdefault(month_list[-1], []).insert( 156 else:
169 0, (ws.title, idx_list[-1] + min_row, ws.max_row, 0)) 157 day_mean = np.mean(day_idx[idx_list[i]: idx_list[i + 1]].dropna())
158 month_mapping.setdefault(item, []).append(
159 (ws.title, idx_list[i] + min_row, idx_list[i + 1] + min_row - 1, day_mean))
170 160
171 def build_metadata_rows(self, confidence, code, print_time, start_date, end_date): 161 def build_metadata_rows(self, confidence, code, print_time, start_date, end_date):
172 if start_date is None or end_date is None: 162 if start_date is None or end_date is None:
...@@ -191,9 +181,9 @@ class BSWorkbook(Workbook): ...@@ -191,9 +181,9 @@ class BSWorkbook(Workbook):
191 def create_meta_sheet(self, card): 181 def create_meta_sheet(self, card):
192 if self.worksheets[0].title == 'Sheet': 182 if self.worksheets[0].title == 'Sheet':
193 ms = self.worksheets[0] 183 ms = self.worksheets[0]
194 ms.title = '{0}({1})'.format(self.meta_sheet_title, card) 184 ms.title = '{0}({1})'.format(self.meta_sheet_title, card[-6:])
195 else: 185 else:
196 ms = self.create_sheet('{0}({1})'.format(self.meta_sheet_title, card)) 186 ms = self.create_sheet('{0}({1})'.format(self.meta_sheet_title, card[-6:]))
197 return ms 187 return ms
198 188
199 def build_meta_sheet(self, card, confidence, code, print_time, start_date, end_date): 189 def build_meta_sheet(self, card, confidence, code, print_time, start_date, end_date):
...@@ -203,6 +193,26 @@ class BSWorkbook(Workbook): ...@@ -203,6 +193,26 @@ class BSWorkbook(Workbook):
203 ms.append(row) 193 ms.append(row)
204 return ms 194 return ms
205 195
196 @staticmethod
197 def amount_format(amount_str):
198 if not isinstance(amount_str, str) or amount_str == '':
199 return amount_str
200 # 1.替换
201 res_str = amount_str.translate(consts.TRANS)
202 # 2.删除多余的-
203 res_str = res_str[0] + res_str[1:].replace('-', '')
204 # 3.首字符处理
205 if res_str[0] in consts.ERROR_CHARS:
206 res_str = '-{0}'.format(res_str[1:])
207 # 4.逗号与句号处理
208 if len(res_str) >= 4:
209 period_idx = len(res_str) - 3
210 if res_str[period_idx] == '.' and res_str[period_idx - 1] == ',':
211 res_str = '{0}{1}'.format(res_str[:period_idx - 1], res_str[period_idx:])
212 elif res_str[period_idx] == ',':
213 res_str = '{0}.{1}'.format(res_str[:period_idx], res_str[period_idx + 1:])
214 return res_str
215
206 def build_month_sheet(self, card, month_mapping, ms, is_reverse): 216 def build_month_sheet(self, card, month_mapping, ms, is_reverse):
207 tmp_ws = self.create_sheet('tmp_ws') 217 tmp_ws = self.create_sheet('tmp_ws')
208 for month in sorted(month_mapping.keys()): 218 for month in sorted(month_mapping.keys()):
...@@ -235,29 +245,25 @@ class BSWorkbook(Workbook): ...@@ -235,29 +245,25 @@ class BSWorkbook(Workbook):
235 # 3.3.余额转数值 245 # 3.3.余额转数值
236 over_cell = rows[consts.OVER_IDX] 246 over_cell = rows[consts.OVER_IDX]
237 try: 247 try:
238 if isinstance(over_cell.value, str): 248 over_cell.value = locale.atof(self.amount_format(over_cell.value))
239 over_cell.value = over_cell.value.translate(consts.TRANS)
240 over_cell.value = locale.atof(over_cell.value)
241 except Exception as e: 249 except Exception as e:
242 continue 250 continue
243 else: 251 else:
244 over_cell.number_format = numbers.FORMAT_NUMBER_COMMA_SEPARATED1 252 over_cell.number_format = numbers.FORMAT_NUMBER_COMMA_SEPARATED1
245 253
246 # 3.4.额转数值 254 # 3.4.额转数值
247 try: 255 try:
248 try: 256 try:
249 if isinstance(amount_cell.value, str): # TODO 可在转化数字失败后,再替换 257 amount_cell.value = locale.atof(self.amount_format(amount_cell.value))
250 amount_cell.value = amount_cell.value.translate(consts.TRANS)
251 amount_cell.value = locale.atof(amount_cell.value)
252 except Exception as e: 258 except Exception as e:
253 try: 259 try:
254 if isinstance(rows[consts.INCOME_IDX].value, str): 260 amount_cell.value = locale.atof(self.amount_format(rows[consts.INCOME_IDX].value))
255 rows[consts.OUTLAY_IDX].value = rows[consts.INCOME_IDX].value.translate(consts.TRANS) 261 if amount_cell.value == 0:
256 amount_cell.value = locale.atof(rows[consts.OUTLAY_IDX].value) 262 raise
263 elif amount_cell.value < 0:
264 amount_cell.value = -amount_cell.value
257 except Exception as e: 265 except Exception as e:
258 if isinstance(rows[consts.OUTLAY_IDX].value, str): 266 amount_cell.value = locale.atof(self.amount_format(rows[consts.OUTLAY_IDX].value))
259 rows[consts.OUTLAY_IDX].value = rows[consts.OUTLAY_IDX].value.translate(consts.TRANS)
260 amount_cell.value = locale.atof(rows[consts.OUTLAY_IDX].value)
261 if amount_cell.value > 0: 267 if amount_cell.value > 0:
262 amount_cell.value = -amount_cell.value 268 amount_cell.value = -amount_cell.value
263 except Exception as e: 269 except Exception as e:
...@@ -313,18 +319,18 @@ class BSWorkbook(Workbook): ...@@ -313,18 +319,18 @@ class BSWorkbook(Workbook):
313 # } 319 # }
314 for card, summary in bs_summary.items(): 320 for card, summary in bs_summary.items():
315 # 1.原表修剪、排列、按照月份分割 321 # 1.原表修剪、排列、按照月份分割
316 start_date = summary['start_date'] 322 start_date = summary.get('start_date')
317 end_date = summary['end_date'] 323 end_date = summary.get('end_date')
318 date_statistics = False 324 date_statistics = False
319 if start_date is None or end_date is None: 325 if start_date is None or end_date is None:
320 date_statistics = True 326 date_statistics = True
321 date_list = [] 327 date_list = []
322 month_mapping = {} 328 month_mapping = {}
323 reverse_trend_list = [] 329 reverse_trend_list = []
324 for sheet in summary['sheet']: 330 for sheet in summary.get('sheet', []):
325 ws = self.get_sheet_by_name(sheet) 331 ws = self.get_sheet_by_name(sheet)
326 # 1.1.删除多余列、排列 332 # 1.1.删除多余列、排列
327 min_row = self.sheet_prune(ws, summary['classify']) 333 min_row = self.sheet_prune(ws, summary.get('classify', 0))
328 # 1.2.按月份分割 334 # 1.2.按月份分割
329 self.sheet_split(ws, month_mapping, reverse_trend_list, min_row, date_list, date_statistics) 335 self.sheet_split(ws, month_mapping, reverse_trend_list, min_row, date_list, date_statistics)
330 336
...@@ -334,32 +340,43 @@ class BSWorkbook(Workbook): ...@@ -334,32 +340,43 @@ class BSWorkbook(Workbook):
334 340
335 # 2.元信息提取表 341 # 2.元信息提取表
336 ms = self.build_meta_sheet(card, 342 ms = self.build_meta_sheet(card,
337 summary['confidence'], 343 summary.get('confidence', 1),
338 summary['code'], 344 summary.get('code'),
339 summary['print_time'], 345 summary.get('print_time'),
340 start_date, 346 start_date,
341 end_date) 347 end_date)
342 348
343 # 3.创建月份表、提取/高亮关键行 349 # 3.创建月份表、提取/高亮关键行
344 is_reverse = False 350 # 倒序处理
345 if sum(reverse_trend_list) > 0: # 倒序处理 351 is_reverse = True if sum(reverse_trend_list) > 0 else False
346 is_reverse = True
347 for month_list in month_mapping.values(): 352 for month_list in month_mapping.values():
348 month_list.sort(key=lambda x: x[-1], reverse=True) 353 month_list.sort(key=lambda x: x[-1], reverse=is_reverse)
354
349 self.build_month_sheet(card, month_mapping, ms, is_reverse) 355 self.build_month_sheet(card, month_mapping, ms, is_reverse)
350 356
351 # 4.删除原表 357 # 4.删除原表
352 for sheet in summary['sheet']: 358 for sheet in summary.get('sheet'):
353 self.remove(self.get_sheet_by_name(sheet)) 359 self.remove(self.get_sheet_by_name(sheet))
354 360
355 def license_rebuild(self, license_summary): 361 def license_rebuild(self, license_summary):
356 for en_key, cn_key in consts.LICENSE_ORDER: 362 for classify, (_, name) in consts.LICENSE_ORDER:
357 ws = self.create_sheet(cn_key) 363 res = license_summary.get(classify)
358 for bl in license_summary.get(en_key, []): 364 if res is None:
365 continue
366 ws = self.create_sheet(name)
367 for bl in res:
359 for bl_field in bl: 368 for bl_field in bl:
360 ws.append(bl_field) 369 ws.append(bl_field)
361 ws.append((None, )) 370 ws.append((None, ))
362 371
363 def rebuild(self, bs_summary, license_summary): 372 def skip_img_sheet(self, skip_img):
373 if skip_img:
374 ws = self.create_sheet(consts.SKIP_IMG_SHEET_NAME)
375 ws.append(consts.SKIP_IMG_SHEET_HEADER)
376 for img_tuple in skip_img:
377 ws.append(img_tuple)
378
379 def rebuild(self, bs_summary, license_summary, skip_img):
364 self.bs_rebuild(bs_summary) 380 self.bs_rebuild(bs_summary)
365 # self.license_rebuild(license_summary) 381 self.license_rebuild(license_summary)
382 self.skip_img_sheet(skip_img)
......
...@@ -25,7 +25,7 @@ class PDFHandler: ...@@ -25,7 +25,7 @@ class PDFHandler:
25 def __init__(self, path, img_dir_path): 25 def __init__(self, path, img_dir_path):
26 self.path = path 26 self.path = path
27 self.img_dir_path = img_dir_path 27 self.img_dir_path = img_dir_path
28 self.img_info_list = [] 28 self.img_path_list = []
29 self.xref_set = set() 29 self.xref_set = set()
30 30
31 def get_img_save_path(self, pno, img_index=0, ext='png'): 31 def get_img_save_path(self, pno, img_index=0, ext='png'):
...@@ -38,7 +38,7 @@ class PDFHandler: ...@@ -38,7 +38,7 @@ class PDFHandler:
38 pm = page.getPixmap(matrix=trans_2, alpha=False) 38 pm = page.getPixmap(matrix=trans_2, alpha=False)
39 img_save_path = self.get_img_save_path(page.number) 39 img_save_path = self.get_img_save_path(page.number)
40 pm.writePNG(img_save_path) 40 pm.writePNG(img_save_path)
41 self.img_info_list.append((img_save_path, page.number, 0)) 41 self.img_path_list.append(img_save_path)
42 42
43 @staticmethod 43 @staticmethod
44 def getimage(pix): 44 def getimage(pix):
...@@ -88,7 +88,7 @@ class PDFHandler: ...@@ -88,7 +88,7 @@ class PDFHandler:
88 with open(img_save_path, "wb") as f: 88 with open(img_save_path, "wb") as f:
89 f.write(img_data) 89 f.write(img_data)
90 self.xref_set.add(xref) 90 self.xref_set.add(xref)
91 self.img_info_list.append((img_save_path, pno, img_index)) 91 self.img_path_list.append(img_save_path)
92 92
93 @staticmethod 93 @staticmethod
94 def split_il(il): 94 def split_il(il):
...@@ -179,7 +179,7 @@ class PDFHandler: ...@@ -179,7 +179,7 @@ class PDFHandler:
179 img_save_path = self.get_img_save_path(pno, img_index, im_list[0][2]) 179 img_save_path = self.get_img_save_path(pno, img_index, im_list[0][2])
180 new_img.save(img_save_path) 180 new_img.save(img_save_path)
181 page_to_png = False 181 page_to_png = False
182 self.img_info_list.append((img_save_path, pno, img_index)) 182 self.img_path_list.append(img_save_path)
183 183
184 # 3.3 碎图分组大于2、全过滤、含特殊filter,特殊处理:整个页面保存为png图片 184 # 3.3 碎图分组大于2、全过滤、含特殊filter,特殊处理:整个页面保存为png图片
185 if page_to_png: 185 if page_to_png:
......
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!