3591e645 by 周伟奇

Merge branch 'feature/main' into feature/mssql

2 parents e24808bc 94c1d320
...@@ -296,6 +296,7 @@ HEADERS_MAPPING.update( ...@@ -296,6 +296,7 @@ HEADERS_MAPPING.update(
296 HEADERS_MAPPING.update( 296 HEADERS_MAPPING.update(
297 { 297 {
298 '联机余额': OVER_KEY, 298 '联机余额': OVER_KEY,
299 '联机金额': OVER_KEY,
299 } 300 }
300 ) 301 )
301 # 竖版-无表格-邮储银行-账户对账单 含有对手方户名 对手方账户 302 # 竖版-无表格-邮储银行-账户对账单 含有对手方户名 对手方账户
...@@ -519,6 +520,8 @@ OTHER_TUPLE = (None, None, None, None, None, None, None, None, None, None, None, ...@@ -519,6 +520,8 @@ OTHER_TUPLE = (None, None, None, None, None, None, None, None, None, None, None,
519 # "35":"针式打印-部分格线-竖版-邮储银行", 520 # "35":"针式打印-部分格线-竖版-邮储银行",
520 # "36":"针式打印-部分格线-竖版-邮储银行-绿卡", 521 # "36":"针式打印-部分格线-竖版-邮储银行-绿卡",
521 522
523 # "38":"普通打印-无格线-农业银行-整数-特殊",
524
522 CLASSIFY_LIST = [ 525 CLASSIFY_LIST = [
523 ('其他', OTHER_TUPLE), 526 ('其他', OTHER_TUPLE),
524 ('其他', OTHER_TUPLE), 527 ('其他', OTHER_TUPLE),
...@@ -560,6 +563,8 @@ CLASSIFY_LIST = [ ...@@ -560,6 +563,8 @@ CLASSIFY_LIST = [
560 ('针式打印-部分格线-竖版-邮储银行', (2, None, 5, 6, None, 4, None, 7, None, None, None, None, None)), 563 ('针式打印-部分格线-竖版-邮储银行', (2, None, 5, 6, None, 4, None, 7, None, None, None, None, None)),
561 ('针式打印-部分格线-竖版-邮储银行-绿卡', (2, None, 5, 6, None, 4, None, 7, None, None, None, None, None)), 564 ('针式打印-部分格线-竖版-邮储银行-绿卡', (2, None, 5, 6, None, 4, None, 7, None, None, None, None, None)),
562 ('其他', OTHER_TUPLE), 565 ('其他', OTHER_TUPLE),
566
567 ('普通打印-无格线-农业银行-整数-特殊', (1, None, 3, 4, None, 2, None, 5, None, None, None, None, None)),
563 ] 568 ]
564 569
565 CLASSIFY_HEADER_LIST = [ 570 CLASSIFY_HEADER_LIST = [
...@@ -603,6 +608,8 @@ CLASSIFY_HEADER_LIST = [ ...@@ -603,6 +608,8 @@ CLASSIFY_HEADER_LIST = [
603 ('序号', '交易日期', '交易渠道', '摘要', '交易金额', '账户余额', '对方账号/卡号/汇票号', '原子账号', '交易机构名称'), 608 ('序号', '交易日期', '交易渠道', '摘要', '交易金额', '账户余额', '对方账号/卡号/汇票号', '原子账号', '交易机构名称'),
604 ('序号', '交易日期', '交易渠道', '摘要', '交易金额', '账户余额', '对方账号/卡号/汇票号', '原子账号', '交易机构名称'), 609 ('序号', '交易日期', '交易渠道', '摘要', '交易金额', '账户余额', '对方账号/卡号/汇票号', '原子账号', '交易机构名称'),
605 OTHER_TUPLE, 610 OTHER_TUPLE,
611
612 ('交易日期', '摘要/附言', '交易金额', '账户余额', '对方账号和户名'),
606 ] 613 ]
607 614
608 # ----------license相关------------------------------------------------------------------------------------------------ 615 # ----------license相关------------------------------------------------------------------------------------------------
...@@ -642,7 +649,7 @@ RP_FIELD_ORDER_0 = (('姓名', '姓名'), ...@@ -642,7 +649,7 @@ RP_FIELD_ORDER_0 = (('姓名', '姓名'),
642 ('出生年月', '出生年月'), 649 ('出生年月', '出生年月'),
643 ('住址', '住址'), 650 ('住址', '住址'),
644 ('性别', '性别'),) 651 ('性别', '性别'),)
645 RP_FIELD_ORDER_1 = IC_FIELD_ORDER_1 652 RP_FIELD_ORDER_1 = (('有效期限', '有效期限'), ('签发机关', '签发机关'), ('通行证号码', '通行证号码'))
646 # 增值税普票 653 # 增值税普票
647 VAT_CN_NAME = 'VAT普票' 654 VAT_CN_NAME = 'VAT普票'
648 VAT_CLASSIFY = 0 655 VAT_CLASSIFY = 0
...@@ -948,6 +955,8 @@ LICENSE_CLASSIFY_SET_1 = {IC_CLASSIFY, VAT_CLASSIFY, MVC_CLASSIFY, MVI_CLASSIFY, ...@@ -948,6 +955,8 @@ LICENSE_CLASSIFY_SET_1 = {IC_CLASSIFY, VAT_CLASSIFY, MVC_CLASSIFY, MVI_CLASSIFY,
948 LICENSE_CLASSIFY_SET_2 = {BL_CLASSIFY, EEP_CLASSIFY, DL_CLASSIFY, PP_CLASSIFY, BC_CLASSIFY} 955 LICENSE_CLASSIFY_SET_2 = {BL_CLASSIFY, EEP_CLASSIFY, DL_CLASSIFY, PP_CLASSIFY, BC_CLASSIFY}
949 956
950 NYYH_CLASSIFY = {17, 18} 957 NYYH_CLASSIFY = {17, 18}
958 NYZS_CLASSIFY = 18
959 SPECIAL_NYZS_CLASSIFY = 38
951 MS_CLASSIFY = 21 960 MS_CLASSIFY = 21
952 MS_ERROR_COL = (5, 6) 961 MS_ERROR_COL = (5, 6)
953 WECHART_CLASSIFY = 12 962 WECHART_CLASSIFY = 12
...@@ -960,12 +969,12 @@ WECHART_HEADERS_MAPPING.update( ...@@ -960,12 +969,12 @@ WECHART_HEADERS_MAPPING.update(
960 } 969 }
961 ) 970 )
962 971
963 PATTERN_LIST = ['交易名称', '收入/支出金额', '收入', '存入', '支出', '支取', '金额', '余额', '发生额', '借贷', '借贷状态', '收支标志', 972 PATTERN_LIST = ['联机金额', '交易名称', '收入/支出金额', '收入', '存入', '支出', '支取', '金额', '余额', '发生额', '借贷',
964 '收/支', '收入金额', '存入金额(贷)', '存入金额(贷)', '支出金额', '支取金额(借)', '支取金额(借)', '记账日期', 973 '借贷状态', '收支标志', '收/支', '收入金额', '存入金额(贷)', '存入金额(贷)', '支出金额', '支取金额(借)',
965 '交易日期', '摘要', '业务摘要', '工作日期', '交易金额', '账户余额', '交易类型', '金额(元)', '金额(元)', '时间', 974 '支取金额(借)', '记账日期', '交易日期', '摘要', '业务摘要', '工作日期', '交易金额', '账户余额', '交易类型',
966 '名称/备注', '摘要/附言', '交易发生额', '交易摘要', '借贷发生额(借:-贷:+)', '借贷发生额(借:-贷:+)', '联机余额', 975 '金额(元)', '金额(元)', '时间', '名称/备注', '摘要/附言', '交易发生额', '交易摘要', '借贷发生额(借:-贷:+)',
967 '交易金额(元)', '交易金额(元)', '账户余额(元)', '账户余额(元)', '会计日期', '摘要代码', '摘要信息', '日期', 976 '借贷发生额(借:-贷:+)', '联机余额', '交易金额(元)', '交易金额(元)', '账户余额(元)', '账户余额(元)', '会计日期',
968 '短摘要', '本次余额', '交易后余额', '交易说明', '帐户余额', '交易日期 记账日期'] 977 '摘要代码', '摘要信息', '日期', '短摘要', '本次余额', '交易后余额', '交易说明', '帐户余额', '交易日期 记账日期']
969 978
970 CN_RE = re.compile(u'[\u4e00-\u9fa5]') 979 CN_RE = re.compile(u'[\u4e00-\u9fa5]')
971 980
......
...@@ -28,6 +28,7 @@ class Command(BaseCommand, LoggerMixin): ...@@ -28,6 +28,7 @@ class Command(BaseCommand, LoggerMixin):
28 return 28 return
29 29
30 monthly_wb = Workbook() 30 monthly_wb = Workbook()
31 monthly_ws = monthly_wb.get_sheet_by_name('Sheet')
31 32
32 for d in range(1, monthrange(pre_mouth.year, pre_mouth.month)[1] + 1): 33 for d in range(1, monthrange(pre_mouth.year, pre_mouth.month)[1] + 1):
33 date_str = '{:04d}-{:02d}-{:02d}'.format(pre_mouth.year, pre_mouth.month, d) 34 date_str = '{:04d}-{:02d}-{:02d}'.format(pre_mouth.year, pre_mouth.month, d)
...@@ -36,12 +37,13 @@ class Command(BaseCommand, LoggerMixin): ...@@ -36,12 +37,13 @@ class Command(BaseCommand, LoggerMixin):
36 print('daily excel path not exists: {0}'.format(daily_excel_path)) 37 print('daily excel path not exists: {0}'.format(daily_excel_path))
37 continue 38 continue
38 39
39 monthly_ws = monthly_wb.create_sheet(date_str) 40 # monthly_ws = monthly_wb.create_sheet(date_str)
40 daily_wb = load_workbook(daily_excel_path) 41 daily_wb = load_workbook(daily_excel_path)
41 daily_ws = daily_wb.get_sheet_by_name('身份证') 42 daily_ws = daily_wb.get_sheet_by_name('身份证')
42 for row in daily_ws.iter_rows(min_row=1, values_only=True): 43 for row in daily_ws.iter_rows(min_row=1, values_only=True):
43 monthly_ws.append(row) 44 monthly_ws.append(row)
44 45
45 monthly_excel_path = os.path.join(excel_dir, 'idcard_{0}.xlsx'.format(pre_mouth.strftime('%Y-%m'))) 46 monthly_excel_path = os.path.join(excel_dir, 'idcard_{0}.xlsx'.format(pre_mouth.strftime('%Y-%m')))
46 monthly_wb.remove(monthly_wb.get_sheet_by_name('Sheet')) 47 # monthly_wb.remove(monthly_wb.get_sheet_by_name('Sheet'))
48 monthly_ws.title = '身份证'
47 monthly_wb.save(monthly_excel_path) 49 monthly_wb.save(monthly_excel_path)
......
...@@ -291,6 +291,7 @@ class Command(BaseCommand, LoggerMixin): ...@@ -291,6 +291,7 @@ class Command(BaseCommand, LoggerMixin):
291 return date_res 291 return date_res
292 292
293 def merge_card(self, bs_summary): 293 def merge_card(self, bs_summary):
294 classify_info = {}
294 merged_bs_summary = {} 295 merged_bs_summary = {}
295 sorted_card = sorted(bs_summary.keys(), key=lambda x: bs_summary[x]['count'], reverse=True) 296 sorted_card = sorted(bs_summary.keys(), key=lambda x: bs_summary[x]['count'], reverse=True)
296 for main_card in sorted_card: 297 for main_card in sorted_card:
...@@ -313,10 +314,13 @@ class Command(BaseCommand, LoggerMixin): ...@@ -313,10 +314,13 @@ class Command(BaseCommand, LoggerMixin):
313 merge_cards.append(card) 314 merge_cards.append(card)
314 for card in merge_cards: 315 for card in merge_cards:
315 del bs_summary[card] 316 del bs_summary[card]
316 merged_bs_summary[main_card]['classify'] = self.get_most(merged_bs_summary[main_card]['classify']) 317 most_classify = self.get_most(merged_bs_summary[main_card]['classify'])
318 classify_count = classify_info.get(most_classify, 0)
319 classify_info[most_classify] = classify_count + 1
320 merged_bs_summary[main_card]['classify'] = most_classify
317 merged_bs_summary[main_card]['role'] = self.get_most(merged_bs_summary[main_card]['role']) 321 merged_bs_summary[main_card]['role'] = self.get_most(merged_bs_summary[main_card]['role'])
318 del bs_summary 322 del bs_summary
319 return merged_bs_summary 323 return merged_bs_summary, classify_info
320 324
321 def prune_bs_summary(self, bs_summary): 325 def prune_bs_summary(self, bs_summary):
322 for summary in bs_summary.values(): 326 for summary in bs_summary.values():
...@@ -354,6 +358,11 @@ class Command(BaseCommand, LoggerMixin): ...@@ -354,6 +358,11 @@ class Command(BaseCommand, LoggerMixin):
354 # } 358 # }
355 # } 359 # }
356 # } 360 # }
361 # 归为同一份流水的逻辑
362 # 所有图片均无卡号:同一分类同一户名归为同一份流水(如果同一分类下只有一个已知户名,则此分类下其他未知户名归为此户名)
363 # 所有图片只已知1卡号:其他未知卡号流水归为此卡号
364 # 所有图片已知多卡号: 1.根据相似度和图片数目合并相似已知卡号,并整理多数分类和户名集合
365 # 2.遍历所有未知卡号,进行过滤:当未知卡号分类与某已知卡号一致,且此未知卡号户名在此已知卡号户名集合中时,将未知卡号归为已知卡号。剩余未知卡号同一分类同一户名归为同一流水
357 # 无卡号 366 # 无卡号
358 if len(bs_summary) == 0: 367 if len(bs_summary) == 0:
359 del bs_summary 368 del bs_summary
...@@ -383,15 +392,16 @@ class Command(BaseCommand, LoggerMixin): ...@@ -383,15 +392,16 @@ class Command(BaseCommand, LoggerMixin):
383 if len(bs_summary) == 1: 392 if len(bs_summary) == 1:
384 merged_bs_summary = self.prune_bs_summary(bs_summary) 393 merged_bs_summary = self.prune_bs_summary(bs_summary)
385 one_card = True 394 one_card = True
395 classify_info = {}
386 # 多卡号 396 # 多卡号
387 else: 397 else:
388 merged_bs_summary = self.merge_card(bs_summary) 398 merged_bs_summary, classify_info = self.merge_card(bs_summary)
389 399
390 for card_summary in merged_bs_summary.values(): 400 for card_summary in merged_bs_summary.values():
391 merge_role = [] 401 merge_role = []
392 classify_summary = unknown_summary.get(card_summary['classify'], {}) 402 classify_summary = unknown_summary.get(card_summary['classify'], {})
393 for role, summary in classify_summary.items(): 403 for role, summary in classify_summary.items():
394 if one_card or role in card_summary['role_set']: 404 if one_card or classify_info.get(card_summary['classify'], 0) == 1 or role in card_summary['role_set']:
395 merge_role.append(role) 405 merge_role.append(role)
396 # card_summary['confidence'].extend(summary['confidence']) 406 # card_summary['confidence'].extend(summary['confidence'])
397 card_summary['sheet'].extend(summary['sheet']) 407 card_summary['sheet'].extend(summary['sheet'])
......
...@@ -2,6 +2,7 @@ import re ...@@ -2,6 +2,7 @@ import re
2 import random 2 import random
3 import locale 3 import locale
4 import numpy as np 4 import numpy as np
5 from datetime import datetime
5 from pandas._libs import tslib 6 from pandas._libs import tslib
6 from pandas._libs.tslibs.nattype import NaTType 7 from pandas._libs.tslibs.nattype import NaTType
7 from pandas.core.indexes.datetimes import DatetimeIndex 8 from pandas.core.indexes.datetimes import DatetimeIndex
...@@ -126,7 +127,7 @@ class BSWorkbook(Workbook): ...@@ -126,7 +127,7 @@ class BSWorkbook(Workbook):
126 max_column_list.append(ws.max_column) 127 max_column_list.append(ws.max_column)
127 128
128 @staticmethod 129 @staticmethod
129 def header_statistics(sheet_header_info, header_info, classify): 130 def header_statistics(sheet_header_info, header_info, classify, special_nhzs):
130 # statistics_header_info = { 131 # statistics_header_info = {
131 # SUMMARY_KEY: 2, 132 # SUMMARY_KEY: 2,
132 # DATE_KEY: 3, 133 # DATE_KEY: 3,
...@@ -143,6 +144,8 @@ class BSWorkbook(Workbook): ...@@ -143,6 +144,8 @@ class BSWorkbook(Workbook):
143 best_sheet_info = sheet_header_info.get(sheet_order_list[0]) 144 best_sheet_info = sheet_header_info.get(sheet_order_list[0])
144 max_find_count = best_sheet_info.get(consts.FIND_COUNT_KEY, 0) 145 max_find_count = best_sheet_info.get(consts.FIND_COUNT_KEY, 0)
145 if max_find_count == 0: 146 if max_find_count == 0:
147 if special_nhzs:
148 classify = consts.SPECIAL_NYZS_CLASSIFY
146 for key, value in consts.CLASSIFY_MAP.items(): 149 for key, value in consts.CLASSIFY_MAP.items():
147 col = consts.CLASSIFY_LIST[classify][1][value] 150 col = consts.CLASSIFY_LIST[classify][1][value]
148 statistics_header_info[key] = col - 1 if isinstance(col, int) else None 151 statistics_header_info[key] = col - 1 if isinstance(col, int) else None
...@@ -255,7 +258,7 @@ class BSWorkbook(Workbook): ...@@ -255,7 +258,7 @@ class BSWorkbook(Workbook):
255 date_col = date_col + 1 258 date_col = date_col + 1
256 for date_tuple_src in ws.iter_cols(min_col=date_col, max_col=date_col, min_row=min_row, values_only=True): 259 for date_tuple_src in ws.iter_cols(min_col=date_col, max_col=date_col, min_row=min_row, values_only=True):
257 date_tuple = [date[:10] if isinstance(date, str) else date for date in date_tuple_src] 260 date_tuple = [date[:10] if isinstance(date, str) else date for date in date_tuple_src]
258 dt_array, tz_parsed = tslib.array_to_datetime( 261 dt_array, _ = tslib.array_to_datetime(
259 np.array(date_tuple, copy=False, dtype=np.object_), 262 np.array(date_tuple, copy=False, dtype=np.object_),
260 errors="coerce", 263 errors="coerce",
261 utc=False, 264 utc=False,
...@@ -265,6 +268,22 @@ class BSWorkbook(Workbook): ...@@ -265,6 +268,22 @@ class BSWorkbook(Workbook):
265 ) 268 )
266 dti = DatetimeIndex(dt_array, tz=None, name=None) 269 dti = DatetimeIndex(dt_array, tz=None, name=None)
267 270
271 rebuid = False
272 for idx, d in enumerate(dti):
273 try:
274 if isinstance(d, NaTType) and isinstance(date_tuple[idx], str):
275 match_obj = re.match(r'(\d{4})[7/](\d{2})[7/](\d{2})', date_tuple[idx])
276 if match_obj:
277 dt_array[idx] = np.datetime64(datetime(int(match_obj.group(1)),
278 int(match_obj.group(2)),
279 int(match_obj.group(3))))
280 rebuid = True
281 except Exception as e:
282 continue
283
284 if rebuid:
285 dti = DatetimeIndex(dt_array, tz=None, name=None)
286
268 month_list, idx_list = self.month_split(dti, date_list, date_statistics) 287 month_list, idx_list = self.month_split(dti, date_list, date_statistics)
269 288
270 if len(month_list) == 0: 289 if len(month_list) == 0:
...@@ -555,6 +574,7 @@ class BSWorkbook(Workbook): ...@@ -555,6 +574,7 @@ class BSWorkbook(Workbook):
555 # } 574 # }
556 # } 575 # }
557 for card, summary in bs_summary.items(): 576 for card, summary in bs_summary.items():
577 special_nhzs = False
558 new_card = self.get_new_card(card) 578 new_card = self.get_new_card(card)
559 # 1.原表表头收集、按照月份分割 579 # 1.原表表头收集、按照月份分割
560 # 1.1 总结首行信息 580 # 1.1 总结首行信息
...@@ -563,10 +583,17 @@ class BSWorkbook(Workbook): ...@@ -563,10 +583,17 @@ class BSWorkbook(Workbook):
563 header_info = {} 583 header_info = {}
564 max_column_list = [] 584 max_column_list = []
565 sheets_list = summary.get('sheet', []) 585 sheets_list = summary.get('sheet', [])
586 special_nhzs_max_col = 0
566 for sheet in sheets_list: 587 for sheet in sheets_list:
567 ws = self.get_sheet_by_name(sheet) 588 ws = self.get_sheet_by_name(sheet)
589 if classify == consts.NYZS_CLASSIFY:
590 special_nhzs_max_col += ws.max_column
568 self.header_collect(ws, sheet_header_info, header_info, max_column_list, classify) 591 self.header_collect(ws, sheet_header_info, header_info, max_column_list, classify)
569 statistics_header_info, max_find_count = self.header_statistics(sheet_header_info, header_info, classify) 592 # 农业银行整数表头特殊处理
593 if classify == consts.NYZS_CLASSIFY and round(special_nhzs_max_col / len(sheets_list)) == 5:
594 special_nhzs = True
595 statistics_header_info, max_find_count = self.header_statistics(
596 sheet_header_info, header_info, classify, special_nhzs)
570 max_column = max(max_column_list) 597 max_column = max(max_column_list)
571 598
572 # 1.2.按月份分割 min_row 正文第一行 date_col 日期行 599 # 1.2.按月份分割 min_row 正文第一行 date_col 日期行
......
...@@ -370,4 +370,5 @@ class DocView(GenericView, DocHandler): ...@@ -370,4 +370,5 @@ class DocView(GenericView, DocHandler):
370 self.running_log.info('[mock doc upload success] [args={0}] [business_type={1}] [doc_id={2}] ' 370 self.running_log.info('[mock doc upload success] [args={0}] [business_type={1}] [doc_id={2}] '
371 '[is_priority={3}] [enqueue_res={4}]'.format(args, prefix, doc.id, 371 '[is_priority={3}] [enqueue_res={4}]'.format(args, prefix, doc.id,
372 is_priority, enqueue_res)) 372 is_priority, enqueue_res))
373 return response.ok() 373 data = {'excel_path': os.path.join(save_dir_path, '{0}.xlsx'.format(doc.id))}
374 return response.ok(data=data)
......
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!