3591e645 by 周伟奇

Merge branch 'feature/main' into feature/mssql

2 parents e24808bc 94c1d320
......@@ -296,6 +296,7 @@ HEADERS_MAPPING.update(
HEADERS_MAPPING.update(
{
'联机余额': OVER_KEY,
'联机金额': OVER_KEY,
}
)
# 竖版-无表格-邮储银行-账户对账单 含有对手方户名 对手方账户
......@@ -519,6 +520,8 @@ OTHER_TUPLE = (None, None, None, None, None, None, None, None, None, None, None,
# "35":"针式打印-部分格线-竖版-邮储银行",
# "36":"针式打印-部分格线-竖版-邮储银行-绿卡",
# "38":"普通打印-无格线-农业银行-整数-特殊",
CLASSIFY_LIST = [
('其他', OTHER_TUPLE),
('其他', OTHER_TUPLE),
......@@ -560,6 +563,8 @@ CLASSIFY_LIST = [
('针式打印-部分格线-竖版-邮储银行', (2, None, 5, 6, None, 4, None, 7, None, None, None, None, None)),
('针式打印-部分格线-竖版-邮储银行-绿卡', (2, None, 5, 6, None, 4, None, 7, None, None, None, None, None)),
('其他', OTHER_TUPLE),
('普通打印-无格线-农业银行-整数-特殊', (1, None, 3, 4, None, 2, None, 5, None, None, None, None, None)),
]
CLASSIFY_HEADER_LIST = [
......@@ -603,6 +608,8 @@ CLASSIFY_HEADER_LIST = [
('序号', '交易日期', '交易渠道', '摘要', '交易金额', '账户余额', '对方账号/卡号/汇票号', '原子账号', '交易机构名称'),
('序号', '交易日期', '交易渠道', '摘要', '交易金额', '账户余额', '对方账号/卡号/汇票号', '原子账号', '交易机构名称'),
OTHER_TUPLE,
('交易日期', '摘要/附言', '交易金额', '账户余额', '对方账号和户名'),
]
# ----------license相关------------------------------------------------------------------------------------------------
......@@ -642,7 +649,7 @@ RP_FIELD_ORDER_0 = (('姓名', '姓名'),
('出生年月', '出生年月'),
('住址', '住址'),
('性别', '性别'),)
RP_FIELD_ORDER_1 = IC_FIELD_ORDER_1
RP_FIELD_ORDER_1 = (('有效期限', '有效期限'), ('签发机关', '签发机关'), ('通行证号码', '通行证号码'))
# 增值税普票
VAT_CN_NAME = 'VAT普票'
VAT_CLASSIFY = 0
......@@ -948,6 +955,8 @@ LICENSE_CLASSIFY_SET_1 = {IC_CLASSIFY, VAT_CLASSIFY, MVC_CLASSIFY, MVI_CLASSIFY,
LICENSE_CLASSIFY_SET_2 = {BL_CLASSIFY, EEP_CLASSIFY, DL_CLASSIFY, PP_CLASSIFY, BC_CLASSIFY}
NYYH_CLASSIFY = {17, 18}
NYZS_CLASSIFY = 18
SPECIAL_NYZS_CLASSIFY = 38
MS_CLASSIFY = 21
MS_ERROR_COL = (5, 6)
WECHART_CLASSIFY = 12
......@@ -960,12 +969,12 @@ WECHART_HEADERS_MAPPING.update(
}
)
PATTERN_LIST = ['交易名称', '收入/支出金额', '收入', '存入', '支出', '支取', '金额', '余额', '发生额', '借贷', '借贷状态', '收支标志',
'收/支', '收入金额', '存入金额(贷)', '存入金额(贷)', '支出金额', '支取金额(借)', '支取金额(借)', '记账日期',
'交易日期', '摘要', '业务摘要', '工作日期', '交易金额', '账户余额', '交易类型', '金额(元)', '金额(元)', '时间',
'名称/备注', '摘要/附言', '交易发生额', '交易摘要', '借贷发生额(借:-贷:+)', '借贷发生额(借:-贷:+)', '联机余额',
'交易金额(元)', '交易金额(元)', '账户余额(元)', '账户余额(元)', '会计日期', '摘要代码', '摘要信息', '日期',
'短摘要', '本次余额', '交易后余额', '交易说明', '帐户余额', '交易日期 记账日期']
PATTERN_LIST = ['联机金额', '交易名称', '收入/支出金额', '收入', '存入', '支出', '支取', '金额', '余额', '发生额', '借贷',
'借贷状态', '收支标志', '收/支', '收入金额', '存入金额(贷)', '存入金额(贷)', '支出金额', '支取金额(借)',
'支取金额(借)', '记账日期', '交易日期', '摘要', '业务摘要', '工作日期', '交易金额', '账户余额', '交易类型',
'金额(元)', '金额(元)', '时间', '名称/备注', '摘要/附言', '交易发生额', '交易摘要', '借贷发生额(借:-贷:+)',
'借贷发生额(借:-贷:+)', '联机余额', '交易金额(元)', '交易金额(元)', '账户余额(元)', '账户余额(元)', '会计日期',
'摘要代码', '摘要信息', '日期', '短摘要', '本次余额', '交易后余额', '交易说明', '帐户余额', '交易日期 记账日期']
CN_RE = re.compile(u'[\u4e00-\u9fa5]')
......
......@@ -28,6 +28,7 @@ class Command(BaseCommand, LoggerMixin):
return
monthly_wb = Workbook()
monthly_ws = monthly_wb.get_sheet_by_name('Sheet')
for d in range(1, monthrange(pre_mouth.year, pre_mouth.month)[1] + 1):
date_str = '{:04d}-{:02d}-{:02d}'.format(pre_mouth.year, pre_mouth.month, d)
......@@ -36,12 +37,13 @@ class Command(BaseCommand, LoggerMixin):
print('daily excel path not exists: {0}'.format(daily_excel_path))
continue
monthly_ws = monthly_wb.create_sheet(date_str)
# monthly_ws = monthly_wb.create_sheet(date_str)
daily_wb = load_workbook(daily_excel_path)
daily_ws = daily_wb.get_sheet_by_name('身份证')
for row in daily_ws.iter_rows(min_row=1, values_only=True):
monthly_ws.append(row)
monthly_excel_path = os.path.join(excel_dir, 'idcard_{0}.xlsx'.format(pre_mouth.strftime('%Y-%m')))
monthly_wb.remove(monthly_wb.get_sheet_by_name('Sheet'))
# monthly_wb.remove(monthly_wb.get_sheet_by_name('Sheet'))
monthly_ws.title = '身份证'
monthly_wb.save(monthly_excel_path)
......
......@@ -291,6 +291,7 @@ class Command(BaseCommand, LoggerMixin):
return date_res
def merge_card(self, bs_summary):
classify_info = {}
merged_bs_summary = {}
sorted_card = sorted(bs_summary.keys(), key=lambda x: bs_summary[x]['count'], reverse=True)
for main_card in sorted_card:
......@@ -313,10 +314,13 @@ class Command(BaseCommand, LoggerMixin):
merge_cards.append(card)
for card in merge_cards:
del bs_summary[card]
merged_bs_summary[main_card]['classify'] = self.get_most(merged_bs_summary[main_card]['classify'])
most_classify = self.get_most(merged_bs_summary[main_card]['classify'])
classify_count = classify_info.get(most_classify, 0)
classify_info[most_classify] = classify_count + 1
merged_bs_summary[main_card]['classify'] = most_classify
merged_bs_summary[main_card]['role'] = self.get_most(merged_bs_summary[main_card]['role'])
del bs_summary
return merged_bs_summary
return merged_bs_summary, classify_info
def prune_bs_summary(self, bs_summary):
for summary in bs_summary.values():
......@@ -354,6 +358,11 @@ class Command(BaseCommand, LoggerMixin):
# }
# }
# }
# 归为同一份流水的逻辑
# 所有图片均无卡号:同一分类同一户名归为同一份流水(如果同一分类下只有一个已知户名,则此分类下其他未知户名归为此户名)
# 所有图片只已知1卡号:其他未知卡号流水归为此卡号
# 所有图片已知多卡号: 1.根据相似度和图片数目合并相似已知卡号,并整理多数分类和户名集合
# 2.遍历所有未知卡号,进行过滤:当未知卡号分类与某已知卡号一致,且此未知卡号户名在此已知卡号户名集合中时,将未知卡号归为已知卡号。剩余未知卡号同一分类同一户名归为同一流水
# 无卡号
if len(bs_summary) == 0:
del bs_summary
......@@ -383,15 +392,16 @@ class Command(BaseCommand, LoggerMixin):
if len(bs_summary) == 1:
merged_bs_summary = self.prune_bs_summary(bs_summary)
one_card = True
classify_info = {}
# 多卡号
else:
merged_bs_summary = self.merge_card(bs_summary)
merged_bs_summary, classify_info = self.merge_card(bs_summary)
for card_summary in merged_bs_summary.values():
merge_role = []
classify_summary = unknown_summary.get(card_summary['classify'], {})
for role, summary in classify_summary.items():
if one_card or role in card_summary['role_set']:
if one_card or classify_info.get(card_summary['classify'], 0) == 1 or role in card_summary['role_set']:
merge_role.append(role)
# card_summary['confidence'].extend(summary['confidence'])
card_summary['sheet'].extend(summary['sheet'])
......
......@@ -2,6 +2,7 @@ import re
import random
import locale
import numpy as np
from datetime import datetime
from pandas._libs import tslib
from pandas._libs.tslibs.nattype import NaTType
from pandas.core.indexes.datetimes import DatetimeIndex
......@@ -126,7 +127,7 @@ class BSWorkbook(Workbook):
max_column_list.append(ws.max_column)
@staticmethod
def header_statistics(sheet_header_info, header_info, classify):
def header_statistics(sheet_header_info, header_info, classify, special_nhzs):
# statistics_header_info = {
# SUMMARY_KEY: 2,
# DATE_KEY: 3,
......@@ -143,6 +144,8 @@ class BSWorkbook(Workbook):
best_sheet_info = sheet_header_info.get(sheet_order_list[0])
max_find_count = best_sheet_info.get(consts.FIND_COUNT_KEY, 0)
if max_find_count == 0:
if special_nhzs:
classify = consts.SPECIAL_NYZS_CLASSIFY
for key, value in consts.CLASSIFY_MAP.items():
col = consts.CLASSIFY_LIST[classify][1][value]
statistics_header_info[key] = col - 1 if isinstance(col, int) else None
......@@ -255,7 +258,7 @@ class BSWorkbook(Workbook):
date_col = date_col + 1
for date_tuple_src in ws.iter_cols(min_col=date_col, max_col=date_col, min_row=min_row, values_only=True):
date_tuple = [date[:10] if isinstance(date, str) else date for date in date_tuple_src]
dt_array, tz_parsed = tslib.array_to_datetime(
dt_array, _ = tslib.array_to_datetime(
np.array(date_tuple, copy=False, dtype=np.object_),
errors="coerce",
utc=False,
......@@ -265,6 +268,22 @@ class BSWorkbook(Workbook):
)
dti = DatetimeIndex(dt_array, tz=None, name=None)
rebuid = False
for idx, d in enumerate(dti):
try:
if isinstance(d, NaTType) and isinstance(date_tuple[idx], str):
match_obj = re.match(r'(\d{4})[7/](\d{2})[7/](\d{2})', date_tuple[idx])
if match_obj:
dt_array[idx] = np.datetime64(datetime(int(match_obj.group(1)),
int(match_obj.group(2)),
int(match_obj.group(3))))
rebuid = True
except Exception as e:
continue
if rebuid:
dti = DatetimeIndex(dt_array, tz=None, name=None)
month_list, idx_list = self.month_split(dti, date_list, date_statistics)
if len(month_list) == 0:
......@@ -555,6 +574,7 @@ class BSWorkbook(Workbook):
# }
# }
for card, summary in bs_summary.items():
special_nhzs = False
new_card = self.get_new_card(card)
# 1.原表表头收集、按照月份分割
# 1.1 总结首行信息
......@@ -563,10 +583,17 @@ class BSWorkbook(Workbook):
header_info = {}
max_column_list = []
sheets_list = summary.get('sheet', [])
special_nhzs_max_col = 0
for sheet in sheets_list:
ws = self.get_sheet_by_name(sheet)
if classify == consts.NYZS_CLASSIFY:
special_nhzs_max_col += ws.max_column
self.header_collect(ws, sheet_header_info, header_info, max_column_list, classify)
statistics_header_info, max_find_count = self.header_statistics(sheet_header_info, header_info, classify)
# 农业银行整数表头特殊处理
if classify == consts.NYZS_CLASSIFY and round(special_nhzs_max_col / len(sheets_list)) == 5:
special_nhzs = True
statistics_header_info, max_find_count = self.header_statistics(
sheet_header_info, header_info, classify, special_nhzs)
max_column = max(max_column_list)
# 1.2.按月份分割 min_row 正文第一行 date_col 日期行
......
......@@ -370,4 +370,5 @@ class DocView(GenericView, DocHandler):
self.running_log.info('[mock doc upload success] [args={0}] [business_type={1}] [doc_id={2}] '
'[is_priority={3}] [enqueue_res={4}]'.format(args, prefix, doc.id,
is_priority, enqueue_res))
return response.ok()
data = {'excel_path': os.path.join(save_dir_path, '{0}.xlsx'.format(doc.id))}
return response.ok(data=data)
......
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!