e5726dfc by 周伟奇

comparison part

1 parent 80b6584c
......@@ -1003,3 +1003,4 @@ APPLICANT_TYPE = ['COAPP', 'CUSTR', 'GAUTR1', 'GAUTR2']
ID_TYPE = ['ITARI', 'ITHKM', 'ITPRC', 'ITPSP', 'ITRES', 'ITTID', 'ITUSC', 'ITCCU']
SECOND_ID_TYPE = ['ITARI', 'ITHKM', 'ITPRC', 'ITPSP', 'ITRES', 'ITTID']
SUB_TYPE = ['CSIBM', 'CSOTH', 'CSSME']
......
PAGE_DEFAULT = 1
PAGE_SIZE_DEFAULT = 10
FIXED_APPLICATION_ID_PREFIX = 'CH-S'
DOC_SCHEME_LIST = ['ACCEPTANCE', 'SETTLEMENT', 'CONTRACT MANAGEMENT']
DATA_SOURCE_LIST = ['POS', 'EAPP', 'ECONTRACT']
HIL_PREFIX = 'HIL'
AFC_PREFIX = 'AFC'
SPLIT_STR = '_'
BUSINESS_TYPE_LIST = [HIL_PREFIX, AFC_PREFIX]
HIL_SET = {'HIL', 'HIl', 'HiL', 'Hil', 'hIL', 'hIl', 'hiL', 'hil', 'CO00002'}
# -------EDMS相关---------------------------------------------------------------------------------------------------
SESSION_PREFIX = 'FHLSID'
CUSTOM_CLIENT = 'CustomClient'
FIXED_TOKEN = '00000000-0000-0000-0000-000000000000'
FIXED_FILE_SIZE = 0
DOWNLOAD_ACTION_TYPE = 'Downloaded'
DOC_SCHEMA_ID_FILL = {
'ACCEPTANCE': (1, 'DFE-AutoFilingScript'),
'SETTLEMENT': (20, 'DFE-AutoFilingScript'),
'CONTRACT MANAGEMENT': (86, 'Schema-Based')
}
BUSINESS_TYPE_DICT = {
HIL_PREFIX: 'CO00002',
AFC_PREFIX: 'CO00001'
}
DOC_SCHEMA_TYPE = 'ElectronicRecord'
APPLICATION_ID_META_FIELD_id = 1
DEALER_CODE_META_FIELD_id = 13
BUSINESS_TYPE_META_FIELD_id = 93
DEALER_CODE = 'ocr_situ_group'
RETRY_TIMES = 3
# ---------银行流水模板相关--------------------------------------------------------------------------------------------
TRANS_MAP = {
'C': "0",
'c': "0",
'(': "0",
'o': "0",
'O': "0",
'D': "0",
'[': "1",
']': "1",
'l': "1",
'L': "1",
'A': "4",
's': "5",
'S': "5",
'b': "6",
'g': "9",
'E': "9",
'B': "13",
}
TRANS = str.maketrans(TRANS_MAP)
ERROR_CHARS = {'.', '。', ':', ':', '•', '·', ',', ','}
SKIP_IMG_SHEET_NAME = '未处理图片'
SKIP_IMG_SHEET_HEADER = ('页码', '序号')
CARD_RATIO = 0.9
UNKNOWN_CARD = '未知卡号'
UNKNOWN_ROLE = '未知户名'
DATE_FORMAT = ['%Y年%m月%d日', '%Y/%m/%d', '%Y-%m-%d', '%Y%m%d']
PROOF_COL_TITLE = '核对结果'
PROOF_RES = ('对', '错')
META_SHEET_TITLE = '关键信息提取和展示'
FIXED_HEADERS = ('记账日期', '记账时间', '金额', '余额', '交易名称', '附言', '对方账户名', '对方卡号/账号',
'对方开户行', '核对结果', '借贷', '收入', '支出')
FIXED_COL_AMOUNT = len(FIXED_HEADERS)
BASE_HEADERS_MAPPING = {label: idx + 1 for idx, label in enumerate(FIXED_HEADERS)}
BORROW_HEADER_COL = BASE_HEADERS_MAPPING['借贷']
INCOME_HEADER_COL = BASE_HEADERS_MAPPING['收入']
OUTLAY_HEADER_COL = BASE_HEADERS_MAPPING['支出']
RESULT_HEADER_COL = BASE_HEADERS_MAPPING['核对结果']
BORROW_IDX = BORROW_HEADER_COL - 1
INCOME_IDX = INCOME_HEADER_COL - 1
OUTLAY_IDX = OUTLAY_HEADER_COL - 1
SUMMARY_IDX = FIXED_HEADERS.index('附言')
DATE_IDX = FIXED_HEADERS.index('记账日期')
AMOUNT_IDX = FIXED_HEADERS.index('金额')
OVER_IDX = FIXED_HEADERS.index('余额')
RESULT_IDX = FIXED_HEADERS.index('核对结果')
# '借贷': ('贷', '借'), # 竖版-无表格-广发银行
# '借贷状态': ('贷', '借'), # 竖版-特殊-交通银行
# '收/支': ('收入', '支出'), # 横版-表格-北京银行
BORROW_HEADERS_SET = {'借贷', '借贷状态', '收/支'}
BORROW_INCOME_SET = {'贷', '收入'}
BORROW_OUTLAY_SET = {'借', '支出'}
INCOME_HEADERS_SET = {'收入金额', '收入', '存入', '存入金额(贷)', '存入金额(贷)'}
OUTLAY_HEADERS_SET = {'支出金额', '支出', '支取金额(借)', '支取金额(借)'}
# ------------------普通打印-全格线--------------------------------------------------------------------------------------
HEADERS_MAPPING = {}
# 横版-表格-中国银行(不规则)
HEADERS_MAPPING.update(
{
'记账日期': BASE_HEADERS_MAPPING['记账日期'],
'记账时间': BASE_HEADERS_MAPPING['记账时间'],
'金额': BASE_HEADERS_MAPPING['金额'],
'余额': BASE_HEADERS_MAPPING['余额'],
'交易名称': BASE_HEADERS_MAPPING['交易名称'],
'附言': BASE_HEADERS_MAPPING['附言'],
'对方账户名': BASE_HEADERS_MAPPING['对方账户名'],
'对方卡号/账号': BASE_HEADERS_MAPPING['对方卡号/账号'],
'对方开户行': BASE_HEADERS_MAPPING['对方开户行'],
}
)
# 横版-表格-农业银行-中国农业银行个人账户明细
HEADERS_MAPPING.update(
{
'交易日期': BASE_HEADERS_MAPPING['记账日期'],
# '存入': BASE_HEADERS_MAPPING['金额'],
'对方账号': BASE_HEADERS_MAPPING['对方卡号/账号'],
'对方名称': BASE_HEADERS_MAPPING['对方账户名'],
'摘要': BASE_HEADERS_MAPPING['附言'],
}
)
# 横版-表格-北京银行
HEADERS_MAPPING.update(
{
'业务摘要': BASE_HEADERS_MAPPING['附言'],
'发生额': BASE_HEADERS_MAPPING['金额'],
'对方户名': BASE_HEADERS_MAPPING['对方账户名'],
}
)
# 横版-表格-工商银行 借记卡账户历史明细清单
# 横版-表格-工商银行-机打验证码 借记卡账户历史明细清单
# 横版-表格-工商银行CH-B008802400
# 横版-表格-工商银行 工资明细清单
# 工商银行历史明细(申请单号:20042501303039397888)
HEADERS_MAPPING.update(
{
'收入/支出金额': BASE_HEADERS_MAPPING['金额'],
'工作日期': BASE_HEADERS_MAPPING['记账日期'],
}
)
# 横版-表格-建设银行-个人活期账户交易明细
# 竖版-表格-建设银行-个人活期账户交易明细 CH-B005832604
# 竖版-表格-建设银行-工资账单CH-B008786812
# 竖版-表格-建设银行-个人活期账户交易明细 CH-B005832604 (2)
HEADERS_MAPPING.update(
{
'交易金额': BASE_HEADERS_MAPPING['金额'],
'账户余额': BASE_HEADERS_MAPPING['余额'],
'对方账号与户名': BASE_HEADERS_MAPPING['对方卡号/账号'],
}
)
# 微信
HEADERS_MAPPING.update(
{
'交易时间': BASE_HEADERS_MAPPING['记账时间'],
'交易类型': BASE_HEADERS_MAPPING['附言'],
'金额(元)': BASE_HEADERS_MAPPING['金额'],
'金额(元)': BASE_HEADERS_MAPPING['金额'],
'交易对方': BASE_HEADERS_MAPPING['对方账户名'],
}
)
# 支付宝
HEADERS_MAPPING.update(
{
'时间': BASE_HEADERS_MAPPING['记账日期'],
'名称/备注': BASE_HEADERS_MAPPING['附言'],
}
)
# ------------普通打印-部分格线-------------------------------------------------------------------------------------------
# 竖版-无表格-农业银行
# 竖版-无表格-农业银行CH-B008805428
HEADERS_MAPPING.update(
{
'摘要/附言': BASE_HEADERS_MAPPING['附言'],
'交易地点/对方账号和户名': BASE_HEADERS_MAPPING['对方卡号/账号'],
}
)
# 农业银行-窄页
HEADERS_MAPPING.update(
{
'交易对手账号': BASE_HEADERS_MAPPING['对方卡号/账号'],
}
)
# 竖版-特殊-农商行
HEADERS_MAPPING.update(
{
'交易发生额': BASE_HEADERS_MAPPING['金额'],
}
)
# 横版-特殊-中信银行-账户交易明细
HEADERS_MAPPING.update(
{
'对方银行': BASE_HEADERS_MAPPING['对方开户行'],
'交易摘要': BASE_HEADERS_MAPPING['附言'],
}
)
# 平安电子账单
HEADERS_MAPPING.update(
{
'借贷发生额(借:-贷:+)': BASE_HEADERS_MAPPING['金额'],
}
)
# ------------普通打印-无格线--------------------------------------------------------------------------------------------
# 竖版-无表格-招商银行(略歪)
# 竖版-无表格-招商银行账户历史交易明细表
HEADERS_MAPPING.update(
{
'联机余额': BASE_HEADERS_MAPPING['余额'],
}
)
# 竖版-无表格-邮储银行-账户对账单 含有对手方户名 对手方账户
# 竖版-无表格-邮储银行 账户对账单
# 竖版-无表格-邮储银行-电子章 邮储银行 账户对账单
HEADERS_MAPPING.update(
{
'交易金额(元)': BASE_HEADERS_MAPPING['金额'],
'交易金额(元)': BASE_HEADERS_MAPPING['金额'],
'账户余额(元)': BASE_HEADERS_MAPPING['余额'],
'账户余额(元)': BASE_HEADERS_MAPPING['余额'],
'对手方户名': BASE_HEADERS_MAPPING['对方账户名'],
'对手方账户': BASE_HEADERS_MAPPING['对方卡号/账号'],
}
)
# 横版-无表格-广发银行-账户交易历史 --> 已废弃
# 竖版-无表格-广发银行-账户交易历史 --> 已废弃
HEADERS_MAPPING.update(
{
'会计日期': BASE_HEADERS_MAPPING['记账日期'],
'对手户名': BASE_HEADERS_MAPPING['对方账户名'],
'对手账号': BASE_HEADERS_MAPPING['对方卡号/账号'],
}
)
# 招行电子账单 TODO 有英文,需测试
HEADERS_MAPPING.update(
{
'对手信息': BASE_HEADERS_MAPPING['对方账户名'],
'摘要代码': BASE_HEADERS_MAPPING['附言'],
}
)
# 横版-无表格-民生银行-中国民生银行个人账户对账单(客户卡号)
# 横版-无表格-民生银行-无标题(客户账户)
# 横版-无表格-民生银行
HEADERS_MAPPING.update(
{
'摘要信息': BASE_HEADERS_MAPPING['附言'],
'对方行名': BASE_HEADERS_MAPPING['对方开户行'],
}
)
# 竖版-无表格-农业银行整数
# 竖版-无表格-农业银行-中国农业银行银行卡交易明细清单
HEADERS_MAPPING.update(
{
'对方账号和户名': BASE_HEADERS_MAPPING['对方卡号/账号'],
}
)
# 竖版-无表格-农业银行-中国农业银行银行卡活期存折交易明细清单.pdf
# 竖版-无表格-农业银行-扩张.pdf
# 竖版-无表格-农业银行-缩进.pdf
HEADERS_MAPPING.update(
{
'日期': BASE_HEADERS_MAPPING['记账日期'],
'短摘要': BASE_HEADERS_MAPPING['附言'],
'本次余额': BASE_HEADERS_MAPPING['余额'],
}
)
# 竖版-无表格-农业银行-无标题(对手帐号)
HEADERS_MAPPING.update(
{
'交易后余额': BASE_HEADERS_MAPPING['余额'],
'对手帐号': BASE_HEADERS_MAPPING['对方卡号/账号'],
}
)
# 竖版-无表格-农商行(非常规)
HEADERS_MAPPING.update(
{
'交易说明': BASE_HEADERS_MAPPING['附言'],
}
)
# 竖版-无表格-工商银行 抬头三行 活期历史明细清单
HEADERS_MAPPING.update(
{
'对方账户': BASE_HEADERS_MAPPING['对方卡号/账号'],
}
)
# -----------针式打印-全格线--------------------------------------------------------------------------------------------
# 竖版-表格-建设银行-中国建设银行活期账户交易明细
# 竖版-表格-建设银行-中国建设银行活期账户明细清单
# 竖版-表格-建设银行-对私活期账户明细- (1).pdf
HEADERS_MAPPING.update(
{
'帐户余额': BASE_HEADERS_MAPPING['余额'],
'对方帐户名称': BASE_HEADERS_MAPPING['对方账户名'],
}
)
# 竖版-特殊-交通银行 零售客户交易清单 5000以上交易记录
HEADERS_MAPPING.update(
{
'交易日期 记账日期': BASE_HEADERS_MAPPING['记账日期'],
}
)
# ----------针式打印-部分格线------------------------------------------------------------------------------------------
# 竖版-特殊-邮储银行-一本通绿卡通交易明细(客户)
# 竖版-特殊-邮储银行-账户交易明细(客户)
HEADERS_MAPPING.update(
{
'对方账号/卡号/汇票号': BASE_HEADERS_MAPPING['对方卡号/账号'],
}
)
# --------------------------------------------------------------------------------------------------------------------
# ('记账日期', '记账时间', '金额', '余额', '交易名称', '附言', '对方账户名', '对方卡号/账号', '对方开户行', '核对结果', '借贷', '收入', '支出')
# CLASSIFY_LIST = [
# # --------------普通打印:全格线---------------------------------
# # 中国银行:记账日期 记账时间 币别 金额 余额 交易名称 渠道 网点名称 附言 对方账户名 对方卡号/账号 对方开户行
# ('中国银行', (1, 2, 4, 5, 6, 9, 10, 11, 12, None, None, None, None)), # 横版-表格-中国银行(不规则)
#
# # 农业银行:交易日期 交易网点 存入 支出 余额 对方账号 对方名称 摘要 渠道 附言
# ('农业银行-10', (1, None, None, 5, None, 8, 7, 6, None, None, None, 3, 4)), # 横版-表格-农业银行-中国农业银行个人账户明细
#
# # 农业银行:序号 日期 摘要 交易金额 余额 对方账号 对方名称 交易地点 渠道 附言
# ('农业银行-10-1', (2, None, 4, 5, None, 3, 7, 6, None, None, None, None, None)),
#
# # 农业银行:交易日期 摘要 交易金额 余额 交易渠道 交易网点 对方账号 对方名称 附言
# ('农业银行-9', (1, None, 3, 4, None, 2, 8, 7, None, None, None, None, None)),
#
# # 北京银行:交易日期 业务摘要 收/支 发生额 余额 对方户名 对方账号 交易渠道
# ('北京银行', (1, None, 4, 5, None, 2, 6, 7, None, None, 3, None, None)), # 横版-表格-北京银行
#
# # 工商银行:交易日期 账号 储种 序号 币种 钞汇 摘要 地区 收入/支出金额 余额 渠道
# ('工商银行', (1, None, 9, 10, None, 7, None, None, None, None, None, None, None)),
#
# # 工商银行:交易日期 账号 储种 序号 币种 钞汇 摘要 地区 收入/支出金额 余额 对方户名 对方账号 渠道
# ('工商银行-电子账单', (1, None, 9, 10, None, 7, 11, 12, None, None, None, None, None)),
#
# # 建设银行:空 摘要 交易日期 交易金额 账户余额 商户/网点号及其名称 对方账号与户名 --> 竖版-表格-建设银行
# # 序号 摘要 币别 钞汇 交易日期 交易金额 账户余额 交易地点附言 对方账号与户名 --> 横版-表格-建设银行
# ('建设银行-竖版', (3, None, 4, 5, None, 2, None, 7, None, None, None, None, None)),
# ('建设银行-横版', (5, None, 6, 7, None, 2, None, 9, None, None, None, None, None)),
#
# # 微信:交易单号 交易时间 交易类型 收/支/其他 交易方式 金额(元) 交易对方 商户单号
# ('微信', (2, None, 6, None, None, 3, 7, None, None, None, None, None, None)),
#
# # 支付宝:流水号 时间 名称/备注 收入 支出 账户余额 资金渠道
# ('支付宝', (2, None, None, 6, None, 3, None, None, None, None, None, 4, 5)),
#
# # -----------------普通打印:部分格线--------------------------------
#
# # 农业银行:交易日期 摘要/附言 交易金额 余额 交易地点/对方账号和户名
# ('农业银行-5', (1, None, 3, 4, None, 2, None, 5, None, None, None, None, None)),
#
# # 农业银行:日期 地点 摘要 存入 支出 余额 对方账号 对方户名
# ('农业银行-8', (1, None, None, 6, None, 3, 8, 7, None, None, None, 4, 5)),
# # 农业银行:日期 摘要 交易金额 余额 地点 交易对手账号 对方户名
# ('农业银行-窄页', (1, None, 3, 4, None, 2, 7, 6, None, None, None, None, None)),
#
# # 农商行:交易日期 交易发生额 账户余额 对方账号 对方户名 摘要 备注
# ('农商行', (1, None, 2, 3, None, 6, 5, 4, None, None, None, None, None)),
#
# # 中信银行:交易日期 交易摘要 收入金额 支出金额 账户余额 对方户名 对方账号 对方银行 交易流水号
# ('中信银行', (1, None, None, 5, None, 2, 6, 7, 8, None, None, 3, 4)),
#
# # 平安电子账单:序号 交易日期 交易网点 摘要 借贷发生额(借:-贷:+) 账户余额
# ('平安电子账单', (2, None, 5, 6, None, 4, None, None, None, None, None, None, None)),
# # 建设银行:序号 摘要 币别 钞汇 交易日期 交易金额 账户余额 交易地点附言 对方账号与户名
# ('建设银行-电子账单', (5, None, 6, 7, None, 2, None, 9, None, None, None, None, None)),
#
# # -----------------普通打印:无格线-------------------------------------
#
# # 招商银行:记账日期 货币 交易金额 联机余额 冲补账 交易摘要
# ('招商银行', (1, None, 3, 4, None, 6, None, None, None, None, None, None, None)),
#
# # 邮储银行:交易日期、交易类型 交易币种 交易金额(元) 账户余额(元) [对手方户名 对手方账户 收支类型] --> 竖版-无表格-邮储银行-账户对账单 含有对手方户名 对手方账户
# # 交易日期、交易类型 交易金额(元) 账户余额(元) 操作柜员 --> 竖版-无表格-邮储银行 账户对账单
# ('邮储银行-8', (1, None, 4, 5, None, 2, 6, 7, None, None, None, None, None)),
# ('邮储银行-5', (1, None, 3, 4, None, 2, None, None, None, None, None, None, None)),
#
# # 工商银行电子版:交易日期 账号 储种 序号 币种 妙汇 摘要 地区 收入/支出金额 余额 [对方户名 对方账号] 渠道
# ('工商银行电子版', (1, None, 9, 10, None, 7, None, None, None, None, None, None, None)),
#
# # 招商银行电子版:记账日期 货币 交易金额 联机余额 交易摘要 对手信息
# ('招商银行电子版', (1, None, 3, 4, None, 5, 6, None, None, None, None, None, None)),
#
# # 民生银行:凭证类型 凭证号码 摘要信息 交易时间 交易金额 账户余额 现转标志 交易渠道 交易机构 对方户名 对方行名 --> 横版-无表格-民生银行-中国民生银行个人账户对账单(客户卡号)
# # 凭证类型 凭证号码 交易时间 摘要 交易金额 账户余额 现转标志 交易渠道 交易机构 对方户名 对方行名 --> 横版-无表格-民生银行
# ('民生银行', (None, None, 5, 6, None, None, 7, None, 8, None, None, None, None)),
#
# # 农业银行:交易日期 摘要/附言 交易金额 对方账号和户名
# ('农业银行-整数', (1, None, 3, None, None, 2, None, 4, None, None, None, None, None)),
#
# # 农业银行:交易日期 摘要/附言 交易金额 余额 交易地点/对方账号和户名
# ('农业银行', (1, None, 3, 4, None, 2, None, 5, None, None, None, None, None)),
#
# # 农业银行:日期、时间、短摘要、交易金额、本次余额、交易网点、渠道、附言
# # 农业银行:日期、时间、日志号、短摘要、交易金额、本次余额、交易网点、渠道、附言
# ('农业银行', (1, 2, 4, 5, None, 3, None, None, None, None, None, None, None)),
# ('农业银行-扩张缩进', (1, 2, 5, 6, None, 4, None, None, None, None, None, None, None)),
#
# # 交通银行:交易日期 记账日期、交易地点、交易类型、借贷状态、交易金额、余额
# ('交通银行', (1, None, 5, 6, None, 3, None, None, None, None, 4, None, None)),
#
#
# # ================针式打印:有格线===================
#
# # 建设银行: 摘要、交易日期、交易金额、账户余额、商户/网点号及其名称、对方账号、对方户名 --> 竖版-表格-建设银行-中国建设银行活期账户明细清单
# # 交易日期、摘要、 币种、 钞汇、 交易金额、 帐户余额、对方账号、 对方帐户名称 --> 竖版-表格-建设银行-对私活期账户明细- (1)
# ('建设银行', (None, None, None, None, None, None, None, None, None, None, None, None, None)),
#
#
# # ================针式打印:无格线===================
#
# # 邮储银行:序号、交易日期、交易渠道、摘要、交易金额、账户余额、对方账号/卡号/汇票号、原子账号、交易机构名称
# ('邮储银行', (2, None, 5, 6, None, 4, None, 7, None, None, None, None, None)),
# ]
OTHER_TUPLE = (None, None, None, None, None, None, None, None, None, None, None, None, None)
# {
# "0":"其他",
# "1":"普通打印-全表格-中国农业银行",
# "2":"普通打印-全表格-中国银行",
# "3":"普通打印-全表格-北京银行",
# "4":"普通打印-全表格-工商银行",
# "5":"普通打印-全表格-建设银行",
# "6":"普通打印-全表格-微信账单",
# "7":"普通打印-全表格-支付宝账单",
# "8":"普通打印-无格线-中国邮政储蓄银行",
# "9":"普通打印-无格线-交通银行",
# "10":"普通打印-无格线-农业银行整数",
# "11":"普通打印-无格线-农业银行银行活期扩张缩进",
# "12":"普通打印-无格线-招商银行",
# "13":"普通打印-无格线-招行电子账单",
# "14":"普通打印-无格线-民生银行",
# "15":"普通打印-部分格线-横版-中信银行",
# "16":"普通打印-部分格线-竖版-中国农业银行分账户窄页",
# "17":"普通打印-部分格线-竖版-农业银行",
# "18":"普通打印-部分格线-竖版-农业银行银行卡交易明细",
# "19":"普通打印-部分格线-竖版-平安电子账单",
# "20":"针式打印-全格线-建设银行",
# "21":"针式打印-部分格线-竖版-邮储银行账户交易",
# "22":"针式打印-部分格线-邮储银行一本通绿卡"
# }
# CLASSIFY_LIST = [
# ('其他', OTHER_TUPLE),
# ('农业银行', (1, None, 3, 5, None, 8, 7, 6, None, None, None, None, None)),
# ('中国银行', (1, 2, 4, 5, 6, 9, 10, 11, 12, None, None, None, None)),
# ('北京银行', (1, None, 4, 5, None, 2, 6, 7, None, None, 3, None, None)),
# ('工商银行', (1, None, 9, 10, None, 7, None, None, None, None, None, None, None)),
# ('建设银行', (None, None, None, None, None, 2, None, None, None, None, None, None, None)),
# ('微信', (2, None, 6, None, None, 3, 7, None, None, None, None, None, None)),
# ('支付宝', (2, None, None, 6, None, 3, None, None, None, None, None, 4, 5)),
#
# ('交通银行', (1, None, 5, 6, None, 3, None, None, None, None, 4, None, None)),
# ('农业银行', (1, None, 3, None, None, 2, None, 4, None, None, None, None, None)),
# ('农业银行', (1, 2, None, None, None, None, None, None, None, None, None, None, None)),
# ('招商银行', (1, None, 3, 4, None, 6, None, None, None, None, None, None, None)),
# ('招商银行电子版', (1, None, 3, 4, None, 5, 6, None, None, None, None, None, None)),
# ('民生银行', (None, None, 5, 6, None, None, 7, None, 8, None, None, None, None)),
#
# ('中信银行', (1, None, None, 5, None, 2, 6, 7, 8, None, None, 3, 4)),
# ('农业银行', (1, None, 3, 4, None, 2, None, 5, None, None, None, None, None)),
# ('农业银行', (1, None, 3, 4, None, 2, None, 5, None, None, None, None, None)),
# ('农业银行', (1, None, 3, 4, None, 2, None, 5, None, None, None, None, None)),
# ('平安电子账单', (2, None, 5, 6, None, 4, None, None, None, None, None, None, None)),
#
# ('建设银行', (None, None, None, None, None, None, None, None, None, None, None, None, None)),
# ('邮储银行', (2, None, 5, 6, None, 4, None, 7, None, None, None, None, None)),
# ('邮储银行', (2, None, 5, 6, None, 4, None, 7, None, None, None, None, None)),
# ]
# "4":"普通打印-全表格-中国银行",
# "5":"普通打印-全表格-农业银行-10列",
# "6":"普通打印-全表格-农业银行-10列-1",
# "7":"普通打印-全表格-农业银行-9列",
# "8":"普通打印-全表格-北京银行",
# "9":"普通打印-全表格-工商银行",
# "10":"普通打印-全表格-工商银行-电子账单",
# "11":"普通打印-全表格-建设银行",
# "12":"普通打印-全表格-微信账单",
# "13":"普通打印-全表格-支付宝账单",
# "14":"普通打印-无格线-交通银行",
# "15":"普通打印-无格线-储蓄银行-5列",
# "16":"普通打印-无格线-储蓄银行-8列",
# "17":"普通打印-无格线-农业银行-扩张缩进",
# "18":"普通打印-无格线-农业银行-整数",
# "19":"普通打印-无格线-招商银行",
# "20":"普通打印-无格线-招商银行-电子账单",
# "21":"普通打印-无格线-民生银行",
# "22":"普通打印-部分格线-横版-中信银行",
# "23":"普通打印-部分格线-竖版-农业银行-5列",
# "24":"普通打印-部分格线-竖版-农业银行-8列",
# "25":"普通打印-部分格线-竖版-农业银行-窄页",
# "26":"普通打印-部分格线-竖版-平安电子账单",
# "27":"普通打印-部分格线-竖版-建设银行-电子账单",
# "34":"针式打印-全格线-建设银行",
# "35":"针式打印-部分格线-竖版-邮储银行",
# "36":"针式打印-部分格线-竖版-邮储银行-绿卡",
CLASSIFY_LIST = [
('其他', OTHER_TUPLE),
('其他', OTHER_TUPLE),
('其他', OTHER_TUPLE),
('其他', OTHER_TUPLE),
('普通打印-全表格-中国银行', (1, 2, 4, 5, 6, 9, 10, 11, 12, None, None, None, None)),
('普通打印-全表格-农业银行-10列', (1, None, None, 5, None, 8, 7, 6, None, None, None, 3, 4)),
('普通打印-全表格-农业银行-10列-1', (2, None, 4, 5, None, 3, 7, 6, None, None, None, None, None)),
('普通打印-全表格-农业银行-9列', (1, None, 3, 4, None, 2, 8, 7, None, None, None, None, None)),
('普通打印-全表格-北京银行', (1, None, 4, 5, None, 2, 6, 7, None, None, 3, None, None)),
('普通打印-全表格-工商银行', (1, None, 9, 10, None, 7, None, None, None, None, None, None, None)),
('普通打印-全表格-工商银行-电子账单', (1, None, 9, 10, None, 7, 11, 12, None, None, None, None, None)),
('普通打印-全表格-建设银行', (3, None, 4, 5, None, 2, None, 7, None, None, None, None, None)),
('普通打印-全表格-微信账单', (2, None, 6, None, None, 3, 7, None, None, None, None, None, None)),
('普通打印-全表格-支付宝账单', (2, None, None, 6, None, 3, None, None, None, None, None, 4, 5)),
('普通打印-无格线-交通银行', (1, None, 5, 6, None, 3, None, None, None, None, 4, None, None)),
('普通打印-无格线-储蓄银行-5列', (1, None, 3, 4, None, 2, None, None, None, None, None, None, None)),
('普通打印-无格线-储蓄银行-8列', (1, None, 4, 5, None, 2, 6, 7, None, None, None, None, None)),
('普通打印-无格线-农业银行-扩张缩进', (1, 2, 5, 6, None, 4, None, None, None, None, None, None, None)),
('普通打印-无格线-农业银行-整数', (1, None, 3, None, None, 2, None, 4, None, None, None, None, None)),
('普通打印-无格线-招商银行', (1, None, 3, 4, None, 6, None, None, None, None, None, None, None)),
('普通打印-无格线-招商银行-电子账单', (1, None, 3, 4, None, 5, 6, None, None, None, None, None, None)),
('普通打印-无格线-民生银行', (None, None, 5, 6, None, None, 7, None, 8, None, None, None, None)),
('普通打印-部分格线-横版-中信银行', (1, None, None, 5, None, 2, 6, 7, 8, None, None, 3, 4)),
('普通打印-部分格线-竖版-农业银行-5列', (1, None, 3, 4, None, 2, None, 5, None, None, None, None, None)),
('普通打印-部分格线-竖版-农业银行-8列', (1, None, None, 6, None, 3, 8, 7, None, None, None, 4, 5)),
('普通打印-部分格线-竖版-农业银行-窄页', (1, None, 3, 4, None, 2, 7, 6, None, None, None, None, None)),
('普通打印-部分格线-竖版-平安电子账单', (2, None, 5, 6, None, 4, None, None, None, None, None, None, None)),
('普通打印-部分格线-竖版-建设银行-电子账单', (5, None, 6, 7, None, 2, None, 9, None, None, None, None, None)),
('其他', OTHER_TUPLE),
('其他', OTHER_TUPLE),
('其他', OTHER_TUPLE),
('其他', OTHER_TUPLE),
('其他', OTHER_TUPLE),
('其他', OTHER_TUPLE),
('针式打印-全格线-建设银行', OTHER_TUPLE),
('针式打印-部分格线-竖版-邮储银行', (2, None, 5, 6, None, 4, None, 7, None, None, None, None, None)),
('针式打印-部分格线-竖版-邮储银行-绿卡', (2, None, 5, 6, None, 4, None, 7, None, None, None, None, None)),
('其他', OTHER_TUPLE),
]
# ----------license相关------------------------------------------------------------------------------------------------
# "0":"AVT Invioce",
# "1":"二手车发票",
# "2":"其他",
# "3":"护照",
# "28":"机动车登记证",
# "29":"机动车销售统一发票",
# "30":"港澳通行证",
# "31":"营业执照",
# "32":"行驶证",
# "33":"身份证",
# "37":"银行卡"
# 其他
OTHER_CLASSIFY = 2
# 身份证
IC_CN_NAME = '身份证'
IC_CLASSIFY = 33
IC_FIELD_ORDER_0 = (('姓名', '姓名'),
('公民身份号码', '公民身份号码'),
('出生年月', '出生年月'),
('住址', '住址'),
('性别', '性别'),
('民族', '民族'),)
IC_FIELD_ORDER_1 = (('有效期限', '有效期限'), ('签发机关', '签发机关'),)
# 居住证
RP_CN_NAME = '居住证'
RP_CLASSIFY = 10087
RP_FIELD_ORDER_0 = (('姓名', '姓名'),
('公民身份号码', '公民身份号码'),
('出生年月', '出生年月'),
('住址', '住址'),
('性别', '性别'),)
RP_FIELD_ORDER_1 = IC_FIELD_ORDER_1
# 增值税发票
VAT_CN_NAME = 'VAT普票'
VAT_CLASSIFY = 0
VAT_FIELD_ORDER = (('发票代码', '发票代码'),
('发票代码(开具)', '发票代码(开具)'),
('发票号码', '发票号码'),
('发票号码(开具)', '发票号码(开具)'),
('开票日期', '开票日期'),
('校验码', '校验码'),
('货物或应税劳务、服务名称', '货物或应税劳务、服务名称'),
('金额合计', '开具金额合计(不含税)'),
('税率', '税率'),
('税额合计', '税额合计'),
('价税合计小写', '价税合计(小写)'),
('价税合计大写', '价税合计(大写)'),
('购方名称', '购买方名称'),
('购方纳税人识别号', '购买方纳税人识别号'),
('购方地址、电话', '购买方地址、电话'),
('购方开户行及账号', '购买方开户行及账号'),
('销方名称', '销售方名称'),
('销方纳税人识别号', '销售方纳税人识别号'),
('销方地址、电话', '销售方地址、电话'),
('销方开户行及账号', '销售方开户行及账号'),
('销售方:(章)', '销售方:(章)'),
('备注', '备注'),)
# 机动车登记证书
MVC_CN_NAME = '机动车登记证书'
MVC_CLASSIFY = 28
MVC_CLASSIFY_SE = 10086
MVC_FIELD_ORDER_1_2 = (('1.机动车所有人/身份证名称/号码', '机动车所有人/身份证明名称/号码'),
('3.登记日期', '登记日期'),
('9.车辆识别代号/车架号', '车辆识别代号/车架号'),
('32.车辆出厂日期', '车辆出厂日期'),
('34.发证日期', '发证日期'),
('30.使用性质', '使用性质'),
('31.车辆获得方式', '车辆获得方式'),
('4.机动车登记编号', '机动车登记编号'),
('空行占位', None),
('5.车辆类型', '车辆类型'),
('6.车辆品牌', '车辆品牌'),
('7.车辆型号', '车辆型号'),
('8.车身颜色', '车身颜色'),
('10.国产/进口', '国产/进口'),
('11.发动机号', '发动机号'),
('12.发动机型号', '发动机型号'),
('15.制造厂名称', '制造厂名称'),
('2.登记机关', '登记机关'),
('编号', '机动车登记证书编号'),)
MVC_FIELD_ORDER_3_4 = (
('姓名/名称', '姓名/名称'),
('身份证明名称/号码', '身份证明名称/号码'),
('转移登记日期', '转移登记日期'),
)
MVC_SE_FIELD_ORDER_1_2 = (('9.车辆识别代号/车架号', '车辆识别代号/车架号'),
('1.机动车所有人/身份证名称/号码', '机动车所有人/身份证明名称/号码'),
('空行占位', None),
('3.登记日期', '登记日期'),
('32.车辆出厂日期', '车辆出厂日期'),
('34.发证日期', '发证日期'),
('30.使用性质', '使用性质'),
('31.车辆获得方式', '车辆获得方式'),
('5.车辆类型', '车辆类型'),
('6.车辆品牌', '车辆品牌'),
('7.车辆型号', '车辆型号'),
('8.车身颜色', '车身颜色'),
('10.国产/进口', '国产/进口'),
('11.发动机号', '发动机号'),
('12.发动机型号', '发动机型号'),
('13.燃料种类', '燃料种类'),
('14.排量/功率', '排量/功率'),
('15.制造厂名称', '制造厂名称'),
('16.转向形式', '转向形式'),
('17.轮距', '轮距'),
('18.轮胎数', '轮胎数'),
('19.轮胎规格', '轮胎规格'),
('20.钢板弹簧片数', '钢板弹簧片数'),
('21.轴距', '轴距'),
('22.轴数', '轴数'),
('23.外廓尺寸', '外廓尺寸'),
('24.货厢内部尺寸', '货厢内部尺寸'),
('25.总质量', '总质量'),
('26.核定载质量', '核定载质量'),
('27.核定载客', '核定载客'),
('28.准牵引总质量', '准牵引总质量'),
('29.驾驶室载客', '驾驶室载客'),
('2.登记机关', '登记机关'),
('4.机动车登记编号', '机动车登记编号'),
('编号', '机动车登记证书编号'),)
MVC_SE_FIELD_ORDER_3_4 = (
('姓名/名称', '姓名/名称'),
('身份证明名称/号码', '身份证明名称/号码'),
('转移登记日期', '转移登记日期'),
)
# 机动车销售统一发票
MVI_CN_NAME = '机动车销售统一发票'
MVI_CLASSIFY = 29
MVI_FIELD_ORDER = (('发票代码', '发票代码'),
('发票号码', '发票号码'),
('开票日期', '开票日期'),
('不含税价', '不含税价'),
('发票类型', '发票联'),
('购方名称', '购买方名称'),
('购买方身份证号或组织机构代码', '购买方证件号码'),
('纳税人识别号', '纳税人识别号'), # nodo
('车辆识别代码', '车架号'),
('价税合计小写', '价税合计小写'),
('销方名称', '销货单位名称'),
('增值税税额', '增值税税额'),
('增值税税率', '增值税税率'), # nodo
('发票章有无', '发票章有无'), # nodo 全国统一发票监制章 销售单位章
('价税合计大写', '价税合计大写'), # nodo
('', None),
('发动机号码', '发动机号'),
('车辆类型', '车辆类型'), # nodo
('厂牌型号', '厂牌型号'), # nodo
('产地', '产地'), # nodo
('合格证号', '合格证号'), # nodo
('进口证明书号', '进口证明书号'), # nodo
('商检单号', '商检单号'), # nodo
('电话', '电话'), # nodo
('销方纳税人识别号', '销货方纳税人识别号'),
('账号', '账号'), # nodo
('地址', '地址'), # nodo
('开户银行', '开户银行'), # nodo
('主管税务机关及代码', '主管税务机关及代码'), # nodo
('吨位', '吨位'), # nodo
('限乘人数', '限乘人数'),) # nodo
IC_PID = VAT_PID = MVC_PID = MVI_PID = None
# 营业执照
BL_CN_NAME = '营业执照'
BL_CLASSIFY = 31
BL_PID = 41
BL_FIELD_ORDER = (('注册号', '统一社会信用代码'),
('企业名称', '名称'),
('企业类型', '类型'),
('经营者姓名', '法定代表人'),
('成立日期', '成立日期'),
('营业期限', '营业期限'),
('注册资本', '注册资本'),
('地址', '住所'),
('经营范围', '经营范围'),)
# 二手车发票
UCI_CN_NAME = '二手车发票'
UCI_CLASSIFY = 1
UCI_PID = 60
UCI_FIELD_ORDER = (('发票代码', '发票代码'),
('发票号码', '发票号码'),
('开票日期', '开票日期'),
('车价合计', '车价合计小写'),
('发票联', '发票联'),
('购方单位', '买方单位/个人'),
('购方号码', '买方单位代码/身份证号码'),
('车架号码', '车架号'),
('车价合计大写', '车价合计大写'),
('二手车市场', '二手车市场'),
('发票章有无', '发票章有无'),
('空行占位', None),
('车牌照号', '车牌照号'),
('登记证号', '登记证号'),
('购方地址', '买方单位/住址'),
('车辆类型', '车辆类型'),
('厂牌型号', '厂牌型号'),
('车管所名称', '转入地车辆管理所名称'),
('销方名称', '卖方单位/个人'),
('销方号码', '卖方单位代码/身份证号码'),
('销方地址', '卖方单位/个人住址'),)
# 港澳台通行证
EEP_CN_NAME = '港澳台通行证'
EEP_CLASSIFY = 30
EEP_PID = 1018
EEP_FIELD_ORDER = (('中文名', '姓名'), # 英文名
('证件号码', '证件号码'),
('签发次数', '换证次数(签发次数)'),
('有效期限', '有效期限'),
('出生日期', '出生日期'),
('性别', '性别'),
('签发机关', '签发机关'),
('签发地点', '签发地点'),)
# 行驶证
DL_CN_NAME = '行驶证'
DL_CLASSIFY = 32
DL_PID = 5
DL_FIELD_ORDER_0 = (('号牌号码', '1 号牌号码'),
('所有人', '3 所有人'),
('使用性质', '5 使用性质'),
('车辆识别代码', '7 车辆识别代号'),
('注册日期', '9 注册日期'),
('发证日期', '10 发证日期'),
('车辆类型', '2 车辆类型'),
('地址', '4 住址'),
('品牌型号', '6 品牌型号'),
('发动机号', '8 发动机号码'),)
DL_FIELD_ORDER_1 = (('号牌号码', '1 号牌号码'),
('档案编号', '11 档案编号'),
('核定载人数', '12 核定载人数'),
('总质量', '13 总质量'),
('整备质量', '14 整备质量'),
('核定载质量', '15 核对载质量'),
('外廓尺寸', '16 外廓尺寸'),
('准牵引总质量', '17 准牵引总质量'),)
# 护照
PP_CN_NAME = '护照'
PP_CLASSIFY = 3
PP_PID = 8
PP_FIELD_ORDER = (('类型', '类型/Type'),
('英文姓名', '姓名/Name'),
('护照号码', '护照号码/Passport No'),
('有效期至', '有效期至/Date of expiry'),
('签发日期', '签发日期/Date of issue'),
('国家码', '国家码/Country Code'),
('性别', '性别/Sex'),
('国籍', '国籍/Nationality'),
('出生日期', '出生日期/Date of birth'),
('出生地点', '出生地点/Place of birth'),
('签发地点', '签发地点/Place of issue'),)
# 银行卡
BC_CN_NAME = '银行卡'
BC_CLASSIFY = 37
BC_PID = 4
# BC_FIELD = (('CardNum', '银行卡号'),
# ('BankName', '发卡行名称'),
# ('CardName', '银行卡名称'),
# ('BankCode', '发卡行代号'),
# ('CardType', '银行卡类型'),
# ('Date', '日期'))
BC_FIELD_ORDER = (('BankName', '发卡行名称'),
('CardNum', '银行卡号'),
('CardType', '银行卡类型'),)
SUCCESS_CODE_SET = {'0', 0}
FIELD_ORDER_MAP = {
IC_CLASSIFY: ('有效期限', IC_FIELD_ORDER_1, IC_FIELD_ORDER_0),
RP_CLASSIFY: ('有效期限', RP_FIELD_ORDER_1, RP_FIELD_ORDER_0),
DL_CLASSIFY: ('档案编号', DL_FIELD_ORDER_1, DL_FIELD_ORDER_0),
MVC_CLASSIFY: ('转移登记日期', MVC_FIELD_ORDER_3_4, MVC_FIELD_ORDER_1_2),
MVC_CLASSIFY_SE: ('转移登记日期', MVC_SE_FIELD_ORDER_3_4, MVC_SE_FIELD_ORDER_1_2)
}
LICENSE_ORDER = ((MVI_CLASSIFY, (MVI_PID, MVI_CN_NAME, MVI_FIELD_ORDER, False, False)),
(IC_CLASSIFY, (IC_PID, IC_CN_NAME, None, True, False)),
(RP_CLASSIFY, (None, RP_CN_NAME, None, True, False)),
(BC_CLASSIFY, (BC_PID, BC_CN_NAME, BC_FIELD_ORDER, False, False)),
(BL_CLASSIFY, (BL_PID, BL_CN_NAME, BL_FIELD_ORDER, False, False)),
(UCI_CLASSIFY, (UCI_PID, UCI_CN_NAME, UCI_FIELD_ORDER, False, False)),
(EEP_CLASSIFY, (EEP_PID, EEP_CN_NAME, EEP_FIELD_ORDER, False, False)),
(DL_CLASSIFY, (DL_PID, DL_CN_NAME, None, True, False)),
(PP_CLASSIFY, (PP_PID, PP_CN_NAME, PP_FIELD_ORDER, False, False)),
(MVC_CLASSIFY, (MVC_PID, MVC_CN_NAME, None, True, True)),
(VAT_CLASSIFY, (VAT_PID, VAT_CN_NAME, VAT_FIELD_ORDER, False, False)))
LICENSE_CLASSIFY_MAPPING = dict(LICENSE_ORDER)
OTHER_CLASSIFY_SET = {OTHER_CLASSIFY}
LICENSE_CLASSIFY_SET_1 = {IC_CLASSIFY, VAT_CLASSIFY, MVC_CLASSIFY, MVI_CLASSIFY}
LICENSE_CLASSIFY_SET_2 = {BL_CLASSIFY, UCI_CLASSIFY, EEP_CLASSIFY, DL_CLASSIFY, PP_CLASSIFY, BC_CLASSIFY}
......@@ -4,8 +4,6 @@ import json
import shutil
import base64
import signal
import asyncio
import aiohttp
import difflib
import requests
import traceback
......@@ -24,8 +22,9 @@ from apps.doc import consts
from apps.doc.ocr.edms import EDMS, rh
from apps.doc.named_enum import KeywordsType
from apps.doc.exceptions import EDMSException, OCR1Exception, OCR2Exception, OCR4Exception
from apps.doc.ocr.wb import BSWorkbook, Workbook
from apps.doc.ocr.wb import BSWorkbook
from apps.doc.models import DocStatus, HILDoc, AFCDoc, Keywords
from celery_compare.tasks import compare
class Command(BaseCommand, LoggerMixin):
......@@ -545,7 +544,7 @@ class Command(BaseCommand, LoggerMixin):
self.cronjob_log.warn('{0} [process failed (pdf_2_img_2_queue)] [task={1}] '
'[error={2}]'.format(self.log_base, task_str, traceback.format_exc()))
except Exception as e:
self.cronjob_log.error('{0} [process error (db save 1)] [error={1}]'.format(
self.cronjob_log.error('{0} [process error (db save)] [error={1}]'.format(
self.log_base, traceback.format_exc()))
error_list.append(1)
return
......@@ -595,27 +594,27 @@ class Command(BaseCommand, LoggerMixin):
except Exception as e:
self.cronjob_log.error('{0} [process error (ocr fetch)] [img_path={1}] [error={2}]'.format(
self.log_base, img_path, traceback.format_exc()))
else:
try:
del json_data_1
# /data/bmw-ocr-data/AFC/tmp/6/img/page_0_img_0.jpeg
# AFC_2
path_split = img_path.split('/')
task_str = consts.SPLIT_STR.join((path_split[-5], path_split[-3]))
try:
del json_data_1
# /data/bmw-ocr-data/AFC/tmp/6/img/page_0_img_0.jpeg
# AFC_2
path_split = img_path.split('/')
task_str = consts.SPLIT_STR.join((path_split[-5], path_split[-3]))
with lock:
doc_res_dict = res_dict.setdefault(task_str, {})
doc_res_dict[img_path] = ocr_1_res
res_dict[task_str] = doc_res_dict
todo_count = todo_count_dict.get(task_str)
if todo_count == 1:
finish_queue.put(task_str)
del todo_count_dict[task_str]
else:
todo_count_dict[task_str] = todo_count - 1
except Exception as e:
self.cronjob_log.error('{0} [process error (store ocr res)] [img_path={1}] [error={2}]'.format(
self.log_base, img_path, traceback.format_exc()))
with lock:
doc_res_dict = res_dict.setdefault(task_str, {})
doc_res_dict[img_path] = ocr_1_res
res_dict[task_str] = doc_res_dict
todo_count = todo_count_dict.get(task_str)
if todo_count == 1:
finish_queue.put(task_str)
del todo_count_dict[task_str]
else:
todo_count_dict[task_str] = todo_count - 1
except Exception as e:
self.cronjob_log.error('{0} [process error (store ocr res)] [img_path={1}] [error={2}]'.format(
self.log_base, img_path, traceback.format_exc()))
def res_2_wb(self, res_dict, img_queue, finish_queue, lock, error_list):
while len(error_list) == 0 or not img_queue.empty() or not finish_queue.empty():
......@@ -626,221 +625,220 @@ class Command(BaseCommand, LoggerMixin):
time.sleep(self.sleep_time_task_get)
continue
else:
self.cronjob_log.info('{0} [res_2_wb] [get task] [task={1}]'.format(self.log_base, task_str))
ocr_1_res = res_dict.pop(task_str, {})
business_type, doc_id_str = task_str.split(consts.SPLIT_STR)
doc_id = int(doc_id_str)
doc_class = HILDoc if business_type == consts.HIL_PREFIX else AFCDoc
doc_data_path = os.path.join(self.data_dir, business_type, consts.TMP_DIR_NAME, doc_id_str)
excel_path = os.path.join(doc_data_path, '{0}.xlsx'.format(doc_id_str))
try:
self.cronjob_log.info('{0} [res_2_wb] [get task] [task={1}]'.format(self.log_base, task_str))
ocr_1_res = res_dict.get(task_str, {})
# self.cronjob_log.info('{0} [res_2_wb] [get task res] [task={1}]'.format(
# self.log_base, task_str))
# 4.OCR结果并且构建excel文件
bs_summary = {}
license_summary = {}
unknown_summary = {}
res_list = []
interest_keyword = Keywords.objects.filter(
type=KeywordsType.INTEREST.value, on_off=True).values_list('keyword', flat=True)
salary_keyword = Keywords.objects.filter(
type=KeywordsType.SALARY.value, on_off=True).values_list('keyword', flat=True)
loan_keyword = Keywords.objects.filter(
type=KeywordsType.LOAN.value, on_off=True).values_list('keyword', flat=True)
wechat_keyword = Keywords.objects.filter(
type=KeywordsType.ALI_WECHART.value, on_off=True).values_list('keyword', flat=True)
wb = BSWorkbook(interest_keyword, salary_keyword, loan_keyword, wechat_keyword)
for img_path, res in ocr_1_res.items():
pno, ino = self.parse_img_path(img_path)
part_idx = 1
if res.get('code') == 1:
ocr_data_list = res.get('data', [])
if not isinstance(ocr_data_list, list):
res_list.append((pno, ino, part_idx, consts.RES_FAILED_3))
self.cronjob_log.warn('{0} [ocr_1 res error] [img={1}]'.format(self.log_base, img_path))
else:
for part_idx, ocr_data in enumerate(ocr_data_list):
part_idx = part_idx + 1
classify = ocr_data.get('classify')
if classify is None:
res_list.append((pno, ino, part_idx, consts.RES_FAILED_3))
self.cronjob_log.warn('{0} [ocr_1 res error] [img={1}]'.format(
self.log_base, img_path))
continue
elif classify in consts.OTHER_CLASSIFY_SET: # 其他类
res_list.append((pno, ino, part_idx, consts.RES_SUCCESS_OTHER))
continue
elif classify in consts.LICENSE_CLASSIFY_SET_1: # 证件1
self.license1_process(ocr_data, license_summary, classify, res_list,
pno, ino, part_idx, img_path)
elif classify in consts.LICENSE_CLASSIFY_SET_2: # 证件2
pid, _, _, _, _, _ = consts.LICENSE_CLASSIFY_MAPPING.get(classify)
file_data = ocr_data.get('section_img')
if file_data is None:
with open(img_path, 'rb') as f:
base64_data = base64.b64encode(f.read())
# 获取解码后的base64值
file_data = base64_data.decode()
json_data_2 = {
"pid": str(pid),
"filedata": file_data
}
for times in range(consts.RETRY_TIMES):
try:
start_time = time.time()
ocr_2_response = requests.post(self.ocr_url_2, data=json_data_2)
if ocr_2_response.status_code != 200:
raise OCR2Exception('ocr_2 status code: {0}'.format(ocr_2_response.status_code))
except Exception as e:
self.cronjob_log.warn(
'{0} [ocr_2 failed] [times={1}] [img_path={2}] [error={3}]'.format(
self.log_base, times, img_path, traceback.format_exc()))
doc = doc_class.objects.filter(id=doc_id).first()
except Exception as e:
self.cronjob_log.error('{0} [process error (db filter)] [task={1}] [error={2}]'.format(
self.log_base, task_str, traceback.format_exc()))
else:
try:
# 4.OCR结果并且构建excel文件
bs_summary = {}
unknown_summary = {}
license_summary = {}
res_list = []
interest_keyword = Keywords.objects.filter(
type=KeywordsType.INTEREST.value, on_off=True).values_list('keyword', flat=True)
salary_keyword = Keywords.objects.filter(
type=KeywordsType.SALARY.value, on_off=True).values_list('keyword', flat=True)
loan_keyword = Keywords.objects.filter(
type=KeywordsType.LOAN.value, on_off=True).values_list('keyword', flat=True)
wechat_keyword = Keywords.objects.filter(
type=KeywordsType.ALI_WECHART.value, on_off=True).values_list('keyword', flat=True)
wb = BSWorkbook(interest_keyword, salary_keyword, loan_keyword, wechat_keyword)
for img_path, res in ocr_1_res.items():
pno, ino = self.parse_img_path(img_path)
part_idx = 1
if res.get('code') == 1:
ocr_data_list = res.get('data', [])
if not isinstance(ocr_data_list, list):
res_list.append((pno, ino, part_idx, consts.RES_FAILED_3))
self.cronjob_log.warn('{0} [ocr_1 res error] [img={1}]'.format(self.log_base, img_path))
else:
for part_idx, ocr_data in enumerate(ocr_data_list):
part_idx = part_idx + 1
classify = ocr_data.get('classify')
if classify is None:
res_list.append((pno, ino, part_idx, consts.RES_FAILED_3))
self.cronjob_log.warn('{0} [ocr_1 res error] [img={1}]'.format(
self.log_base, img_path))
continue
elif classify in consts.OTHER_CLASSIFY_SET: # 其他类
res_list.append((pno, ino, part_idx, consts.RES_SUCCESS_OTHER))
continue
elif classify in consts.LICENSE_CLASSIFY_SET_1: # 证件1
self.license1_process(ocr_data, license_summary, classify, res_list,
pno, ino, part_idx, img_path)
elif classify in consts.LICENSE_CLASSIFY_SET_2: # 证件2
pid, _, _, _, _, _ = consts.LICENSE_CLASSIFY_MAPPING.get(classify)
file_data = ocr_data.get('section_img')
if file_data is None:
with open(img_path, 'rb') as f:
base64_data = base64.b64encode(f.read())
# 获取解码后的base64值
file_data = base64_data.decode()
json_data_2 = {
"pid": str(pid),
"filedata": file_data
}
for times in range(consts.RETRY_TIMES):
try:
start_time = time.time()
ocr_2_response = requests.post(self.ocr_url_2, data=json_data_2)
if ocr_2_response.status_code != 200:
raise OCR2Exception('ocr_2 status code: {0}'.format(ocr_2_response.status_code))
except Exception as e:
self.cronjob_log.warn(
'{0} [ocr_2 failed] [times={1}] [img_path={2}] [error={3}]'.format(
self.log_base, times, img_path, traceback.format_exc()))
else:
ocr_2_res = json.loads(ocr_2_response.text)
end_time = time.time()
speed_time = int(end_time - start_time)
self.cronjob_log.info(
'{0} [ocr_2 success] [img={1}] [speed_time={2}]'.format(
self.log_base, img_path, speed_time))
if classify == consts.BC_CLASSIFY:
name = '有'
json_data_3 = {
"file": file_data,
'card_res': ocr_2_res
}
card_name_response = requests.post(self.ocr_url_3, json_data_3)
if card_name_response.status_code == 200:
card_name_res = card_name_response.json()
if isinstance(card_name_res, dict) and \
card_name_res.get('data', {}).get('is_exists_name') == 0:
name = '无'
ocr_2_res['Name'] = name
self.license2_process(ocr_2_res, license_summary, pid, classify, res_list, pno, ino, part_idx)
break
else:
ocr_2_res = json.loads(ocr_2_response.text)
end_time = time.time()
speed_time = int(end_time - start_time)
self.cronjob_log.info(
'{0} [ocr_2 success] [img={1}] [speed_time={2}]'.format(
self.log_base, img_path, speed_time))
if classify == consts.BC_CLASSIFY:
name = '有'
json_data_3 = {
"file": file_data,
'card_res': ocr_2_res
}
card_name_response = requests.post(self.ocr_url_3, json_data_3)
if card_name_response.status_code == 200:
card_name_res = card_name_response.json()
if isinstance(card_name_res, dict) and \
card_name_res.get('data', {}).get('is_exists_name') == 0:
name = '无'
ocr_2_res['Name'] = name
self.license2_process(ocr_2_res, license_summary, pid, classify, res_list, pno, ino, part_idx)
break
else:
res_list.append((pno, ino, part_idx, consts.RES_FAILED_2))
self.cronjob_log.warn(
'{0} [ocr_2 failed] [img_path={1}]'.format(self.log_base, img_path))
else: # 流水处理
self.bs_process(wb, ocr_data, bs_summary, unknown_summary, classify, res_list, pno, ino, part_idx)
else:
res_list.append((pno, ino, part_idx, consts.RES_FAILED_1))
self.cronjob_log.info('{0} [ocr_1 res error] [img={1}]'.format(self.log_base, img_path))
with lock:
del res_dict[task_str]
# self.cronjob_log.info('{0} [res_dict record] [res_dict={1}]'.format(
# self.log_base, res_dict))
res_list.append((pno, ino, part_idx, consts.RES_FAILED_2))
self.cronjob_log.warn(
'{0} [ocr_2 failed] [img_path={1}]'.format(self.log_base, img_path))
else: # 流水处理
self.bs_process(wb, ocr_data, bs_summary, unknown_summary, classify, res_list, pno, ino, part_idx)
else:
res_list.append((pno, ino, part_idx, consts.RES_FAILED_1))
self.cronjob_log.info('{0} [ocr_1 res error] [img={1}]'.format(self.log_base, img_path))
self.cronjob_log.info('{0} [task={1}] [bs_summary={2}] [unknown_summary={3}] '
'[license_summary={4}]'.format(self.log_base, task_str, bs_summary,
unknown_summary, license_summary))
self.cronjob_log.info('{0} [task={1}] [bs_summary={2}] [unknown_summary={3}] '
'[license_summary={4}]'.format(self.log_base, task_str, bs_summary,
unknown_summary, license_summary))
self.license_log.info('[task={0}] [license_summary={1}]'.format(task_str, license_summary))
idcard_list = license_summary.get(consts.IC_CLASSIFY)
if idcard_list:
self.idcard_log.info('[task={0}] [idcard={1}]'.format(task_str, idcard_list))
self.license_log.info('[task={0}] [license_summary={1}]'.format(task_str, license_summary))
idcard_list = license_summary.get(consts.IC_CLASSIFY)
if idcard_list:
self.idcard_log.info('[task={0}] [idcard={1}]'.format(task_str, idcard_list))
merged_bs_summary = self.rebuild_bs_summary(bs_summary, unknown_summary)
merged_bs_summary = self.rebuild_bs_summary(bs_summary, unknown_summary)
self.bs_log.info('[task={0}] [bs_summary={1}]'.format(task_str, merged_bs_summary))
self.bs_log.info('[task={0}] [bs_summary={1}]'.format(task_str, merged_bs_summary))
self.cronjob_log.info('{0} [task={1}] [merged_bs_summary={2}] [unknown_summary={3}] '
'[res_list={4}]'.format(self.log_base, task_str, merged_bs_summary,
unknown_summary, res_list))
del unknown_summary
self.cronjob_log.info('{0} [task={1}] [merged_bs_summary={2}] [unknown_summary={3}] '
'[res_list={4}]'.format(self.log_base, task_str, merged_bs_summary,
unknown_summary, res_list))
del unknown_summary
# 4.2 重构Excel文件
# doc, business_type = self.get_doc_object(task_str)
business_type, doc_id_str = task_str.split(consts.SPLIT_STR)
doc_id = int(doc_id_str)
doc_class = HILDoc if business_type == consts.HIL_PREFIX else AFCDoc
doc = doc_class.objects.filter(id=doc_id).first()
doc_data_path = os.path.join(self.data_dir, business_type, consts.TMP_DIR_NAME, str(doc.id))
excel_path = os.path.join(doc_data_path, '{0}.xlsx'.format(doc.id))
# src_excel_path = os.path.join(doc_data_path, 'src.xlsx')
# wb.save(src_excel_path)
count_list = wb.rebuild(merged_bs_summary, license_summary, res_list, doc.document_scheme)
wb.save(excel_path)
except Exception as e:
try:
with lock:
if task_str in res_dict:
del res_dict[task_str]
# doc, business_type = self.get_doc_object(task_str)
business_type, doc_id_str = task_str.split(consts.SPLIT_STR)
doc_id = int(doc_id_str)
doc_class = HILDoc if business_type == consts.HIL_PREFIX else AFCDoc
doc = doc_class.objects.filter(id=doc_id).first()
doc.status = DocStatus.PROCESS_FAILED.value
doc.save()
self.cronjob_log.warn('{0} [process failed (res to wb)] [task={1}] [error={2}]'.format(
self.log_base, task_str, traceback.format_exc()))
except Exception as e:
self.cronjob_log.error('{0} [process error (wb end)] [task={1}] [error={2}]'.format(
self.log_base, task_str, traceback.format_exc()))
try:
doc_data_path = os.path.join(self.data_dir, business_type, consts.TMP_DIR_NAME, str(doc.id))
img_save_path = os.path.join(doc_data_path, 'img')
shutil.rmtree(img_save_path, ignore_errors=True)
pdf_path = os.path.join(doc_data_path, '{0}.pdf'.format(doc.id))
os.remove(pdf_path)
except Exception as e:
self.cronjob_log.error('{0} [process error (file remove 1)] [task={1}] [error={2}]'.format(
self.cronjob_log.warn('{0} [process failed (res conformity)] [task={1}] [error={2}]'.format(
self.log_base, task_str, traceback.format_exc()))
else:
try:
img_save_path = os.path.join(doc_data_path, 'img')
write_zip_file(img_save_path, os.path.join(doc_data_path, '{0}_img.zip'.format(doc.id)))
shutil.rmtree(img_save_path, ignore_errors=True)
pdf_path = os.path.join(doc_data_path, '{0}.pdf'.format(doc.id))
os.remove(pdf_path)
# os.remove(src_excel_path)
except Exception as e:
self.cronjob_log.error('{0} [process error (file remove 2)] [task={1}] [error={2}]'.format(
self.log_base, task_str, traceback.format_exc()))
try:
# 5.上传至EDMS
for times in range(consts.RETRY_TIMES):
try:
self.edms.upload(excel_path, doc, business_type)
except Exception as e:
self.cronjob_log.warn(
'{0} [edms upload failed] [times={1}] [task={2}] [error={3}]'.format(
self.log_base, times, task_str, traceback.format_exc()))
edms_exc = str(e)
else:
break
else:
raise EDMSException(edms_exc)
except Exception as e:
try:
doc.status = DocStatus.UPLOAD_FAILED.value
doc.end_time = timezone.now()
doc.duration = min((doc.end_time - doc.start_time).seconds, 32760)
for field, count in count_list:
if hasattr(doc, field):
setattr(doc, field, count)
doc.status = DocStatus.PROCESS_FAILED.value
doc.save()
self.cronjob_log.warn('{0} [process failed (edms upload)] [task={1}] [error={2}]'.format(
self.log_base, task_str, traceback.format_exc()))
except Exception as e:
self.cronjob_log.error('{0} [process error (edms upload)] [task={1}] [error={2}]'.format(
self.cronjob_log.error('{0} [process error (db save)] [task={1}] [error={2}]'.format(
self.log_base, task_str, traceback.format_exc()))
else:
try:
doc.status = DocStatus.COMPLETE.value
doc.end_time = timezone.now()
doc.duration = min((doc.end_time - doc.start_time).seconds, 32760)
for field, count in count_list:
if hasattr(doc, field):
setattr(doc, field, count)
doc.save()
self.cronjob_log.info('{0} [process complete] [task={1}]'.format(self.log_base, task_str))
os.remove(excel_path)
# 重构Excel文件
# src_excel_path = os.path.join(doc_data_path, 'src.xlsx')
# wb.save(src_excel_path)
count_list = wb.rebuild(merged_bs_summary, license_summary, res_list, doc.document_scheme)
wb.save(excel_path)
except Exception as e:
self.cronjob_log.error('{0} [process error (completed)] [task={1}] [error={2}]'.format(
self.cronjob_log.warn('{0} [process failed (wb rebuild)] [task={1}] [error={2}]'.format(
self.log_base, task_str, traceback.format_exc()))
try:
doc.status = DocStatus.PROCESS_FAILED.value
doc.save()
except Exception as e:
self.cronjob_log.error('{0} [process error (db save)] [task={1}] [error={2}]'.format(
self.log_base, task_str, traceback.format_exc()))
else:
try:
# 上传至EDMS
for times in range(consts.RETRY_TIMES):
try:
self.edms.upload(excel_path, doc, business_type)
except Exception as e:
self.cronjob_log.warn(
'{0} [edms upload failed] [times={1}] [task={2}] [error={3}]'.format(
self.log_base, times, task_str, traceback.format_exc()))
edms_exc = str(e)
else:
break
else:
raise EDMSException(edms_exc)
except Exception as e:
doc.status = DocStatus.UPLOAD_FAILED.value
self.cronjob_log.warn('{0} [process failed (edms upload)] [task={1}] [error={2}]'.format(
self.log_base, task_str, traceback.format_exc()))
else:
doc.status = DocStatus.COMPLETE.value
self.cronjob_log.info('{0} [edms upload success] [task={1}]'.format(self.log_base, task_str))
finally:
try:
doc.end_time = timezone.now()
doc.duration = min((doc.end_time - doc.start_time).seconds, 32760)
for field, count in count_list:
if hasattr(doc, field):
setattr(doc, field, count)
doc.save()
except Exception as e:
self.cronjob_log.error('{0} [process error (db save)] [task={1}] [error={2}]'.format(
self.log_base, task_str, traceback.format_exc()))
else:
self.cronjob_log.info('{0} [process complete] [task={1}]'.format(self.log_base, task_str))
os.remove(excel_path)
finally:
# TODO 识别结果存一张表,方便跑报表
# 更新OCR累计识别结果表
# 触发比对
compare.apply_async((doc.application_id, business_type, None, ocr_res_id), queue='queue_compare')
finally:
try:
img_save_path = os.path.join(doc_data_path, 'img')
write_zip_file(img_save_path, os.path.join(doc_data_path, '{0}_img.zip'.format(doc_id_str)))
shutil.rmtree(img_save_path, ignore_errors=True)
pdf_path = os.path.join(doc_data_path, '{0}.pdf'.format(doc_id_str))
os.remove(pdf_path)
except Exception as e:
self.cronjob_log.error('{0} [process error (pdf & img remove)] [task={1}] [error={2}]'.format(
self.log_base, task_str, traceback.format_exc()))
def handle(self, *args, **kwargs):
db.close_old_connections()
lock = Lock()
......
......@@ -165,3 +165,107 @@ class Configs(models.Model):
verbose_name = '配置信息'
verbose_name_plural = verbose_name
# 比对信息表
class AFCComparisonInfo(models.Model):
id = models.BigAutoField(primary_key=True, verbose_name="id") # 主键
uniq_seq = models.CharField(max_length=128, verbose_name="唯一序列号") # 索引?
application_id = models.CharField(max_length=64, verbose_name="申请id") # 索引
# CUSTOMER_TYPE = ['TCCOR', 'TCDAS', 'TCFRE', 'TCIAS', 'TCIND', 'TCSEP', 'TCURE']
customer_type = models.CharField(max_length=16, verbose_name="顾客类型")
application_version = models.SmallIntegerField(default=0, verbose_name="应用版本")
vehicle_status = models.CharField(max_length=16, verbose_name="车辆状况") # VEHICLE_STATUS = ['PCUSD', 'PCNEW']
individual_cus_info = models.TextField(verbose_name="个人信息")
usedcar_info = models.TextField(null=True, verbose_name="二手车信息")
corporate_cus_info = models.TextField(null=True, verbose_name="企业信息")
update_time = models.DateTimeField(auto_now=True, verbose_name='修改时间')
create_time = models.DateTimeField(auto_now_add=True, verbose_name='创建时间') # 索引
class Meta:
managed = False
db_table = 'afc_comparison_info'
situ_db_label = 'afc'
# 比对信息表
class HILComparisonInfo(models.Model):
id = models.BigAutoField(primary_key=True, verbose_name="id") # 主键
uniq_seq = models.CharField(max_length=128, verbose_name="唯一序列号") # 索引?
application_id = models.CharField(max_length=64, verbose_name="申请id") # 索引
# CUSTOMER_TYPE = ['TCCOR', 'TCDAS', 'TCFRE', 'TCIAS', 'TCIND', 'TCSEP', 'TCURE']
customer_type = models.CharField(max_length=16, verbose_name="顾客类型")
application_version = models.SmallIntegerField(default=0, verbose_name="应用版本")
vehicle_status = models.CharField(max_length=16, verbose_name="车辆状况") # VEHICLE_STATUS = ['PCUSD', 'PCNEW']
individual_cus_info = models.TextField(verbose_name="个人信息")
usedcar_info = models.TextField(null=True, verbose_name="二手车信息")
corporate_cus_info = models.TextField(null=True, verbose_name="企业信息")
update_time = models.DateTimeField(auto_now=True, verbose_name='修改时间')
create_time = models.DateTimeField(auto_now_add=True, verbose_name='创建时间') # 索引
class Meta:
managed = False
db_table = 'hil_comparison_info'
situ_db_label = 'hil'
# OCR结果累计表
class AFCOCRResult(models.Model):
id = models.AutoField(primary_key=True, verbose_name="id") # 主键
application_id = models.CharField(max_length=64, verbose_name="申请id") # 索引
bs_ocr = models.TextField(null=True, verbose_name="银行流水")
mvi_ocr = models.TextField(null=True, verbose_name="机动车销售统一发票")
ic_ocr = models.TextField(null=True, verbose_name="身份证")
rp_ocr = models.TextField(null=True, verbose_name="居住证")
bc_ocr = models.TextField(null=True, verbose_name="银行卡")
bl_ocr = models.TextField(null=True, verbose_name="营业执照")
uci_ocr = models.TextField(null=True, verbose_name="二手车发票")
eep_ocr = models.TextField(null=True, verbose_name="港澳台通行证")
dl_ocr = models.TextField(null=True, verbose_name="行驶证")
pp_ocr = models.TextField(null=True, verbose_name="护照")
mvc_ocr = models.TextField(null=True, verbose_name="机动车登记证")
vat_ocr = models.TextField(null=True, verbose_name="增值税发票")
update_time = models.DateTimeField(auto_now=True, verbose_name='修改时间')
create_time = models.DateTimeField(auto_now_add=True, verbose_name='创建时间')
class Meta:
managed = False
db_table = 'afc_ocr_result'
situ_db_label = 'afc'
# OCR结果累计表
class HILOCRResult(models.Model):
id = models.AutoField(primary_key=True, verbose_name="id") # 主键
application_id = models.CharField(max_length=64, verbose_name="申请id") # 索引
bs_ocr = models.TextField(null=True, verbose_name="银行流水")
mvi_ocr = models.TextField(null=True, verbose_name="机动车销售统一发票")
ic_ocr = models.TextField(null=True, verbose_name="身份证")
rp_ocr = models.TextField(null=True, verbose_name="居住证")
bc_ocr = models.TextField(null=True, verbose_name="银行卡")
bl_ocr = models.TextField(null=True, verbose_name="营业执照")
uci_ocr = models.TextField(null=True, verbose_name="二手车发票")
eep_ocr = models.TextField(null=True, verbose_name="港澳台通行证")
dl_ocr = models.TextField(null=True, verbose_name="行驶证")
pp_ocr = models.TextField(null=True, verbose_name="护照")
mvc_ocr = models.TextField(null=True, verbose_name="机动车登记证")
vat_ocr = models.TextField(null=True, verbose_name="增值税发票")
update_time = models.DateTimeField(auto_now=True, verbose_name='修改时间')
create_time = models.DateTimeField(auto_now_add=True, verbose_name='创建时间')
class Meta:
managed = False
db_table = 'hil_ocr_result'
situ_db_label = 'hil'
......
import locale
import numpy as np
from pandas._libs import tslib
from pandas._libs.tslibs.nattype import NaTType
from pandas.core.indexes.datetimes import DatetimeIndex
from openpyxl import Workbook
from openpyxl.styles import Border, Side, PatternFill, numbers
from openpyxl.utils import get_column_letter
from apps.doc import consts
class BSWorkbook(Workbook):
def __init__(self, interest_keyword, salary_keyword, loan_keyword, *args, **kwargs):
super().__init__(*args, **kwargs)
locale.setlocale(locale.LC_NUMERIC, 'en_US.UTF-8')
self.meta_sheet_title = '关键信息提取和展示'
self.blank_row = (None,)
self.code_header = ('页数', '电子回单验证码')
self.date_header = ('打印时间', '起始日期', '终止日期', '流水区间结果')
self.keyword_header = ('关键词', '记账日期', '金额')
self.interest_keyword = interest_keyword
self.salary_keyword = salary_keyword
self.loan_keyword = loan_keyword
self.proof_res = ('对', '错')
self.loan_fill = PatternFill("solid", fgColor="00FFCC00")
self.amount_fill = PatternFill("solid", fgColor="00FFFF00")
# self.bd = Side(style='thin', color="000000")
# self.border = Border(left=self.bd, top=self.bd, right=self.bd, bottom=self.bd)
self.MAX_MEAN = 31
@staticmethod
def sheet_prune(ws, classify):
ws.insert_cols(1, amount=consts.FIXED_COL_AMOUNT)
moved_col_set = set()
header_col_set = set()
# 根据第一行关键词排列
for col in range(consts.FIXED_COL_AMOUNT + 1, ws.max_column + 1):
header_value = ws.cell(1, col).value
header_col = consts.HEADERS_MAPPING.get(header_value)
if header_col is not None and header_col not in header_col_set:
letter = get_column_letter(col)
ws.move_range("{0}1:{0}{1}".format(letter, ws.max_row), cols=header_col - col)
moved_col_set.add(col)
header_col_set.add(header_col)
elif header_value in consts.BORROW_HEADERS_SET:
letter = get_column_letter(col)
ws.move_range("{0}1:{0}{1}".format(letter, ws.max_row), cols=consts.BORROW_HEADER_COL - col)
moved_col_set.add(col)
header_col_set.add(consts.BORROW_HEADER_COL)
elif header_value in consts.INCOME_HEADERS_SET:
letter = get_column_letter(col)
ws.move_range("{0}1:{0}{1}".format(letter, ws.max_row), cols=consts.INCOME_HEADER_COL - col)
moved_col_set.add(col)
header_col_set.add(consts.INCOME_HEADER_COL)
elif header_value in consts.OUTLAY_HEADERS_SET:
letter = get_column_letter(col)
ws.move_range("{0}1:{0}{1}".format(letter, ws.max_row), cols=consts.OUTLAY_HEADER_COL - col)
moved_col_set.add(col)
header_col_set.add(consts.OUTLAY_HEADER_COL)
# 缺失表头再次查找
for header_col in range(1, consts.FIXED_COL_AMOUNT + 1):
if header_col in header_col_set or header_col == consts.RESULT_HEADER_COL:
continue
fix_col = consts.CLASSIFY_LIST[classify][1][header_col - 1]
if fix_col is None:
continue
fix_col = fix_col + consts.FIXED_COL_AMOUNT
if fix_col in moved_col_set:
break
letter = get_column_letter(fix_col)
ws.move_range("{0}1:{0}{1}".format(letter, ws.max_row), cols=header_col - fix_col)
ws.delete_cols(consts.FIXED_COL_AMOUNT + 1, amount=ws.max_column)
min_row = 1 if len(moved_col_set) == 0 else 2
return min_row
@staticmethod
def month_split(dti, date_list, date_statistics):
month_list = []
idx_list = []
month_pre = None
for idx, month_str in enumerate(dti.strftime('%Y-%m')):
if isinstance(month_str, float):
continue
if month_str != month_pre:
month_list.append(month_str)
if month_pre is None:
if date_statistics:
date_list.append(dti[idx].date())
idx = 0
idx_list.append(idx)
month_pre = month_str
if date_statistics:
for idx in range(len(dti) - 1, -1, -1):
if isinstance(dti[idx], NaTType):
continue
date_list.append(dti[idx].date())
break
return month_list, idx_list
@staticmethod
def get_reverse_trend(day_idx, idx_list):
reverse_trend = 0
pre_day = None
for idx, day in enumerate(day_idx):
if np.isnan(day):
continue
if idx in idx_list or pre_day is None:
pre_day = day
continue
if day < pre_day:
reverse_trend += 1
pre_day = day
elif day > pre_day:
reverse_trend -= 1
pre_day = day
if reverse_trend > 0:
reverse_trend = 1
elif reverse_trend < 0:
reverse_trend = -1
return reverse_trend
def sheet_split(self, ws, month_mapping, reverse_trend_list, min_row, date_list, date_statistics):
for date_tuple_src in ws.iter_cols(min_col=1, max_col=1, min_row=min_row, values_only=True):
date_tuple = [date[:10] if isinstance(date, str) else date for date in date_tuple_src]
dt_array, tz_parsed = tslib.array_to_datetime(
np.array(date_tuple, copy=False, dtype=np.object_),
errors="coerce",
utc=False,
dayfirst=False,
yearfirst=False,
require_iso8601=True,
)
dti = DatetimeIndex(dt_array, tz=None, name=None)
month_list, idx_list = self.month_split(dti, date_list, date_statistics)
if len(month_list) == 0:
# month_info process
month_info = month_mapping.setdefault('xxxx-xx', [])
month_info.append((ws.title, min_row, ws.max_row, 0))
else:
# reverse_trend_list process
reverse_trend = self.get_reverse_trend(dti.day, idx_list)
reverse_trend_list.append(reverse_trend)
# month_info process
day_idx = dti.day
idx_list_max_idx = len(idx_list) - 1
for i, item in enumerate(month_list):
if i == idx_list_max_idx:
day_mean = np.mean(day_idx[idx_list[i]:].dropna())
month_mapping.setdefault(item, []).append(
(ws.title, idx_list[i] + min_row, ws.max_row, day_mean))
else:
day_mean = np.mean(day_idx[idx_list[i]: idx_list[i + 1]].dropna())
month_mapping.setdefault(item, []).append(
(ws.title, idx_list[i] + min_row, idx_list[i + 1] + min_row - 1, day_mean))
def build_metadata_rows(self, confidence, code, print_time, start_date, end_date):
if start_date is None or end_date is None:
timedelta = None
else:
timedelta = (end_date - start_date).days
metadata_rows = [
('流水识别置信度', confidence),
self.blank_row,
self.code_header,
]
metadata_rows.extend(code)
metadata_rows.extend(
[self.blank_row,
self.date_header,
(print_time, start_date, end_date, timedelta),
self.blank_row,
self.keyword_header]
)
return metadata_rows
def create_meta_sheet(self, card):
if self.worksheets[0].title == 'Sheet':
ms = self.worksheets[0]
ms.title = '{0}({1})'.format(self.meta_sheet_title, card[-6:])
else:
ms = self.create_sheet('{0}({1})'.format(self.meta_sheet_title, card[-6:]))
return ms
def build_meta_sheet(self, card, confidence, code, print_time, start_date, end_date):
metadata_rows = self.build_metadata_rows(confidence, code, print_time, start_date, end_date)
ms = self.create_meta_sheet(card)
for row in metadata_rows:
ms.append(row)
return ms
@staticmethod
def amount_format(amount_str):
if not isinstance(amount_str, str) or amount_str == '':
return amount_str
# 1.替换
res_str = amount_str.translate(consts.TRANS)
# 2.首字符处理
first_char = res_str[0]
if first_char in consts.ERROR_CHARS:
first_char = '-'
# 3.删除多余的-
res_str = first_char + res_str[1:].replace('-', '')
# 4.逗号与句号处理
if len(res_str) >= 4:
period_idx = len(res_str) - 3
if res_str[period_idx] == '.' and res_str[period_idx - 1] == ',':
res_str = '{0}{1}'.format(res_str[:period_idx - 1], res_str[period_idx:])
elif res_str[period_idx] == ',':
res_str = '{0}.{1}'.format(res_str[:period_idx], res_str[period_idx + 1:])
return res_str
def build_month_sheet(self, card, month_mapping, ms, is_reverse):
tmp_ws = self.create_sheet('tmp_ws')
for month in sorted(month_mapping.keys()):
# 3.1.拷贝数据
parts = month_mapping.get(month)
new_ws = self.create_sheet('{0}({1})'.format(month, card[-6:]))
new_ws.append(consts.FIXED_HEADERS)
for part in parts:
ws = self.get_sheet_by_name(part[0])
for row_value in ws.iter_rows(min_row=part[1], max_row=part[2], values_only=True):
new_ws.append(row_value)
# 3.2.提取信息、高亮
amount_mapping = {}
amount_fill_row = set()
for rows in new_ws.iter_rows(min_row=2):
summary_cell = rows[consts.SUMMARY_IDX]
date_cell = rows[consts.DATE_IDX]
amount_cell = rows[consts.AMOUNT_IDX]
row = summary_cell.row
# 关键词1提取
if summary_cell.value in self.interest_keyword:
ms.append((summary_cell.value, date_cell.value, amount_cell.value))
# 关键词2提取至临时表
elif summary_cell.value in self.salary_keyword:
tmp_ws.append((summary_cell.value, date_cell.value, amount_cell.value))
# 贷款关键词高亮
elif summary_cell.value in self.loan_keyword:
summary_cell.fill = self.loan_fill
amount_error = False
# 3.3.余额转数值
over_cell = rows[consts.OVER_IDX]
try:
over_cell.value = locale.atof(self.amount_format(over_cell.value))
except Exception as e:
amount_error = True
else:
over_cell.number_format = numbers.FORMAT_NUMBER_00
# 3.4.金额转数值
try:
try:
amount_cell.value = locale.atof(self.amount_format(amount_cell.value))
except Exception as e:
try:
amount_cell.value = locale.atof(self.amount_format(rows[consts.INCOME_IDX].value))
if amount_cell.value == 0:
raise
elif amount_cell.value < 0:
amount_cell.value = -amount_cell.value
except Exception as e:
amount_cell.value = locale.atof(self.amount_format(rows[consts.OUTLAY_IDX].value))
if amount_cell.value > 0:
amount_cell.value = -amount_cell.value
except Exception as e:
amount_error = True
else:
if rows[consts.BORROW_IDX].value in consts.BORROW_OUTLAY_SET:
amount_cell.value = -amount_cell.value
amount_cell.number_format = numbers.FORMAT_NUMBER_00
same_amount_mapping = amount_mapping.get(date_cell.value, {})
fill_rows = same_amount_mapping.get(-amount_cell.value)
if fill_rows:
amount_fill_row.add(row)
amount_fill_row.update(fill_rows)
amount_mapping.setdefault(date_cell.value, {}).setdefault(
amount_cell.value, []).append(row)
# 3.5.核对结果
if row > 2 and not amount_error:
if is_reverse:
rows[consts.RESULT_IDX].value = '=IF(D{0}=ROUND(SUM(D{1},C{0}),2), "{2}", "{3}")'.format(
row - 1, row, *self.proof_res)
else:
rows[consts.RESULT_IDX].value = '=IF(D{0}=ROUND(SUM(D{1},C{0}),2), "{2}", "{3}")'.format(
row, row - 1, *self.proof_res)
# 删除金额辅助列
new_ws.delete_cols(consts.BORROW_HEADER_COL, amount=new_ws.max_column)
# 3.6.同一天相同进出账高亮
del amount_mapping
for row in amount_fill_row:
new_ws[row][consts.AMOUNT_IDX].fill = self.amount_fill
# 关键词2信息提取
ms.append(self.blank_row)
ms.append(self.keyword_header)
for row in tmp_ws.iter_rows(values_only=True):
ms.append(row)
self.remove(tmp_ws)
def bs_rebuild(self, bs_summary):
# bs_summary = {
# '卡号': {
# 'classify': 0,
# 'confidence': 0.9,
# 'role': '柳雪',
# 'code': [('page', 'code')],
# 'print_time': 'datetime',
# 'start_date': 'datetime',
# 'end_date': 'datetime',
# 'sheet': ['sheet_name']
# }
# }
for card, summary in bs_summary.items():
# 1.原表修剪、排列、按照月份分割
start_date = summary.get('start_date')
end_date = summary.get('end_date')
date_statistics = False
if start_date is None or end_date is None:
date_statistics = True
date_list = []
month_mapping = {}
reverse_trend_list = []
for sheet in summary.get('sheet', []):
ws = self.get_sheet_by_name(sheet)
# 1.1.删除多余列、排列
min_row = self.sheet_prune(ws, summary.get('classify', 0))
# 1.2.按月份分割
self.sheet_split(ws, month_mapping, reverse_trend_list, min_row, date_list, date_statistics)
if date_statistics is True and len(date_list) > 1:
start_date = min(date_list) if start_date is None else start_date
end_date = max(date_list) if end_date is None else end_date
# 2.元信息提取表
ms = self.build_meta_sheet(card,
summary.get('confidence', 1),
summary.get('code'),
summary.get('print_time'),
start_date,
end_date)
# 3.创建月份表、提取/高亮关键行
# 倒序处理
is_reverse = True if sum(reverse_trend_list) > 0 else False
for month_list in month_mapping.values():
month_list.sort(key=lambda x: x[-1], reverse=is_reverse)
self.build_month_sheet(card, month_mapping, ms, is_reverse)
# 4.删除原表
for sheet in summary.get('sheet'):
self.remove(self.get_sheet_by_name(sheet))
def license_rebuild(self, license_summary, document_scheme):
for classify, (_, name, field_order, side_diff, scheme_diff) in consts.LICENSE_ORDER:
license_list = license_summary.get(classify)
if not license_list:
continue
ws = self.create_sheet(name)
if scheme_diff and document_scheme == consts.DOC_SCHEME_LIST[1]:
classify = consts.MVC_CLASSIFY_SE
for license_dict in license_list:
if classify == consts.IC_CLASSIFY and license_dict.get('类别') == '1':
license_summary.setdefault(consts.RP_CLASSIFY, []).append(license_dict)
continue
if side_diff:
key, field_order_yes, field_order_no = consts.FIELD_ORDER_MAP.get(classify)
field_order = field_order_yes if key in license_dict else field_order_no
for search_field, write_field in field_order:
ws.append((write_field, license_dict.get(search_field, '')))
ws.append((None, ))
def skip_img_sheet(self, skip_img):
if skip_img:
ws = self.create_sheet(consts.SKIP_IMG_SHEET_NAME)
ws.append(consts.SKIP_IMG_SHEET_HEADER)
for img_tuple in skip_img:
ws.append(img_tuple)
def rebuild(self, bs_summary, license_summary, skip_img, document_scheme):
self.bs_rebuild(bs_summary)
self.license_rebuild(license_summary, document_scheme)
self.skip_img_sheet(skip_img)
......@@ -15,11 +15,11 @@ from common import response
from common.mixins import GenericView
from common.tools.file_tools import file_write
from common.redis_cache import redis_handler as rh
from .models import UploadDocRecords, DocStatus, PriorityApplication, GCAPRecords
from .models import UploadDocRecords, DocStatus, PriorityApplication, GCAPRecords, AFCComparisonInfo, HILComparisonInfo
from .mixins import DocHandler
from . import consts
from apps.account.authentication import OAuth2AuthenticationWithUser
from celery_compare.tasks import test
from celery_compare.tasks import compare
# restframework将request.body封装至request.data, webargs从request.data中获取参数
......@@ -312,8 +312,23 @@ class CompareView(GenericView):
# pos上传比对信息接口
@use_args(compare_args, location='data')
def post(self, request, args):
self.running_log.info('in')
test.apply_async((args, ), queue='queue_compare')
# 存库
uniq_seq = args.get('uniqSeq')
business_type = args.get('applicationEntity')
application_id = args.get('applicationId')
comparison_class = HILComparisonInfo if business_type in consts.HIL_SET else AFCComparisonInfo
comparison_class.objects.create(
uniq_seq=uniq_seq,
application_id=application_id,
customer_type=args.get('customerType'),
application_version=args.get('applicationVersion'),
vehicle_status=args.get('vehicleStatus'),
individual_cus_info=None,
usedcar_info=None,
corporate_cus_info=None,
)
# 触发比对
compare.apply_async((application_id, business_type, uniq_seq, None), queue='queue_compare')
return response.ok()
post.openapi_doc = '''
......
......@@ -8,4 +8,4 @@ broker = conf.CELERY_BROKER_URL
app = Celery('celery_compare', broker=broker, include=['celery_compare.tasks'])
app.conf.update(worker_max_tasks_per_child=5, timezone='Asia/Shanghai')
\ No newline at end of file
app.conf.update(worker_max_tasks_per_child=5, timezone='Asia/Shanghai')
......
......@@ -6,7 +6,12 @@ compare_log = logging.getLogger('compare')
@app.task
def test(info):
doc = AFCDoc.objects.filter(id=1).first()
compare_log.info(doc.id)
compare_log.info(info)
def compare(application_id, application_entity, uniq_seq, ocr_res_id):
# POS: application_id, application_entity, uniq_seq, None
# OCR: application_id, business_type(application_entity), None, ocr_res_id
# 根据application_id查找最新的比对信息,如果没有,结束
# 分析比对信息,需要比对的license
# 根据application_id查找OCR累计结果指定license字段,如果没有,结束
# 比对信息,将比对结果发送GCAP
pass
......
import os
import smtplib
from email import encoders
from email.header import Header
from email.mime.base import MIMEBase
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
MAIL_SERVER_HOST = 'smtp.exmail.qq.com'
MAIL_SERVER_PORT = 25
TIME_OUT = 50
class MailSender:
def __init__(self, sender, pwd):
self.sender = sender
self.pwd = pwd
self.server = smtplib.SMTP(timeout=TIME_OUT)
self.server.debuglevel = 0
self.server.connect(host=MAIL_SERVER_HOST,
port=MAIL_SERVER_PORT,)
self.server.login(self.sender, self.pwd)
def close(self):
self.server.close()
def send(self, to_addrs, subject, content, file_list=[]):
msg = MIMEMultipart()
for att_file in file_list:
att = MIMEBase('application', 'octet-stream')
att.set_payload(open(att_file, 'rb').read())
encoders.encode_base64(att)
att.add_header('Content-Disposition',
'attachment',
filename=Header(os.path.basename(att_file), 'utf-8').encode())
msg.attach(att)
msg['Subject'] = Header(subject, 'utf-8')
msg['From'] = self.sender
msg['To'] = ','.join(to_addrs)
content = u'Hi:<br><br>' + \
content + \
u'<br><br>祝好!<br><br><br>本邮件为系统自动发送,请勿直接回复!<hr>'
msg.attach(MIMEText(content.encode('utf-8'), 'html', 'utf-8'))
self.server.sendmail(self.sender, to_addrs, msg.as_string())
# smtplib.SMTPServerDisconnected
# if __name__ == '__main__':
# mail_sender = MailSender()
# mail_sender.send(['1304057458@qq.com'], 'hello', 'world.', [])
# mail_sender.close()
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!