7dfc2ee8 by 周伟奇

merge license

2 parents 1242adb8 e570371a
......@@ -33,6 +33,5 @@ data/*
# 脚本
src/*.sh
test.py
test*
ocr_test.py
\ No newline at end of file
ocr_test_2.py
\ No newline at end of file
......
......@@ -35,9 +35,35 @@ DEALER_CODE_META_FIELD_id = 13
BUSINESS_TYPE_META_FIELD_id = 93
DEALER_CODE = 'ocr_situ_group'
RETRY_TIMES = 3
# ---------银行流水模板相关--------------------------------------------------------------------------------------------
TRANS = str.maketrans('Cc((oODlLmAsSbg', '000000011345569')
TRANS_MAP = {
'C': "0",
'c': "0",
'(': "0",
'o': "0",
'O': "0",
'D': "0",
'[': "1",
']': "1",
'l': "1",
'L': "1",
'A': "4",
's': "5",
'S': "5",
'b': "6",
'g': "9",
'E': "9",
'B': "13",
}
TRANS = str.maketrans(TRANS_MAP)
ERROR_CHARS = {'.', ':', ':', '•', '·'}
SKIP_IMG_SHEET_NAME = '未处理图片'
SKIP_IMG_SHEET_HEADER = ('页码', '序号')
CARD_RATIO = 0.9
UNKNOWN_CARD = '未知卡号'
......@@ -95,7 +121,7 @@ HEADERS_MAPPING.update(
HEADERS_MAPPING.update(
{
'交易日期': BASE_HEADERS_MAPPING['记账日期'],
'存入': BASE_HEADERS_MAPPING['金额'],
# '存入': BASE_HEADERS_MAPPING['金额'],
'对方账号': BASE_HEADERS_MAPPING['对方卡号/账号'],
'对方名称': BASE_HEADERS_MAPPING['对方账户名'],
'摘要': BASE_HEADERS_MAPPING['附言'],
......@@ -160,6 +186,12 @@ HEADERS_MAPPING.update(
'交易地点/对方账号和户名': BASE_HEADERS_MAPPING['对方卡号/账号'],
}
)
# 农业银行-窄页
HEADERS_MAPPING.update(
{
'交易对手账号': BASE_HEADERS_MAPPING['对方卡号/账号'],
}
)
# 竖版-特殊-农商行
HEADERS_MAPPING.update(
{
......@@ -299,17 +331,27 @@ HEADERS_MAPPING.update(
# ('中国银行', (1, 2, 4, 5, 6, 9, 10, 11, 12, None, None, None, None)), # 横版-表格-中国银行(不规则)
#
# # 农业银行:交易日期 交易网点 存入 支出 余额 对方账号 对方名称 摘要 渠道 附言
# ('农业银行', (1, None, 3, 5, None, 8, 7, 6, None, None, None, None, None)), # 横版-表格-农业银行-中国农业银行个人账户明细
# ('农业银行-10', (1, None, None, 5, None, 8, 7, 6, None, None, None, 3, 4)), # 横版-表格-农业银行-中国农业银行个人账户明细
#
# # 农业银行:序号 日期 摘要 交易金额 余额 对方账号 对方名称 交易地点 渠道 附言
# ('农业银行-10-1', (2, None, 4, 5, None, 3, 7, 6, None, None, None, None, None)),
#
# # 农业银行:交易日期 摘要 交易金额 余额 交易渠道 交易网点 对方账号 对方名称 附言
# ('农业银行-9', (1, None, 3, 4, None, 2, 8, 7, None, None, None, None, None)),
#
# # 北京银行:交易日期 业务摘要 收/支 发生额 余额 对方户名 对方账号 交易渠道
# ('北京银行', (1, None, 4, 5, None, 2, 6, 7, None, None, 3, None, None)), # 横版-表格-北京银行
#
# # 工商银行:交易日期 账号 储种 序号 币种 钞汇 摘要 地区 收入/支出金额 余额 [对方户名 对方账号] 渠道
# # 工商银行:交易日期 账号 储种 序号 币种 钞汇 摘要 地区 收入/支出金额 余额 渠道
# ('工商银行', (1, None, 9, 10, None, 7, None, None, None, None, None, None, None)),
#
# # 工商银行:交易日期 账号 储种 序号 币种 钞汇 摘要 地区 收入/支出金额 余额 对方户名 对方账号 渠道
# ('工商银行-电子账单', (1, None, 9, 10, None, 7, 11, 12, None, None, None, None, None)),
#
# # 建设银行:空 摘要 交易日期 交易金额 账户余额 商户/网点号及其名称 对方账号与户名 --> 竖版-表格-建设银行
# # 序号 摘要 币别 钞汇 交易日期 交易金额 账户余额 交易地点附言 对方账号与户名 --> 横版-表格-建设银行
# ('建设银行', (None, None, None, None, None, 2, None, None, None, None, None, None, None)),
# ('建设银行-竖版', (3, None, 4, 5, None, 2, None, 7, None, None, None, None, None)),
# ('建设银行-横版', (5, None, 6, 7, None, 2, None, 9, None, None, None, None, None)),
#
# # 微信:交易单号 交易时间 交易类型 收/支/其他 交易方式 金额(元) 交易对方 商户单号
# ('微信', (2, None, 6, None, None, 3, 7, None, None, None, None, None, None)),
......@@ -320,7 +362,13 @@ HEADERS_MAPPING.update(
# # -----------------普通打印:部分格线--------------------------------
#
# # 农业银行:交易日期 摘要/附言 交易金额 余额 交易地点/对方账号和户名
# ('农业银行', (1, None, 3, 4, None, 2, None, 5, None, None, None, None, None)),
# ('农业银行-5', (1, None, 3, 4, None, 2, None, 5, None, None, None, None, None)),
#
# # 农业银行:日期 地点 摘要 存入 支出 余额 对方账号 对方户名
# ('农业银行-8', (1, None, None, 6, None, 3, 8, 7, None, None, None, 4, 5)),
# # 农业银行:日期 摘要 交易金额 余额 地点 交易对手账号 对方户名
# ('农业银行-窄页', (1, None, 3, 4, None, 2, 7, 6, None, None, None, None, None)),
#
# # 农商行:交易日期 交易发生额 账户余额 对方账号 对方户名 摘要 备注
# ('农商行', (1, None, 2, 3, None, 6, 5, 4, None, None, None, None, None)),
......@@ -330,6 +378,9 @@ HEADERS_MAPPING.update(
#
# # 平安电子账单:序号 交易日期 交易网点 摘要 借贷发生额(借:-贷:+) 账户余额
# ('平安电子账单', (2, None, 5, 6, None, 4, None, None, None, None, None, None, None)),
# # 建设银行:序号 摘要 币别 钞汇 交易日期 交易金额 账户余额 交易地点附言 对方账号与户名
# ('建设银行-电子账单', (5, None, 6, 7, None, 2, None, 9, None, None, None, None, None)),
#
# # -----------------普通打印:无格线-------------------------------------
#
......@@ -338,7 +389,8 @@ HEADERS_MAPPING.update(
#
# # 邮储银行:交易日期、交易类型 交易币种 交易金额(元) 账户余额(元) [对手方户名 对手方账户 收支类型] --> 竖版-无表格-邮储银行-账户对账单 含有对手方户名 对手方账户
# # 交易日期、交易类型 交易金额(元) 账户余额(元) 操作柜员 --> 竖版-无表格-邮储银行 账户对账单
# ('邮储银行', (1, None, None, None, None, 2, None, None, None, None, None, None, None)),
# ('邮储银行-8', (1, None, 4, 5, None, 2, 6, 7, None, None, None, None, None)),
# ('邮储银行-5', (1, None, 3, 4, None, 2, None, None, None, None, None, None, None)),
#
# # 工商银行电子版:交易日期 账号 储种 序号 币种 妙汇 摘要 地区 收入/支出金额 余额 [对方户名 对方账号] 渠道
# ('工商银行电子版', (1, None, 9, 10, None, 7, None, None, None, None, None, None, None)),
......@@ -351,13 +403,15 @@ HEADERS_MAPPING.update(
# ('民生银行', (None, None, 5, 6, None, None, 7, None, 8, None, None, None, None)),
#
# # 农业银行:交易日期 摘要/附言 交易金额 对方账号和户名
# ('农业银行', (1, None, 3, None, None, 2, None, 4, None, None, None, None, None)),
# ('农业银行-整数', (1, None, 3, None, None, 2, None, 4, None, None, None, None, None)),
#
# # 农业银行:交易日期 摘要/附言 交易金额 余额 交易地点/对方账号和户名
# ('农业银行', (1, None, 3, 4, None, 2, None, 5, None, None, None, None, None)),
#
# # 农业银行:日期、时间、[日志号]、短摘要、交易金额、本次余额、交易网点、渠道、附言
# ('农业银行', (1, 2, None, None, None, None, None, None, None, None, None, None, None)),
# # 农业银行:日期、时间、短摘要、交易金额、本次余额、交易网点、渠道、附言
# # 农业银行:日期、时间、日志号、短摘要、交易金额、本次余额、交易网点、渠道、附言
# ('农业银行', (1, 2, 4, 5, None, 3, None, None, None, None, None, None, None)),
# ('农业银行-扩张缩进', (1, 2, 5, 6, None, 4, None, None, None, None, None, None, None)),
#
# # 交通银行:交易日期 记账日期、交易地点、交易类型、借贷状态、交易金额、余额
# ('交通银行', (1, None, 5, 6, None, 3, None, None, None, None, 4, None, None)),
......@@ -374,11 +428,10 @@ HEADERS_MAPPING.update(
#
# # 邮储银行:序号、交易日期、交易渠道、摘要、交易金额、账户余额、对方账号/卡号/汇票号、原子账号、交易机构名称
# ('邮储银行', (2, None, 5, 6, None, 4, None, 7, None, None, None, None, None)),
#
# # 建设银行:序号 摘要 币别 钞汇 交易日期 交易金额 账户余额 交易地点附言 对方账号与户名
# ('建设银行', (5, None, 6, 7, None, 2, None, 9, None, None, None, None, None)),
# ]
OTHER_TUPLE = (None, None, None, None, None, None, None, None, None, None, None, None, None)
# {
# "0":"其他",
# "1":"普通打印-全表格-中国农业银行",
......@@ -408,67 +461,163 @@ HEADERS_MAPPING.update(
# "22":"针式打印-部分格线-邮储银行一本通绿卡"
# }
# CLASSIFY_LIST = [
# ('其他', OTHER_TUPLE),
# ('农业银行', (1, None, 3, 5, None, 8, 7, 6, None, None, None, None, None)),
# ('中国银行', (1, 2, 4, 5, 6, 9, 10, 11, 12, None, None, None, None)),
# ('北京银行', (1, None, 4, 5, None, 2, 6, 7, None, None, 3, None, None)),
# ('工商银行', (1, None, 9, 10, None, 7, None, None, None, None, None, None, None)),
# ('建设银行', (None, None, None, None, None, 2, None, None, None, None, None, None, None)),
# ('微信', (2, None, 6, None, None, 3, 7, None, None, None, None, None, None)),
# ('支付宝', (2, None, None, 6, None, 3, None, None, None, None, None, 4, 5)),
#
# ('交通银行', (1, None, 5, 6, None, 3, None, None, None, None, 4, None, None)),
# ('农业银行', (1, None, 3, None, None, 2, None, 4, None, None, None, None, None)),
# ('农业银行', (1, 2, None, None, None, None, None, None, None, None, None, None, None)),
# ('招商银行', (1, None, 3, 4, None, 6, None, None, None, None, None, None, None)),
# ('招商银行电子版', (1, None, 3, 4, None, 5, 6, None, None, None, None, None, None)),
# ('民生银行', (None, None, 5, 6, None, None, 7, None, 8, None, None, None, None)),
#
# ('中信银行', (1, None, None, 5, None, 2, 6, 7, 8, None, None, 3, 4)),
# ('农业银行', (1, None, 3, 4, None, 2, None, 5, None, None, None, None, None)),
# ('农业银行', (1, None, 3, 4, None, 2, None, 5, None, None, None, None, None)),
# ('农业银行', (1, None, 3, 4, None, 2, None, 5, None, None, None, None, None)),
# ('平安电子账单', (2, None, 5, 6, None, 4, None, None, None, None, None, None, None)),
#
# ('建设银行', (None, None, None, None, None, None, None, None, None, None, None, None, None)),
# ('邮储银行', (2, None, 5, 6, None, 4, None, 7, None, None, None, None, None)),
# ('邮储银行', (2, None, 5, 6, None, 4, None, 7, None, None, None, None, None)),
# ]
# "4":"普通打印-全表格-中国银行",
# "5":"普通打印-全表格-农业银行-10列",
# "6":"普通打印-全表格-农业银行-10列-1",
# "7":"普通打印-全表格-农业银行-9列",
# "8":"普通打印-全表格-北京银行",
# "9":"普通打印-全表格-工商银行",
# "10":"普通打印-全表格-工商银行-电子账单",
# "11":"普通打印-全表格-建设银行",
# "12":"普通打印-全表格-微信账单",
# "13":"普通打印-全表格-支付宝账单",
# "14":"普通打印-无格线-交通银行",
# "15":"普通打印-无格线-储蓄银行-5列",
# "16":"普通打印-无格线-储蓄银行-8列",
# "17":"普通打印-无格线-农业银行-扩张缩进",
# "18":"普通打印-无格线-农业银行-整数",
# "19":"普通打印-无格线-招商银行",
# "20":"普通打印-无格线-招商银行-电子账单",
# "21":"普通打印-无格线-民生银行",
# "22":"普通打印-部分格线-横版-中信银行",
# "23":"普通打印-部分格线-竖版-农业银行-5列",
# "24":"普通打印-部分格线-竖版-农业银行-8列",
# "25":"普通打印-部分格线-竖版-农业银行-窄页",
# "26":"普通打印-部分格线-竖版-平安电子账单",
# "27":"普通打印-部分格线-竖版-建设银行-电子账单",
# "34":"针式打印-全格线-建设银行",
# "35":"针式打印-部分格线-竖版-邮储银行",
# "36":"针式打印-部分格线-竖版-邮储银行-绿卡",
CLASSIFY_LIST = [
('其他', (None, None, None, None, None, None, None, None, None, None, None, None, None)),
('农业银行', (1, None, 3, 5, None, 8, 7, 6, None, None, None, None, None)),
('中国银行', (1, 2, 4, 5, 6, 9, 10, 11, 12, None, None, None, None)),
('北京银行', (1, None, 4, 5, None, 2, 6, 7, None, None, 3, None, None)),
('工商银行', (1, None, 9, 10, None, 7, None, None, None, None, None, None, None)),
('建设银行', (None, None, None, None, None, 2, None, None, None, None, None, None, None)),
('微信', (2, None, 6, None, None, 3, 7, None, None, None, None, None, None)),
('支付宝', (2, None, None, 6, None, 3, None, None, None, None, None, 4, 5)),
('交通银行', (1, None, 5, 6, None, 3, None, None, None, None, 4, None, None)),
('农业银行', (1, None, 3, None, None, 2, None, 4, None, None, None, None, None)),
('农业银行', (1, 2, None, None, None, None, None, None, None, None, None, None, None)),
('招商银行', (1, None, 3, 4, None, 6, None, None, None, None, None, None, None)),
('招商银行电子版', (1, None, 3, 4, None, 5, 6, None, None, None, None, None, None)),
('民生银行', (None, None, 5, 6, None, None, 7, None, 8, None, None, None, None)),
('中信银行', (1, None, None, 5, None, 2, 6, 7, 8, None, None, 3, 4)),
('农业银行', (1, None, 3, 4, None, 2, None, 5, None, None, None, None, None)),
('农业银行', (1, None, 3, 4, None, 2, None, 5, None, None, None, None, None)),
('农业银行', (1, None, 3, 4, None, 2, None, 5, None, None, None, None, None)),
('平安电子账单', (2, None, 5, 6, None, 4, None, None, None, None, None, None, None)),
('建设银行', (None, None, None, None, None, None, None, None, None, None, None, None, None)),
('邮储银行', (2, None, 5, 6, None, 4, None, 7, None, None, None, None, None)),
('邮储银行', (2, None, 5, 6, None, 4, None, 7, None, None, None, None, None)),
('其他', OTHER_TUPLE),
('其他', OTHER_TUPLE),
('其他', OTHER_TUPLE),
('其他', OTHER_TUPLE),
('普通打印-全表格-中国银行', (1, 2, 4, 5, 6, 9, 10, 11, 12, None, None, None, None)),
('普通打印-全表格-农业银行-10列', (1, None, None, 5, None, 8, 7, 6, None, None, None, 3, 4)),
('普通打印-全表格-农业银行-10列-1', (2, None, 4, 5, None, 3, 7, 6, None, None, None, None, None)),
('普通打印-全表格-农业银行-9列', (1, None, 3, 4, None, 2, 8, 7, None, None, None, None, None)),
('普通打印-全表格-北京银行', (1, None, 4, 5, None, 2, 6, 7, None, None, 3, None, None)),
('普通打印-全表格-工商银行', (1, None, 9, 10, None, 7, None, None, None, None, None, None, None)),
('普通打印-全表格-工商银行-电子账单', (1, None, 9, 10, None, 7, 11, 12, None, None, None, None, None)),
('普通打印-全表格-建设银行', (3, None, 4, 5, None, 2, None, 7, None, None, None, None, None)),
('普通打印-全表格-微信账单', (2, None, 6, None, None, 3, 7, None, None, None, None, None, None)),
('普通打印-全表格-支付宝账单', (2, None, None, 6, None, 3, None, None, None, None, None, 4, 5)),
('普通打印-无格线-交通银行', (1, None, 5, 6, None, 3, None, None, None, None, 4, None, None)),
('普通打印-无格线-储蓄银行-5列', (1, None, 3, 4, None, 2, None, None, None, None, None, None, None)),
('普通打印-无格线-储蓄银行-8列', (1, None, 4, 5, None, 2, 6, 7, None, None, None, None, None)),
('普通打印-无格线-农业银行-扩张缩进', (1, 2, 5, 6, None, 4, None, None, None, None, None, None, None)),
('普通打印-无格线-农业银行-整数', (1, None, 3, None, None, 2, None, 4, None, None, None, None, None)),
('普通打印-无格线-招商银行', (1, None, 3, 4, None, 6, None, None, None, None, None, None, None)),
('普通打印-无格线-招商银行-电子账单', (1, None, 3, 4, None, 5, 6, None, None, None, None, None, None)),
('普通打印-无格线-民生银行', (None, None, 5, 6, None, None, 7, None, 8, None, None, None, None)),
('普通打印-部分格线-横版-中信银行', (1, None, None, 5, None, 2, 6, 7, 8, None, None, 3, 4)),
('普通打印-部分格线-竖版-农业银行-5列', (1, None, 3, 4, None, 2, None, 5, None, None, None, None, None)),
('普通打印-部分格线-竖版-农业银行-8列', (1, None, None, 6, None, 3, 8, 7, None, None, None, 4, 5)),
('普通打印-部分格线-竖版-农业银行-窄页', (1, None, 3, 4, None, 2, 7, 6, None, None, None, None, None)),
('普通打印-部分格线-竖版-平安电子账单', (2, None, 5, 6, None, 4, None, None, None, None, None, None, None)),
('普通打印-部分格线-竖版-建设银行-电子账单', (5, None, 6, 7, None, 2, None, 9, None, None, None, None, None)),
('其他', OTHER_TUPLE),
('其他', OTHER_TUPLE),
('其他', OTHER_TUPLE),
('其他', OTHER_TUPLE),
('其他', OTHER_TUPLE),
('其他', OTHER_TUPLE),
('针式打印-全格线-建设银行', OTHER_TUPLE),
('针式打印-部分格线-竖版-邮储银行', (2, None, 5, 6, None, 4, None, 7, None, None, None, None, None)),
('针式打印-部分格线-竖版-邮储银行-绿卡', (2, None, 5, 6, None, 4, None, 7, None, None, None, None, None)),
('其他', OTHER_TUPLE),
]
# ----------license相关------------------------------------------------------------------------------------------------
# "0":"AVT Invioce",
# "1":"二手车发票",
# "2":"其他",
# "3":"护照",
# "28":"机动车登记证",
# "29":"机动车销售统一发票",
# "30":"港澳通行证",
# "31":"营业执照",
# "32":"行驶证",
# "33":"身份证",
# "37":"银行卡"
# 其他
OTHER_CLASSIFY = 2
# 身份证
IC_CN_NAME = '身份证'
IC_CLASSIFY = 33
# 增值税发票
VAT_CN_NAME = '增值税发票'
VAT_CLASSIFY = 0
# 机动车登记证书
MVC_CN_NAME = '机动车登记证书'
MVC_CLASSIFY = 28
# 机动车销售统一发票
MVI_CN_NAME = '机动车销售统一发票'
MVI_CLASSIFY = 29
IC_PID = VAT_PID = MVC_PID = MVI_PID = None
# 营业执照
BL_KEY = 'bl'
BL_CN_NAME = '营业执照'
BL_CLASSIFY = 31
BL_PID = 41
# 二手车发票
UCI_KEY = 'uci'
UCI_CN_NAME = '二手车发票'
UCI_CLASSIFY = 1
UCI_PID = 60
# 港澳台通行证
EEP_KEY = 'eep'
EEP_CN_NAME = '港澳台通行证'
EEP_CLASSIFY = 30
EEP_PID = 1018
# 行驶证
DL_KEY = 'dl'
DL_CN_NAME = '行驶证'
DL_CLASSIFY = 32
DL_PID = 5
# 护照
PP_KEY = 'pp'
PP_CN_NAME = '护照'
PP_CLASSIFY = 3
PP_PID = 8
# 银行卡
BC_KEY = 'bc'
# 身份证
IC_KEY = 'ic'
# 机动车登记证书
MVC_KEY = 'mvc'
# 机动车销售统一发票
MVI_KEY = 'mvi'
# 增值税发票
VAT_KEY = 'vat'
LICENSE_ORDER = ((MVI_KEY, '机动车销售统一发票'),
(IC_KEY, '身份证'),
(BC_KEY, '银行卡'),
(BL_KEY, '营业执照'),
(UCI_KEY, '二手车发票'),
(EEP_KEY, '港澳台通行证'),
(DL_KEY, '行驶证'),
(PP_KEY, '护照'),
(MVC_KEY, '机动车登记证书'),
(VAT_KEY, '增值税发票'))
BC_CN_NAME = '银行卡'
BC_CLASSIFY = 37
BC_PID = 4
BC_FIELD = (('CardNum', '银行卡号'),
('BankName', '发卡行名称'),
('CardName', '银行卡名称'),
......@@ -478,3 +627,19 @@ BC_FIELD = (('CardNum', '银行卡号'),
SUCCESS_CODE_SET = {'0', 0}
LICENSE_ORDER = ((MVI_CLASSIFY, (MVI_PID, MVI_CN_NAME)),
(IC_CLASSIFY, (IC_PID, IC_CN_NAME)),
(BC_CLASSIFY, (BC_PID, BC_CN_NAME)),
(BL_CLASSIFY, (BL_PID, BL_CN_NAME)),
(UCI_CLASSIFY, (UCI_PID, UCI_CN_NAME)),
(EEP_CLASSIFY, (EEP_PID, EEP_CN_NAME)),
(DL_CLASSIFY, (DL_PID, DL_CN_NAME)),
(PP_CLASSIFY, (PP_PID, PP_CN_NAME)),
(MVC_CLASSIFY, (MVC_PID, MVC_CN_NAME)),
(VAT_CLASSIFY, (VAT_PID, VAT_CN_NAME)))
LICENSE_CLASSIFY_MAPPING = dict(LICENSE_ORDER)
OTHER_CLASSIFY_SET = {OTHER_CLASSIFY}
LICENSE_CLASSIFY_SET_1 = {IC_CLASSIFY, VAT_CLASSIFY, MVC_CLASSIFY, MVI_CLASSIFY}
LICENSE_CLASSIFY_SET_2 = {BL_CLASSIFY, UCI_CLASSIFY, EEP_CLASSIFY, DL_CLASSIFY, PP_CLASSIFY, BC_CLASSIFY}
......
class EDMSException(Exception):
pass
......@@ -4,6 +4,7 @@ import signal
import asyncio
import aiohttp
import difflib
import base64
import requests
from datetime import datetime, date
from collections import Counter
......@@ -18,6 +19,7 @@ from apps.doc.models import DocStatus, HILDoc, AFCDoc, Keywords
from apps.doc.named_enum import KeywordsType
from apps.doc import consts
from apps.doc.ocr.edms import EDMS, rh
from apps.doc.exceptions import EDMSException
class Command(BaseCommand, LoggerMixin):
......@@ -30,7 +32,8 @@ class Command(BaseCommand, LoggerMixin):
# 数据目录
self.data_dir = conf.DATA_DIR
# ocr相关
self.ocr_url = conf.OCR_URL
self.ocr_url_1 = conf.OCR_URL_1
self.ocr_url_2 = conf.OCR_URL_2
# EDMS web_service_api
self.edms = EDMS(conf.EDMS_USER, conf.EDMS_PWD)
# 优雅退出信号:15
......@@ -70,18 +73,44 @@ class Command(BaseCommand, LoggerMixin):
os.makedirs(doc_data_path, exist_ok=True)
pdf_path = os.path.join(doc_data_path, '{0}.pdf'.format(doc.id))
if not doc.application_id.startswith(consts.FIXED_APPLICATION_ID_PREFIX):
for times in range(consts.RETRY_TIMES):
try:
self.edms.download(pdf_path, doc.metadata_version_id)
except Exception as e:
self.cronjob_log.warn('{0} [edms download failed] [times={1}] [business_type={2}] [doc_id={3}] '
'[error={4}]'.format(self.log_base, times, business_type, doc.id, e))
edms_exc = str(e)
else:
break
else:
raise EDMSException(edms_exc)
excel_path = os.path.join(doc_data_path, '{0}.xlsx'.format(doc.id))
src_excel_path = os.path.join(doc_data_path, 'src.xlsx')
self.cronjob_log.info('{0} [pdf download success] [business_type={1}] [doc_id={2}] [pdf_path={3}]'.format(
self.cronjob_log.info('{0} [edms download success] [business_type={1}] [doc_id={2}] [pdf_path={3}]'.format(
self.log_base, business_type, doc.id, pdf_path))
return doc_data_path, excel_path, src_excel_path, pdf_path
@staticmethod
def append_bs_sheet(wb, sheets, bs_summary, unknown_summary, pno, img_idx, classify, confidence):
def bs_process(self, wb, ocr_data, bs_summary, unknown_summary, img_path, classify, skip_img):
sheets = ocr_data.get('data', [])
if not sheets:
skip_img.append(self.parse_img_path(img_path))
return
confidence = ocr_data.get('confidence', 1)
img_name, _ = os.path.splitext(os.path.basename(img_path))
for i, sheet in enumerate(sheets):
sheet_name = 'page_{0}_img_{1}_{2}'.format(pno, img_idx, i)
cells = sheet.get('cells')
if not cells:
skip_img.append(self.parse_img_path(img_path))
continue
sheet_name = '{0}_{1}'.format(img_name, i)
ws = wb.create_sheet(sheet_name)
for cell in cells:
c1 = cell.get('start_column')
r1 = cell.get('start_row')
words = cell.get('words')
ws.cell(row=r1 + 1, column=c1 + 1, value=words)
# ['户名', '卡号', '页码', '回单验证码', '打印时间', '起始时间', '终止时间']
summary = sheet.get('summary')
card = summary[1]
......@@ -129,74 +158,199 @@ class Command(BaseCommand, LoggerMixin):
if summary[6] is not None:
ed_list.append(summary[6])
ws = wb.create_sheet(sheet_name)
cells = sheet.get('cells')
for cell in cells:
c1 = cell.get('start_column')
r1 = cell.get('start_row')
words = cell.get('words')
ws.cell(row=r1+1, column=c1+1, value=words)
def license1_process(self, ocr_data, license_summary, classify, skip_img, img_path):
license_data = ocr_data.get('data', [])
if not license_data:
skip_img.append(self.parse_img_path(img_path))
return
for license_dict in license_data:
res_list = []
for field, value in license_dict.items():
res_list.append((field, value))
license_summary.setdefault(classify, []).append(res_list)
def license2_process(self, ocr_res_2, license_summary, pid, classify, skip_img, img_path):
if ocr_res_2.get('ErrorCode') in consts.SUCCESS_CODE_SET:
if pid == consts.BC_PID:
# 银行卡
res_list = []
for en_key, chn_key in consts.BC_FIELD:
res_list.append((chn_key, ocr_res_2.get(en_key, '')))
license_summary.setdefault(classify, []).append(res_list)
else:
# 营业执照、行驶证等
for result_dict in ocr_res_2.get('ResultList', []):
res_list = []
for field_dict in result_dict.get('FieldList', []):
res_list.append(
(field_dict.get('chn_key', ''), field_dict.get('value', '')))
license_summary.setdefault(classify, []).append(res_list)
else:
skip_img.append(self.parse_img_path(img_path))
def ocr_2_wb(self, res, wb, pno, img_idx, bs_summary, unknown_summary, license_summary):
# res = {
# 'code': 1,
# 'msg': 'success',
# 'data': {
# 'classify': 0,
# 'confidence': 0.999,
# 'sheets': [
# {
# 'summary': ['户名', '卡号', '页码', '回单验证码', '打印时间', '起始时间', '终止时间'],
# 'cells': []
# },
# {
# 'summary': ['户名', '卡号', '页码', '回单验证码', '打印时间', '起始时间', '终止时间'],
# 'cells': []
# }
# ]
# }
# }
data = res.get('data', {})
classify = data.get('classify')
@staticmethod
async def fetch_ocr_1_result(url, json_data):
async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(ssl=False)) as session:
async with session.post(url, json=json_data) as response:
if response.status == 200:
return await response.json()
@staticmethod
async def fetch_ocr_2_result(url, json_data):
async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(ssl=False)) as session:
async with session.post(url, data=json_data) as response:
if response.status == 200:
return await response.json()
async def img_2_ocr_2_wb(self, wb, img_path, bs_summary, unknown_summary, license_summary, skip_img):
with open(img_path, 'rb') as f:
base64_data = base64.b64encode(f.read())
# 获取解码后的base64值
file_data = base64_data.decode()
json_data_1 = {
"file": file_data
}
ocr_res_1 = await self.fetch_ocr_1_result(self.ocr_url_1, json_data_1)
if ocr_res_1 is None:
raise Exception('ocr 1 error, img_path={0}'.format(img_path))
else:
self.cronjob_log.info('{0} [ocr_1 result] [img={1}] [res={2}]'.format(
self.log_base, img_path, ocr_res_1))
if ocr_res_1.get('code') == 1:
ocr_data = ocr_res_1.get('data', {})
classify = ocr_data.get('classify')
if classify is None:
skip_img.append(self.parse_img_path(img_path))
return
# if classify in
sheets = data.get('sheets', [])
if not sheets:
elif classify in consts.OTHER_CLASSIFY_SET: # 其他类
skip_img.append(self.parse_img_path(img_path))
return
confidence = data.get('confidence', 1)
self.append_bs_sheet(wb, sheets, bs_summary, unknown_summary, pno, img_idx, classify, confidence)
# else:
# pass
# async def fetch_ocr_result(self, img_path):
# async with aiohttp.ClientSession(
# headers=self.ocr_header, connector=aiohttp.TCPConnector(ssl=False)
# ) as session:
# json_data = self.get_ocr_json(img_path)
# async with session.post(self.ocr_url, json=json_data) as response:
# return await response.json()
elif classify in consts.LICENSE_CLASSIFY_SET_1: # 证件1
self.license1_process(ocr_data, license_summary, classify, skip_img, img_path)
elif classify in consts.LICENSE_CLASSIFY_SET_2: # 证件2
pid, _ = consts.LICENSE_CLASSIFY_MAPPING.get(classify)
json_data_2 = {
"pid": str(pid),
"key": conf.OCR_KEY,
"secret": conf.OCR_SECRET,
"file": file_data
}
ocr_res_2 = await self.fetch_ocr_2_result(self.ocr_url_2, json_data_2)
if ocr_res_2 is None:
raise Exception('ocr 2 error, img_path={0}'.format(img_path))
else:
# 识别结果
self.cronjob_log.info('{0} [ocr_2 result] [img={1}] [res={2}]'.format(
self.log_base, img_path, ocr_res_2))
self.license2_process(ocr_res_2, license_summary, pid, classify, skip_img, img_path)
else: # 流水处理
self.bs_process(wb, ocr_data, bs_summary, unknown_summary, img_path, classify, skip_img)
else:
skip_img.append(self.parse_img_path(img_path))
# def img_2_ocr_2_wb(self, wb, img_path, bs_summary, unknown_summary, license_summary, skip_img):
# # # 流水
# # res = {
# # 'code': 1,
# # 'msg': 'success',
# # 'data': {
# # 'classify': 0,
# # 'confidence': 0.999,
# # 'data': [
# # {
# # 'summary': ['户名', '卡号', '页码', '回单验证码', '打印时间', '起始时间', '终止时间'],
# # 'cells': []
# # },
# # {
# # 'summary': ['户名', '卡号', '页码', '回单验证码', '打印时间', '起始时间', '终止时间'],
# # 'cells': []
# # }
# # ]
# # }
# # }
# #
# # # 证件-1
# # res = {
# # 'code': 1,
# # 'msg': 'success',
# # 'data': {
# # 'classify': 0,
# # 'confidence': 0.999,
# # 'data': [
# # {
# # 'cn_key': 'value',
# # 'cn_key': 'value',
# # },
# # {
# # 'cn_key': 'value',
# # 'cn_key': 'value',
# # },
# # ]
# # }
# # }
# #
# # # 证件-2 or 其他类
# # res = {
# # 'code': 1,
# # 'msg': 'success',
# # 'data': {
# # 'classify': 0,
# # 'confidence': 0.999,
# # }
# # }
# with open(img_path, 'rb') as f:
# base64_data = base64.b64encode(f.read())
# # 获取解码后的base64值
# file_data = base64_data.decode()
# json_data_1 = {
# "file": file_data
# }
# response_1 = requests.post(self.ocr_url_1, json=json_data_1)
# if response_1.status_code == 200:
# ocr_res_1 = response_1.json()
# self.cronjob_log.info('{0} [ocr_1 result] [img={1}] [res={2}]'.format(
# self.log_base, img_path, ocr_res_1))
#
# async def img_2_ocr_2_wb(self, wb, img_path, summary):
# res = await self.fetch_ocr_result(img_path)
# self.cronjob_log.info('{0} [fetch ocr result success] [img={1}] [res={2}]'.format(self.log_base, img_path, res))
# sheets_list = res.get('result').get('res')
# img_name = os.path.basename(img_path)
# self.append_sheet(wb, sheets_list, img_name, summary)
def fetch_ocr_result(self, img_path):
files = [
('img', open(img_path, 'rb'))
]
response = requests.request("POST", self.ocr_url, files=files)
return response.json()
def img_2_ocr_2_wb(self, wb, img_info, bs_summary, unknown_summary, license_summary):
res = self.fetch_ocr_result(img_info[0])
self.cronjob_log.info('{0} [fetch ocr result success] [img={1}] [res={2}]'.format(
self.log_base, img_info[0], res))
if res.get('code') == 1:
self.ocr_2_wb(res, wb, img_info[1], img_info[2], bs_summary, unknown_summary, license_summary)
# if ocr_res_1.get('code') == 1:
# ocr_data = ocr_res_1.get('data', {})
# classify = ocr_data.get('classify')
# if classify is None:
# skip_img.append(self.parse_img_path(img_path))
# return
# elif classify in consts.OTHER_CLASSIFY_SET: # 其他类
# skip_img.append(self.parse_img_path(img_path))
# return
# elif classify in consts.LICENSE_CLASSIFY_SET_1: # 证件1
# self.license1_process(ocr_data, license_summary, classify, skip_img, img_path)
# elif classify in consts.LICENSE_CLASSIFY_SET_2: # 证件2
# pid, _ = consts.LICENSE_CLASSIFY_MAPPING.get(classify)
# json_data_2 = {
# "pid": str(pid),
# "key": conf.OCR_KEY,
# "secret": conf.OCR_SECRET,
# "file": file_data
# }
# response_2 = requests.post(self.ocr_url_2, data=json_data_2)
# if response_2.status_code == 200:
# # 识别结果
# ocr_res_2 = response_2.json()
# self.cronjob_log.info('{0} [ocr_2 result] [img={1}] [res={2}]'.format(
# self.log_base, img_path, ocr_res_2))
# self.license2_process(ocr_res_2, license_summary, pid, classify, skip_img, img_path)
# else:
# raise Exception('ocr 2 error, img_path={0}'.format(img_path))
# else: # 流水处理
# self.bs_process(wb, ocr_data, bs_summary, unknown_summary, img_path, classify, skip_img)
# else:
# skip_img.append(self.parse_img_path(img_path))
# else:
# raise Exception('ocr 1 error, img_path={0}'.format(img_path))
@staticmethod
def parse_img_path(img_path):
img_name, _ = os.path.splitext(os.path.basename(img_path))
return int(img_name[5])+1, int(img_name[11])+1
@staticmethod
def get_most(value_list):
......@@ -255,7 +409,6 @@ class Command(BaseCommand, LoggerMixin):
summary['role'] = self.get_most(summary['role'])
return bs_summary
def rebuild_bs_summary(self, bs_summary, unknown_summary):
# bs_summary = {
# '卡号': {
......@@ -297,8 +450,10 @@ class Command(BaseCommand, LoggerMixin):
merged_bs_summary[card] = summary
else:
# 1卡号
one_card = False
if len(bs_summary) == 1:
merged_bs_summary = self.prune_bs_summary(bs_summary)
one_card = True
# 多卡号
else:
merged_bs_summary = self.merge_card(bs_summary)
......@@ -307,7 +462,7 @@ class Command(BaseCommand, LoggerMixin):
merge_role = []
classify_summary = unknown_summary.get(card_summary['classify'], {})
for role, summary in classify_summary.items():
if role in card_summary['role_set']:
if one_card or role in card_summary['role_set']:
merge_role.append(role)
card_summary['sheet'].extend(summary['sheet'])
card_summary['code'].extend(summary['code'])
......@@ -336,12 +491,13 @@ class Command(BaseCommand, LoggerMixin):
return merged_bs_summary
# TODO 细化文件状态,不同异常状态,归还队列,重试时采取不同的处理
# TODO 调用接口重试
# TODO 协程异步发送OCR请求
# TODO 异常邮件通知
# 识别失败:普通异常,如PDF异常、构建过程异常
# EDMS异常:下载异常-->回队列-->邮件;上传异常-->重新上传队列-->邮件
# 算法异常:第一道异常-->识别失败-->邮件;第二道异常-->识别失败-->邮件
# TODO 协程异步发送OCR请求
# TODO 调用接口重试
# TODO 数据库断联问题
# TODO 非流水证件处理
# TODO EDMS API GATEWAY
def handle(self, *args, **kwargs):
sleep_second = int(conf.SLEEP_SECOND)
max_sleep_second = int(conf.MAX_SLEEP_SECOND)
......@@ -369,61 +525,82 @@ class Command(BaseCommand, LoggerMixin):
pdf_handler.extract_image()
self.cronjob_log.info('{0} [pdf to img end] [business_type={1}] [doc_id={2}]'.format(
self.log_base, business_type, doc.id))
write_zip_file(img_save_path, os.path.join(doc_data_path, '{0}_img.zip'.format(doc.id)))
# 4.获取OCR结果并且构建excel文件
bs_summary = {}
license_summary = {}
unknown_summary = {}
skip_img = []
interest_keyword = Keywords.objects.filter(
type=KeywordsType.INTEREST.value).values_list('keyword', flat=True)
type=KeywordsType.INTEREST.value, on_off=True).values_list('keyword', flat=True)
salary_keyword = Keywords.objects.filter(
type=KeywordsType.SALARY.value).values_list('keyword', flat=True)
type=KeywordsType.SALARY.value, on_off=True).values_list('keyword', flat=True)
loan_keyword = Keywords.objects.filter(
type__in=[KeywordsType.LOAN.value, KeywordsType.ALI_WECHART.value]).values_list(
type__in=[KeywordsType.LOAN.value, KeywordsType.ALI_WECHART.value], on_off=True).values_list(
'keyword', flat=True)
wb = BSWorkbook(interest_keyword, salary_keyword, loan_keyword)
# wb = Workbook()
# 4.1 获取OCR结果
# loop = asyncio.get_event_loop()
# tasks = [self.img_2_ocr_2_wb(wb, img_path, summary) for img_path in pdf_handler.img_path_list]
# loop.run_until_complete(asyncio.wait(tasks))
loop = asyncio.get_event_loop()
tasks = [self.img_2_ocr_2_wb(wb, img_path, bs_summary, unknown_summary, license_summary, skip_img)
for img_path in pdf_handler.img_path_list]
loop.run_until_complete(asyncio.wait(tasks))
# loop.close()
for img_info in pdf_handler.img_info_list:
self.img_2_ocr_2_wb(wb, img_info, bs_summary, unknown_summary, license_summary)
# for img_path in pdf_handler.img_path_list:
# self.img_2_ocr_2_wb(wb, img_path, bs_summary, unknown_summary, license_summary, skip_img)
self.cronjob_log.info('{0} [bs_summary={1}] [unknown_summary={2}] [license_summary={3}]'.format(
self.log_base, bs_summary, unknown_summary, license_summary))
self.cronjob_log.info('{0} [business_type={1}] [doc_id={2}] [bs_summary={3}] [unknown_summary={4}] '
'[license_summary={5}]'.format(self.log_base, business_type, doc.id, bs_summary,
unknown_summary, license_summary))
merged_bs_summary = self.rebuild_bs_summary(bs_summary, unknown_summary)
self.cronjob_log.info('{0} [merged_bs_summary={1}] [unknown_summary={2}]'.format(
self.log_base, merged_bs_summary, unknown_summary))
self.cronjob_log.info('{0} [business_type={1}] [doc_id={2}] [merged_bs_summary={3}] '
'[unknown_summary={4}] [skip_img={5}]'.format(self.log_base, business_type,
doc.id, merged_bs_summary,
unknown_summary, skip_img))
del unknown_summary
# 4.2 重构Excel文件
wb.save(src_excel_path)
wb.rebuild(merged_bs_summary, license_summary)
wb.rebuild(merged_bs_summary, license_summary, skip_img)
wb.save(excel_path)
except EDMSException as e:
self.cronjob_log.error('{0} [process failed (edms download)] [business_type={1}] [doc_id={2}] '
'[err={3}]'.format(self.log_base, business_type, doc.id, e))
except Exception as e:
doc.status = DocStatus.PROCESS_FAILED.value
doc.save()
self.cronjob_log.error('{0} [process failed] [business_type={1}] [doc_id={2}] [err={3}]'.format(
self.log_base, business_type, doc.id, e))
self.cronjob_log.error('{0} [process failed (program)] [business_type={1}] [doc_id={2}] '
'[err={3}]'.format(self.log_base, business_type, doc.id, e))
else:
try:
# 5.上传至EDMS
for times in range(consts.RETRY_TIMES):
try:
self.edms.upload(excel_path, doc, business_type)
except Exception as e:
self.cronjob_log.warn(
'{0} [edms upload failed] [times={1}] [business_type={2}] [doc_id={3}] '
'[error={4}]'.format(self.log_base, times, business_type, doc.id, e))
edms_exc = str(e)
else:
break
else:
raise EDMSException(edms_exc)
except Exception as e:
doc.status = DocStatus.UPLOAD_FAILED.value
doc.save()
end_time = time.time()
speed_time = int(end_time - start_time)
self.cronjob_log.error('{0} [upload failed] [business_type={1}] [doc_id={2}] [speed_time={3}] '
'[err={4}]'.format(self.log_base, business_type, doc.id, speed_time, e))
self.cronjob_log.error('{0} [process failed (edms upload)] [business_type={1}] [doc_id={2}] '
'[speed_time={3}] [err={4}]'.format(self.log_base, business_type, doc.id,
speed_time, e))
write_zip_file(img_save_path, os.path.join(doc_data_path, '{0}_img.zip'.format(doc.id)))
else:
doc.status = DocStatus.COMPLETE.value
doc.save()
......@@ -431,5 +608,6 @@ class Command(BaseCommand, LoggerMixin):
speed_time = int(end_time - start_time)
self.cronjob_log.info('{0} [process complete] [business_type={1}] [doc_id={2}] '
'[speed_time={3}]'.format(self.log_base, business_type, doc.id, speed_time))
write_zip_file(img_save_path, os.path.join(doc_data_path, '{0}_img.zip'.format(doc.id)))
self.cronjob_log.info('{0} [stop safely]'.format(self.log_base))
......
......@@ -141,32 +141,22 @@ class BSWorkbook(Workbook):
# month_info process
month_info = month_mapping.setdefault('xxxx-xx', [])
month_info.append((ws.title, min_row, ws.max_row, 0))
elif len(month_list) == 1:
# reverse_trend_list process
reverse_trend = self.get_reverse_trend(dti.day, idx_list)
reverse_trend_list.append(reverse_trend)
# month_info process
month_info = month_mapping.setdefault(month_list[0], [])
day_mean = np.mean(dti.day.dropna())
if len(month_info) == 0:
month_info.append((ws.title, min_row, ws.max_row, day_mean))
else:
for i, item in enumerate(month_info):
if day_mean <= item[-1]:
month_info.insert(i, (ws.title, min_row, ws.max_row, day_mean))
break
else:
month_info.append((ws.title, min_row, ws.max_row, day_mean))
else:
# reverse_trend_list process
reverse_trend = self.get_reverse_trend(dti.day, idx_list)
reverse_trend_list.append(reverse_trend)
# month_info process
for i, item in enumerate(month_list[:-1]):
day_idx = dti.day
idx_list_max_idx = len(idx_list) - 1
for i, item in enumerate(month_list):
if i == idx_list_max_idx:
day_mean = np.mean(day_idx[idx_list[i]:].dropna())
month_mapping.setdefault(item, []).append(
(ws.title, idx_list[i] + min_row, idx_list[i + 1] + min_row - 1, self.MAX_MEAN))
month_mapping.setdefault(month_list[-1], []).insert(
0, (ws.title, idx_list[-1] + min_row, ws.max_row, 0))
(ws.title, idx_list[i] + min_row, ws.max_row, day_mean))
else:
day_mean = np.mean(day_idx[idx_list[i]: idx_list[i + 1]].dropna())
month_mapping.setdefault(item, []).append(
(ws.title, idx_list[i] + min_row, idx_list[i + 1] + min_row - 1, day_mean))
def build_metadata_rows(self, confidence, code, print_time, start_date, end_date):
if start_date is None or end_date is None:
......@@ -191,9 +181,9 @@ class BSWorkbook(Workbook):
def create_meta_sheet(self, card):
if self.worksheets[0].title == 'Sheet':
ms = self.worksheets[0]
ms.title = '{0}({1})'.format(self.meta_sheet_title, card)
ms.title = '{0}({1})'.format(self.meta_sheet_title, card[-6:])
else:
ms = self.create_sheet('{0}({1})'.format(self.meta_sheet_title, card))
ms = self.create_sheet('{0}({1})'.format(self.meta_sheet_title, card[-6:]))
return ms
def build_meta_sheet(self, card, confidence, code, print_time, start_date, end_date):
......@@ -203,6 +193,26 @@ class BSWorkbook(Workbook):
ms.append(row)
return ms
@staticmethod
def amount_format(amount_str):
if not isinstance(amount_str, str) or amount_str == '':
return amount_str
# 1.替换
res_str = amount_str.translate(consts.TRANS)
# 2.删除多余的-
res_str = res_str[0] + res_str[1:].replace('-', '')
# 3.首字符处理
if res_str[0] in consts.ERROR_CHARS:
res_str = '-{0}'.format(res_str[1:])
# 4.逗号与句号处理
if len(res_str) >= 4:
period_idx = len(res_str) - 3
if res_str[period_idx] == '.' and res_str[period_idx - 1] == ',':
res_str = '{0}{1}'.format(res_str[:period_idx - 1], res_str[period_idx:])
elif res_str[period_idx] == ',':
res_str = '{0}.{1}'.format(res_str[:period_idx], res_str[period_idx + 1:])
return res_str
def build_month_sheet(self, card, month_mapping, ms, is_reverse):
tmp_ws = self.create_sheet('tmp_ws')
for month in sorted(month_mapping.keys()):
......@@ -235,29 +245,25 @@ class BSWorkbook(Workbook):
# 3.3.余额转数值
over_cell = rows[consts.OVER_IDX]
try:
if isinstance(over_cell.value, str):
over_cell.value = over_cell.value.translate(consts.TRANS)
over_cell.value = locale.atof(over_cell.value)
over_cell.value = locale.atof(self.amount_format(over_cell.value))
except Exception as e:
continue
else:
over_cell.number_format = numbers.FORMAT_NUMBER_COMMA_SEPARATED1
# 3.4.额转数值
# 3.4.额转数值
try:
try:
if isinstance(amount_cell.value, str): # TODO 可在转化数字失败后,再替换
amount_cell.value = amount_cell.value.translate(consts.TRANS)
amount_cell.value = locale.atof(amount_cell.value)
amount_cell.value = locale.atof(self.amount_format(amount_cell.value))
except Exception as e:
try:
if isinstance(rows[consts.INCOME_IDX].value, str):
rows[consts.OUTLAY_IDX].value = rows[consts.INCOME_IDX].value.translate(consts.TRANS)
amount_cell.value = locale.atof(rows[consts.OUTLAY_IDX].value)
amount_cell.value = locale.atof(self.amount_format(rows[consts.INCOME_IDX].value))
if amount_cell.value == 0:
raise
elif amount_cell.value < 0:
amount_cell.value = -amount_cell.value
except Exception as e:
if isinstance(rows[consts.OUTLAY_IDX].value, str):
rows[consts.OUTLAY_IDX].value = rows[consts.OUTLAY_IDX].value.translate(consts.TRANS)
amount_cell.value = locale.atof(rows[consts.OUTLAY_IDX].value)
amount_cell.value = locale.atof(self.amount_format(rows[consts.OUTLAY_IDX].value))
if amount_cell.value > 0:
amount_cell.value = -amount_cell.value
except Exception as e:
......@@ -313,18 +319,18 @@ class BSWorkbook(Workbook):
# }
for card, summary in bs_summary.items():
# 1.原表修剪、排列、按照月份分割
start_date = summary['start_date']
end_date = summary['end_date']
start_date = summary.get('start_date')
end_date = summary.get('end_date')
date_statistics = False
if start_date is None or end_date is None:
date_statistics = True
date_list = []
month_mapping = {}
reverse_trend_list = []
for sheet in summary['sheet']:
for sheet in summary.get('sheet', []):
ws = self.get_sheet_by_name(sheet)
# 1.1.删除多余列、排列
min_row = self.sheet_prune(ws, summary['classify'])
min_row = self.sheet_prune(ws, summary.get('classify', 0))
# 1.2.按月份分割
self.sheet_split(ws, month_mapping, reverse_trend_list, min_row, date_list, date_statistics)
......@@ -334,32 +340,43 @@ class BSWorkbook(Workbook):
# 2.元信息提取表
ms = self.build_meta_sheet(card,
summary['confidence'],
summary['code'],
summary['print_time'],
summary.get('confidence', 1),
summary.get('code'),
summary.get('print_time'),
start_date,
end_date)
# 3.创建月份表、提取/高亮关键行
is_reverse = False
if sum(reverse_trend_list) > 0: # 倒序处理
is_reverse = True
# 倒序处理
is_reverse = True if sum(reverse_trend_list) > 0 else False
for month_list in month_mapping.values():
month_list.sort(key=lambda x: x[-1], reverse=True)
month_list.sort(key=lambda x: x[-1], reverse=is_reverse)
self.build_month_sheet(card, month_mapping, ms, is_reverse)
# 4.删除原表
for sheet in summary['sheet']:
for sheet in summary.get('sheet'):
self.remove(self.get_sheet_by_name(sheet))
def license_rebuild(self, license_summary):
for en_key, cn_key in consts.LICENSE_ORDER:
ws = self.create_sheet(cn_key)
for bl in license_summary.get(en_key, []):
for classify, (_, name) in consts.LICENSE_ORDER:
res = license_summary.get(classify)
if res is None:
continue
ws = self.create_sheet(name)
for bl in res:
for bl_field in bl:
ws.append(bl_field)
ws.append((None, ))
def rebuild(self, bs_summary, license_summary):
def skip_img_sheet(self, skip_img):
if skip_img:
ws = self.create_sheet(consts.SKIP_IMG_SHEET_NAME)
ws.append(consts.SKIP_IMG_SHEET_HEADER)
for img_tuple in skip_img:
ws.append(img_tuple)
def rebuild(self, bs_summary, license_summary, skip_img):
self.bs_rebuild(bs_summary)
# self.license_rebuild(license_summary)
self.license_rebuild(license_summary)
self.skip_img_sheet(skip_img)
......
......@@ -25,7 +25,7 @@ class PDFHandler:
def __init__(self, path, img_dir_path):
self.path = path
self.img_dir_path = img_dir_path
self.img_info_list = []
self.img_path_list = []
self.xref_set = set()
def get_img_save_path(self, pno, img_index=0, ext='png'):
......@@ -38,7 +38,7 @@ class PDFHandler:
pm = page.getPixmap(matrix=trans_2, alpha=False)
img_save_path = self.get_img_save_path(page.number)
pm.writePNG(img_save_path)
self.img_info_list.append((img_save_path, page.number, 0))
self.img_path_list.append(img_save_path)
@staticmethod
def getimage(pix):
......@@ -88,7 +88,7 @@ class PDFHandler:
with open(img_save_path, "wb") as f:
f.write(img_data)
self.xref_set.add(xref)
self.img_info_list.append((img_save_path, pno, img_index))
self.img_path_list.append(img_save_path)
@staticmethod
def split_il(il):
......@@ -179,7 +179,7 @@ class PDFHandler:
img_save_path = self.get_img_save_path(pno, img_index, im_list[0][2])
new_img.save(img_save_path)
page_to_png = False
self.img_info_list.append((img_save_path, pno, img_index))
self.img_path_list.append(img_save_path)
# 3.3 碎图分组大于2、全过滤、含特殊filter,特殊处理:整个页面保存为png图片
if page_to_png:
......
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!