Skip to content
Toggle navigation
Toggle navigation
This project
Loading...
Sign in
周伟奇
/
bmw-ocr
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Graphs
Network
Create a new issue
Commits
Issue Boards
Files
Commits
Network
Compare
Branches
Tags
7dfc2ee8
authored
2020-10-17 23:40:18 +0800
by
周伟奇
Browse Files
Options
Browse Files
Tag
Download
Plain Diff
merge license
2 parents
1242adb8
e570371a
Show whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
575 additions
and
214 deletions
.gitignore
src/apps/doc/consts.py
src/apps/doc/exceptions.py
src/apps/doc/management/commands/doc_ocr_process.py
src/apps/doc/ocr/wb.py
src/common/tools/pdf_to_img.py
.gitignore
View file @
7dfc2ee
...
...
@@ -33,6 +33,5 @@ data/*
# 脚本
src/*.sh
test
.py
test
*
ocr_test.py
\ No newline at end of file
ocr_test_2.py
\ No newline at end of file
...
...
src/apps/doc/consts.py
View file @
7dfc2ee
...
...
@@ -35,9 +35,35 @@ DEALER_CODE_META_FIELD_id = 13
BUSINESS_TYPE_META_FIELD_id
=
93
DEALER_CODE
=
'ocr_situ_group'
RETRY_TIMES
=
3
# ---------银行流水模板相关--------------------------------------------------------------------------------------------
TRANS
=
str
.
maketrans
(
'Cc((oODlLmAsSbg'
,
'000000011345569'
)
TRANS_MAP
=
{
'C'
:
"0"
,
'c'
:
"0"
,
'('
:
"0"
,
'o'
:
"0"
,
'O'
:
"0"
,
'D'
:
"0"
,
'['
:
"1"
,
']'
:
"1"
,
'l'
:
"1"
,
'L'
:
"1"
,
'A'
:
"4"
,
's'
:
"5"
,
'S'
:
"5"
,
'b'
:
"6"
,
'g'
:
"9"
,
'E'
:
"9"
,
'B'
:
"13"
,
}
TRANS
=
str
.
maketrans
(
TRANS_MAP
)
ERROR_CHARS
=
{
'.'
,
':'
,
':'
,
'•'
,
'·'
}
SKIP_IMG_SHEET_NAME
=
'未处理图片'
SKIP_IMG_SHEET_HEADER
=
(
'页码'
,
'序号'
)
CARD_RATIO
=
0.9
UNKNOWN_CARD
=
'未知卡号'
...
...
@@ -95,7 +121,7 @@ HEADERS_MAPPING.update(
HEADERS_MAPPING
.
update
(
{
'交易日期'
:
BASE_HEADERS_MAPPING
[
'记账日期'
],
'存入'
:
BASE_HEADERS_MAPPING
[
'金额'
],
#
'存入': BASE_HEADERS_MAPPING['金额'],
'对方账号'
:
BASE_HEADERS_MAPPING
[
'对方卡号/账号'
],
'对方名称'
:
BASE_HEADERS_MAPPING
[
'对方账户名'
],
'摘要'
:
BASE_HEADERS_MAPPING
[
'附言'
],
...
...
@@ -160,6 +186,12 @@ HEADERS_MAPPING.update(
'交易地点/对方账号和户名'
:
BASE_HEADERS_MAPPING
[
'对方卡号/账号'
],
}
)
# 农业银行-窄页
HEADERS_MAPPING
.
update
(
{
'交易对手账号'
:
BASE_HEADERS_MAPPING
[
'对方卡号/账号'
],
}
)
# 竖版-特殊-农商行
HEADERS_MAPPING
.
update
(
{
...
...
@@ -299,17 +331,27 @@ HEADERS_MAPPING.update(
# ('中国银行', (1, 2, 4, 5, 6, 9, 10, 11, 12, None, None, None, None)), # 横版-表格-中国银行(不规则)
#
# # 农业银行:交易日期 交易网点 存入 支出 余额 对方账号 对方名称 摘要 渠道 附言
# ('农业银行', (1, None, 3, 5, None, 8, 7, 6, None, None, None, None, None)), # 横版-表格-农业银行-中国农业银行个人账户明细
# ('农业银行-10', (1, None, None, 5, None, 8, 7, 6, None, None, None, 3, 4)), # 横版-表格-农业银行-中国农业银行个人账户明细
#
# # 农业银行:序号 日期 摘要 交易金额 余额 对方账号 对方名称 交易地点 渠道 附言
# ('农业银行-10-1', (2, None, 4, 5, None, 3, 7, 6, None, None, None, None, None)),
#
# # 农业银行:交易日期 摘要 交易金额 余额 交易渠道 交易网点 对方账号 对方名称 附言
# ('农业银行-9', (1, None, 3, 4, None, 2, 8, 7, None, None, None, None, None)),
#
# # 北京银行:交易日期 业务摘要 收/支 发生额 余额 对方户名 对方账号 交易渠道
# ('北京银行', (1, None, 4, 5, None, 2, 6, 7, None, None, 3, None, None)), # 横版-表格-北京银行
#
# # 工商银行:交易日期 账号 储种 序号 币种 钞汇 摘要 地区 收入/支出金额 余额
[对方户名 对方账号]
渠道
# # 工商银行:交易日期 账号 储种 序号 币种 钞汇 摘要 地区 收入/支出金额 余额 渠道
# ('工商银行', (1, None, 9, 10, None, 7, None, None, None, None, None, None, None)),
#
# # 工商银行:交易日期 账号 储种 序号 币种 钞汇 摘要 地区 收入/支出金额 余额 对方户名 对方账号 渠道
# ('工商银行-电子账单', (1, None, 9, 10, None, 7, 11, 12, None, None, None, None, None)),
#
# # 建设银行:空 摘要 交易日期 交易金额 账户余额 商户/网点号及其名称 对方账号与户名 --> 竖版-表格-建设银行
# # 序号 摘要 币别 钞汇 交易日期 交易金额 账户余额 交易地点附言 对方账号与户名 --> 横版-表格-建设银行
# ('建设银行', (None, None, None, None, None, 2, None, None, None, None, None, None, None)),
# ('建设银行-竖版', (3, None, 4, 5, None, 2, None, 7, None, None, None, None, None)),
# ('建设银行-横版', (5, None, 6, 7, None, 2, None, 9, None, None, None, None, None)),
#
# # 微信:交易单号 交易时间 交易类型 收/支/其他 交易方式 金额(元) 交易对方 商户单号
# ('微信', (2, None, 6, None, None, 3, 7, None, None, None, None, None, None)),
...
...
@@ -320,7 +362,13 @@ HEADERS_MAPPING.update(
# # -----------------普通打印:部分格线--------------------------------
#
# # 农业银行:交易日期 摘要/附言 交易金额 余额 交易地点/对方账号和户名
# ('农业银行', (1, None, 3, 4, None, 2, None, 5, None, None, None, None, None)),
# ('农业银行-5', (1, None, 3, 4, None, 2, None, 5, None, None, None, None, None)),
#
# # 农业银行:日期 地点 摘要 存入 支出 余额 对方账号 对方户名
# ('农业银行-8', (1, None, None, 6, None, 3, 8, 7, None, None, None, 4, 5)),
# # 农业银行:日期 摘要 交易金额 余额 地点 交易对手账号 对方户名
# ('农业银行-窄页', (1, None, 3, 4, None, 2, 7, 6, None, None, None, None, None)),
#
# # 农商行:交易日期 交易发生额 账户余额 对方账号 对方户名 摘要 备注
# ('农商行', (1, None, 2, 3, None, 6, 5, 4, None, None, None, None, None)),
...
...
@@ -330,6 +378,9 @@ HEADERS_MAPPING.update(
#
# # 平安电子账单:序号 交易日期 交易网点 摘要 借贷发生额(借:-贷:+) 账户余额
# ('平安电子账单', (2, None, 5, 6, None, 4, None, None, None, None, None, None, None)),
# # 建设银行:序号 摘要 币别 钞汇 交易日期 交易金额 账户余额 交易地点附言 对方账号与户名
# ('建设银行-电子账单', (5, None, 6, 7, None, 2, None, 9, None, None, None, None, None)),
#
# # -----------------普通打印:无格线-------------------------------------
#
...
...
@@ -338,7 +389,8 @@ HEADERS_MAPPING.update(
#
# # 邮储银行:交易日期、交易类型 交易币种 交易金额(元) 账户余额(元) [对手方户名 对手方账户 收支类型] --> 竖版-无表格-邮储银行-账户对账单 含有对手方户名 对手方账户
# # 交易日期、交易类型 交易金额(元) 账户余额(元) 操作柜员 --> 竖版-无表格-邮储银行 账户对账单
# ('邮储银行', (1, None, None, None, None, 2, None, None, None, None, None, None, None)),
# ('邮储银行-8', (1, None, 4, 5, None, 2, 6, 7, None, None, None, None, None)),
# ('邮储银行-5', (1, None, 3, 4, None, 2, None, None, None, None, None, None, None)),
#
# # 工商银行电子版:交易日期 账号 储种 序号 币种 妙汇 摘要 地区 收入/支出金额 余额 [对方户名 对方账号] 渠道
# ('工商银行电子版', (1, None, 9, 10, None, 7, None, None, None, None, None, None, None)),
...
...
@@ -351,13 +403,15 @@ HEADERS_MAPPING.update(
# ('民生银行', (None, None, 5, 6, None, None, 7, None, 8, None, None, None, None)),
#
# # 农业银行:交易日期 摘要/附言 交易金额 对方账号和户名
# ('农业银行', (1, None, 3, None, None, 2, None, 4, None, None, None, None, None)),
# ('农业银行
-整数
', (1, None, 3, None, None, 2, None, 4, None, None, None, None, None)),
#
# # 农业银行:交易日期 摘要/附言 交易金额 余额 交易地点/对方账号和户名
# ('农业银行', (1, None, 3, 4, None, 2, None, 5, None, None, None, None, None)),
#
# # 农业银行:日期、时间、[日志号]、短摘要、交易金额、本次余额、交易网点、渠道、附言
# ('农业银行', (1, 2, None, None, None, None, None, None, None, None, None, None, None)),
# # 农业银行:日期、时间、短摘要、交易金额、本次余额、交易网点、渠道、附言
# # 农业银行:日期、时间、日志号、短摘要、交易金额、本次余额、交易网点、渠道、附言
# ('农业银行', (1, 2, 4, 5, None, 3, None, None, None, None, None, None, None)),
# ('农业银行-扩张缩进', (1, 2, 5, 6, None, 4, None, None, None, None, None, None, None)),
#
# # 交通银行:交易日期 记账日期、交易地点、交易类型、借贷状态、交易金额、余额
# ('交通银行', (1, None, 5, 6, None, 3, None, None, None, None, 4, None, None)),
...
...
@@ -374,11 +428,10 @@ HEADERS_MAPPING.update(
#
# # 邮储银行:序号、交易日期、交易渠道、摘要、交易金额、账户余额、对方账号/卡号/汇票号、原子账号、交易机构名称
# ('邮储银行', (2, None, 5, 6, None, 4, None, 7, None, None, None, None, None)),
#
# # 建设银行:序号 摘要 币别 钞汇 交易日期 交易金额 账户余额 交易地点附言 对方账号与户名
# ('建设银行', (5, None, 6, 7, None, 2, None, 9, None, None, None, None, None)),
# ]
OTHER_TUPLE
=
(
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
)
# {
# "0":"其他",
# "1":"普通打印-全表格-中国农业银行",
...
...
@@ -408,67 +461,163 @@ HEADERS_MAPPING.update(
# "22":"针式打印-部分格线-邮储银行一本通绿卡"
# }
# CLASSIFY_LIST = [
# ('其他', OTHER_TUPLE),
# ('农业银行', (1, None, 3, 5, None, 8, 7, 6, None, None, None, None, None)),
# ('中国银行', (1, 2, 4, 5, 6, 9, 10, 11, 12, None, None, None, None)),
# ('北京银行', (1, None, 4, 5, None, 2, 6, 7, None, None, 3, None, None)),
# ('工商银行', (1, None, 9, 10, None, 7, None, None, None, None, None, None, None)),
# ('建设银行', (None, None, None, None, None, 2, None, None, None, None, None, None, None)),
# ('微信', (2, None, 6, None, None, 3, 7, None, None, None, None, None, None)),
# ('支付宝', (2, None, None, 6, None, 3, None, None, None, None, None, 4, 5)),
#
# ('交通银行', (1, None, 5, 6, None, 3, None, None, None, None, 4, None, None)),
# ('农业银行', (1, None, 3, None, None, 2, None, 4, None, None, None, None, None)),
# ('农业银行', (1, 2, None, None, None, None, None, None, None, None, None, None, None)),
# ('招商银行', (1, None, 3, 4, None, 6, None, None, None, None, None, None, None)),
# ('招商银行电子版', (1, None, 3, 4, None, 5, 6, None, None, None, None, None, None)),
# ('民生银行', (None, None, 5, 6, None, None, 7, None, 8, None, None, None, None)),
#
# ('中信银行', (1, None, None, 5, None, 2, 6, 7, 8, None, None, 3, 4)),
# ('农业银行', (1, None, 3, 4, None, 2, None, 5, None, None, None, None, None)),
# ('农业银行', (1, None, 3, 4, None, 2, None, 5, None, None, None, None, None)),
# ('农业银行', (1, None, 3, 4, None, 2, None, 5, None, None, None, None, None)),
# ('平安电子账单', (2, None, 5, 6, None, 4, None, None, None, None, None, None, None)),
#
# ('建设银行', (None, None, None, None, None, None, None, None, None, None, None, None, None)),
# ('邮储银行', (2, None, 5, 6, None, 4, None, 7, None, None, None, None, None)),
# ('邮储银行', (2, None, 5, 6, None, 4, None, 7, None, None, None, None, None)),
# ]
# "4":"普通打印-全表格-中国银行",
# "5":"普通打印-全表格-农业银行-10列",
# "6":"普通打印-全表格-农业银行-10列-1",
# "7":"普通打印-全表格-农业银行-9列",
# "8":"普通打印-全表格-北京银行",
# "9":"普通打印-全表格-工商银行",
# "10":"普通打印-全表格-工商银行-电子账单",
# "11":"普通打印-全表格-建设银行",
# "12":"普通打印-全表格-微信账单",
# "13":"普通打印-全表格-支付宝账单",
# "14":"普通打印-无格线-交通银行",
# "15":"普通打印-无格线-储蓄银行-5列",
# "16":"普通打印-无格线-储蓄银行-8列",
# "17":"普通打印-无格线-农业银行-扩张缩进",
# "18":"普通打印-无格线-农业银行-整数",
# "19":"普通打印-无格线-招商银行",
# "20":"普通打印-无格线-招商银行-电子账单",
# "21":"普通打印-无格线-民生银行",
# "22":"普通打印-部分格线-横版-中信银行",
# "23":"普通打印-部分格线-竖版-农业银行-5列",
# "24":"普通打印-部分格线-竖版-农业银行-8列",
# "25":"普通打印-部分格线-竖版-农业银行-窄页",
# "26":"普通打印-部分格线-竖版-平安电子账单",
# "27":"普通打印-部分格线-竖版-建设银行-电子账单",
# "34":"针式打印-全格线-建设银行",
# "35":"针式打印-部分格线-竖版-邮储银行",
# "36":"针式打印-部分格线-竖版-邮储银行-绿卡",
CLASSIFY_LIST
=
[
(
'其他'
,
(
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
)),
(
'农业银行'
,
(
1
,
None
,
3
,
5
,
None
,
8
,
7
,
6
,
None
,
None
,
None
,
None
,
None
)),
(
'中国银行'
,
(
1
,
2
,
4
,
5
,
6
,
9
,
10
,
11
,
12
,
None
,
None
,
None
,
None
)),
(
'北京银行'
,
(
1
,
None
,
4
,
5
,
None
,
2
,
6
,
7
,
None
,
None
,
3
,
None
,
None
)),
(
'工商银行'
,
(
1
,
None
,
9
,
10
,
None
,
7
,
None
,
None
,
None
,
None
,
None
,
None
,
None
)),
(
'建设银行'
,
(
None
,
None
,
None
,
None
,
None
,
2
,
None
,
None
,
None
,
None
,
None
,
None
,
None
)),
(
'微信'
,
(
2
,
None
,
6
,
None
,
None
,
3
,
7
,
None
,
None
,
None
,
None
,
None
,
None
)),
(
'支付宝'
,
(
2
,
None
,
None
,
6
,
None
,
3
,
None
,
None
,
None
,
None
,
None
,
4
,
5
)),
(
'交通银行'
,
(
1
,
None
,
5
,
6
,
None
,
3
,
None
,
None
,
None
,
None
,
4
,
None
,
None
)),
(
'农业银行'
,
(
1
,
None
,
3
,
None
,
None
,
2
,
None
,
4
,
None
,
None
,
None
,
None
,
None
)),
(
'农业银行'
,
(
1
,
2
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
)),
(
'招商银行'
,
(
1
,
None
,
3
,
4
,
None
,
6
,
None
,
None
,
None
,
None
,
None
,
None
,
None
)),
(
'招商银行电子版'
,
(
1
,
None
,
3
,
4
,
None
,
5
,
6
,
None
,
None
,
None
,
None
,
None
,
None
)),
(
'民生银行'
,
(
None
,
None
,
5
,
6
,
None
,
None
,
7
,
None
,
8
,
None
,
None
,
None
,
None
)),
(
'中信银行'
,
(
1
,
None
,
None
,
5
,
None
,
2
,
6
,
7
,
8
,
None
,
None
,
3
,
4
)),
(
'农业银行'
,
(
1
,
None
,
3
,
4
,
None
,
2
,
None
,
5
,
None
,
None
,
None
,
None
,
None
)),
(
'农业银行'
,
(
1
,
None
,
3
,
4
,
None
,
2
,
None
,
5
,
None
,
None
,
None
,
None
,
None
)),
(
'农业银行'
,
(
1
,
None
,
3
,
4
,
None
,
2
,
None
,
5
,
None
,
None
,
None
,
None
,
None
)),
(
'平安电子账单'
,
(
2
,
None
,
5
,
6
,
None
,
4
,
None
,
None
,
None
,
None
,
None
,
None
,
None
)),
(
'建设银行'
,
(
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
)),
(
'邮储银行'
,
(
2
,
None
,
5
,
6
,
None
,
4
,
None
,
7
,
None
,
None
,
None
,
None
,
None
)),
(
'邮储银行'
,
(
2
,
None
,
5
,
6
,
None
,
4
,
None
,
7
,
None
,
None
,
None
,
None
,
None
)),
(
'其他'
,
OTHER_TUPLE
),
(
'其他'
,
OTHER_TUPLE
),
(
'其他'
,
OTHER_TUPLE
),
(
'其他'
,
OTHER_TUPLE
),
(
'普通打印-全表格-中国银行'
,
(
1
,
2
,
4
,
5
,
6
,
9
,
10
,
11
,
12
,
None
,
None
,
None
,
None
)),
(
'普通打印-全表格-农业银行-10列'
,
(
1
,
None
,
None
,
5
,
None
,
8
,
7
,
6
,
None
,
None
,
None
,
3
,
4
)),
(
'普通打印-全表格-农业银行-10列-1'
,
(
2
,
None
,
4
,
5
,
None
,
3
,
7
,
6
,
None
,
None
,
None
,
None
,
None
)),
(
'普通打印-全表格-农业银行-9列'
,
(
1
,
None
,
3
,
4
,
None
,
2
,
8
,
7
,
None
,
None
,
None
,
None
,
None
)),
(
'普通打印-全表格-北京银行'
,
(
1
,
None
,
4
,
5
,
None
,
2
,
6
,
7
,
None
,
None
,
3
,
None
,
None
)),
(
'普通打印-全表格-工商银行'
,
(
1
,
None
,
9
,
10
,
None
,
7
,
None
,
None
,
None
,
None
,
None
,
None
,
None
)),
(
'普通打印-全表格-工商银行-电子账单'
,
(
1
,
None
,
9
,
10
,
None
,
7
,
11
,
12
,
None
,
None
,
None
,
None
,
None
)),
(
'普通打印-全表格-建设银行'
,
(
3
,
None
,
4
,
5
,
None
,
2
,
None
,
7
,
None
,
None
,
None
,
None
,
None
)),
(
'普通打印-全表格-微信账单'
,
(
2
,
None
,
6
,
None
,
None
,
3
,
7
,
None
,
None
,
None
,
None
,
None
,
None
)),
(
'普通打印-全表格-支付宝账单'
,
(
2
,
None
,
None
,
6
,
None
,
3
,
None
,
None
,
None
,
None
,
None
,
4
,
5
)),
(
'普通打印-无格线-交通银行'
,
(
1
,
None
,
5
,
6
,
None
,
3
,
None
,
None
,
None
,
None
,
4
,
None
,
None
)),
(
'普通打印-无格线-储蓄银行-5列'
,
(
1
,
None
,
3
,
4
,
None
,
2
,
None
,
None
,
None
,
None
,
None
,
None
,
None
)),
(
'普通打印-无格线-储蓄银行-8列'
,
(
1
,
None
,
4
,
5
,
None
,
2
,
6
,
7
,
None
,
None
,
None
,
None
,
None
)),
(
'普通打印-无格线-农业银行-扩张缩进'
,
(
1
,
2
,
5
,
6
,
None
,
4
,
None
,
None
,
None
,
None
,
None
,
None
,
None
)),
(
'普通打印-无格线-农业银行-整数'
,
(
1
,
None
,
3
,
None
,
None
,
2
,
None
,
4
,
None
,
None
,
None
,
None
,
None
)),
(
'普通打印-无格线-招商银行'
,
(
1
,
None
,
3
,
4
,
None
,
6
,
None
,
None
,
None
,
None
,
None
,
None
,
None
)),
(
'普通打印-无格线-招商银行-电子账单'
,
(
1
,
None
,
3
,
4
,
None
,
5
,
6
,
None
,
None
,
None
,
None
,
None
,
None
)),
(
'普通打印-无格线-民生银行'
,
(
None
,
None
,
5
,
6
,
None
,
None
,
7
,
None
,
8
,
None
,
None
,
None
,
None
)),
(
'普通打印-部分格线-横版-中信银行'
,
(
1
,
None
,
None
,
5
,
None
,
2
,
6
,
7
,
8
,
None
,
None
,
3
,
4
)),
(
'普通打印-部分格线-竖版-农业银行-5列'
,
(
1
,
None
,
3
,
4
,
None
,
2
,
None
,
5
,
None
,
None
,
None
,
None
,
None
)),
(
'普通打印-部分格线-竖版-农业银行-8列'
,
(
1
,
None
,
None
,
6
,
None
,
3
,
8
,
7
,
None
,
None
,
None
,
4
,
5
)),
(
'普通打印-部分格线-竖版-农业银行-窄页'
,
(
1
,
None
,
3
,
4
,
None
,
2
,
7
,
6
,
None
,
None
,
None
,
None
,
None
)),
(
'普通打印-部分格线-竖版-平安电子账单'
,
(
2
,
None
,
5
,
6
,
None
,
4
,
None
,
None
,
None
,
None
,
None
,
None
,
None
)),
(
'普通打印-部分格线-竖版-建设银行-电子账单'
,
(
5
,
None
,
6
,
7
,
None
,
2
,
None
,
9
,
None
,
None
,
None
,
None
,
None
)),
(
'其他'
,
OTHER_TUPLE
),
(
'其他'
,
OTHER_TUPLE
),
(
'其他'
,
OTHER_TUPLE
),
(
'其他'
,
OTHER_TUPLE
),
(
'其他'
,
OTHER_TUPLE
),
(
'其他'
,
OTHER_TUPLE
),
(
'针式打印-全格线-建设银行'
,
OTHER_TUPLE
),
(
'针式打印-部分格线-竖版-邮储银行'
,
(
2
,
None
,
5
,
6
,
None
,
4
,
None
,
7
,
None
,
None
,
None
,
None
,
None
)),
(
'针式打印-部分格线-竖版-邮储银行-绿卡'
,
(
2
,
None
,
5
,
6
,
None
,
4
,
None
,
7
,
None
,
None
,
None
,
None
,
None
)),
(
'其他'
,
OTHER_TUPLE
),
]
# ----------license相关------------------------------------------------------------------------------------------------
# "0":"AVT Invioce",
# "1":"二手车发票",
# "2":"其他",
# "3":"护照",
# "28":"机动车登记证",
# "29":"机动车销售统一发票",
# "30":"港澳通行证",
# "31":"营业执照",
# "32":"行驶证",
# "33":"身份证",
# "37":"银行卡"
# 其他
OTHER_CLASSIFY
=
2
# 身份证
IC_CN_NAME
=
'身份证'
IC_CLASSIFY
=
33
# 增值税发票
VAT_CN_NAME
=
'增值税发票'
VAT_CLASSIFY
=
0
# 机动车登记证书
MVC_CN_NAME
=
'机动车登记证书'
MVC_CLASSIFY
=
28
# 机动车销售统一发票
MVI_CN_NAME
=
'机动车销售统一发票'
MVI_CLASSIFY
=
29
IC_PID
=
VAT_PID
=
MVC_PID
=
MVI_PID
=
None
# 营业执照
BL_KEY
=
'bl'
BL_CN_NAME
=
'营业执照'
BL_CLASSIFY
=
31
BL_PID
=
41
# 二手车发票
UCI_KEY
=
'uci'
UCI_CN_NAME
=
'二手车发票'
UCI_CLASSIFY
=
1
UCI_PID
=
60
# 港澳台通行证
EEP_KEY
=
'eep'
EEP_CN_NAME
=
'港澳台通行证'
EEP_CLASSIFY
=
30
EEP_PID
=
1018
# 行驶证
DL_KEY
=
'dl'
DL_CN_NAME
=
'行驶证'
DL_CLASSIFY
=
32
DL_PID
=
5
# 护照
PP_KEY
=
'pp'
PP_CN_NAME
=
'护照'
PP_CLASSIFY
=
3
PP_PID
=
8
# 银行卡
BC_KEY
=
'bc'
# 身份证
IC_KEY
=
'ic'
# 机动车登记证书
MVC_KEY
=
'mvc'
# 机动车销售统一发票
MVI_KEY
=
'mvi'
# 增值税发票
VAT_KEY
=
'vat'
LICENSE_ORDER
=
((
MVI_KEY
,
'机动车销售统一发票'
),
(
IC_KEY
,
'身份证'
),
(
BC_KEY
,
'银行卡'
),
(
BL_KEY
,
'营业执照'
),
(
UCI_KEY
,
'二手车发票'
),
(
EEP_KEY
,
'港澳台通行证'
),
(
DL_KEY
,
'行驶证'
),
(
PP_KEY
,
'护照'
),
(
MVC_KEY
,
'机动车登记证书'
),
(
VAT_KEY
,
'增值税发票'
))
BC_CN_NAME
=
'银行卡'
BC_CLASSIFY
=
37
BC_PID
=
4
BC_FIELD
=
((
'CardNum'
,
'银行卡号'
),
(
'BankName'
,
'发卡行名称'
),
(
'CardName'
,
'银行卡名称'
),
...
...
@@ -478,3 +627,19 @@ BC_FIELD = (('CardNum', '银行卡号'),
SUCCESS_CODE_SET
=
{
'0'
,
0
}
LICENSE_ORDER
=
((
MVI_CLASSIFY
,
(
MVI_PID
,
MVI_CN_NAME
)),
(
IC_CLASSIFY
,
(
IC_PID
,
IC_CN_NAME
)),
(
BC_CLASSIFY
,
(
BC_PID
,
BC_CN_NAME
)),
(
BL_CLASSIFY
,
(
BL_PID
,
BL_CN_NAME
)),
(
UCI_CLASSIFY
,
(
UCI_PID
,
UCI_CN_NAME
)),
(
EEP_CLASSIFY
,
(
EEP_PID
,
EEP_CN_NAME
)),
(
DL_CLASSIFY
,
(
DL_PID
,
DL_CN_NAME
)),
(
PP_CLASSIFY
,
(
PP_PID
,
PP_CN_NAME
)),
(
MVC_CLASSIFY
,
(
MVC_PID
,
MVC_CN_NAME
)),
(
VAT_CLASSIFY
,
(
VAT_PID
,
VAT_CN_NAME
)))
LICENSE_CLASSIFY_MAPPING
=
dict
(
LICENSE_ORDER
)
OTHER_CLASSIFY_SET
=
{
OTHER_CLASSIFY
}
LICENSE_CLASSIFY_SET_1
=
{
IC_CLASSIFY
,
VAT_CLASSIFY
,
MVC_CLASSIFY
,
MVI_CLASSIFY
}
LICENSE_CLASSIFY_SET_2
=
{
BL_CLASSIFY
,
UCI_CLASSIFY
,
EEP_CLASSIFY
,
DL_CLASSIFY
,
PP_CLASSIFY
,
BC_CLASSIFY
}
...
...
src/apps/doc/exceptions.py
0 → 100644
View file @
7dfc2ee
class
EDMSException
(
Exception
):
pass
src/apps/doc/management/commands/doc_ocr_process.py
View file @
7dfc2ee
...
...
@@ -4,6 +4,7 @@ import signal
import
asyncio
import
aiohttp
import
difflib
import
base64
import
requests
from
datetime
import
datetime
,
date
from
collections
import
Counter
...
...
@@ -18,6 +19,7 @@ from apps.doc.models import DocStatus, HILDoc, AFCDoc, Keywords
from
apps.doc.named_enum
import
KeywordsType
from
apps.doc
import
consts
from
apps.doc.ocr.edms
import
EDMS
,
rh
from
apps.doc.exceptions
import
EDMSException
class
Command
(
BaseCommand
,
LoggerMixin
):
...
...
@@ -30,7 +32,8 @@ class Command(BaseCommand, LoggerMixin):
# 数据目录
self
.
data_dir
=
conf
.
DATA_DIR
# ocr相关
self
.
ocr_url
=
conf
.
OCR_URL
self
.
ocr_url_1
=
conf
.
OCR_URL_1
self
.
ocr_url_2
=
conf
.
OCR_URL_2
# EDMS web_service_api
self
.
edms
=
EDMS
(
conf
.
EDMS_USER
,
conf
.
EDMS_PWD
)
# 优雅退出信号:15
...
...
@@ -70,18 +73,44 @@ class Command(BaseCommand, LoggerMixin):
os
.
makedirs
(
doc_data_path
,
exist_ok
=
True
)
pdf_path
=
os
.
path
.
join
(
doc_data_path
,
'{0}.pdf'
.
format
(
doc
.
id
))
if
not
doc
.
application_id
.
startswith
(
consts
.
FIXED_APPLICATION_ID_PREFIX
):
for
times
in
range
(
consts
.
RETRY_TIMES
):
try
:
self
.
edms
.
download
(
pdf_path
,
doc
.
metadata_version_id
)
except
Exception
as
e
:
self
.
cronjob_log
.
warn
(
'{0} [edms download failed] [times={1}] [business_type={2}] [doc_id={3}] '
'[error={4}]'
.
format
(
self
.
log_base
,
times
,
business_type
,
doc
.
id
,
e
))
edms_exc
=
str
(
e
)
else
:
break
else
:
raise
EDMSException
(
edms_exc
)
excel_path
=
os
.
path
.
join
(
doc_data_path
,
'{0}.xlsx'
.
format
(
doc
.
id
))
src_excel_path
=
os
.
path
.
join
(
doc_data_path
,
'src.xlsx'
)
self
.
cronjob_log
.
info
(
'{0} [
pdf
download success] [business_type={1}] [doc_id={2}] [pdf_path={3}]'
.
format
(
self
.
cronjob_log
.
info
(
'{0} [
edms
download success] [business_type={1}] [doc_id={2}] [pdf_path={3}]'
.
format
(
self
.
log_base
,
business_type
,
doc
.
id
,
pdf_path
))
return
doc_data_path
,
excel_path
,
src_excel_path
,
pdf_path
@staticmethod
def
append_bs_sheet
(
wb
,
sheets
,
bs_summary
,
unknown_summary
,
pno
,
img_idx
,
classify
,
confidence
):
def
bs_process
(
self
,
wb
,
ocr_data
,
bs_summary
,
unknown_summary
,
img_path
,
classify
,
skip_img
):
sheets
=
ocr_data
.
get
(
'data'
,
[])
if
not
sheets
:
skip_img
.
append
(
self
.
parse_img_path
(
img_path
))
return
confidence
=
ocr_data
.
get
(
'confidence'
,
1
)
img_name
,
_
=
os
.
path
.
splitext
(
os
.
path
.
basename
(
img_path
))
for
i
,
sheet
in
enumerate
(
sheets
):
sheet_name
=
'page_{0}_img_{1}_{2}'
.
format
(
pno
,
img_idx
,
i
)
cells
=
sheet
.
get
(
'cells'
)
if
not
cells
:
skip_img
.
append
(
self
.
parse_img_path
(
img_path
))
continue
sheet_name
=
'{0}_{1}'
.
format
(
img_name
,
i
)
ws
=
wb
.
create_sheet
(
sheet_name
)
for
cell
in
cells
:
c1
=
cell
.
get
(
'start_column'
)
r1
=
cell
.
get
(
'start_row'
)
words
=
cell
.
get
(
'words'
)
ws
.
cell
(
row
=
r1
+
1
,
column
=
c1
+
1
,
value
=
words
)
# ['户名', '卡号', '页码', '回单验证码', '打印时间', '起始时间', '终止时间']
summary
=
sheet
.
get
(
'summary'
)
card
=
summary
[
1
]
...
...
@@ -129,74 +158,199 @@ class Command(BaseCommand, LoggerMixin):
if
summary
[
6
]
is
not
None
:
ed_list
.
append
(
summary
[
6
])
ws
=
wb
.
create_sheet
(
sheet_name
)
cells
=
sheet
.
get
(
'cells'
)
for
cell
in
cells
:
c1
=
cell
.
get
(
'start_column'
)
r1
=
cell
.
get
(
'start_row'
)
words
=
cell
.
get
(
'words'
)
ws
.
cell
(
row
=
r1
+
1
,
column
=
c1
+
1
,
value
=
words
)
def
license1_process
(
self
,
ocr_data
,
license_summary
,
classify
,
skip_img
,
img_path
):
license_data
=
ocr_data
.
get
(
'data'
,
[])
if
not
license_data
:
skip_img
.
append
(
self
.
parse_img_path
(
img_path
))
return
for
license_dict
in
license_data
:
res_list
=
[]
for
field
,
value
in
license_dict
.
items
():
res_list
.
append
((
field
,
value
))
license_summary
.
setdefault
(
classify
,
[])
.
append
(
res_list
)
def
license2_process
(
self
,
ocr_res_2
,
license_summary
,
pid
,
classify
,
skip_img
,
img_path
):
if
ocr_res_2
.
get
(
'ErrorCode'
)
in
consts
.
SUCCESS_CODE_SET
:
if
pid
==
consts
.
BC_PID
:
# 银行卡
res_list
=
[]
for
en_key
,
chn_key
in
consts
.
BC_FIELD
:
res_list
.
append
((
chn_key
,
ocr_res_2
.
get
(
en_key
,
''
)))
license_summary
.
setdefault
(
classify
,
[])
.
append
(
res_list
)
else
:
# 营业执照、行驶证等
for
result_dict
in
ocr_res_2
.
get
(
'ResultList'
,
[]):
res_list
=
[]
for
field_dict
in
result_dict
.
get
(
'FieldList'
,
[]):
res_list
.
append
(
(
field_dict
.
get
(
'chn_key'
,
''
),
field_dict
.
get
(
'value'
,
''
)))
license_summary
.
setdefault
(
classify
,
[])
.
append
(
res_list
)
else
:
skip_img
.
append
(
self
.
parse_img_path
(
img_path
))
def
ocr_2_wb
(
self
,
res
,
wb
,
pno
,
img_idx
,
bs_summary
,
unknown_summary
,
license_summary
):
# res = {
# 'code': 1,
# 'msg': 'success',
# 'data': {
# 'classify': 0,
# 'confidence': 0.999,
# 'sheets': [
# {
# 'summary': ['户名', '卡号', '页码', '回单验证码', '打印时间', '起始时间', '终止时间'],
# 'cells': []
# },
# {
# 'summary': ['户名', '卡号', '页码', '回单验证码', '打印时间', '起始时间', '终止时间'],
# 'cells': []
# }
# ]
# }
# }
data
=
res
.
get
(
'data'
,
{})
classify
=
data
.
get
(
'classify'
)
@staticmethod
async
def
fetch_ocr_1_result
(
url
,
json_data
):
async
with
aiohttp
.
ClientSession
(
connector
=
aiohttp
.
TCPConnector
(
ssl
=
False
))
as
session
:
async
with
session
.
post
(
url
,
json
=
json_data
)
as
response
:
if
response
.
status
==
200
:
return
await
response
.
json
()
@staticmethod
async
def
fetch_ocr_2_result
(
url
,
json_data
):
async
with
aiohttp
.
ClientSession
(
connector
=
aiohttp
.
TCPConnector
(
ssl
=
False
))
as
session
:
async
with
session
.
post
(
url
,
data
=
json_data
)
as
response
:
if
response
.
status
==
200
:
return
await
response
.
json
()
async
def
img_2_ocr_2_wb
(
self
,
wb
,
img_path
,
bs_summary
,
unknown_summary
,
license_summary
,
skip_img
):
with
open
(
img_path
,
'rb'
)
as
f
:
base64_data
=
base64
.
b64encode
(
f
.
read
())
# 获取解码后的base64值
file_data
=
base64_data
.
decode
()
json_data_1
=
{
"file"
:
file_data
}
ocr_res_1
=
await
self
.
fetch_ocr_1_result
(
self
.
ocr_url_1
,
json_data_1
)
if
ocr_res_1
is
None
:
raise
Exception
(
'ocr 1 error, img_path={0}'
.
format
(
img_path
))
else
:
self
.
cronjob_log
.
info
(
'{0} [ocr_1 result] [img={1}] [res={2}]'
.
format
(
self
.
log_base
,
img_path
,
ocr_res_1
))
if
ocr_res_1
.
get
(
'code'
)
==
1
:
ocr_data
=
ocr_res_1
.
get
(
'data'
,
{})
classify
=
ocr_data
.
get
(
'classify'
)
if
classify
is
None
:
skip_img
.
append
(
self
.
parse_img_path
(
img_path
))
return
# if classify in
sheets
=
data
.
get
(
'sheets'
,
[])
if
not
sheets
:
elif
classify
in
consts
.
OTHER_CLASSIFY_SET
:
# 其他类
skip_img
.
append
(
self
.
parse_img_path
(
img_path
))
return
confidence
=
data
.
get
(
'confidence'
,
1
)
self
.
append_bs_sheet
(
wb
,
sheets
,
bs_summary
,
unknown_summary
,
pno
,
img_idx
,
classify
,
confidence
)
# else:
# pass
# async def fetch_ocr_result(self, img_path):
# async with aiohttp.ClientSession(
# headers=self.ocr_header, connector=aiohttp.TCPConnector(ssl=False)
# ) as session:
# json_data = self.get_ocr_json(img_path)
# async with session.post(self.ocr_url, json=json_data) as response:
# return await response.json()
elif
classify
in
consts
.
LICENSE_CLASSIFY_SET_1
:
# 证件1
self
.
license1_process
(
ocr_data
,
license_summary
,
classify
,
skip_img
,
img_path
)
elif
classify
in
consts
.
LICENSE_CLASSIFY_SET_2
:
# 证件2
pid
,
_
=
consts
.
LICENSE_CLASSIFY_MAPPING
.
get
(
classify
)
json_data_2
=
{
"pid"
:
str
(
pid
),
"key"
:
conf
.
OCR_KEY
,
"secret"
:
conf
.
OCR_SECRET
,
"file"
:
file_data
}
ocr_res_2
=
await
self
.
fetch_ocr_2_result
(
self
.
ocr_url_2
,
json_data_2
)
if
ocr_res_2
is
None
:
raise
Exception
(
'ocr 2 error, img_path={0}'
.
format
(
img_path
))
else
:
# 识别结果
self
.
cronjob_log
.
info
(
'{0} [ocr_2 result] [img={1}] [res={2}]'
.
format
(
self
.
log_base
,
img_path
,
ocr_res_2
))
self
.
license2_process
(
ocr_res_2
,
license_summary
,
pid
,
classify
,
skip_img
,
img_path
)
else
:
# 流水处理
self
.
bs_process
(
wb
,
ocr_data
,
bs_summary
,
unknown_summary
,
img_path
,
classify
,
skip_img
)
else
:
skip_img
.
append
(
self
.
parse_img_path
(
img_path
))
# def img_2_ocr_2_wb(self, wb, img_path, bs_summary, unknown_summary, license_summary, skip_img):
# # # 流水
# # res = {
# # 'code': 1,
# # 'msg': 'success',
# # 'data': {
# # 'classify': 0,
# # 'confidence': 0.999,
# # 'data': [
# # {
# # 'summary': ['户名', '卡号', '页码', '回单验证码', '打印时间', '起始时间', '终止时间'],
# # 'cells': []
# # },
# # {
# # 'summary': ['户名', '卡号', '页码', '回单验证码', '打印时间', '起始时间', '终止时间'],
# # 'cells': []
# # }
# # ]
# # }
# # }
# #
# # # 证件-1
# # res = {
# # 'code': 1,
# # 'msg': 'success',
# # 'data': {
# # 'classify': 0,
# # 'confidence': 0.999,
# # 'data': [
# # {
# # 'cn_key': 'value',
# # 'cn_key': 'value',
# # },
# # {
# # 'cn_key': 'value',
# # 'cn_key': 'value',
# # },
# # ]
# # }
# # }
# #
# # # 证件-2 or 其他类
# # res = {
# # 'code': 1,
# # 'msg': 'success',
# # 'data': {
# # 'classify': 0,
# # 'confidence': 0.999,
# # }
# # }
# with open(img_path, 'rb') as f:
# base64_data = base64.b64encode(f.read())
# # 获取解码后的base64值
# file_data = base64_data.decode()
# json_data_1 = {
# "file": file_data
# }
# response_1 = requests.post(self.ocr_url_1, json=json_data_1)
# if response_1.status_code == 200:
# ocr_res_1 = response_1.json()
# self.cronjob_log.info('{0} [ocr_1 result] [img={1}] [res={2}]'.format(
# self.log_base, img_path, ocr_res_1))
#
# async def img_2_ocr_2_wb(self, wb, img_path, summary):
# res = await self.fetch_ocr_result(img_path)
# self.cronjob_log.info('{0} [fetch ocr result success] [img={1}] [res={2}]'.format(self.log_base, img_path, res))
# sheets_list = res.get('result').get('res')
# img_name = os.path.basename(img_path)
# self.append_sheet(wb, sheets_list, img_name, summary)
def
fetch_ocr_result
(
self
,
img_path
):
files
=
[
(
'img'
,
open
(
img_path
,
'rb'
))
]
response
=
requests
.
request
(
"POST"
,
self
.
ocr_url
,
files
=
files
)
return
response
.
json
()
def
img_2_ocr_2_wb
(
self
,
wb
,
img_info
,
bs_summary
,
unknown_summary
,
license_summary
):
res
=
self
.
fetch_ocr_result
(
img_info
[
0
])
self
.
cronjob_log
.
info
(
'{0} [fetch ocr result success] [img={1}] [res={2}]'
.
format
(
self
.
log_base
,
img_info
[
0
],
res
))
if
res
.
get
(
'code'
)
==
1
:
self
.
ocr_2_wb
(
res
,
wb
,
img_info
[
1
],
img_info
[
2
],
bs_summary
,
unknown_summary
,
license_summary
)
# if ocr_res_1.get('code') == 1:
# ocr_data = ocr_res_1.get('data', {})
# classify = ocr_data.get('classify')
# if classify is None:
# skip_img.append(self.parse_img_path(img_path))
# return
# elif classify in consts.OTHER_CLASSIFY_SET: # 其他类
# skip_img.append(self.parse_img_path(img_path))
# return
# elif classify in consts.LICENSE_CLASSIFY_SET_1: # 证件1
# self.license1_process(ocr_data, license_summary, classify, skip_img, img_path)
# elif classify in consts.LICENSE_CLASSIFY_SET_2: # 证件2
# pid, _ = consts.LICENSE_CLASSIFY_MAPPING.get(classify)
# json_data_2 = {
# "pid": str(pid),
# "key": conf.OCR_KEY,
# "secret": conf.OCR_SECRET,
# "file": file_data
# }
# response_2 = requests.post(self.ocr_url_2, data=json_data_2)
# if response_2.status_code == 200:
# # 识别结果
# ocr_res_2 = response_2.json()
# self.cronjob_log.info('{0} [ocr_2 result] [img={1}] [res={2}]'.format(
# self.log_base, img_path, ocr_res_2))
# self.license2_process(ocr_res_2, license_summary, pid, classify, skip_img, img_path)
# else:
# raise Exception('ocr 2 error, img_path={0}'.format(img_path))
# else: # 流水处理
# self.bs_process(wb, ocr_data, bs_summary, unknown_summary, img_path, classify, skip_img)
# else:
# skip_img.append(self.parse_img_path(img_path))
# else:
# raise Exception('ocr 1 error, img_path={0}'.format(img_path))
@staticmethod
def
parse_img_path
(
img_path
):
img_name
,
_
=
os
.
path
.
splitext
(
os
.
path
.
basename
(
img_path
))
return
int
(
img_name
[
5
])
+
1
,
int
(
img_name
[
11
])
+
1
@staticmethod
def
get_most
(
value_list
):
...
...
@@ -255,7 +409,6 @@ class Command(BaseCommand, LoggerMixin):
summary
[
'role'
]
=
self
.
get_most
(
summary
[
'role'
])
return
bs_summary
def
rebuild_bs_summary
(
self
,
bs_summary
,
unknown_summary
):
# bs_summary = {
# '卡号': {
...
...
@@ -297,8 +450,10 @@ class Command(BaseCommand, LoggerMixin):
merged_bs_summary
[
card
]
=
summary
else
:
# 1卡号
one_card
=
False
if
len
(
bs_summary
)
==
1
:
merged_bs_summary
=
self
.
prune_bs_summary
(
bs_summary
)
one_card
=
True
# 多卡号
else
:
merged_bs_summary
=
self
.
merge_card
(
bs_summary
)
...
...
@@ -307,7 +462,7 @@ class Command(BaseCommand, LoggerMixin):
merge_role
=
[]
classify_summary
=
unknown_summary
.
get
(
card_summary
[
'classify'
],
{})
for
role
,
summary
in
classify_summary
.
items
():
if
role
in
card_summary
[
'role_set'
]:
if
one_card
or
role
in
card_summary
[
'role_set'
]:
merge_role
.
append
(
role
)
card_summary
[
'sheet'
]
.
extend
(
summary
[
'sheet'
])
card_summary
[
'code'
]
.
extend
(
summary
[
'code'
])
...
...
@@ -336,12 +491,13 @@ class Command(BaseCommand, LoggerMixin):
return
merged_bs_summary
# TODO 细化文件状态,不同异常状态,归还队列,重试时采取不同的处理
# TODO 调用接口重试
# TODO 协程异步发送OCR请求
# TODO 异常邮件通知
# 识别失败:普通异常,如PDF异常、构建过程异常
# EDMS异常:下载异常-->回队列-->邮件;上传异常-->重新上传队列-->邮件
# 算法异常:第一道异常-->识别失败-->邮件;第二道异常-->识别失败-->邮件
# TODO 协程异步发送OCR请求
# TODO 调用接口重试
# TODO 数据库断联问题
# TODO 非流水证件处理
# TODO EDMS API GATEWAY
def
handle
(
self
,
*
args
,
**
kwargs
):
sleep_second
=
int
(
conf
.
SLEEP_SECOND
)
max_sleep_second
=
int
(
conf
.
MAX_SLEEP_SECOND
)
...
...
@@ -369,61 +525,82 @@ class Command(BaseCommand, LoggerMixin):
pdf_handler
.
extract_image
()
self
.
cronjob_log
.
info
(
'{0} [pdf to img end] [business_type={1}] [doc_id={2}]'
.
format
(
self
.
log_base
,
business_type
,
doc
.
id
))
write_zip_file
(
img_save_path
,
os
.
path
.
join
(
doc_data_path
,
'{0}_img.zip'
.
format
(
doc
.
id
)))
# 4.获取OCR结果并且构建excel文件
bs_summary
=
{}
license_summary
=
{}
unknown_summary
=
{}
skip_img
=
[]
interest_keyword
=
Keywords
.
objects
.
filter
(
type
=
KeywordsType
.
INTEREST
.
value
)
.
values_list
(
'keyword'
,
flat
=
True
)
type
=
KeywordsType
.
INTEREST
.
value
,
on_off
=
True
)
.
values_list
(
'keyword'
,
flat
=
True
)
salary_keyword
=
Keywords
.
objects
.
filter
(
type
=
KeywordsType
.
SALARY
.
value
)
.
values_list
(
'keyword'
,
flat
=
True
)
type
=
KeywordsType
.
SALARY
.
value
,
on_off
=
True
)
.
values_list
(
'keyword'
,
flat
=
True
)
loan_keyword
=
Keywords
.
objects
.
filter
(
type__in
=
[
KeywordsType
.
LOAN
.
value
,
KeywordsType
.
ALI_WECHART
.
value
])
.
values_list
(
type__in
=
[
KeywordsType
.
LOAN
.
value
,
KeywordsType
.
ALI_WECHART
.
value
]
,
on_off
=
True
)
.
values_list
(
'keyword'
,
flat
=
True
)
wb
=
BSWorkbook
(
interest_keyword
,
salary_keyword
,
loan_keyword
)
# wb = Workbook()
# 4.1 获取OCR结果
# loop = asyncio.get_event_loop()
# tasks = [self.img_2_ocr_2_wb(wb, img_path, summary) for img_path in pdf_handler.img_path_list]
# loop.run_until_complete(asyncio.wait(tasks))
loop
=
asyncio
.
get_event_loop
()
tasks
=
[
self
.
img_2_ocr_2_wb
(
wb
,
img_path
,
bs_summary
,
unknown_summary
,
license_summary
,
skip_img
)
for
img_path
in
pdf_handler
.
img_path_list
]
loop
.
run_until_complete
(
asyncio
.
wait
(
tasks
))
# loop.close()
for
img_info
in
pdf_handler
.
img_info
_list
:
self
.
img_2_ocr_2_wb
(
wb
,
img_info
,
bs_summary
,
unknown_summary
,
license_summary
)
# for img_path in pdf_handler.img_path
_list:
# self.img_2_ocr_2_wb(wb, img_path, bs_summary, unknown_summary, license_summary, skip_img
)
self
.
cronjob_log
.
info
(
'{0} [bs_summary={1}] [unknown_summary={2}] [license_summary={3}]'
.
format
(
self
.
log_base
,
bs_summary
,
unknown_summary
,
license_summary
))
self
.
cronjob_log
.
info
(
'{0} [business_type={1}] [doc_id={2}] [bs_summary={3}] [unknown_summary={4}] '
'[license_summary={5}]'
.
format
(
self
.
log_base
,
business_type
,
doc
.
id
,
bs_summary
,
unknown_summary
,
license_summary
))
merged_bs_summary
=
self
.
rebuild_bs_summary
(
bs_summary
,
unknown_summary
)
self
.
cronjob_log
.
info
(
'{0} [merged_bs_summary={1}] [unknown_summary={2}]'
.
format
(
self
.
log_base
,
merged_bs_summary
,
unknown_summary
))
self
.
cronjob_log
.
info
(
'{0} [business_type={1}] [doc_id={2}] [merged_bs_summary={3}] '
'[unknown_summary={4}] [skip_img={5}]'
.
format
(
self
.
log_base
,
business_type
,
doc
.
id
,
merged_bs_summary
,
unknown_summary
,
skip_img
))
del
unknown_summary
# 4.2 重构Excel文件
wb
.
save
(
src_excel_path
)
wb
.
rebuild
(
merged_bs_summary
,
license_summary
)
wb
.
rebuild
(
merged_bs_summary
,
license_summary
,
skip_img
)
wb
.
save
(
excel_path
)
except
EDMSException
as
e
:
self
.
cronjob_log
.
error
(
'{0} [process failed (edms download)] [business_type={1}] [doc_id={2}] '
'[err={3}]'
.
format
(
self
.
log_base
,
business_type
,
doc
.
id
,
e
))
except
Exception
as
e
:
doc
.
status
=
DocStatus
.
PROCESS_FAILED
.
value
doc
.
save
()
self
.
cronjob_log
.
error
(
'{0} [process failed
] [business_type={1}] [doc_id={2}] [err={3}]'
.
format
(
self
.
log_base
,
business_type
,
doc
.
id
,
e
))
self
.
cronjob_log
.
error
(
'{0} [process failed
(program)] [business_type={1}] [doc_id={2}] '
'[err={3}]'
.
format
(
self
.
log_base
,
business_type
,
doc
.
id
,
e
))
else
:
try
:
# 5.上传至EDMS
for
times
in
range
(
consts
.
RETRY_TIMES
):
try
:
self
.
edms
.
upload
(
excel_path
,
doc
,
business_type
)
except
Exception
as
e
:
self
.
cronjob_log
.
warn
(
'{0} [edms upload failed] [times={1}] [business_type={2}] [doc_id={3}] '
'[error={4}]'
.
format
(
self
.
log_base
,
times
,
business_type
,
doc
.
id
,
e
))
edms_exc
=
str
(
e
)
else
:
break
else
:
raise
EDMSException
(
edms_exc
)
except
Exception
as
e
:
doc
.
status
=
DocStatus
.
UPLOAD_FAILED
.
value
doc
.
save
()
end_time
=
time
.
time
()
speed_time
=
int
(
end_time
-
start_time
)
self
.
cronjob_log
.
error
(
'{0} [upload failed] [business_type={1}] [doc_id={2}] [speed_time={3}] '
'[err={4}]'
.
format
(
self
.
log_base
,
business_type
,
doc
.
id
,
speed_time
,
e
))
self
.
cronjob_log
.
error
(
'{0} [process failed (edms upload)] [business_type={1}] [doc_id={2}] '
'[speed_time={3}] [err={4}]'
.
format
(
self
.
log_base
,
business_type
,
doc
.
id
,
speed_time
,
e
))
write_zip_file
(
img_save_path
,
os
.
path
.
join
(
doc_data_path
,
'{0}_img.zip'
.
format
(
doc
.
id
)))
else
:
doc
.
status
=
DocStatus
.
COMPLETE
.
value
doc
.
save
()
...
...
@@ -431,5 +608,6 @@ class Command(BaseCommand, LoggerMixin):
speed_time
=
int
(
end_time
-
start_time
)
self
.
cronjob_log
.
info
(
'{0} [process complete] [business_type={1}] [doc_id={2}] '
'[speed_time={3}]'
.
format
(
self
.
log_base
,
business_type
,
doc
.
id
,
speed_time
))
write_zip_file
(
img_save_path
,
os
.
path
.
join
(
doc_data_path
,
'{0}_img.zip'
.
format
(
doc
.
id
)))
self
.
cronjob_log
.
info
(
'{0} [stop safely]'
.
format
(
self
.
log_base
))
...
...
src/apps/doc/ocr/wb.py
View file @
7dfc2ee
...
...
@@ -141,32 +141,22 @@ class BSWorkbook(Workbook):
# month_info process
month_info
=
month_mapping
.
setdefault
(
'xxxx-xx'
,
[])
month_info
.
append
((
ws
.
title
,
min_row
,
ws
.
max_row
,
0
))
elif
len
(
month_list
)
==
1
:
# reverse_trend_list process
reverse_trend
=
self
.
get_reverse_trend
(
dti
.
day
,
idx_list
)
reverse_trend_list
.
append
(
reverse_trend
)
# month_info process
month_info
=
month_mapping
.
setdefault
(
month_list
[
0
],
[])
day_mean
=
np
.
mean
(
dti
.
day
.
dropna
())
if
len
(
month_info
)
==
0
:
month_info
.
append
((
ws
.
title
,
min_row
,
ws
.
max_row
,
day_mean
))
else
:
for
i
,
item
in
enumerate
(
month_info
):
if
day_mean
<=
item
[
-
1
]:
month_info
.
insert
(
i
,
(
ws
.
title
,
min_row
,
ws
.
max_row
,
day_mean
))
break
else
:
month_info
.
append
((
ws
.
title
,
min_row
,
ws
.
max_row
,
day_mean
))
else
:
# reverse_trend_list process
reverse_trend
=
self
.
get_reverse_trend
(
dti
.
day
,
idx_list
)
reverse_trend_list
.
append
(
reverse_trend
)
# month_info process
for
i
,
item
in
enumerate
(
month_list
[:
-
1
]):
day_idx
=
dti
.
day
idx_list_max_idx
=
len
(
idx_list
)
-
1
for
i
,
item
in
enumerate
(
month_list
):
if
i
==
idx_list_max_idx
:
day_mean
=
np
.
mean
(
day_idx
[
idx_list
[
i
]:]
.
dropna
())
month_mapping
.
setdefault
(
item
,
[])
.
append
(
(
ws
.
title
,
idx_list
[
i
]
+
min_row
,
idx_list
[
i
+
1
]
+
min_row
-
1
,
self
.
MAX_MEAN
))
month_mapping
.
setdefault
(
month_list
[
-
1
],
[])
.
insert
(
0
,
(
ws
.
title
,
idx_list
[
-
1
]
+
min_row
,
ws
.
max_row
,
0
))
(
ws
.
title
,
idx_list
[
i
]
+
min_row
,
ws
.
max_row
,
day_mean
))
else
:
day_mean
=
np
.
mean
(
day_idx
[
idx_list
[
i
]:
idx_list
[
i
+
1
]]
.
dropna
())
month_mapping
.
setdefault
(
item
,
[])
.
append
(
(
ws
.
title
,
idx_list
[
i
]
+
min_row
,
idx_list
[
i
+
1
]
+
min_row
-
1
,
day_mean
))
def
build_metadata_rows
(
self
,
confidence
,
code
,
print_time
,
start_date
,
end_date
):
if
start_date
is
None
or
end_date
is
None
:
...
...
@@ -191,9 +181,9 @@ class BSWorkbook(Workbook):
def
create_meta_sheet
(
self
,
card
):
if
self
.
worksheets
[
0
]
.
title
==
'Sheet'
:
ms
=
self
.
worksheets
[
0
]
ms
.
title
=
'{0}({1})'
.
format
(
self
.
meta_sheet_title
,
card
)
ms
.
title
=
'{0}({1})'
.
format
(
self
.
meta_sheet_title
,
card
[
-
6
:]
)
else
:
ms
=
self
.
create_sheet
(
'{0}({1})'
.
format
(
self
.
meta_sheet_title
,
card
))
ms
=
self
.
create_sheet
(
'{0}({1})'
.
format
(
self
.
meta_sheet_title
,
card
[
-
6
:]
))
return
ms
def
build_meta_sheet
(
self
,
card
,
confidence
,
code
,
print_time
,
start_date
,
end_date
):
...
...
@@ -203,6 +193,26 @@ class BSWorkbook(Workbook):
ms
.
append
(
row
)
return
ms
@staticmethod
def
amount_format
(
amount_str
):
if
not
isinstance
(
amount_str
,
str
)
or
amount_str
==
''
:
return
amount_str
# 1.替换
res_str
=
amount_str
.
translate
(
consts
.
TRANS
)
# 2.删除多余的-
res_str
=
res_str
[
0
]
+
res_str
[
1
:]
.
replace
(
'-'
,
''
)
# 3.首字符处理
if
res_str
[
0
]
in
consts
.
ERROR_CHARS
:
res_str
=
'-{0}'
.
format
(
res_str
[
1
:])
# 4.逗号与句号处理
if
len
(
res_str
)
>=
4
:
period_idx
=
len
(
res_str
)
-
3
if
res_str
[
period_idx
]
==
'.'
and
res_str
[
period_idx
-
1
]
==
','
:
res_str
=
'{0}{1}'
.
format
(
res_str
[:
period_idx
-
1
],
res_str
[
period_idx
:])
elif
res_str
[
period_idx
]
==
','
:
res_str
=
'{0}.{1}'
.
format
(
res_str
[:
period_idx
],
res_str
[
period_idx
+
1
:])
return
res_str
def
build_month_sheet
(
self
,
card
,
month_mapping
,
ms
,
is_reverse
):
tmp_ws
=
self
.
create_sheet
(
'tmp_ws'
)
for
month
in
sorted
(
month_mapping
.
keys
()):
...
...
@@ -235,29 +245,25 @@ class BSWorkbook(Workbook):
# 3.3.余额转数值
over_cell
=
rows
[
consts
.
OVER_IDX
]
try
:
if
isinstance
(
over_cell
.
value
,
str
):
over_cell
.
value
=
over_cell
.
value
.
translate
(
consts
.
TRANS
)
over_cell
.
value
=
locale
.
atof
(
over_cell
.
value
)
over_cell
.
value
=
locale
.
atof
(
self
.
amount_format
(
over_cell
.
value
))
except
Exception
as
e
:
continue
else
:
over_cell
.
number_format
=
numbers
.
FORMAT_NUMBER_COMMA_SEPARATED1
# 3.4.
余
额转数值
# 3.4.
金
额转数值
try
:
try
:
if
isinstance
(
amount_cell
.
value
,
str
):
# TODO 可在转化数字失败后,再替换
amount_cell
.
value
=
amount_cell
.
value
.
translate
(
consts
.
TRANS
)
amount_cell
.
value
=
locale
.
atof
(
amount_cell
.
value
)
amount_cell
.
value
=
locale
.
atof
(
self
.
amount_format
(
amount_cell
.
value
))
except
Exception
as
e
:
try
:
if
isinstance
(
rows
[
consts
.
INCOME_IDX
]
.
value
,
str
):
rows
[
consts
.
OUTLAY_IDX
]
.
value
=
rows
[
consts
.
INCOME_IDX
]
.
value
.
translate
(
consts
.
TRANS
)
amount_cell
.
value
=
locale
.
atof
(
rows
[
consts
.
OUTLAY_IDX
]
.
value
)
amount_cell
.
value
=
locale
.
atof
(
self
.
amount_format
(
rows
[
consts
.
INCOME_IDX
]
.
value
))
if
amount_cell
.
value
==
0
:
raise
elif
amount_cell
.
value
<
0
:
amount_cell
.
value
=
-
amount_cell
.
value
except
Exception
as
e
:
if
isinstance
(
rows
[
consts
.
OUTLAY_IDX
]
.
value
,
str
):
rows
[
consts
.
OUTLAY_IDX
]
.
value
=
rows
[
consts
.
OUTLAY_IDX
]
.
value
.
translate
(
consts
.
TRANS
)
amount_cell
.
value
=
locale
.
atof
(
rows
[
consts
.
OUTLAY_IDX
]
.
value
)
amount_cell
.
value
=
locale
.
atof
(
self
.
amount_format
(
rows
[
consts
.
OUTLAY_IDX
]
.
value
))
if
amount_cell
.
value
>
0
:
amount_cell
.
value
=
-
amount_cell
.
value
except
Exception
as
e
:
...
...
@@ -313,18 +319,18 @@ class BSWorkbook(Workbook):
# }
for
card
,
summary
in
bs_summary
.
items
():
# 1.原表修剪、排列、按照月份分割
start_date
=
summary
[
'start_date'
]
end_date
=
summary
[
'end_date'
]
start_date
=
summary
.
get
(
'start_date'
)
end_date
=
summary
.
get
(
'end_date'
)
date_statistics
=
False
if
start_date
is
None
or
end_date
is
None
:
date_statistics
=
True
date_list
=
[]
month_mapping
=
{}
reverse_trend_list
=
[]
for
sheet
in
summary
[
'sheet'
]
:
for
sheet
in
summary
.
get
(
'sheet'
,
[])
:
ws
=
self
.
get_sheet_by_name
(
sheet
)
# 1.1.删除多余列、排列
min_row
=
self
.
sheet_prune
(
ws
,
summary
[
'classify'
]
)
min_row
=
self
.
sheet_prune
(
ws
,
summary
.
get
(
'classify'
,
0
)
)
# 1.2.按月份分割
self
.
sheet_split
(
ws
,
month_mapping
,
reverse_trend_list
,
min_row
,
date_list
,
date_statistics
)
...
...
@@ -334,32 +340,43 @@ class BSWorkbook(Workbook):
# 2.元信息提取表
ms
=
self
.
build_meta_sheet
(
card
,
summary
[
'confidence'
]
,
summary
[
'code'
]
,
summary
[
'print_time'
]
,
summary
.
get
(
'confidence'
,
1
)
,
summary
.
get
(
'code'
)
,
summary
.
get
(
'print_time'
)
,
start_date
,
end_date
)
# 3.创建月份表、提取/高亮关键行
is_reverse
=
False
if
sum
(
reverse_trend_list
)
>
0
:
# 倒序处理
is_reverse
=
True
# 倒序处理
is_reverse
=
True
if
sum
(
reverse_trend_list
)
>
0
else
False
for
month_list
in
month_mapping
.
values
():
month_list
.
sort
(
key
=
lambda
x
:
x
[
-
1
],
reverse
=
True
)
month_list
.
sort
(
key
=
lambda
x
:
x
[
-
1
],
reverse
=
is_reverse
)
self
.
build_month_sheet
(
card
,
month_mapping
,
ms
,
is_reverse
)
# 4.删除原表
for
sheet
in
summary
[
'sheet'
]
:
for
sheet
in
summary
.
get
(
'sheet'
)
:
self
.
remove
(
self
.
get_sheet_by_name
(
sheet
))
def
license_rebuild
(
self
,
license_summary
):
for
en_key
,
cn_key
in
consts
.
LICENSE_ORDER
:
ws
=
self
.
create_sheet
(
cn_key
)
for
bl
in
license_summary
.
get
(
en_key
,
[]):
for
classify
,
(
_
,
name
)
in
consts
.
LICENSE_ORDER
:
res
=
license_summary
.
get
(
classify
)
if
res
is
None
:
continue
ws
=
self
.
create_sheet
(
name
)
for
bl
in
res
:
for
bl_field
in
bl
:
ws
.
append
(
bl_field
)
ws
.
append
((
None
,
))
def
rebuild
(
self
,
bs_summary
,
license_summary
):
def
skip_img_sheet
(
self
,
skip_img
):
if
skip_img
:
ws
=
self
.
create_sheet
(
consts
.
SKIP_IMG_SHEET_NAME
)
ws
.
append
(
consts
.
SKIP_IMG_SHEET_HEADER
)
for
img_tuple
in
skip_img
:
ws
.
append
(
img_tuple
)
def
rebuild
(
self
,
bs_summary
,
license_summary
,
skip_img
):
self
.
bs_rebuild
(
bs_summary
)
# self.license_rebuild(license_summary)
self
.
license_rebuild
(
license_summary
)
self
.
skip_img_sheet
(
skip_img
)
...
...
src/common/tools/pdf_to_img.py
View file @
7dfc2ee
...
...
@@ -25,7 +25,7 @@ class PDFHandler:
def
__init__
(
self
,
path
,
img_dir_path
):
self
.
path
=
path
self
.
img_dir_path
=
img_dir_path
self
.
img_
info
_list
=
[]
self
.
img_
path
_list
=
[]
self
.
xref_set
=
set
()
def
get_img_save_path
(
self
,
pno
,
img_index
=
0
,
ext
=
'png'
):
...
...
@@ -38,7 +38,7 @@ class PDFHandler:
pm
=
page
.
getPixmap
(
matrix
=
trans_2
,
alpha
=
False
)
img_save_path
=
self
.
get_img_save_path
(
page
.
number
)
pm
.
writePNG
(
img_save_path
)
self
.
img_
info_list
.
append
((
img_save_path
,
page
.
number
,
0
)
)
self
.
img_
path_list
.
append
(
img_save_path
)
@staticmethod
def
getimage
(
pix
):
...
...
@@ -88,7 +88,7 @@ class PDFHandler:
with
open
(
img_save_path
,
"wb"
)
as
f
:
f
.
write
(
img_data
)
self
.
xref_set
.
add
(
xref
)
self
.
img_
info_list
.
append
((
img_save_path
,
pno
,
img_index
)
)
self
.
img_
path_list
.
append
(
img_save_path
)
@staticmethod
def
split_il
(
il
):
...
...
@@ -179,7 +179,7 @@ class PDFHandler:
img_save_path
=
self
.
get_img_save_path
(
pno
,
img_index
,
im_list
[
0
][
2
])
new_img
.
save
(
img_save_path
)
page_to_png
=
False
self
.
img_
info_list
.
append
((
img_save_path
,
pno
,
img_index
)
)
self
.
img_
path_list
.
append
(
img_save_path
)
# 3.3 碎图分组大于2、全过滤、含特殊filter,特殊处理:整个页面保存为png图片
if
page_to_png
:
...
...
Write
Preview
Styling with
Markdown
is supported
Attach a file
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to post a comment