Skip to content
Toggle navigation
Toggle navigation
This project
Loading...
Sign in
周伟奇
/
bmw-ocr
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Graphs
Network
Create a new issue
Commits
Issue Boards
Files
Commits
Network
Compare
Branches
Tags
1c6d880f
authored
2020-09-27 18:40:22 +0800
by
周伟奇
Browse Files
Options
Browse Files
Tag
Download
Email Patches
Plain Diff
add license
1 parent
c1c49a8e
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
306 additions
and
166 deletions
src/apps/doc/consts.py
src/apps/doc/management/commands/doc_ocr_process.py
src/apps/doc/ocr/wb.py
src/common/tools/pdf_to_img.py
src/apps/doc/consts.py
View file @
1c6d880
...
...
@@ -95,7 +95,7 @@ HEADERS_MAPPING.update(
HEADERS_MAPPING
.
update
(
{
'交易日期'
:
BASE_HEADERS_MAPPING
[
'记账日期'
],
'存入'
:
BASE_HEADERS_MAPPING
[
'金额'
],
#
'存入': BASE_HEADERS_MAPPING['金额'],
'对方账号'
:
BASE_HEADERS_MAPPING
[
'对方卡号/账号'
],
'对方名称'
:
BASE_HEADERS_MAPPING
[
'对方账户名'
],
'摘要'
:
BASE_HEADERS_MAPPING
[
'附言'
],
...
...
@@ -160,6 +160,12 @@ HEADERS_MAPPING.update(
'交易地点/对方账号和户名'
:
BASE_HEADERS_MAPPING
[
'对方卡号/账号'
],
}
)
# 农业银行-窄页
HEADERS_MAPPING
.
update
(
{
'交易对手账号'
:
BASE_HEADERS_MAPPING
[
'对方卡号/账号'
],
}
)
# 竖版-特殊-农商行
HEADERS_MAPPING
.
update
(
{
...
...
@@ -299,17 +305,27 @@ HEADERS_MAPPING.update(
# ('中国银行', (1, 2, 4, 5, 6, 9, 10, 11, 12, None, None, None, None)), # 横版-表格-中国银行(不规则)
#
# # 农业银行:交易日期 交易网点 存入 支出 余额 对方账号 对方名称 摘要 渠道 附言
# ('农业银行', (1, None, 3, 5, None, 8, 7, 6, None, None, None, None, None)), # 横版-表格-农业银行-中国农业银行个人账户明细
# ('农业银行-10', (1, None, None, 5, None, 8, 7, 6, None, None, None, 3, 4)), # 横版-表格-农业银行-中国农业银行个人账户明细
#
# # 农业银行:序号 日期 摘要 交易金额 余额 对方账号 对方名称 交易地点 渠道 附言
# ('农业银行-10-1', (2, None, 4, 5, None, 3, 7, 6, None, None, None, None, None)),
#
# # 农业银行:交易日期 摘要 交易金额 余额 交易渠道 交易网点 对方账号 对方名称 附言
# ('农业银行-9', (1, None, 3, 4, None, 2, 8, 7, None, None, None, None, None)),
#
# # 北京银行:交易日期 业务摘要 收/支 发生额 余额 对方户名 对方账号 交易渠道
# ('北京银行', (1, None, 4, 5, None, 2, 6, 7, None, None, 3, None, None)), # 横版-表格-北京银行
#
# # 工商银行:交易日期 账号 储种 序号 币种 钞汇 摘要 地区 收入/支出金额 余额
[对方户名 对方账号]
渠道
# # 工商银行:交易日期 账号 储种 序号 币种 钞汇 摘要 地区 收入/支出金额 余额 渠道
# ('工商银行', (1, None, 9, 10, None, 7, None, None, None, None, None, None, None)),
#
# # 工商银行:交易日期 账号 储种 序号 币种 钞汇 摘要 地区 收入/支出金额 余额 对方户名 对方账号 渠道
# ('工商银行-电子账单', (1, None, 9, 10, None, 7, 11, 12, None, None, None, None, None)),
#
# # 建设银行:空 摘要 交易日期 交易金额 账户余额 商户/网点号及其名称 对方账号与户名 --> 竖版-表格-建设银行
# # 序号 摘要 币别 钞汇 交易日期 交易金额 账户余额 交易地点附言 对方账号与户名 --> 横版-表格-建设银行
# ('建设银行', (None, None, None, None, None, 2, None, None, None, None, None, None, None)),
# ('建设银行-竖版', (3, None, 4, 5, None, 2, None, 7, None, None, None, None, None)),
# ('建设银行-横版', (5, None, 6, 7, None, 2, None, 9, None, None, None, None, None)),
#
# # 微信:交易单号 交易时间 交易类型 收/支/其他 交易方式 金额(元) 交易对方 商户单号
# ('微信', (2, None, 6, None, None, 3, 7, None, None, None, None, None, None)),
...
...
@@ -320,7 +336,13 @@ HEADERS_MAPPING.update(
# # -----------------普通打印:部分格线--------------------------------
#
# # 农业银行:交易日期 摘要/附言 交易金额 余额 交易地点/对方账号和户名
# ('农业银行', (1, None, 3, 4, None, 2, None, 5, None, None, None, None, None)),
# ('农业银行-5', (1, None, 3, 4, None, 2, None, 5, None, None, None, None, None)),
#
# # 农业银行:日期 地点 摘要 存入 支出 余额 对方账号 对方户名
# ('农业银行-8', (1, None, None, 6, None, 3, 8, 7, None, None, None, 4, 5)),
# # 农业银行:日期 摘要 交易金额 余额 地点 交易对手账号 对方户名
# ('农业银行-窄页', (1, None, 3, 4, None, 2, 7, 6, None, None, None, None, None)),
#
# # 农商行:交易日期 交易发生额 账户余额 对方账号 对方户名 摘要 备注
# ('农商行', (1, None, 2, 3, None, 6, 5, 4, None, None, None, None, None)),
...
...
@@ -330,6 +352,9 @@ HEADERS_MAPPING.update(
#
# # 平安电子账单:序号 交易日期 交易网点 摘要 借贷发生额(借:-贷:+) 账户余额
# ('平安电子账单', (2, None, 5, 6, None, 4, None, None, None, None, None, None, None)),
# # 建设银行:序号 摘要 币别 钞汇 交易日期 交易金额 账户余额 交易地点附言 对方账号与户名
# ('建设银行-电子账单', (5, None, 6, 7, None, 2, None, 9, None, None, None, None, None)),
#
# # -----------------普通打印:无格线-------------------------------------
#
...
...
@@ -338,7 +363,8 @@ HEADERS_MAPPING.update(
#
# # 邮储银行:交易日期、交易类型 交易币种 交易金额(元) 账户余额(元) [对手方户名 对手方账户 收支类型] --> 竖版-无表格-邮储银行-账户对账单 含有对手方户名 对手方账户
# # 交易日期、交易类型 交易金额(元) 账户余额(元) 操作柜员 --> 竖版-无表格-邮储银行 账户对账单
# ('邮储银行', (1, None, None, None, None, 2, None, None, None, None, None, None, None)),
# ('邮储银行-8', (1, None, 4, 5, None, 2, 6, 7, None, None, None, None, None)),
# ('邮储银行-5', (1, None, 3, 4, None, 2, None, None, None, None, None, None, None)),
#
# # 工商银行电子版:交易日期 账号 储种 序号 币种 妙汇 摘要 地区 收入/支出金额 余额 [对方户名 对方账号] 渠道
# ('工商银行电子版', (1, None, 9, 10, None, 7, None, None, None, None, None, None, None)),
...
...
@@ -351,13 +377,15 @@ HEADERS_MAPPING.update(
# ('民生银行', (None, None, 5, 6, None, None, 7, None, 8, None, None, None, None)),
#
# # 农业银行:交易日期 摘要/附言 交易金额 对方账号和户名
# ('农业银行', (1, None, 3, None, None, 2, None, 4, None, None, None, None, None)),
# ('农业银行
-整数
', (1, None, 3, None, None, 2, None, 4, None, None, None, None, None)),
#
# # 农业银行:交易日期 摘要/附言 交易金额 余额 交易地点/对方账号和户名
# ('农业银行', (1, None, 3, 4, None, 2, None, 5, None, None, None, None, None)),
#
# # 农业银行:日期、时间、[日志号]、短摘要、交易金额、本次余额、交易网点、渠道、附言
# ('农业银行', (1, 2, None, None, None, None, None, None, None, None, None, None, None)),
# # 农业银行:日期、时间、短摘要、交易金额、本次余额、交易网点、渠道、附言
# # 农业银行:日期、时间、日志号、短摘要、交易金额、本次余额、交易网点、渠道、附言
# ('农业银行', (1, 2, 4, 5, None, 3, None, None, None, None, None, None, None)),
# ('农业银行-扩张缩进', (1, 2, 5, 6, None, 4, None, None, None, None, None, None, None)),
#
# # 交通银行:交易日期 记账日期、交易地点、交易类型、借贷状态、交易金额、余额
# ('交通银行', (1, None, 5, 6, None, 3, None, None, None, None, 4, None, None)),
...
...
@@ -374,11 +402,10 @@ HEADERS_MAPPING.update(
#
# # 邮储银行:序号、交易日期、交易渠道、摘要、交易金额、账户余额、对方账号/卡号/汇票号、原子账号、交易机构名称
# ('邮储银行', (2, None, 5, 6, None, 4, None, 7, None, None, None, None, None)),
#
# # 建设银行:序号 摘要 币别 钞汇 交易日期 交易金额 账户余额 交易地点附言 对方账号与户名
# ('建设银行', (5, None, 6, 7, None, 2, None, 9, None, None, None, None, None)),
# ]
OTHER_TUPLE
=
(
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
)
# {
# "0":"其他",
# "1":"普通打印-全表格-中国农业银行",
...
...
@@ -408,67 +435,163 @@ HEADERS_MAPPING.update(
# "22":"针式打印-部分格线-邮储银行一本通绿卡"
# }
# CLASSIFY_LIST = [
# ('其他', OTHER_TUPLE),
# ('农业银行', (1, None, 3, 5, None, 8, 7, 6, None, None, None, None, None)),
# ('中国银行', (1, 2, 4, 5, 6, 9, 10, 11, 12, None, None, None, None)),
# ('北京银行', (1, None, 4, 5, None, 2, 6, 7, None, None, 3, None, None)),
# ('工商银行', (1, None, 9, 10, None, 7, None, None, None, None, None, None, None)),
# ('建设银行', (None, None, None, None, None, 2, None, None, None, None, None, None, None)),
# ('微信', (2, None, 6, None, None, 3, 7, None, None, None, None, None, None)),
# ('支付宝', (2, None, None, 6, None, 3, None, None, None, None, None, 4, 5)),
#
# ('交通银行', (1, None, 5, 6, None, 3, None, None, None, None, 4, None, None)),
# ('农业银行', (1, None, 3, None, None, 2, None, 4, None, None, None, None, None)),
# ('农业银行', (1, 2, None, None, None, None, None, None, None, None, None, None, None)),
# ('招商银行', (1, None, 3, 4, None, 6, None, None, None, None, None, None, None)),
# ('招商银行电子版', (1, None, 3, 4, None, 5, 6, None, None, None, None, None, None)),
# ('民生银行', (None, None, 5, 6, None, None, 7, None, 8, None, None, None, None)),
#
# ('中信银行', (1, None, None, 5, None, 2, 6, 7, 8, None, None, 3, 4)),
# ('农业银行', (1, None, 3, 4, None, 2, None, 5, None, None, None, None, None)),
# ('农业银行', (1, None, 3, 4, None, 2, None, 5, None, None, None, None, None)),
# ('农业银行', (1, None, 3, 4, None, 2, None, 5, None, None, None, None, None)),
# ('平安电子账单', (2, None, 5, 6, None, 4, None, None, None, None, None, None, None)),
#
# ('建设银行', (None, None, None, None, None, None, None, None, None, None, None, None, None)),
# ('邮储银行', (2, None, 5, 6, None, 4, None, 7, None, None, None, None, None)),
# ('邮储银行', (2, None, 5, 6, None, 4, None, 7, None, None, None, None, None)),
# ]
# "4":"普通打印-全表格-中国银行",
# "5":"普通打印-全表格-农业银行-10列",
# "6":"普通打印-全表格-农业银行-10列-1",
# "7":"普通打印-全表格-农业银行-9列",
# "8":"普通打印-全表格-北京银行",
# "9":"普通打印-全表格-工商银行",
# "10":"普通打印-全表格-工商银行-电子账单",
# "11":"普通打印-全表格-建设银行",
# "12":"普通打印-全表格-微信账单",
# "13":"普通打印-全表格-支付宝账单",
# "14":"普通打印-无格线-交通银行",
# "15":"普通打印-无格线-储蓄银行-5列",
# "16":"普通打印-无格线-储蓄银行-8列",
# "17":"普通打印-无格线-农业银行-扩张缩进",
# "18":"普通打印-无格线-农业银行-整数",
# "19":"普通打印-无格线-招商银行",
# "20":"普通打印-无格线-招商银行-电子账单",
# "21":"普通打印-无格线-民生银行",
# "22":"普通打印-部分格线-横版-中信银行",
# "23":"普通打印-部分格线-竖版-农业银行-5列",
# "24":"普通打印-部分格线-竖版-农业银行-8列",
# "25":"普通打印-部分格线-竖版-农业银行-窄页",
# "26":"普通打印-部分格线-竖版-平安电子账单",
# "27":"普通打印-部分格线-竖版-建设银行-电子账单",
# "34":"针式打印-全格线-建设银行",
# "35":"针式打印-部分格线-竖版-邮储银行",
# "36":"针式打印-部分格线-竖版-邮储银行-绿卡",
CLASSIFY_LIST
=
[
(
'其他'
,
(
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
)),
(
'农业银行'
,
(
1
,
None
,
3
,
5
,
None
,
8
,
7
,
6
,
None
,
None
,
None
,
None
,
None
)),
(
'中国银行'
,
(
1
,
2
,
4
,
5
,
6
,
9
,
10
,
11
,
12
,
None
,
None
,
None
,
None
)),
(
'北京银行'
,
(
1
,
None
,
4
,
5
,
None
,
2
,
6
,
7
,
None
,
None
,
3
,
None
,
None
)),
(
'工商银行'
,
(
1
,
None
,
9
,
10
,
None
,
7
,
None
,
None
,
None
,
None
,
None
,
None
,
None
)),
(
'建设银行'
,
(
None
,
None
,
None
,
None
,
None
,
2
,
None
,
None
,
None
,
None
,
None
,
None
,
None
)),
(
'微信'
,
(
2
,
None
,
6
,
None
,
None
,
3
,
7
,
None
,
None
,
None
,
None
,
None
,
None
)),
(
'支付宝'
,
(
2
,
None
,
None
,
6
,
None
,
3
,
None
,
None
,
None
,
None
,
None
,
4
,
5
)),
(
'交通银行'
,
(
1
,
None
,
5
,
6
,
None
,
3
,
None
,
None
,
None
,
None
,
4
,
None
,
None
)),
(
'农业银行'
,
(
1
,
None
,
3
,
None
,
None
,
2
,
None
,
4
,
None
,
None
,
None
,
None
,
None
)),
(
'农业银行'
,
(
1
,
2
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
)),
(
'招商银行'
,
(
1
,
None
,
3
,
4
,
None
,
6
,
None
,
None
,
None
,
None
,
None
,
None
,
None
)),
(
'招商银行电子版'
,
(
1
,
None
,
3
,
4
,
None
,
5
,
6
,
None
,
None
,
None
,
None
,
None
,
None
)),
(
'民生银行'
,
(
None
,
None
,
5
,
6
,
None
,
None
,
7
,
None
,
8
,
None
,
None
,
None
,
None
)),
(
'中信银行'
,
(
1
,
None
,
None
,
5
,
None
,
2
,
6
,
7
,
8
,
None
,
None
,
3
,
4
)),
(
'农业银行'
,
(
1
,
None
,
3
,
4
,
None
,
2
,
None
,
5
,
None
,
None
,
None
,
None
,
None
)),
(
'农业银行'
,
(
1
,
None
,
3
,
4
,
None
,
2
,
None
,
5
,
None
,
None
,
None
,
None
,
None
)),
(
'农业银行'
,
(
1
,
None
,
3
,
4
,
None
,
2
,
None
,
5
,
None
,
None
,
None
,
None
,
None
)),
(
'平安电子账单'
,
(
2
,
None
,
5
,
6
,
None
,
4
,
None
,
None
,
None
,
None
,
None
,
None
,
None
)),
(
'建设银行'
,
(
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
)),
(
'邮储银行'
,
(
2
,
None
,
5
,
6
,
None
,
4
,
None
,
7
,
None
,
None
,
None
,
None
,
None
)),
(
'邮储银行'
,
(
2
,
None
,
5
,
6
,
None
,
4
,
None
,
7
,
None
,
None
,
None
,
None
,
None
)),
(
'其他'
,
OTHER_TUPLE
),
(
'其他'
,
OTHER_TUPLE
),
(
'其他'
,
OTHER_TUPLE
),
(
'其他'
,
OTHER_TUPLE
),
(
'普通打印-全表格-中国银行'
,
(
1
,
2
,
4
,
5
,
6
,
9
,
10
,
11
,
12
,
None
,
None
,
None
,
None
)),
(
'普通打印-全表格-农业银行-10列'
,
(
1
,
None
,
None
,
5
,
None
,
8
,
7
,
6
,
None
,
None
,
None
,
3
,
4
)),
(
'普通打印-全表格-农业银行-10列-1'
,
(
2
,
None
,
4
,
5
,
None
,
3
,
7
,
6
,
None
,
None
,
None
,
None
,
None
)),
(
'普通打印-全表格-农业银行-9列'
,
(
1
,
None
,
3
,
4
,
None
,
2
,
8
,
7
,
None
,
None
,
None
,
None
,
None
)),
(
'普通打印-全表格-北京银行'
,
(
1
,
None
,
4
,
5
,
None
,
2
,
6
,
7
,
None
,
None
,
3
,
None
,
None
)),
(
'普通打印-全表格-工商银行'
,
(
1
,
None
,
9
,
10
,
None
,
7
,
None
,
None
,
None
,
None
,
None
,
None
,
None
)),
(
'普通打印-全表格-工商银行-电子账单'
,
(
1
,
None
,
9
,
10
,
None
,
7
,
11
,
12
,
None
,
None
,
None
,
None
,
None
)),
(
'普通打印-全表格-建设银行'
,
(
3
,
None
,
4
,
5
,
None
,
2
,
None
,
7
,
None
,
None
,
None
,
None
,
None
)),
(
'普通打印-全表格-微信账单'
,
(
2
,
None
,
6
,
None
,
None
,
3
,
7
,
None
,
None
,
None
,
None
,
None
,
None
)),
(
'普通打印-全表格-支付宝账单'
,
(
2
,
None
,
None
,
6
,
None
,
3
,
None
,
None
,
None
,
None
,
None
,
4
,
5
)),
(
'普通打印-无格线-交通银行'
,
(
1
,
None
,
5
,
6
,
None
,
3
,
None
,
None
,
None
,
None
,
4
,
None
,
None
)),
(
'普通打印-无格线-储蓄银行-5列'
,
(
1
,
None
,
3
,
4
,
None
,
2
,
None
,
None
,
None
,
None
,
None
,
None
,
None
)),
(
'普通打印-无格线-储蓄银行-8列'
,
(
1
,
None
,
4
,
5
,
None
,
2
,
6
,
7
,
None
,
None
,
None
,
None
,
None
)),
(
'普通打印-无格线-农业银行-扩张缩进'
,
(
1
,
2
,
5
,
6
,
None
,
4
,
None
,
None
,
None
,
None
,
None
,
None
,
None
)),
(
'普通打印-无格线-农业银行-整数'
,
(
1
,
None
,
3
,
None
,
None
,
2
,
None
,
4
,
None
,
None
,
None
,
None
,
None
)),
(
'普通打印-无格线-招商银行'
,
(
1
,
None
,
3
,
4
,
None
,
6
,
None
,
None
,
None
,
None
,
None
,
None
,
None
)),
(
'普通打印-无格线-招商银行-电子账单'
,
(
1
,
None
,
3
,
4
,
None
,
5
,
6
,
None
,
None
,
None
,
None
,
None
,
None
)),
(
'普通打印-无格线-民生银行'
,
(
None
,
None
,
5
,
6
,
None
,
None
,
7
,
None
,
8
,
None
,
None
,
None
,
None
)),
(
'普通打印-部分格线-横版-中信银行'
,
(
1
,
None
,
None
,
5
,
None
,
2
,
6
,
7
,
8
,
None
,
None
,
3
,
4
)),
(
'普通打印-部分格线-竖版-农业银行-5列'
,
(
1
,
None
,
3
,
4
,
None
,
2
,
None
,
5
,
None
,
None
,
None
,
None
,
None
)),
(
'普通打印-部分格线-竖版-农业银行-8列'
,
(
1
,
None
,
None
,
6
,
None
,
3
,
8
,
7
,
None
,
None
,
None
,
4
,
5
)),
(
'普通打印-部分格线-竖版-农业银行-窄页'
,
(
1
,
None
,
3
,
4
,
None
,
2
,
7
,
6
,
None
,
None
,
None
,
None
,
None
)),
(
'普通打印-部分格线-竖版-平安电子账单'
,
(
2
,
None
,
5
,
6
,
None
,
4
,
None
,
None
,
None
,
None
,
None
,
None
,
None
)),
(
'普通打印-部分格线-竖版-建设银行-电子账单'
,
(
5
,
None
,
6
,
7
,
None
,
2
,
None
,
9
,
None
,
None
,
None
,
None
,
None
)),
(
'其他'
,
OTHER_TUPLE
),
(
'其他'
,
OTHER_TUPLE
),
(
'其他'
,
OTHER_TUPLE
),
(
'其他'
,
OTHER_TUPLE
),
(
'其他'
,
OTHER_TUPLE
),
(
'其他'
,
OTHER_TUPLE
),
(
'针式打印-全格线-建设银行'
,
OTHER_TUPLE
),
(
'针式打印-部分格线-竖版-邮储银行'
,
(
2
,
None
,
5
,
6
,
None
,
4
,
None
,
7
,
None
,
None
,
None
,
None
,
None
)),
(
'针式打印-部分格线-竖版-邮储银行-绿卡'
,
(
2
,
None
,
5
,
6
,
None
,
4
,
None
,
7
,
None
,
None
,
None
,
None
,
None
)),
(
'其他'
,
OTHER_TUPLE
),
]
# ----------license相关------------------------------------------------------------------------------------------------
# "0":"AVT Invioce",
# "1":"二手车发票",
# "2":"其他",
# "3":"护照",
# "28":"机动车登记证",
# "29":"机动车销售统一发票",
# "30":"港澳通行证",
# "31":"营业执照",
# "32":"行驶证",
# "33":"身份证",
# "37":"银行卡"
# 其他
OTHER_CLASSIFY
=
2
# 身份证
IC_CN_NAME
=
'身份证'
IC_CLASSIFY
=
33
# 增值税发票
VAT_CN_NAME
=
'增值税发票'
VAT_CLASSIFY
=
0
# 机动车登记证书
MVC_CN_NAME
=
'机动车登记证书'
MVC_CLASSIFY
=
28
# 机动车销售统一发票
MVI_CN_NAME
=
'机动车销售统一发票'
MVI_CLASSIFY
=
29
IC_PID
=
VAT_PID
=
MVC_PID
=
MVI_PID
=
None
# 营业执照
BL_KEY
=
'bl'
BL_CN_NAME
=
'营业执照'
BL_CLASSIFY
=
31
BL_PID
=
41
# 二手车发票
UCI_KEY
=
'uci'
UCI_CN_NAME
=
'二手车发票'
UCI_CLASSIFY
=
1
UCI_PID
=
60
# 港澳台通行证
EEP_KEY
=
'eep'
EEP_CN_NAME
=
'港澳台通行证'
EEP_CLASSIFY
=
30
EEP_PID
=
1018
# 行驶证
DL_KEY
=
'dl'
DL_CN_NAME
=
'行驶证'
DL_CLASSIFY
=
32
DL_PID
=
5
# 护照
PP_KEY
=
'pp'
PP_CN_NAME
=
'护照'
PP_CLASSIFY
=
3
PP_PID
=
8
# 银行卡
BC_KEY
=
'bc'
# 身份证
IC_KEY
=
'ic'
# 机动车登记证书
MVC_KEY
=
'mvc'
# 机动车销售统一发票
MVI_KEY
=
'mvi'
# 增值税发票
VAT_KEY
=
'vat'
LICENSE_ORDER
=
((
MVI_KEY
,
'机动车销售统一发票'
),
(
IC_KEY
,
'身份证'
),
(
BC_KEY
,
'银行卡'
),
(
BL_KEY
,
'营业执照'
),
(
UCI_KEY
,
'二手车发票'
),
(
EEP_KEY
,
'港澳台通行证'
),
(
DL_KEY
,
'行驶证'
),
(
PP_KEY
,
'护照'
),
(
MVC_KEY
,
'机动车登记证书'
),
(
VAT_KEY
,
'增值税发票'
))
BC_CN_NAME
=
'银行卡'
BC_CLASSIFY
=
37
BC_PID
=
4
BC_FIELD
=
((
'CardNum'
,
'银行卡号'
),
(
'BankName'
,
'发卡行名称'
),
(
'CardName'
,
'银行卡名称'
),
...
...
@@ -478,14 +601,19 @@ BC_FIELD = (('CardNum', '银行卡号'),
SUCCESS_CODE_SET
=
{
'0'
,
0
}
BC_PID
=
4
OTHER_SET
=
{
0
,
1
,
2
}
BS_SET
=
{
10
,
11
,
12
}
LICENSE_SET_1
=
{
110
,
111
,
112
}
LICENSE_SET_2
=
{
1110
,
1111
,
1112
}
CLASSIFY_PID_DICT
=
{
0
:
(
4
,
BC_KEY
)
# 银行卡
}
LICENSE_ORDER
=
((
MVI_CLASSIFY
,
(
MVI_PID
,
MVI_CN_NAME
)),
(
IC_CLASSIFY
,
(
IC_PID
,
IC_CN_NAME
)),
(
BC_CLASSIFY
,
(
BC_PID
,
BC_CN_NAME
)),
(
BL_CLASSIFY
,
(
BL_PID
,
BL_CN_NAME
)),
(
UCI_CLASSIFY
,
(
UCI_PID
,
UCI_CN_NAME
)),
(
EEP_CLASSIFY
,
(
EEP_PID
,
EEP_CN_NAME
)),
(
DL_CLASSIFY
,
(
DL_PID
,
DL_CN_NAME
)),
(
PP_CLASSIFY
,
(
PP_PID
,
PP_CN_NAME
)),
(
MVC_CLASSIFY
,
(
MVC_PID
,
MVC_CN_NAME
)),
(
VAT_CLASSIFY
,
(
VAT_PID
,
VAT_CN_NAME
)))
LICENSE_CLASSIFY_MAPPING
=
dict
(
LICENSE_ORDER
)
OTHER_CLASSIFY_SET
=
{
OTHER_CLASSIFY
}
LICENSE_CLASSIFY_SET_1
=
{
IC_CLASSIFY
,
VAT_CLASSIFY
,
MVC_CLASSIFY
,
MVI_CLASSIFY
}
LICENSE_CLASSIFY_SET_2
=
{
BL_CLASSIFY
,
UCI_CLASSIFY
,
EEP_CLASSIFY
,
DL_CLASSIFY
,
PP_CLASSIFY
,
BC_CLASSIFY
}
...
...
src/apps/doc/management/commands/doc_ocr_process.py
View file @
1c6d880
...
...
@@ -81,9 +81,14 @@ class Command(BaseCommand, LoggerMixin):
return
doc_data_path
,
excel_path
,
src_excel_path
,
pdf_path
@staticmethod
def
bs_process
(
wb
,
sheets
,
bs_summary
,
unknown_summary
,
pno
,
img_idx
,
classify
,
confidence
):
def
bs_process
(
wb
,
ocr_data
,
bs_summary
,
unknown_summary
,
img_path
,
classify
):
sheets
=
ocr_data
.
get
(
'data'
,
[])
if
not
sheets
:
return
confidence
=
ocr_data
.
get
(
'confidence'
,
1
)
img_name
,
_
=
os
.
path
.
splitext
(
os
.
path
.
basename
(
img_path
))
for
i
,
sheet
in
enumerate
(
sheets
):
sheet_name
=
'
page_{0}_img_{1}_{2}'
.
format
(
pno
,
img_idx
,
i
)
sheet_name
=
'
{0}_{1}'
.
format
(
img_name
,
i
)
# ['户名', '卡号', '页码', '回单验证码', '打印时间', '起始时间', '终止时间']
summary
=
sheet
.
get
(
'summary'
)
card
=
summary
[
1
]
...
...
@@ -139,38 +144,52 @@ class Command(BaseCommand, LoggerMixin):
words
=
cell
.
get
(
'words'
)
ws
.
cell
(
row
=
r1
+
1
,
column
=
c1
+
1
,
value
=
words
)
def
license2_process
(
self
,
img_path
,
license_summary
,
pid
,
license_key
):
with
open
(
img_path
,
'rb'
)
as
f
:
base64_data
=
base64
.
b64encode
(
f
.
read
())
# 获取解码后的base64值
filedata
=
base64_data
.
decode
()
# pid 产品的pid, key, secret 登录之后能够查看到
datas
=
{
"pid"
:
str
(
pid
),
"key"
:
conf
.
OCR_KEY
,
"secret"
:
conf
.
OCR_SECRET
,
"file"
:
filedata
}
r
=
requests
.
post
(
self
.
ocr_url_2
,
data
=
datas
)
if
r
.
status_code
==
200
:
# 识别结果
response
=
r
.
json
()
if
response
.
get
(
'ErrorCode'
)
in
consts
.
SUCCESS_CODE_SET
:
@staticmethod
def
license1_process
(
ocr_data
,
license_summary
,
classify
):
license_data
=
ocr_data
.
get
(
'data'
,
[])
if
not
license_data
:
return
_
,
license_key
=
consts
.
CLASSIFY_PID_DICT
.
get
(
classify
)
for
license_dict
in
license_data
:
res_list
=
[]
for
field
,
value
in
license_dict
.
items
():
res_list
.
append
((
field
,
value
))
license_summary
.
setdefault
(
license_key
,
[])
.
append
(
res_list
)
@staticmethod
def
license2_process
(
ocr_res_2
,
license_summary
,
pid
,
classify
):
if
ocr_res_2
.
get
(
'ErrorCode'
)
in
consts
.
SUCCESS_CODE_SET
:
if
pid
==
consts
.
BC_PID
:
# 银行卡
res_list
=
[]
for
en_key
,
chn_key
in
consts
.
BC_FIELD
:
res_list
.
append
((
chn_key
,
response
.
get
(
en_key
,
''
)))
license_summary
.
setdefault
(
license_ke
y
,
[])
.
append
(
res_list
)
res_list
.
append
((
chn_key
,
ocr_res_2
.
get
(
en_key
,
''
)))
license_summary
.
setdefault
(
classif
y
,
[])
.
append
(
res_list
)
else
:
# 营业执照、行驶证等
for
result_dict
in
response
.
get
(
'ResultList'
,
[]):
for
result_dict
in
ocr_res_2
.
get
(
'ResultList'
,
[]):
res_list
=
[]
for
field_dict
in
result_dict
.
get
(
'FieldList'
,
[]):
res_list
.
append
((
field_dict
.
get
(
'chn_key'
,
''
),
field_dict
.
get
(
'value'
,
''
)))
license_summary
.
setdefault
(
license_key
,
[])
.
append
(
res_list
)
res_list
.
append
(
(
field_dict
.
get
(
'chn_key'
,
''
),
field_dict
.
get
(
'value'
,
''
)))
license_summary
.
setdefault
(
classify
,
[])
.
append
(
res_list
)
# async def fetch_ocr_result(self, img_path):
# async with aiohttp.ClientSession(
# headers=self.ocr_header, connector=aiohttp.TCPConnector(ssl=False)
# ) as session:
# json_data = self.get_ocr_json(img_path)
# async with session.post(self.ocr_url, json=json_data) as response:
# return await response.json()
#
# async def img_2_ocr_2_wb(self, wb, img_path, summary):
# res = await self.fetch_ocr_result(img_path)
# self.cronjob_log.info('{0} [fetch ocr result success] [img={1}] [res={2}]'.format(self.log_base, img_path, res))
# sheets_list = res.get('result').get('res')
# img_name = os.path.basename(img_path)
# self.append_sheet(wb, sheets_list, img_name, summary)
def
ocr_2_wb
(
self
,
res
,
wb
,
pno
,
img_idx
,
bs_summary
,
unknown_summary
,
license_summary
):
def
img_2_ocr_2_wb
(
self
,
wb
,
img_path
,
bs_summary
,
unknown_summary
,
license_summary
):
# # 流水
# res = {
# 'code': 1,
...
...
@@ -178,7 +197,7 @@ class Command(BaseCommand, LoggerMixin):
# 'data': {
# 'classify': 0,
# 'confidence': 0.999,
# '
sheets
': [
# '
data
': [
# {
# 'summary': ['户名', '卡号', '页码', '回单验证码', '打印时间', '起始时间', '终止时间'],
# 'cells': []
...
...
@@ -220,55 +239,52 @@ class Command(BaseCommand, LoggerMixin):
# 'confidence': 0.999,
# }
# }
data
=
res
.
get
(
'data'
,
{})
classify
=
data
.
get
(
'classify'
)
with
open
(
img_path
,
'rb'
)
as
f
:
base64_data
=
base64
.
b64encode
(
f
.
read
())
# 获取解码后的base64值
file_data
=
base64_data
.
decode
()
json_data_1
=
{
"file"
:
file_data
}
response_1
=
requests
.
post
(
self
.
ocr_url_1
,
data
=
json_data_1
)
if
response_1
.
status_code
==
200
:
ocr_res_1
=
response_1
.
json
()
self
.
cronjob_log
.
info
(
'{0} [ocr_1 result] [img={1}] [res={2}]'
.
format
(
self
.
log_base
,
img_path
,
ocr_res_1
))
if
ocr_res_1
.
get
(
'code'
)
==
1
:
ocr_data
=
ocr_res_1
.
get
(
'data'
,
{})
classify
=
ocr_data
.
get
(
'classify'
)
if
classify
is
None
:
return
elif
classify
in
consts
.
OTHER_SET
:
# 其他类
return
elif
classify
in
consts
.
BS_SET
:
# 流水处理
sheets
=
data
.
get
(
'sheets'
,
[])
if
not
sheets
:
elif
classify
in
consts
.
OTHER_CLASSIFY_SET
:
# 其他类
return
confidence
=
data
.
get
(
'confidence'
,
1
)
self
.
bs_process
(
wb
,
sheets
,
bs_summary
,
unknown_summary
,
pno
,
img_idx
,
classify
,
confidence
)
elif
classify
in
consts
.
LICENSE_SET_1
:
# 证件1
# self.license1_process() # TODO license1
elif
classify
in
consts
.
LICENSE_CLASSIFY_SET_1
:
# 证件1
self
.
license1_process
(
ocr_data
,
license_summary
,
classify
)
elif
classify
in
consts
.
LICENSE_CLASSIFY_SET_2
:
# 证件2
pid
,
_
=
consts
.
LICENSE_CLASSIFY_MAPPING
.
get
(
classify
)
json_data_2
=
{
"pid"
:
str
(
pid
),
"key"
:
conf
.
OCR_KEY
,
"secret"
:
conf
.
OCR_SECRET
,
"file"
:
file_data
}
response_2
=
requests
.
post
(
self
.
ocr_url_2
,
data
=
json_data_2
)
if
response_2
.
status_code
==
200
:
# 识别结果
ocr_res_2
=
response_2
.
json
()
self
.
cronjob_log
.
info
(
'{0} [ocr_2 result] [img={1}] [res={2}]'
.
format
(
self
.
log_base
,
img_path
,
ocr_res_2
))
self
.
license2_process
(
ocr_res_2
,
license_summary
,
pid
,
classify
)
else
:
raise
Exception
(
'ocr 2 error, img_path={0}'
.
format
(
img_path
))
else
:
# 流水处理
self
.
bs_process
(
wb
,
ocr_data
,
bs_summary
,
unknown_summary
,
img_path
,
classify
)
else
:
pass
elif
classify
in
consts
.
LICENSE_SET_2
:
# 证件2
pid
,
license_key
=
consts
.
CLASSIFY_PID_DICT
.
get
(
classify
)
self
.
license2_process
(
license_summary
,
pid
,
license_key
)
# TODO reuse img data?
# async def fetch_ocr_result(self, img_path):
# async with aiohttp.ClientSession(
# headers=self.ocr_header, connector=aiohttp.TCPConnector(ssl=False)
# ) as session:
# json_data = self.get_ocr_json(img_path)
# async with session.post(self.ocr_url, json=json_data) as response:
# return await response.json()
#
# async def img_2_ocr_2_wb(self, wb, img_path, summary):
# res = await self.fetch_ocr_result(img_path)
# self.cronjob_log.info('{0} [fetch ocr result success] [img={1}] [res={2}]'.format(self.log_base, img_path, res))
# sheets_list = res.get('result').get('res')
# img_name = os.path.basename(img_path)
# self.append_sheet(wb, sheets_list, img_name, summary)
else
:
raise
Exception
(
'ocr 1 error, img_path={0}'
.
format
(
img_path
))
def
fetch_ocr_result
(
self
,
img_path
):
files
=
[
(
'img'
,
open
(
img_path
,
'rb'
))
]
response
=
requests
.
request
(
"POST"
,
self
.
ocr_url_1
,
files
=
files
)
if
response
.
status_code
==
200
:
return
response
.
json
()
def
img_2_ocr_2_wb
(
self
,
wb
,
img_info
,
bs_summary
,
unknown_summary
,
license_summary
):
res
=
self
.
fetch_ocr_result
(
img_info
[
0
])
self
.
cronjob_log
.
info
(
'{0} [fetch ocr result success] [img={1}] [res={2}]'
.
format
(
self
.
log_base
,
img_info
[
0
],
res
))
if
res
.
get
(
'code'
)
==
1
:
self
.
ocr_2_wb
(
res
,
wb
,
img_info
[
1
],
img_info
[
2
],
bs_summary
,
unknown_summary
,
license_summary
)
@staticmethod
def
get_most
(
value_list
):
...
...
@@ -414,7 +430,6 @@ class Command(BaseCommand, LoggerMixin):
# EDMS异常:下载异常-->回队列-->邮件;上传异常-->重新上传队列-->邮件
# 算法异常:第一道异常-->识别失败-->邮件;第二道异常-->识别失败-->邮件
# TODO 数据库断联问题
# TODO 非流水证件处理
# TODO EDMS API GATEWAY
def
handle
(
self
,
*
args
,
**
kwargs
):
sleep_second
=
int
(
conf
.
SLEEP_SECOND
)
...
...
@@ -466,8 +481,8 @@ class Command(BaseCommand, LoggerMixin):
# loop.run_until_complete(asyncio.wait(tasks))
# loop.close()
for
img_
info
in
pdf_handler
.
img_info
_list
:
self
.
img_2_ocr_2_wb
(
wb
,
img_
info
,
bs_summary
,
unknown_summary
,
license_summary
)
for
img_
path
in
pdf_handler
.
img_path
_list
:
self
.
img_2_ocr_2_wb
(
wb
,
img_
path
,
bs_summary
,
unknown_summary
,
license_summary
)
self
.
cronjob_log
.
info
(
'{0} [bs_summary={1}] [unknown_summary={2}] [license_summary={3}]'
.
format
(
self
.
log_base
,
bs_summary
,
unknown_summary
,
license_summary
))
...
...
src/apps/doc/ocr/wb.py
View file @
1c6d880
...
...
@@ -168,7 +168,7 @@ class BSWorkbook(Workbook):
month_mapping
.
setdefault
(
month_list
[
-
1
],
[])
.
insert
(
0
,
(
ws
.
title
,
idx_list
[
-
1
]
+
min_row
,
ws
.
max_row
,
0
))
def
build_metadata_rows
(
self
,
c
lassify
,
confidence
,
rol
e
,
code
,
print_time
,
start_date
,
end_date
):
def
build_metadata_rows
(
self
,
c
onfidenc
e
,
code
,
print_time
,
start_date
,
end_date
):
if
start_date
is
None
or
end_date
is
None
:
timedelta
=
None
else
:
...
...
@@ -176,10 +176,6 @@ class BSWorkbook(Workbook):
metadata_rows
=
[
(
'流水识别置信度'
,
confidence
),
self
.
blank_row
,
(
'分类结果'
,
classify
),
self
.
blank_row
,
(
'户名'
,
role
),
self
.
blank_row
,
self
.
code_header
,
]
metadata_rows
.
extend
(
code
)
...
...
@@ -200,19 +196,19 @@ class BSWorkbook(Workbook):
ms
=
self
.
create_sheet
(
'{0}({1})'
.
format
(
self
.
meta_sheet_title
,
card
))
return
ms
def
build_meta_sheet
(
self
,
card
,
c
lassify
,
confidence
,
rol
e
,
code
,
print_time
,
start_date
,
end_date
):
metadata_rows
=
self
.
build_metadata_rows
(
c
lassify
,
confidence
,
rol
e
,
code
,
print_time
,
start_date
,
end_date
)
def
build_meta_sheet
(
self
,
card
,
c
onfidenc
e
,
code
,
print_time
,
start_date
,
end_date
):
metadata_rows
=
self
.
build_metadata_rows
(
c
onfidenc
e
,
code
,
print_time
,
start_date
,
end_date
)
ms
=
self
.
create_meta_sheet
(
card
)
for
row
in
metadata_rows
:
ms
.
append
(
row
)
return
ms
def
build_month_sheet
(
self
,
role
,
month_mapping
,
ms
,
is_reverse
):
def
build_month_sheet
(
self
,
card
,
month_mapping
,
ms
,
is_reverse
):
tmp_ws
=
self
.
create_sheet
(
'tmp_ws'
)
for
month
in
sorted
(
month_mapping
.
keys
()):
# 3.1.拷贝数据
parts
=
month_mapping
.
get
(
month
)
new_ws
=
self
.
create_sheet
(
'{0}({1})'
.
format
(
month
,
role
))
new_ws
=
self
.
create_sheet
(
'{0}({1})'
.
format
(
month
,
card
[
-
6
:]
))
new_ws
.
append
(
consts
.
FIXED_HEADERS
)
for
part
in
parts
:
ws
=
self
.
get_sheet_by_name
(
part
[
0
])
...
...
@@ -338,9 +334,7 @@ class BSWorkbook(Workbook):
# 2.元信息提取表
ms
=
self
.
build_meta_sheet
(
card
,
summary
[
'classify'
],
summary
[
'confidence'
],
summary
[
'role'
],
summary
[
'code'
],
summary
[
'print_time'
],
start_date
,
...
...
@@ -359,13 +353,16 @@ class BSWorkbook(Workbook):
self
.
remove
(
self
.
get_sheet_by_name
(
sheet
))
def
license_rebuild
(
self
,
license_summary
):
for
en_key
,
cn_key
in
consts
.
LICENSE_ORDER
:
ws
=
self
.
create_sheet
(
cn_key
)
for
bl
in
license_summary
.
get
(
en_key
,
[]):
for
classify
,
(
_
,
name
)
in
consts
.
LICENSE_ORDER
:
res
=
license_summary
.
get
(
classify
)
if
res
is
None
:
continue
ws
=
self
.
create_sheet
(
name
)
for
bl
in
res
:
for
bl_field
in
bl
:
ws
.
append
(
bl_field
)
ws
.
append
((
None
,
))
def
rebuild
(
self
,
bs_summary
,
license_summary
):
self
.
bs_rebuild
(
bs_summary
)
#
self.license_rebuild(license_summary)
self
.
license_rebuild
(
license_summary
)
...
...
src/common/tools/pdf_to_img.py
View file @
1c6d880
...
...
@@ -25,7 +25,7 @@ class PDFHandler:
def
__init__
(
self
,
path
,
img_dir_path
):
self
.
path
=
path
self
.
img_dir_path
=
img_dir_path
self
.
img_
info
_list
=
[]
self
.
img_
path
_list
=
[]
self
.
xref_set
=
set
()
def
get_img_save_path
(
self
,
pno
,
img_index
=
0
,
ext
=
'png'
):
...
...
@@ -38,7 +38,7 @@ class PDFHandler:
pm
=
page
.
getPixmap
(
matrix
=
trans_2
,
alpha
=
False
)
img_save_path
=
self
.
get_img_save_path
(
page
.
number
)
pm
.
writePNG
(
img_save_path
)
self
.
img_
info_list
.
append
((
img_save_path
,
page
.
number
,
0
)
)
self
.
img_
path_list
.
append
(
img_save_path
)
@staticmethod
def
getimage
(
pix
):
...
...
@@ -88,7 +88,7 @@ class PDFHandler:
with
open
(
img_save_path
,
"wb"
)
as
f
:
f
.
write
(
img_data
)
self
.
xref_set
.
add
(
xref
)
self
.
img_
info_list
.
append
((
img_save_path
,
pno
,
img_index
)
)
self
.
img_
path_list
.
append
(
img_save_path
)
@staticmethod
def
split_il
(
il
):
...
...
@@ -179,7 +179,7 @@ class PDFHandler:
img_save_path
=
self
.
get_img_save_path
(
pno
,
img_index
,
im_list
[
0
][
2
])
new_img
.
save
(
img_save_path
)
page_to_png
=
False
self
.
img_
info_list
.
append
((
img_save_path
,
pno
,
img_index
)
)
self
.
img_
path_list
.
append
(
img_save_path
)
# 3.3 碎图分组大于2、全过滤、含特殊filter,特殊处理:整个页面保存为png图片
if
page_to_png
:
...
...
Write
Preview
Styling with
Markdown
is supported
Attach a file
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to post a comment