Skip to content
Toggle navigation
Toggle navigation
This project
Loading...
Sign in
周伟奇
/
bmw-ocr
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Graphs
Network
Create a new issue
Commits
Issue Boards
Files
Commits
Network
Compare
Branches
Tags
1529291b
authored
2020-10-20 11:18:22 +0800
by
周伟奇
Browse Files
Options
Browse Files
Tag
Download
Email Patches
Plain Diff
add field order & add RP
1 parent
cc0dc16d
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
236 additions
and
31 deletions
src/apps/doc/consts.py
src/apps/doc/management/commands/doc_ocr_process.py
src/apps/doc/ocr/wb.py
src/apps/doc/consts.py
View file @
1529291
...
...
@@ -79,7 +79,7 @@ META_SHEET_TITLE = '关键信息提取和展示'
FIXED_HEADERS
=
(
'记账日期'
,
'记账时间'
,
'金额'
,
'余额'
,
'交易名称'
,
'附言'
,
'对方账户名'
,
'对方卡号/账号'
,
'对方开户行'
,
'核对结果'
,
'借贷'
,
'收入'
,
'支出'
)
FIXED_COL_AMOUNT
=
len
(
FIXED_HEADERS
)
BASE_HEADERS_MAPPING
=
{
label
:
idx
+
1
for
idx
,
label
in
enumerate
(
FIXED_HEADERS
)}
BASE_HEADERS_MAPPING
=
{
label
:
idx
+
1
for
idx
,
label
in
enumerate
(
FIXED_HEADERS
)}
BORROW_HEADER_COL
=
BASE_HEADERS_MAPPING
[
'借贷'
]
INCOME_HEADER_COL
=
BASE_HEADERS_MAPPING
[
'收入'
]
OUTLAY_HEADER_COL
=
BASE_HEADERS_MAPPING
[
'支出'
]
...
...
@@ -583,47 +583,238 @@ OTHER_CLASSIFY = 2
# 身份证
IC_CN_NAME
=
'身份证'
IC_CLASSIFY
=
33
IC_FIELD_ORDER
=
()
IC_FIELD_ORDER_0
=
((
'姓名'
,
'姓名'
),
(
'公民身份号码'
,
'公民身份号码'
),
(
'出生年月'
,
'出生年月'
),
(
'住址'
,
'住址'
),
(
'性别'
,
'性别'
),
(
'民族'
,
'民族'
),)
IC_FIELD_ORDER_1
=
((
'有效期限'
,
'有效期限'
),
(
'签发机关'
,
'签发机关'
),)
# 居住证
RP_CN_NAME
=
'身份证'
RP_CLASSIFY
=
10087
RP_FIELD_ORDER_0
=
((
'姓名'
,
'姓名'
),
(
'公民身份号码'
,
'公民身份号码'
),
(
'出生年月'
,
'出生年月'
),
(
'住址'
,
'住址'
),
(
'性别'
,
'性别'
),)
RP_FIELD_ORDER_1
=
IC_FIELD_ORDER_1
# 增值税发票
VAT_CN_NAME
=
'
增值税发
票'
VAT_CN_NAME
=
'
VAT普
票'
VAT_CLASSIFY
=
0
VAT_FIELD_ORDER
=
()
VAT_FIELD_ORDER
=
((
'发票代码'
,
'发票代码'
),
(
'发票代码(开具)'
,
'发票代码(开具)'
),
(
'发票号码'
,
'发票号码'
),
(
'发票号码(开具)'
,
'发票号码(开具)'
),
(
'开票日期'
,
'开票日期'
),
(
'校验码'
,
'校验码'
),
(
'货物或应税劳务、服务名称'
,
'货物或应税劳务、服务名称'
),
(
'金额合计'
,
'开具金额合计(不含税)'
),
(
'税率'
,
'税率'
),
(
'税额合计'
,
'税额合计'
),
(
'价税合计小写'
,
'价税合计(小写)'
),
(
'价税合计大写'
,
'价税合计(大写)'
),
(
'购方名称'
,
'购买方名称'
),
(
'购方纳税人识别号'
,
'购买方纳税人识别号'
),
(
'购方地址、电话'
,
'购买方地址、电话'
),
(
'购方开户行及账号'
,
'购买方开户行及账号'
),
(
'销方名称'
,
'销售方名称'
),
(
'销方纳税人识别号'
,
'销售方纳税人识别号'
),
(
'销方地址、电话'
,
'销售方地址、电话'
),
(
'销方开户行及账号'
,
'销售方开户行及账号'
),
(
'销售方:(章)'
,
'销售方:(章)'
),
(
'备注'
,
'备注'
),)
# 机动车登记证书
MVC_CN_NAME
=
'机动车登记证书'
MVC_CLASSIFY
=
28
MVC_FIELD_ORDER
=
()
MVC_SE_FIELD_ORDER
=
()
MVC_CLASSIFY_SE
=
10086
MVC_FIELD_ORDER_1_2
=
((
'1.机动车所有人/身份证名称/号码'
,
'机动车所有人/身份证明名称/号码'
),
(
'3.登记日期'
,
'登记日期'
),
(
'9.车辆识别代号/车架号'
,
'车辆识别代号/车架号'
),
(
'32.车辆出厂日期'
,
'车辆出厂日期'
),
(
'34.发证日期'
,
'发证日期'
),
(
'30.使用性质'
,
'使用性质'
),
(
'31.车辆获得方式'
,
'车辆获得方式'
),
(
'4.机动车登记编号'
,
'机动车登记编号'
),
(
'空行占位'
,
None
),
(
'5.车辆类型'
,
'车辆类型'
),
(
'6.车辆品牌'
,
'车辆品牌'
),
(
'7.车辆型号'
,
'车辆型号'
),
(
'8.车身颜色'
,
'车身颜色'
),
(
'10.国产/进口'
,
'国产/进口'
),
(
'11.发动机号'
,
'发动机号'
),
(
'12.发动机型号'
,
'发动机型号'
),
(
'15.制造厂名称'
,
'制造厂名称'
),
(
'2.登记机关'
,
'登记机关'
),
(
'编号'
,
'机动车登记证书编号'
),)
MVC_FIELD_ORDER_3_4
=
(
(
'姓名/名称'
,
'姓名/名称'
),
(
'身份证明名称/号码'
,
'身份证明名称/号码'
),
(
'转移登记日期'
,
'转移登记日期'
),
)
MVC_SE_FIELD_ORDER_1_2
=
((
'9.车辆识别代号/车架号'
,
'车辆识别代号/车架号'
),
(
'1.机动车所有人/身份证名称/号码'
,
'机动车所有人/身份证明名称/号码'
),
(
'空行占位'
,
None
),
(
'3.登记日期'
,
'登记日期'
),
(
'32.车辆出厂日期'
,
'车辆出厂日期'
),
(
'34.发证日期'
,
'发证日期'
),
(
'30.使用性质'
,
'使用性质'
),
(
'31.车辆获得方式'
,
'车辆获得方式'
),
(
'5.车辆类型'
,
'车辆类型'
),
(
'6.车辆品牌'
,
'车辆品牌'
),
(
'7.车辆型号'
,
'车辆型号'
),
(
'8.车身颜色'
,
'车身颜色'
),
(
'10.国产/进口'
,
'国产/进口'
),
(
'11.发动机号'
,
'发动机号'
),
(
'12.发动机型号'
,
'发动机型号'
),
(
'13.燃料种类'
,
'燃料种类'
),
(
'14.排量/功率'
,
'排量/功率'
),
(
'15.制造厂名称'
,
'制造厂名称'
),
(
'16.转向形式'
,
'转向形式'
),
(
'17.轮距'
,
'轮距'
),
(
'18.轮胎数'
,
'轮胎数'
),
(
'19.轮胎规格'
,
'轮胎规格'
),
(
'20.钢板弹簧片数'
,
'钢板弹簧片数'
),
(
'21.轴距'
,
'轴距'
),
(
'22.轴数'
,
'轴数'
),
(
'23.外廓尺寸'
,
''
),
(
'24.货厢内部尺寸'
,
''
),
(
'25.总质量'
,
''
),
(
'26.核定载质量'
,
''
),
(
'27.核定载客'
,
''
),
(
'28.准牵引总质量'
,
''
),
(
'29.驾驶室载客'
,
''
),
(
'2.登记机关'
,
'登记机关'
),
(
'4.机动车登记编号'
,
'机动车登记编号'
),
(
'编号'
,
'机动车登记证书编号'
),)
MVC_SE_FIELD_ORDER_3_4
=
(
(
'姓名/名称'
,
'姓名/名称'
),
(
'身份证明名称/号码'
,
'身份证明名称/号码'
),
(
'转移登记日期'
,
'转移登记日期'
),
)
# 机动车销售统一发票
MVI_CN_NAME
=
'机动车销售统一发票'
MVI_CLASSIFY
=
29
MVI_FIELD_ORDER
=
()
MVI_FIELD_ORDER
=
((
'发票代码'
,
'发票代码'
),
(
'发票号码'
,
'发票号码'
),
(
'开票日期'
,
'开票日期'
),
(
'不含税价'
,
'不含税价'
),
(
'发票类型'
,
'发票联'
),
(
'购方名称'
,
'购买方名称'
),
(
'购买方身份证号或组织机构代码'
,
'购买方证件号码'
),
(
'纳税人识别号'
,
'纳税人识别号'
),
# nodo
(
'车辆识别代码'
,
'车架号'
),
(
'价税合计小写'
,
'价税合计小写'
),
(
'销方名称'
,
'销货单位名称'
),
(
'增值税税额'
,
'增值税税额'
),
(
'增值税税率'
,
'增值税税率'
),
# nodo
(
'发票章有无'
,
'发票章有无'
),
# nodo 全国统一发票监制章 销售单位章
(
'价税合计大写'
,
'价税合计大写'
),
# nodo
(
''
,
None
),
(
'发动机号码'
,
'发动机号'
),
(
'车辆类型'
,
'车辆类型'
),
# nodo
(
'厂牌型号'
,
'厂牌型号'
),
# nodo
(
'产地'
,
'产地'
),
# nodo
(
'合格证号'
,
'合格证号'
),
# nodo
(
'进口证明书号'
,
'进口证明书号'
),
# nodo
(
'商检单号'
,
'商检单号'
),
# nodo
(
'电话'
,
'电话'
),
# nodo
(
'销方纳税人识别号'
,
'销货方纳税人识别号'
),
(
'账号'
,
'账号'
),
# nodo
(
'地址'
,
'地址'
),
# nodo
(
'开户银行'
,
'开户银行'
),
# nodo
(
'主管税务机关及代码'
,
'主管税务机关及代码'
),
# nodo
(
'吨位'
,
'吨位'
),
# nodo
(
'限乘人数'
,
'限乘人数'
),)
# nodo
IC_PID
=
VAT_PID
=
MVC_PID
=
MVI_PID
=
None
# 营业执照
BL_CN_NAME
=
'营业执照'
BL_CLASSIFY
=
31
BL_PID
=
41
BL_FIELD_ORDER
=
()
BL_FIELD_ORDER
=
((
'注册号'
,
'统一社会信用代码'
),
(
'企业名称'
,
'名称'
),
(
'企业类型'
,
'类型'
),
(
'经营者姓名'
,
'法定代表人'
),
(
'成立日期'
,
'成立日期'
),
(
'营业期限'
,
'营业期限'
),
(
'注册资本'
,
'注册资本'
),
(
'地址'
,
'住所'
),
(
'经营范围'
,
'经营范围'
),)
# 二手车发票
UCI_CN_NAME
=
'二手车发票'
UCI_CLASSIFY
=
1
UCI_PID
=
60
UCI_FIELD_ORDER
=
()
UCI_FIELD_ORDER
=
((
'发票代码'
,
'发票代码'
),
(
'发票号码'
,
'发票号码'
),
(
'开票日期'
,
'开票日期'
),
(
'车价合计'
,
'车价合计小写'
),
(
'发票联'
,
'发票联'
),
(
'购方单位'
,
'买方单位/个人'
),
(
'购方号码'
,
'买方单位代码/身份证号码'
),
(
'车架号码'
,
'车架号'
),
(
'车价合计大写'
,
'车价合计大写'
),
(
'二手车市场'
,
'二手车市场'
),
(
'发票章有无'
,
'发票章有无'
),
(
'空行占位'
,
None
),
(
'车牌照号'
,
'车牌照号'
),
(
'登记证号'
,
'登记证号'
),
(
'购方地址'
,
'买方单位/住址'
),
(
'车辆类型'
,
'车辆类型'
),
(
'厂牌型号'
,
'厂牌型号'
),
(
'车管所名称'
,
'转入地车辆管理所名称'
),
(
'销方名称'
,
'卖方单位/个人'
),
(
'销方号码'
,
'卖方单位代码/身份证号码'
),
(
'销方地址'
,
'卖方单位/个人住址'
),)
# 港澳台通行证
EEP_CN_NAME
=
'港澳台通行证'
EEP_CLASSIFY
=
30
EEP_PID
=
1018
EEP_FIELD_ORDER
=
()
EEP_FIELD_ORDER
=
((
'中文名'
,
'姓名'
),
# 英文名
(
'证件号码'
,
'证件号码'
),
(
'签发次数'
,
'换证次数(签发次数)'
),
(
'有效期限'
,
'有效期限'
),
(
'出生日期'
,
'出生日期'
),
(
'性别'
,
'性别'
),
(
'签发机关'
,
'签发机关'
),
(
'签发地点'
,
'签发地点'
),)
# 行驶证
DL_CN_NAME
=
'行驶证'
DL_CLASSIFY
=
32
DL_PID
=
5
DL_FIELD_ORDER
=
()
DL_FIELD_ORDER_0
=
((
'号牌号码'
,
'1 号牌号码'
),
(
'所有人'
,
'3 所有人'
),
(
'使用性质'
,
'5 使用性质'
),
(
'车辆识别代码'
,
'7 车辆识别代号'
),
(
'注册日期'
,
'9 注册日期'
),
(
'发证日期'
,
'10 发证日期'
),
(
'车辆类型'
,
'2 车辆类型'
),
(
'地址'
,
'4 住址'
),
(
'品牌型号'
,
'6 品牌型号'
),
(
'发动机号'
,
'8 发动机号码'
),)
DL_FIELD_ORDER_1
=
((
'号牌号码'
,
'1 号牌号码'
),
(
'档案编号'
,
'11 档案编号'
),
(
'核定载人数'
,
'12 核定载人数'
),
(
'总质量'
,
'13 总质量'
),
(
'整备质量'
,
'14 整备质量'
),
(
'核定载质量'
,
'15 核对载质量'
),
(
'外廓尺寸'
,
'16 外廓尺寸'
),
(
'准牵引总质量'
,
'17 准牵引总质量'
),)
# 护照
PP_CN_NAME
=
'护照'
PP_CLASSIFY
=
3
PP_PID
=
8
PP_FIELD_ORDER
=
()
PP_FIELD_ORDER
=
((
'类型'
,
'类型/Type'
),
(
'英文姓名'
,
'姓名/Name'
),
(
'护照号码'
,
'护照号码/Passport No'
),
(
'有效期至'
,
'有效期至/Date of expiry'
),
(
'签发日期'
,
'签发日期/Date of issue'
),
(
'国家码'
,
'国家码/Country Code'
),
(
'性别'
,
'性别/Sex'
),
(
'国籍'
,
'国籍/Nationality'
),
(
'出生日期'
,
'出生日期/Date of birth'
),
(
'出生地点'
,
'出生地点/Place of birth'
),
(
'签发地点'
,
'签发地点/Place of issue'
),)
# 银行卡
BC_CN_NAME
=
'银行卡'
BC_CLASSIFY
=
37
...
...
@@ -640,16 +831,25 @@ BC_FIELD_ORDER = (('BankName', '发卡行名称'),
SUCCESS_CODE_SET
=
{
'0'
,
0
}
LICENSE_ORDER
=
((
MVI_CLASSIFY
,
(
MVI_PID
,
MVI_CN_NAME
,
MVI_FIELD_ORDER
)),
(
IC_CLASSIFY
,
(
IC_PID
,
IC_CN_NAME
,
IC_FIELD_ORDER
)),
(
BC_CLASSIFY
,
(
BC_PID
,
BC_CN_NAME
,
BC_FIELD_ORDER
)),
(
BL_CLASSIFY
,
(
BL_PID
,
BL_CN_NAME
,
BL_FIELD_ORDER
)),
(
UCI_CLASSIFY
,
(
UCI_PID
,
UCI_CN_NAME
,
UCI_FIELD_ORDER
)),
(
EEP_CLASSIFY
,
(
EEP_PID
,
EEP_CN_NAME
,
EEP_FIELD_ORDER
)),
(
DL_CLASSIFY
,
(
DL_PID
,
DL_CN_NAME
,
DL_FIELD_ORDER
)),
(
PP_CLASSIFY
,
(
PP_PID
,
PP_CN_NAME
,
PP_FIELD_ORDER
)),
(
MVC_CLASSIFY
,
(
MVC_PID
,
MVC_CN_NAME
,
MVC_FIELD_ORDER
)),
(
VAT_CLASSIFY
,
(
VAT_PID
,
VAT_CN_NAME
,
VAT_FIELD_ORDER
)))
FIELD_ORDER_MAP
=
{
IC_CLASSIFY
:
(
'有效期限'
,
IC_FIELD_ORDER_1
,
IC_FIELD_ORDER_0
),
RP_CLASSIFY
:
(
'有效期限'
,
RP_FIELD_ORDER_1
,
RP_FIELD_ORDER_0
),
DL_CLASSIFY
:
(
'档案编号'
,
DL_FIELD_ORDER_1
,
DL_FIELD_ORDER_0
),
MVC_CLASSIFY
:
(
'转移登记日期'
,
MVC_FIELD_ORDER_3_4
,
MVC_FIELD_ORDER_1_2
),
MVC_CLASSIFY_SE
:
(
'转移登记日期'
,
MVC_SE_FIELD_ORDER_3_4
,
MVC_SE_FIELD_ORDER_1_2
)
}
LICENSE_ORDER
=
((
MVI_CLASSIFY
,
(
MVI_PID
,
MVI_CN_NAME
,
MVI_FIELD_ORDER
,
False
,
False
)),
(
IC_CLASSIFY
,
(
IC_PID
,
IC_CN_NAME
,
None
,
True
,
False
)),
(
RP_CLASSIFY
,
(
None
,
RP_CN_NAME
,
None
,
True
,
False
)),
(
BC_CLASSIFY
,
(
BC_PID
,
BC_CN_NAME
,
BC_FIELD_ORDER
,
False
,
False
)),
(
BL_CLASSIFY
,
(
BL_PID
,
BL_CN_NAME
,
BL_FIELD_ORDER
,
False
,
False
)),
(
UCI_CLASSIFY
,
(
UCI_PID
,
UCI_CN_NAME
,
UCI_FIELD_ORDER
,
False
,
False
)),
(
EEP_CLASSIFY
,
(
EEP_PID
,
EEP_CN_NAME
,
EEP_FIELD_ORDER
,
False
,
False
)),
(
DL_CLASSIFY
,
(
DL_PID
,
DL_CN_NAME
,
None
,
True
,
False
)),
(
PP_CLASSIFY
,
(
PP_PID
,
PP_CN_NAME
,
PP_FIELD_ORDER
,
False
,
False
)),
(
MVC_CLASSIFY
,
(
MVC_PID
,
MVC_CN_NAME
,
None
,
True
,
True
)),
(
VAT_CLASSIFY
,
(
VAT_PID
,
VAT_CN_NAME
,
VAT_FIELD_ORDER
,
False
,
False
)))
LICENSE_CLASSIFY_MAPPING
=
dict
(
LICENSE_ORDER
)
...
...
src/apps/doc/management/commands/doc_ocr_process.py
View file @
1529291
...
...
@@ -159,6 +159,7 @@ class Command(BaseCommand, LoggerMixin):
ed_list
.
append
(
summary
[
6
])
def
license1_process
(
self
,
ocr_data
,
license_summary
,
classify
,
skip_img
,
img_path
):
# 类别:'0'身份证, '1'居住证
license_data
=
ocr_data
.
get
(
'data'
,
[])
if
not
license_data
:
skip_img
.
append
(
self
.
parse_img_path
(
img_path
))
...
...
@@ -174,7 +175,7 @@ class Command(BaseCommand, LoggerMixin):
# res_dict[chn_key] = ocr_res_2.get(en_key, '')
license_summary
.
setdefault
(
classify
,
[])
.
append
(
ocr_res_2
)
else
:
# 营业执照
、行驶证
等
# 营业执照等
for
result_dict
in
ocr_res_2
.
get
(
'ResultList'
,
[]):
res_dict
=
{}
for
field_dict
in
result_dict
.
get
(
'FieldList'
,
[]):
...
...
@@ -224,7 +225,7 @@ class Command(BaseCommand, LoggerMixin):
elif
classify
in
consts
.
LICENSE_CLASSIFY_SET_1
:
# 证件1
self
.
license1_process
(
ocr_data
,
license_summary
,
classify
,
skip_img
,
img_path
)
elif
classify
in
consts
.
LICENSE_CLASSIFY_SET_2
:
# 证件2
pid
,
_
,
_
=
consts
.
LICENSE_CLASSIFY_MAPPING
.
get
(
classify
)
pid
,
_
,
_
,
_
,
_
=
consts
.
LICENSE_CLASSIFY_MAPPING
.
get
(
classify
)
json_data_2
=
{
"pid"
:
str
(
pid
),
"key"
:
conf
.
OCR_KEY
,
...
...
@@ -490,8 +491,7 @@ class Command(BaseCommand, LoggerMixin):
# 识别失败:普通异常,如PDF异常、构建过程异常
# EDMS异常:下载异常-->回队列-->邮件;上传异常-->重新上传队列-->邮件
# 算法异常:第一道异常-->识别失败-->邮件;第二道异常-->识别失败-->邮件
# TODO 协程异步发送OCR请求
# TODO 调用接口重试
# TODO OCR接口调用重试
# TODO 数据库断联问题
def
handle
(
self
,
*
args
,
**
kwargs
):
sleep_second
=
int
(
conf
.
SLEEP_SECOND
)
...
...
src/apps/doc/ocr/wb.py
View file @
1529291
...
...
@@ -360,15 +360,20 @@ class BSWorkbook(Workbook):
self
.
remove
(
self
.
get_sheet_by_name
(
sheet
))
def
license_rebuild
(
self
,
license_summary
,
document_scheme
):
for
classify
,
(
_
,
name
,
field_order
)
in
consts
.
LICENSE_ORDER
:
# 机动车登记证:CA和SE不同顺序
if
classify
==
consts
.
MVC_CLASSIFY
and
document_scheme
==
consts
.
DOC_SCHEME_LIST
[
1
]:
field_order
=
consts
.
MVC_SE_FIELD_ORDER
for
classify
,
(
_
,
name
,
field_order
,
side_diff
,
scheme_diff
)
in
consts
.
LICENSE_ORDER
:
license_list
=
license_summary
.
get
(
classify
)
if
license_list
is
None
:
if
not
license_list
:
continue
ws
=
self
.
create_sheet
(
name
)
if
scheme_diff
and
document_scheme
==
consts
.
DOC_SCHEME_LIST
[
1
]:
classify
=
consts
.
MVC_CLASSIFY_SE
for
license_dict
in
license_list
:
if
classify
==
consts
.
IC_CLASSIFY
and
license_dict
.
get
(
'类别'
)
==
'1'
:
license_summary
.
setdefault
(
consts
.
RP_CLASSIFY
,
[])
.
append
(
license_dict
)
continue
if
side_diff
:
key
,
field_order_yes
,
field_order_no
=
consts
.
FIELD_ORDER_MAP
.
get
(
classify
)
field_order
=
field_order_yes
if
key
in
license_dict
else
field_order_no
for
search_field
,
write_field
in
field_order
:
ws
.
append
((
write_field
,
license_dict
.
get
(
search_field
,
''
)))
ws
.
append
((
None
,
))
...
...
Write
Preview
Styling with
Markdown
is supported
Attach a file
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to post a comment