Skip to content
Toggle navigation
Toggle navigation
This project
Loading...
Sign in
周伟奇
/
bmw-ocr
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Graphs
Network
Create a new issue
Commits
Issue Boards
Files
Commits
Network
Compare
Branches
Tags
3b1e6657
authored
2021-12-23 11:23:04 +0800
by
周伟奇
Browse Files
Options
Browse Files
Tag
Download
Email Patches
Plain Diff
fix folder
1 parent
6e9f7b32
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
120 additions
and
64 deletions
src/apps/doc/management/commands/folder_ocr_process.py
src/apps/doc/management/commands/folder_wsc_process.py
src/celery_compare/tasks.py
src/apps/doc/management/commands/folder_ocr_process.py
View file @
3b1e665
...
...
@@ -61,72 +61,114 @@ class Command(BaseCommand, LoggerMixin):
def
signal_handler
(
self
,
sig
,
frame
):
self
.
switch
=
False
# 停止处理文件
def
license1_process
(
self
,
ocr_data
,
license_summary
,
classify
,
img_path
):
def
license1_process
(
self
,
ocr_data
,
license_summary
,
classify
):
# 类别:'0'身份证, '1'居住证
license_data
=
ocr_data
.
get
(
'data'
,
[]
)
license_data
=
ocr_data
.
get
(
'data'
)
if
not
license_data
:
return
if
classify
==
consts
.
MVC_CLASSIFY
:
# 车辆登记证 3/4页结果整合
for
mvc_dict
in
license_data
:
try
:
mvc_page
=
mvc_dict
.
pop
(
'page'
)
except
Exception
as
e
:
pass
else
:
if
isinstance
(
license_data
,
dict
):
license_data
.
pop
(
'base64_img'
,
''
)
# 保单
if
classify
==
consts
.
INSURANCE_CLASSIFY
:
product_result
=
[
''
,
''
,
''
]
for
product
in
license_data
.
get
(
'result'
,
{})
.
get
(
'productList'
,
[]):
name
=
product
.
get
(
'name'
,
{})
.
get
(
'words'
,
''
)
if
name
.
find
(
'机动车损失'
)
!=
-
1
:
product_result
[
0
]
=
product
.
get
(
'coverage'
,
{})
.
get
(
'words'
,
''
)
product_result
[
2
]
=
product
.
get
(
'deductible_franchise'
,
{})
.
get
(
'words'
,
''
)
elif
name
.
find
(
'第三者责任'
)
!=
-
1
:
product_result
[
1
]
=
product
.
get
(
'coverage'
,
{})
.
get
(
'words'
,
''
)
special_str
=
license_data
.
get
(
'result'
,
{})
.
get
(
'1stBeneficiary'
,
{})
.
get
(
'words'
,
''
)
special
=
'无'
if
special_str
.
find
(
'宝马'
)
!=
-
1
or
special_str
.
find
(
'先锋国际融资租赁有限公司'
)
!=
-
1
:
special
=
'有'
insurance_ocr_result
=
{
'被保险人姓名'
:
license_data
.
get
(
'result'
,
{})
.
get
(
'insured'
,
{})
.
get
(
'name'
,
{})
.
get
(
'words'
,
''
),
'被保险人证件号码'
:
license_data
.
get
(
'result'
,
{})
.
get
(
'insured'
,
{})
.
get
(
'certiCode'
,
{})
.
get
(
'words'
,
''
),
'车架号'
:
license_data
.
get
(
'result'
,
{})
.
get
(
'vehicle'
,
{})
.
get
(
'VIN'
,
{})
.
get
(
'words'
,
''
),
'机动车损失保险金额'
:
product_result
[
0
],
'机动车第三者责任保险金额'
:
product_result
[
1
],
'机动车损失保险绝对免赔率/绝对免赔额'
:
product_result
[
2
],
'保险费合计'
:
license_data
.
get
(
'result'
,
{})
.
get
(
'premiumSum'
,
{})
.
get
(
'words'
,
''
),
'保险起始日期'
:
license_data
.
get
(
'result'
,
{})
.
get
(
'startDate'
,
{})
.
get
(
'words'
,
''
),
'保险截止日期'
:
license_data
.
get
(
'result'
,
{})
.
get
(
'endDate'
,
{})
.
get
(
'words'
,
''
),
'保单章'
:
license_data
.
get
(
'result'
,
{})
.
get
(
'seal'
,
{})
.
get
(
'words'
,
''
),
'特别约定第一受益人'
:
special
,
}
license_summary
.
setdefault
(
classify
,
[])
.
append
(
insurance_ocr_result
)
# DDA
elif
classify
==
consts
.
DDA_CLASSIFY
:
pro
=
ocr_data
.
get
(
'confidence'
,
0
)
if
pro
<
consts
.
DDA_PRO_MIN
:
return
dda_ocr_result
=
{}
for
key
,
value
in
license_data
.
get
(
'result'
,
{})
.
items
():
dda_ocr_result
[
key
]
=
value
.
get
(
'words'
,
''
)
dda_ocr_result
[
consts
.
DDA_PRO
]
=
pro
license_summary
.
setdefault
(
classify
,
[])
.
append
(
dda_ocr_result
)
# 抵押登记豁免函
elif
classify
==
consts
.
HMH_CLASSIFY
:
hmh_ocr_result
=
{}
for
key
,
value
in
license_data
.
get
(
'words_result'
,
{})
.
items
():
hmh_ocr_result
[
key
]
=
value
.
get
(
'words'
,
''
)
license_summary
.
setdefault
(
classify
,
[])
.
append
(
hmh_ocr_result
)
# 二手车交易凭证
elif
classify
==
consts
.
JYPZ_CLASSIFY
:
jypz_ocr_result
=
{}
for
key
,
value
in
license_data
.
get
(
'result'
,
{})
.
items
():
jypz_ocr_result
[
key
]
=
value
.
get
(
'words'
,
''
)
license_summary
.
setdefault
(
classify
,
[])
.
append
(
jypz_ocr_result
)
# 车辆登记证 3/4页结果整合
elif
classify
==
consts
.
MVC_CLASSIFY
:
rebuild_data_dict
=
{}
mvc_page
=
license_data
.
pop
(
'page'
,
'VehicleRCI'
)
mvc_res
=
license_data
.
pop
(
'results'
,
{})
if
mvc_page
==
'VehicleRegArea'
:
mvc_res
=
mvc_dict
.
pop
(
'results'
,
{})
mvc_dict
[
'机动车登记证书编号'
]
=
mvc_res
.
get
(
'register_no'
,
{})
.
get
(
'words'
,
''
)
for
register_info
in
mvc_res
.
get
(
'register_info'
,
[]):
for
detail_dict
in
register_info
.
get
(
'details'
,
{})
.
values
():
mvc_dict
.
setdefault
(
detail_dict
.
get
(
'chinese_key'
,
'未知'
),
[])
.
append
(
rebuild_data_dict
[
'机动车登记证书编号'
]
=
mvc_res
.
get
(
'机动车登记证书编号'
,
{})
.
get
(
'words'
,
''
)
for
register_info
in
mvc_res
.
get
(
'登记信息'
,
[]):
register_info
.
pop
(
'register_type'
,
None
)
register_info
.
pop
(
'register_type_name'
,
None
)
for
cn_key
,
detail_dict
in
register_info
.
items
():
rebuild_data_dict
.
setdefault
(
cn_key
,
[])
.
append
(
detail_dict
.
get
(
'words'
,
''
))
del
mvc_res
if
classify
==
consts
.
IC_CLASSIFY
:
for
id_card_dict
in
license_data
:
try
:
base64_img
=
id_card_dict
.
pop
(
'base64_img'
)
except
Exception
as
e
:
continue
else
:
card_type
=
-
1
json_data_4
=
{
'mode'
:
1
,
'user_info'
:
{
'image_content'
:
base64_img
,
},
'options'
:
{
'distinguish_type'
:
1
,
'auto_rotate'
:
True
,
},
}
for
times
in
range
(
consts
.
RETRY_TIMES
):
try
:
start_time
=
time
.
time
()
ocr_4_response
=
requests
.
post
(
self
.
ocr_url_4
,
json
=
json_data_4
)
if
ocr_4_response
.
status_code
!=
200
:
raise
OCR4Exception
(
'ocr_4 status code: {0}'
.
format
(
ocr_4_response
.
status_code
))
except
Exception
as
e
:
self
.
folder_log
.
warn
(
'{0} [ocr_4 failed] [times={1}] [img_path={2}] [error={3}]'
.
format
(
self
.
log_base
,
times
,
img_path
,
traceback
.
format_exc
()))
else
:
ocr_4_res
=
ocr_4_response
.
json
()
end_time
=
time
.
time
()
speed_time
=
int
(
end_time
-
start_time
)
if
ocr_4_res
.
get
(
'code'
)
==
0
and
ocr_4_res
.
get
(
'result'
,
{})
.
get
(
'rtn'
)
==
0
:
card_type
=
ocr_4_res
.
get
(
'result'
,
{})
.
get
(
'idcard_distinguish_result'
,
{})
.
get
(
'result'
,
-
1
)
for
cn_key
,
detail_dict
in
mvc_res
.
items
():
rebuild_data_dict
[
cn_key
]
=
detail_dict
.
get
(
'words'
,
''
)
del
mvc_res
license_summary
.
setdefault
(
classify
,
[])
.
append
(
rebuild_data_dict
)
self
.
folder_log
.
info
(
'{0} [ocr_4 success] [img_path={1}] [speed_time={2}]'
.
format
(
self
.
log_base
,
img_path
,
speed_time
))
break
# 身份证真伪
elif
classify
==
consts
.
IC_CLASSIFY
:
id_card_dict
=
{}
card_type
=
license_data
.
get
(
'type'
,
''
)
is_ic
=
card_type
.
startswith
(
'身份证'
)
is_info_side
=
card_type
.
endswith
(
'信息面'
)
id_card_dict
[
'类别'
]
=
'0'
if
is_ic
else
'1'
if
is_ic
:
field_map
=
consts
.
IC_MAP_0
if
is_info_side
else
consts
.
IC_MAP_1
else
:
self
.
folder_log
.
warn
(
'{0} [ocr_4 failed] [img_path={1}]'
.
format
(
self
.
log_base
,
img_path
))
field_map
=
consts
.
RP_MAP_0
if
is_info_side
else
consts
.
RP_MAP_1
for
write_field
,
search_field
in
field_map
:
id_card_dict
[
write_field
]
=
license_data
.
get
(
'words_result'
,
{})
.
get
(
search_field
,
{})
.
get
(
'words'
,
''
)
if
not
is_info_side
:
start_time
=
license_data
.
get
(
'words_result'
,
{})
.
get
(
'签发日期'
,
{})
.
get
(
'words'
,
''
)
end_time
=
license_data
.
get
(
'words_result'
,
{})
.
get
(
'失效日期'
,
{})
.
get
(
'words'
,
''
)
id_card_dict
[
'有效期限'
]
=
'{0}-{1}'
.
format
(
start_time
,
end_time
)
id_card_dict
[
consts
.
IC_TURE_OR_FALSE
]
=
consts
.
IC_RES_MAPPING
.
get
(
card_type
)
id_card_dict
[
consts
.
IC_TURE_OR_FALSE
]
=
consts
.
IC_RES_MAPPING
.
get
(
1
)
license_summary
.
setdefault
(
classify
,
[])
.
append
(
id_card_dict
)
# 购车发票 & 二手车发票
elif
classify
==
consts
.
MVI_CLASSIFY
or
classify
==
consts
.
UCI_CLASSIFY
:
rebuild_data_dict
=
{}
mvi_res
=
license_data
.
pop
(
'result'
,
{})
for
en_key
,
detail_dict
in
mvi_res
.
items
():
rebuild_data_dict
[
detail_dict
.
get
(
'chinese_key'
,
''
)]
=
detail_dict
.
get
(
'words'
,
''
)
rebuild_data_dict
[
'新旧版式'
]
=
license_data
.
get
(
'layout'
,
''
)
license_summary
.
setdefault
(
classify
,
[])
.
append
(
rebuild_data_dict
)
# 其他
else
:
license_summary
.
setdefault
(
classify
,
[])
.
extend
(
license_data
)
@staticmethod
...
...
@@ -166,7 +208,7 @@ class Command(BaseCommand, LoggerMixin):
if
isinstance
(
data_list
,
list
):
for
ocr_data
in
data_list
:
# part_idx = part_idx + 1
self
.
license1_process
(
ocr_data
,
license_summary
,
classify
,
img_path
)
self
.
license1_process
(
ocr_data
,
license_summary
,
classify
)
wb
=
BSWorkbook
(
set
(),
set
(),
set
(),
set
(),
set
())
wb
.
simple_license_rebuild
(
license_summary
,
consts
.
DOC_SCHEME_LIST
[
0
])
...
...
src/apps/doc/management/commands/folder_wsc_process.py
View file @
3b1e665
...
...
@@ -48,6 +48,7 @@ class Finder:
"其他约定与条件英文"
:
""
,
"其他约定与条件中文"
:
""
,
}
def
get_line
(
self
,
ocr_results
,
key_string
):
# 根据指定关键词, 找出与关键词同处一行的字符
top
,
bottom
=
-
1
,
-
1
...
...
@@ -69,6 +70,7 @@ class Finder:
line_text
=
sorted
(
line_text
,
key
=
lambda
x
:
x
[
0
][
0
],
reverse
=
False
)
lines
=
''
.
join
([
i
[
1
]
for
i
in
line_text
])
return
lines
def
page_predict
(
self
,
ocr_results
,
page_template
):
classes
=
[]
for
pno
in
ocr_results
:
...
...
@@ -82,6 +84,7 @@ class Finder:
classes
.
append
([
pno
,
score
])
pred
=
sorted
(
classes
,
key
=
lambda
x
:
x
[
1
],
reverse
=
True
)[
0
]
return
pred
def
get_top_key
(
self
,
ocr_results
,
key_string
):
# 加入过滤词典
"""找到与 key_string 最匹配的字段的 key
"""
...
...
@@ -90,6 +93,7 @@ class Finder:
ratio_list
=
[[
fuzz
.
ratio
(
key_string
,
ocr_results
[
key
][
1
]),
key
]
for
key
in
ocr_results
]
top_key
=
sorted
(
ratio_list
,
key
=
lambda
x
:
x
[
0
])[
-
1
]
return
top_key
def
get_top_iou
(
self
,
ocr_results
,
poly
):
"""求最大IoU
"""
...
...
@@ -108,6 +112,7 @@ class Finder:
return
-
1
,
-
1
top_iou
=
sorted
(
iou_list
,
key
=
lambda
x
:
x
[
0
])[
-
1
]
return
top_iou
def
get_key_value
(
self
,
ocr_results
,
key_string
):
"""根据 key 查找 value
"""
...
...
@@ -139,6 +144,7 @@ class Finder:
else
:
value
=
words
return
value
def
get_contract_No
(
self
):
"""提取左上角的合同编号字段
"""
...
...
@@ -153,6 +159,7 @@ class Finder:
# TODO!!!
contract_No_list
.
append
(
contract_No
)
return
contract_No_list
def
get_info_in_page_3
(
self
):
"""提取第三页上的经销商名称,和经销商统一社会信用代码或公司注册号
"""
...
...
@@ -178,6 +185,7 @@ class Finder:
words
=
self
.
get_key_value
(
self
.
ocr_results
[
pno
],
'统一社会信用代码或公司注册号'
)
dealer_No
=
words
.
replace
(
'O'
,
'0'
)
return
dealer_name
,
dealer_No
def
get_info_in_page_38
(
self
):
"""提取第38页上的经销商名称
"""
...
...
@@ -195,6 +203,7 @@ class Finder:
words
=
re
.
sub
(
r'[(())盖章《]'
,
""
,
words
)
dealer_name
=
words
return
dealer_name
def
get_guarantor
(
self
):
"""提取第10页上保证人段落,所见即所得
"""
...
...
@@ -210,6 +219,7 @@ class Finder:
words
=
words
.
replace
(
'【'
,
'['
)
.
replace
(
'】'
,
']'
)
.
replace
(
','
,
','
)
.
replace
(
'('
,
'('
)
.
replace
(
')'
,
')'
)
guarantor
=
words
return
guarantor
def
get_info_in_page_39
(
self
):
"""提取综合授信合同上的一些字段
"""
...
...
@@ -291,6 +301,7 @@ class Finder:
deposit_chn
=
f
'{words}
%
'
return
amount_eng
,
amount_chn
,
term_start_eng
,
term_end_eng
,
\
term_start_chn
,
term_end_chn
,
deposit_eng
,
deposit_chn
def
get_other_arrangements_and_conditions
(
self
):
"""获取其它约定与条件文本段落
"""
...
...
@@ -311,6 +322,7 @@ class Finder:
words
=
searchObj
.
group
(
1
)
other_arrangements_and_conditions_chn
=
words
return
other_arrangements_and_conditions_eng
,
other_arrangements_and_conditions_chn
def
get_info
(
self
):
# 按照文档页码返回一个合同编号列表,依次表示每一页上识别到的合同编号
contract_No_list
=
self
.
get_contract_No
()
...
...
@@ -337,6 +349,7 @@ class Finder:
self
.
init_result
[
"其他约定与条件中文"
]
=
words_chn
return
self
.
init_result
class
TIFFHandler
:
def
__init__
(
self
,
path
,
img_save_path
):
...
...
@@ -568,6 +581,7 @@ class Command(BaseCommand, LoggerMixin):
if
len
(
true_file_set
)
==
0
and
len
(
os_error_filename_set
)
>
0
:
true_file_set
.
add
(
os_error_filename_set
.
pop
())
for
name
in
true_file_set
:
time
.
sleep
(
10
)
# 防止文件较大时,读取到不完整文件
path
=
os
.
path
.
join
(
input_dir
,
name
)
try
:
...
...
src/celery_compare/tasks.py
View file @
3b1e665
...
...
@@ -989,7 +989,7 @@ def get_se_cms_compare_info_auto(last_obj, application_entity):
vehicle_field_input
.
append
((
consts
.
SE_NEW_ADD_FIELD
[
0
],
hmh_name
))
vehicle_field_input
.
append
((
consts
.
SE_NEW_ADD_FIELD
[
1
],
hmh_id
))
vehicle_field_input
.
append
((
consts
.
SE_NEW_ADD_FIELD
[
2
],
first_submission_date
))
vehicle_field_input
.
append
((
consts
.
SE_NEW_ADD_FIELD
[
3
],
consts
.
SE_STAMP_VALUE
))
#
vehicle_field_input.append((consts.SE_NEW_ADD_FIELD[3], consts.SE_STAMP_VALUE))
vehicle_field_input
.
append
((
consts
.
SE_NEW_ADD_FIELD
[
4
],
consts
.
SE_FPL_VALUE
))
bhsj
=
float
(
amount
)
/
1.13
vehicle_field_input
.
append
((
consts
.
SE_NEW_ADD_FIELD
[
5
],
consts
.
SPLIT_STR
.
join
([
...
...
@@ -1464,7 +1464,7 @@ def get_se_cms_compare_info(last_obj, application_entity, detect_list):
vehicle_field_input
.
append
((
consts
.
SE_NEW_ADD_FIELD
[
0
],
hmh_name
))
vehicle_field_input
.
append
((
consts
.
SE_NEW_ADD_FIELD
[
1
],
hmh_id
))
vehicle_field_input
.
append
((
consts
.
SE_NEW_ADD_FIELD
[
2
],
first_submission_date
))
vehicle_field_input
.
append
((
consts
.
SE_NEW_ADD_FIELD
[
3
],
consts
.
SE_STAMP_VALUE
))
#
vehicle_field_input.append((consts.SE_NEW_ADD_FIELD[3], consts.SE_STAMP_VALUE))
vehicle_field_input
.
append
((
consts
.
SE_NEW_ADD_FIELD
[
4
],
consts
.
SE_FPL_VALUE
))
bhsj
=
float
(
amount
)
/
1.13
vehicle_field_input
.
append
((
consts
.
SE_NEW_ADD_FIELD
[
5
],
consts
.
SPLIT_STR
.
join
([
...
...
@@ -1499,7 +1499,7 @@ def get_se_cms_compare_info(last_obj, application_entity, detect_list):
gb34_field_input
.
append
((
consts
.
SE_GB_USED_FIELD
[
1
],
main_num
))
gb34_field_input
.
append
((
consts
.
SE_GB_USED_FIELD
[
2
],
first_submission_date
))
vehicle_field_input
.
append
((
consts
.
SE_NEW_ADD_FIELD
[
2
],
first_submission_date
))
vehicle_field_input
.
append
((
consts
.
SE_NEW_ADD_FIELD
[
3
],
consts
.
SE_STAMP_VALUE
))
#
vehicle_field_input.append((consts.SE_NEW_ADD_FIELD[3], consts.SE_STAMP_VALUE))
jypz_field_input
.
append
((
'vinNo'
,
vin_no
))
jypz_field_input
.
append
((
'vehicleTransactionAmount'
,
amount
))
jypz_field_input
.
append
((
consts
.
SE_GB_USED_FIELD
[
-
1
],
first_submission_date
))
...
...
@@ -2683,9 +2683,9 @@ def se_compare_process(compare_info, ocr_res_dict, is_gsyh):
for
i
in
cn_reason_list
:
if
i
in
tmp_set
:
continue
elif
i
in
consts
.
BS_REASON
:
tmp_set
.
add
(
i
)
bs_cn_reason_list
.
append
(
i
)
#
elif i in consts.BS_REASON:
#
tmp_set.add(i)
#
bs_cn_reason_list.append(i)
else
:
tmp_set
.
add
(
i
)
last_cn_reason_list
.
append
(
i
)
...
...
Write
Preview
Styling with
Markdown
is supported
Attach a file
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to post a comment