Skip to content
Toggle navigation
Toggle navigation
This project
Loading...
Sign in
周伟奇
/
bmw-ocr
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Graphs
Network
Create a new issue
Commits
Issue Boards
Files
Commits
Network
Compare
Branches
Tags
3586d37a
authored
2022-01-19 11:39:47 +0800
by
周伟奇
Browse Files
Options
Browse Files
Tag
Download
Email Patches
Plain Diff
add ltgt
1 parent
634fd497
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
118 additions
and
24 deletions
src/apps/doc/consts.py
src/apps/doc/management/commands/folder_ltgt_process.py
src/celery_compare/tasks.py
src/apps/doc/consts.py
View file @
3586d37
...
...
@@ -832,6 +832,16 @@ MVC_SE_FIELD_ORDER_3_4 = (
(
'解除抵押日期'
,
'解除抵押日期'
),
)
MVC_SE_FIELD_ORDER_1_2_LTGT
=
((
'1.机动车所有人/身份证名称/号码'
,
'机动车所有人/身份证明名称/号码'
),
(
'编号'
,
'机动车登记证书编号'
),)
MVC_SE_FIELD_ORDER_3_4_LTGT
=
(
(
'身份证名称/号码'
,
'身份证明名称/号码'
),
(
'机动车登记证书编号'
,
'机动车登记证书编号'
),
(
'抵押登记日期'
,
'抵押登记日期'
),
)
# 机动车销售统一发票
MVI_CN_NAME
=
'机动车销售统一发票'
MVI_CLASSIFY
=
29
...
...
@@ -1421,6 +1431,23 @@ SE_DDA_FIELD = ['applicationId(1)', 'applicationId(2)', 'bankName', 'companyName
ASP_KEY
=
'is_asp'
AFC_CON_MAP_LTGT
=
{
'合同编号'
:
(
1
,
1
,
'合同编号'
,
None
),
'借款人姓名'
:
(
2
,
2
,
'借款人及抵押人'
,
'name'
),
'共借人姓名'
:
(
2
,
2
,
'共同借款人及共同抵押人'
,
'name'
),
'保证人姓名1'
:
(
2
,
2
,
'保证人1'
,
'name'
),
'保证人姓名2'
:
(
2
,
2
,
'保证人2'
,
'name'
),
}
AFC_CON_FIELD_ORDER_LTGT
=
(
(
'合同编号'
,
'合同编号'
),
(
'借款人姓名'
,
'借款人姓名'
),
(
'共借人姓名'
,
'共借人姓名'
),
(
'保证人姓名1'
,
'保证人姓名1'
),
(
'保证人姓名2'
,
'保证人姓名2'
),
)
SE_AFC_CON_MAP
=
{
'合同编号-每页'
:
(
None
,
None
,
'合同编号'
,
None
),
'所购车辆价格-小写-重要条款'
:
(
1
,
1
,
'所购车辆价格'
,
None
),
...
...
src/apps/doc/management/commands/folder_ltgt_process.py
View file @
3586d37
...
...
@@ -58,10 +58,11 @@ class Command(BaseCommand, LoggerMixin):
130
:
'民事调解书'
}
self
.
sheet_content
=
{
128
:
(
'执行裁定书'
,
(
'承办法院'
,
'案号/标号'
,
'被执行人'
,
'债权金额'
,
'诉讼时间'
))
,
129
:
(
'民事判决书'
,
(
'承办法院'
,
'案号/标号'
,
'被告'
,
'判决结果: 贷款本金'
,
'判决结果: 罚息'
,
'判决结果: 律师费'
,
'判决结果: 案件受理费'
,
'诉讼时间'
))
,
130
:
(
'民事调解书'
,
(
'承办法院'
,
'案号/标号'
,
'被告'
,
'协议内容: 支付金额'
,
'协议内容: 案件受理费'
,
'诉讼时间'
))
,
128
:
[
'执行裁定书'
,
[
'承办法院'
,
'案号/标号'
,
'被执行人'
,
'债权金额'
,
'诉讼时间'
]]
,
129
:
[
'民事判决书'
,
[
'承办法院'
,
'案号/标号'
,
'被告'
,
'判决结果: 贷款本金'
,
'判决结果: 罚息'
,
'判决结果: 律师费'
,
'判决结果: 案件受理费'
,
'诉讼时间'
]]
,
130
:
[
'民事调解书'
,
[
'承办法院'
,
'案号/标号'
,
'被告'
,
'协议内容: 支付金额'
,
'协议内容: 案件受理费'
,
'诉讼时间'
]]
,
}
self
.
FILE_KEY
=
'file'
self
.
DATE_KEY
=
'date'
self
.
CLASSIFY_KEY
=
'classify'
self
.
RESULT_KEY
=
'result'
...
...
@@ -84,6 +85,14 @@ class Command(BaseCommand, LoggerMixin):
consts
.
IC_CLASSIFY
:
(
consts
.
IC_CN_NAME
,
'有效期限'
,
consts
.
IC_FIELD_ORDER_3
,
consts
.
IC_FIELD_ORDER_2
),
consts
.
MVC_CLASSIFY
:
(
consts
.
MVC_CN_NAME
,
'机动车登记证书编号'
,
consts
.
MVC_SE_FIELD_ORDER_3_4
,
consts
.
MVC_SE_FIELD_ORDER_1_2
),
}
self
.
field_map_2
=
{
# sheet_name, key_field, side_field_order, src_field_order
consts
.
CONTRACT_CLASSIFY
:
(
consts
.
CONTRACT_CN_NAME
,
None
,
None
,
consts
.
AFC_CON_FIELD_ORDER_LTGT
),
consts
.
VAT_CLASSIFY
:
(
consts
.
VAT_CN_NAME
,
None
,
None
,
consts
.
VATS_FIELD_ORDER
),
consts
.
IC_CLASSIFY
:
(
consts
.
IC_CN_NAME
,
'有效期限'
,
consts
.
IC_FIELD_ORDER_3
,
consts
.
IC_FIELD_ORDER_2
),
consts
.
MVC_CLASSIFY
:
(
consts
.
MVC_CN_NAME
,
'机动车登记证书编号'
,
consts
.
MVC_SE_FIELD_ORDER_3_4_LTGT
,
consts
.
MVC_SE_FIELD_ORDER_1_2_LTGT
),
}
# ocr相关
self
.
ocr_url
=
conf
.
OCR_URL_FOLDER
self
.
ocr_url_2
=
conf
.
OCR2_URL_FOLDER
...
...
@@ -95,7 +104,7 @@ class Command(BaseCommand, LoggerMixin):
def
signal_handler
(
self
,
sig
,
frame
):
self
.
switch
=
False
# 停止处理文件
def
contract_process
(
self
,
ocr_data
,
contract_result
,
classify
):
def
contract_process
(
self
,
ocr_data
,
contract_result
,
classify
,
rebuild_contract_result
):
contract_dict
=
ocr_data
.
get
(
'data'
)
if
not
contract_dict
or
contract_dict
.
get
(
'page_num'
)
is
None
or
contract_dict
.
get
(
'page_info'
)
is
None
:
return
...
...
@@ -135,6 +144,26 @@ class Command(BaseCommand, LoggerMixin):
contract_result
.
setdefault
(
classify
,
dict
())
.
setdefault
(
page_num_only
,
[])
.
append
(
rebuild_page_info
)
page_compare_dict
=
{}
for
key
,
value
in
contract_dict
.
get
(
'page_info'
,
{})
.
items
():
if
not
isinstance
(
value
,
dict
):
continue
elif
text_key
in
value
:
if
value
[
text_key
]
is
None
:
page_compare_dict
[
key
]
=
''
elif
isinstance
(
value
[
text_key
],
str
):
page_compare_dict
[
key
]
=
value
[
text_key
]
elif
isinstance
(
value
[
text_key
],
list
):
page_compare_dict
[
key
]
=
value
[
text_key
]
else
:
page_compare_dict
[
key
]
=
{}
for
sub_key
,
sub_value
in
value
.
items
():
if
sub_value
[
text_key
]
is
None
:
page_compare_dict
[
key
][
sub_key
]
=
''
elif
isinstance
(
sub_value
[
text_key
],
str
):
page_compare_dict
[
key
][
sub_key
]
=
sub_value
[
text_key
]
rebuild_contract_result
.
setdefault
(
classify
,
dict
())[
page_num_only
]
=
page_compare_dict
def
license1_process
(
self
,
ocr_data
,
all_res
,
classify
):
# 类别:'0'身份证, '1'居住证
license_data
=
ocr_data
.
get
(
'data'
)
...
...
@@ -253,7 +282,31 @@ class Command(BaseCommand, LoggerMixin):
seperate_path_map
[
c
]
=
os
.
path
.
join
(
seperate_dir
,
new_name
)
return
img_save_path
,
excel_path
,
pdf_save_path
,
seperate_path_map
def
res_process
(
self
,
all_res
,
excel_path
,
classify
,
contract_result
):
@staticmethod
def
all_res_add_contract
(
all_res
,
rebuild_contract_result
):
for
classify
,
page_info_dict
in
rebuild_contract_result
.
items
():
res
=
{}
is_asp
=
False
for
key
,
(
pno_not_asp
,
pno_asp
,
key1
,
key2
)
in
consts
.
AFC_CON_MAP_LTGT
.
items
():
pno
=
pno_asp
if
is_asp
else
pno_not_asp
if
pno
is
None
:
if
isinstance
(
pno_asp
,
int
):
continue
end_idx
=
9
if
is_asp
else
8
for
i
in
range
(
1
,
end_idx
):
res
.
setdefault
(
key
,
list
())
.
append
(
page_info_dict
.
get
(
str
(
i
),
{})
.
get
(
key1
,
''
))
elif
key2
is
None
:
res
[
key
]
=
page_info_dict
.
get
(
str
(
pno
),
{})
.
get
(
key1
,
''
)
res
.
setdefault
(
consts
.
IMG_PATH_KEY
,
dict
())[
key
]
=
page_info_dict
.
get
(
str
(
pno
),
{})
.
get
(
consts
.
IMG_PATH_KEY
,
''
)
else
:
res
[
key
]
=
page_info_dict
.
get
(
str
(
pno
),
{})
.
get
(
key1
,
{})
.
get
(
key2
,
''
)
res
.
setdefault
(
consts
.
IMG_PATH_KEY
,
dict
())[
key
]
=
page_info_dict
.
get
(
str
(
pno
),
{})
.
get
(
consts
.
IMG_PATH_KEY
,
''
)
all_res
[
classify
]
=
[
res
]
def
res_process
(
self
,
all_res
,
excel_path
,
classify
,
contract_result
,
rebuild_contract_result
):
try
:
wb
=
BSWorkbook
(
set
(),
set
(),
set
(),
set
(),
set
())
for
c
,
res_list
in
all_res
.
items
():
...
...
@@ -274,6 +327,8 @@ class Command(BaseCommand, LoggerMixin):
wb
.
contract_rebuild
(
contract_result
)
wb
.
remove_base_sheet
()
wb
.
save
(
excel_path
)
self
.
all_res_add_contract
(
all_res
,
rebuild_contract_result
)
except
Exception
as
e
:
self
.
folder_log
.
error
(
'{0} [wb build error] [path={1}] [error={2}]'
.
format
(
self
.
log_base
,
excel_path
,
traceback
.
format_exc
()))
...
...
@@ -284,7 +339,7 @@ class Command(BaseCommand, LoggerMixin):
sep
=
os
.
path
.
sep
+
(
os
.
path
.
altsep
or
''
)
return
os
.
path
.
basename
(
path
.
rstrip
(
sep
))
def
ocr_process
(
self
,
img_path
,
classify
,
all_res
,
seperate_path_map
,
contract_result
):
def
ocr_process
(
self
,
img_path
,
classify
,
all_res
,
seperate_path_map
,
contract_result
,
rebuild_contract_result
):
if
os
.
path
.
exists
(
img_path
):
# TODO 图片验证
with
open
(
img_path
,
'rb'
)
as
f
:
...
...
@@ -332,7 +387,7 @@ class Command(BaseCommand, LoggerMixin):
elif
new_classify
in
consts
.
LICENSE_CLASSIFY_SET_2
:
self
.
license2_process
(
ocr_data
,
all_res
,
new_classify
,
img_path
)
elif
new_classify
in
consts
.
CONTRACT_SET
:
self
.
contract_process
(
ocr_data
,
contract_result
,
new_classify
)
self
.
contract_process
(
ocr_data
,
contract_result
,
new_classify
,
rebuild_contract_result
)
break
else
:
self
.
folder_log
.
warn
(
'{0} [ocr failed] [img_path={1}]'
.
format
(
self
.
log_base
,
img_path
))
...
...
@@ -395,15 +450,15 @@ class Command(BaseCommand, LoggerMixin):
def
images_process
(
self
,
img_path_list
,
classify
,
excel_path
,
seperate_path_map
):
all_res
=
dict
()
contract_result
=
dict
()
rebuild_contract_result
=
dict
()
for
img_path
in
img_path_list
:
self
.
ocr_process
(
img_path
,
classify
,
all_res
,
seperate_path_map
,
contract_result
)
self
.
ocr_process
(
img_path
,
classify
,
all_res
,
seperate_path_map
,
contract_result
,
rebuild_contract_result
)
# if len(all_res) > 0:
self
.
res_process
(
all_res
,
excel_path
,
classify
,
contract_result
)
self
.
res_process
(
all_res
,
excel_path
,
classify
,
contract_result
,
rebuild_contract_result
)
return
all_res
def
pdf_process
(
self
,
name
,
path
,
classify
,
img_output_dir
,
wb_output_dir
,
pdf_output_dir
,
seperate_dir_map
):
if
os
.
path
.
exists
(
path
):
rebuild_res
=
None
img_save_path
,
excel_path
,
pdf_save_path
,
seperate_path_map
=
self
.
get_path
(
name
,
img_output_dir
,
wb_output_dir
,
pdf_output_dir
,
seperate_dir_map
)
pdf_handler
=
PDFHandler
(
path
,
img_save_path
)
...
...
@@ -420,7 +475,9 @@ class Command(BaseCommand, LoggerMixin):
else
:
ocr_result
=
afc_predict
(
pdf_handler
.
pdf_info
)
contract_result
=
dict
()
rebuild_contract_result
=
dict
()
page_res
=
{}
all_res
=
dict
()
for
page_num
,
page_info
in
ocr_result
.
get
(
'page_info'
,
{})
.
items
():
if
isinstance
(
page_num
,
str
)
and
page_num
.
startswith
(
'page_'
):
page_res
[
page_num
]
=
{
...
...
@@ -435,9 +492,10 @@ class Command(BaseCommand, LoggerMixin):
'classify'
:
page_res
[
page_key
]
.
pop
(
'classify'
,
consts
.
OTHER_CLASSIFY
),
'data'
:
page_res
[
page_key
]
}
self
.
contract_process
(
ocr_data
,
contract_result
,
classify
)
self
.
res_process
(
{},
excel_path
,
classify
,
contract_result
)
self
.
contract_process
(
ocr_data
,
contract_result
,
classify
,
rebuild_contract_result
)
self
.
res_process
(
all_res
,
excel_path
,
classify
,
contract_result
,
rebuild_
contract_result
)
shutil
.
move
(
path
,
pdf_save_path
)
return
all_res
else
:
try
:
self
.
folder_log
.
info
(
'{0} [pdf to img start] [path={1}]'
.
format
(
self
.
log_base
,
path
))
...
...
@@ -465,7 +523,6 @@ class Command(BaseCommand, LoggerMixin):
def
tif_process
(
self
,
name
,
path
,
classify
,
img_output_dir
,
wb_output_dir
,
tiff_output_dir
,
seperate_dir_map
):
if
os
.
path
.
exists
(
path
):
rebuild_res
=
None
try
:
img_save_path
,
excel_path
,
tiff_save_path
,
seperate_path_map
=
self
.
get_path
(
name
,
img_output_dir
,
wb_output_dir
,
tiff_output_dir
,
seperate_dir_map
)
...
...
@@ -490,7 +547,6 @@ class Command(BaseCommand, LoggerMixin):
return
rebuild_res
def
img_process
(
self
,
name
,
path
,
classify
,
wb_output_dir
,
img_output_dir
,
pdf_output_dir
,
seperate_dir_map
):
rebuild_res
=
None
try
:
img_save_path
,
excel_path
,
_
,
seperate_path_map
=
self
.
get_path
(
name
,
img_output_dir
,
wb_output_dir
,
pdf_output_dir
,
seperate_dir_map
)
...
...
@@ -539,19 +595,27 @@ class Command(BaseCommand, LoggerMixin):
try
:
if
result
[
self
.
CLASSIFY_KEY
]
in
self
.
sheet_content
:
sheet_name
,
head_fields
=
self
.
sheet_content
[
result
[
self
.
CLASSIFY_KEY
]]
first_head_row
=
head_fields
else
:
sheet_name
,
key_field
,
side_field_order
,
field_order
=
self
.
field_map_2
[
result
[
self
.
CLASSIFY_KEY
]]
if
key_field
is
not
None
and
len
(
side_field_order
)
>
len
(
field_order
):
first_head_row
=
[]
for
a
,
_
in
side_field_order
:
first_head_row
.
append
(
a
)
else
:
sheet_name
,
key_field
,
side_field_order
,
field_order
=
self
.
field_map
[
result
[
self
.
CLASSIFY_KEY
]]
first_head_row
=
[]
for
a
,
_
in
field_order
:
first_head_row
.
append
(
a
)
if
key_field
is
not
None
and
key_field
in
result
[
self
.
RESULT_KEY
]:
head_fields
=
[]
for
a
,
b
in
side_field_order
:
if
isinstance
(
b
,
str
):
for
a
,
_
in
side_field_order
:
head_fields
.
append
(
a
)
else
:
head_fields
=
[]
for
a
,
b
in
field_order
:
if
isinstance
(
b
,
str
):
for
a
,
_
in
field_order
:
head_fields
.
append
(
a
)
row
=
[]
row
=
[
result
[
self
.
FILE_KEY
]
]
for
field
in
head_fields
:
ocr_str_or_list
=
result
[
self
.
RESULT_KEY
]
.
get
(
field
,
''
)
if
isinstance
(
ocr_str_or_list
,
list
):
...
...
@@ -563,7 +627,8 @@ class Command(BaseCommand, LoggerMixin):
ws
=
wb
.
get_sheet_by_name
(
sheet_name
)
else
:
ws
=
wb
.
create_sheet
(
sheet_name
)
ws
.
append
(
head_fields
)
first_head_row
.
insert
(
0
,
'文件名'
)
ws
.
append
(
first_head_row
)
ws
.
append
(
row
)
except
Exception
as
e
:
self
.
folder_log
.
info
(
'{0} [daily wb failed] [result={1}] [error={2}]'
.
format
(
...
...
@@ -617,6 +682,7 @@ class Command(BaseCommand, LoggerMixin):
if
len
(
true_file_set
)
==
0
and
len
(
os_error_filename_set
)
>
0
:
true_file_set
.
add
(
os_error_filename_set
.
pop
())
for
name
in
true_file_set
:
time
.
sleep
(
5
)
path
=
os
.
path
.
join
(
input_dir
,
name
)
try
:
...
...
@@ -664,7 +730,8 @@ class Command(BaseCommand, LoggerMixin):
{
self
.
CLASSIFY_KEY
:
c
,
self
.
RESULT_KEY
:
res
,
self
.
DATE_KEY
:
date_str
self
.
DATE_KEY
:
date_str
,
self
.
FILE_KEY
:
name
,
}
)
...
...
src/celery_compare/tasks.py
View file @
3586d37
...
...
@@ -1067,8 +1067,8 @@ def get_se_cms_compare_info_auto(last_obj, application_entity):
# ('accountNo', account_no),
# ]
# bank_info[consts.DDA_EN] = dda_field_input
#
if len(bank_info) > 0:
#
compare_info['bankInfo'] = bank_info
if
len
(
bank_info
)
>
0
:
compare_info
[
'bankInfo'
]
=
bank_info
# 银行流水 --------------------------------------------------------------------
if
cms_info
.
get
(
'autoApprovedDetails'
,
{})
.
get
(
'aaType'
,
''
)
in
[
'CAA1'
,
'CAA2'
]:
...
...
Write
Preview
Styling with
Markdown
is supported
Attach a file
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to post a comment