Skip to content
Toggle navigation
Toggle navigation
This project
Loading...
Sign in
周伟奇
/
bmw-ocr
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Graphs
Network
Create a new issue
Commits
Issue Boards
Files
Commits
Network
Compare
Branches
Tags
87525e99
authored
2021-11-11 17:39:59 +0800
by
周伟奇
Browse Files
Options
Browse Files
Tag
Download
Email Patches
Plain Diff
fix afc e-contract
1 parent
9bab1769
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
90 additions
and
86 deletions
src/common/electronic_afc_contract/afc_contract_ocr.py
src/common/electronic_afc_contract/get_char.py
src/common/electronic_afc_contract/afc_contract_ocr.py
View file @
87525e9
...
...
@@ -9,8 +9,23 @@ from .get_char import Finder
def
predict
(
pdf_info
):
ocr_results
=
{}
for
pno
in
pdf_info
:
ocr_results
[
pno
]
=
{}
for
key
,
block
in
enumerate
(
pdf_info
[
pno
][
'blocks'
]):
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
# print(text)
xmin
,
ymin
,
xmax
,
ymax
=
bbox
polygon
=
[
xmin
,
ymin
,
xmax
,
ymin
,
xmax
,
ymax
,
xmin
,
ymax
]
text
=
text
.
replace
(
":"
,
":"
)
.
replace
(
" "
,
""
)
ocr_results
[
pno
][
key
]
=
[
polygon
,
text
]
# 输入是整个 PDF 中的信息
f
=
Finder
(
pdf_info
)
f
=
Finder
(
pdf_info
,
ocr_results
=
ocr_results
)
results
=
f
.
get_info
()
return
results
...
...
src/common/electronic_afc_contract/get_char.py
View file @
87525e9
...
...
@@ -11,14 +11,13 @@ from fuzzywuzzy import fuzz
class
Finder
:
def
__init__
(
self
,
pdf_info
):
def
__init__
(
self
,
pdf_info
,
ocr_results
):
self
.
pdf_info
=
pdf_info
self
.
ocr_results
=
ocr_results
self
.
is_asp
=
False
self
.
item
=
{
"words"
:
None
,
"position"
:
None
,
}
def
gen_init_result
(
self
,
is_asp
):
# 格式化算法输出
self
.
init_result
=
{
"page_1"
:
{
"合同编号"
:
self
.
item
,
...
...
@@ -109,8 +108,10 @@ class Finder:
"日期"
:
self
.
item
,
},
}
def
poly_to_rectangle
(
self
,
poly
):
xmin
,
ymin
,
xmax
,
ymin
,
xmax
,
ymax
,
xmin
,
ymax
=
poly
bbox
=
[
xmin
,
ymin
,
xmax
,
ymax
]
return
bbox
def
get_contract_no
(
self
,
page_num
):
"""传入页码,查看该页码右上角的编号
...
...
@@ -121,47 +122,41 @@ class Finder:
sting:
"""
contract_no
=
self
.
item
.
copy
()
# contract_no['words'] = ''
# contract_no['position'] = [-1, -1, -1, -1]
# 只看第一页
for
block
in
self
.
pdf_info
[
page_num
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
'合同编号:'
in
text
:
words
=
text
.
split
(
':'
)[
-
1
]
contract_no
[
'position'
]
=
bbox
contract_no
[
'words'
]
=
words
for
key
in
self
.
ocr_results
[
page_num
]:
bbox
,
text
=
self
.
ocr_results
[
page_num
][
key
]
if
'合同编号:'
in
text
:
words
=
text
.
split
(
':'
)[
-
1
]
location
=
self
.
poly_to_rectangle
(
bbox
)
contract_no
[
'words'
]
=
words
contract_no
[
'position'
]
=
location
return
contract_no
def
get_vehicle_price
(
self
,
page_num
=
'0'
):
vehicle_price
=
self
.
item
.
copy
()
for
block
in
self
.
pdf_info
[
page_num
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
'所购车辆价格为人民币'
in
text
:
words
=
text
.
split
(
'币'
)[
-
1
]
vehicle_price
[
'position'
]
=
bbox
vehicle_price
[
'words'
]
=
words
# vehicle_price['words'] = ''
# vehicle_price['position'] = [-1, -1, -1, -1]
for
key
in
self
.
ocr_results
[
page_num
]:
bbox
,
text
=
self
.
ocr_results
[
page_num
][
key
]
if
'所购车辆价格为人民币'
in
text
:
words
=
text
.
split
(
'币'
)[
-
1
]
location
=
self
.
poly_to_rectangle
(
bbox
)
vehicle_price
[
'words'
]
=
words
vehicle_price
[
'position'
]
=
location
return
vehicle_price
def
get_vin
(
self
,
page_num
=
'0'
):
vin
=
self
.
item
.
copy
()
for
block
in
self
.
pdf_info
[
page_num
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
'车架号:'
in
text
:
words
=
text
.
split
(
':'
)[
-
1
]
vin
[
'position'
]
=
bbox
vin
[
'words'
]
=
words
# vin['words'] = ''
# vin['position'] = [-1, -1, -1, -1]
for
key
in
self
.
ocr_results
[
page_num
]:
bbox
,
text
=
self
.
ocr_results
[
page_num
][
key
]
if
'车架号:'
in
text
:
words
=
text
.
split
(
':'
)[
-
1
]
location
=
self
.
poly_to_rectangle
(
bbox
)
vin
[
'words'
]
=
words
vin
[
'position'
]
=
location
return
vin
def
get_loan_principal
(
self
,
page_num
=
'0'
):
chinese_keywords
=
[
'壹'
,
'贰'
,
'叁'
,
'肆'
,
'伍'
,
'陆'
,
'柒'
,
'捌'
,
'玖'
,
'拾'
,
'佰'
,
'仟'
,
'万'
,
'亿'
,
'元'
,
'角'
,
'分'
,
'零'
,
'整'
]
...
...
@@ -202,7 +197,6 @@ class Finder:
asp_2
[
'position'
]
=
bbox
asp_2
[
'words'
]
=
words
return
upper
,
lower
,
asp_1
,
asp_2
def
get_loan_term
(
self
,
page_num
=
'0'
):
loan_term
=
self
.
item
.
copy
()
all_text
=
''
...
...
@@ -226,10 +220,20 @@ class Finder:
loan_term
[
'position'
]
=
bbox
loan_term
[
'words'
]
=
words
return
loan_term
def
mergelist
(
self
,
text_list
):
pattern
=
re
.
compile
(
"[^
\u4e00
-
\u9fa5
]"
)
# 匹配不是中文的其他字符
mergeindex
=
-
1
for
index
,
i
in
enumerate
(
text_list
):
if
'所购'
in
i
and
len
(
pattern
.
sub
(
''
,
pattern
.
sub
(
''
,
text_list
[
index
+
1
])))
!=
0
:
# if '所购' in i and '.00' not in text_list[index+1]:
mergeindex
=
index
if
mergeindex
==
-
1
:
return
text_list
else
:
new_text_list
=
text_list
[:
mergeindex
]
+
[
text_list
[
mergeindex
]
+
text_list
[
mergeindex
+
1
]]
+
text_list
[
mergeindex
+
2
:]
return
self
.
mergelist
(
new_text_list
)
def
get_asp_details
(
self
,
page_num
):
asp_details_table_term
=
self
.
item
.
copy
()
asp_details_table
=
[]
asp_details_text_list
=
[]
table
=
False
...
...
@@ -244,26 +248,22 @@ class Finder:
if
'第二条'
in
text
or
'征信管理'
in
text
:
table
=
False
if
table
==
True
:
# print(text)
asp_details_text_list
.
append
(
text
)
asp_details_text_list
=
self
.
mergelist
(
asp_details_text_list
)
for
i
in
range
((
len
(
asp_details_text_list
)
+
2
)
//
3
):
line
=
[]
if
i
==
0
:
line
=
[
asp_details_text_list
[
0
]]
else
:
for
j
in
range
(
3
):
line
.
append
(
asp_details_text_list
[
i
*
3
-
2
+
j
])
asp_details_table
.
append
(
line
)
if
len
(
asp_details_table
)
>
0
:
asp_details_table_term
[
'words'
]
=
asp_details_table
return
asp_details_table_term
def
get_signature
(
self
):
signature
=
self
.
item
.
copy
()
for
block
in
self
.
pdf_info
[
'0'
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
...
...
@@ -275,7 +275,6 @@ class Finder:
signature
[
'words'
]
=
words
signature
[
'position'
]
=
bbox
return
signature
def
get_somebody
(
self
,
top
,
bottom
):
# 指定上下边界后,返回上下边界内的客户信息
_name
=
self
.
item
.
copy
()
...
...
@@ -300,6 +299,7 @@ class Finder:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
y_top
<
bbox
[
3
]
<
y_bottom
:
# print(top, bottom, text)
if
'姓名/名称'
in
text
:
words
=
text
.
split
(
':'
)[
-
1
]
_name
[
'position'
]
=
bbox
...
...
@@ -309,7 +309,6 @@ class Finder:
_id
[
'position'
]
=
bbox
_id
[
'words'
]
=
words
return
_name
,
_id
def
get_seller
(
self
):
seller
=
self
.
item
.
copy
()
# 先找到 key
...
...
@@ -336,7 +335,6 @@ class Finder:
seller
[
'position'
]
=
bbox
seller
[
'words'
]
=
text
return
seller
def
get_payback_account
(
self
):
account
=
self
.
item
.
copy
()
account_name
=
self
.
item
.
copy
()
...
...
@@ -389,7 +387,6 @@ class Finder:
account_bank
[
'position'
]
=
bbox
account_bank
[
'words'
]
=
words
return
account
,
account_name
,
account_bank
def
get_repayment_schedule
(
self
):
repayment_schedule
=
self
.
item
.
copy
()
# 只看第二页
...
...
@@ -408,23 +405,17 @@ class Finder:
table
=
False
if
table
==
True
:
repayment_schedule_text_list
.
append
(
text
)
for
i
in
range
(
len
(
repayment_schedule_text_list
)
//
5
):
line
=
[]
# 5表示5列的意思
for
j
in
range
(
5
):
line
.
append
(
repayment_schedule_text_list
[
i
*
5
+
j
])
if
str
(
i
+
1
)
==
line
[
1
]:
break
repayment_schedule_table
.
append
(
line
)
if
len
(
repayment_schedule_table
)
>
0
:
repayment_schedule
[
'words'
]
=
repayment_schedule_table
return
repayment_schedule
def
get_signature_role_1
(
self
):
signature_role_1
=
self
.
init_item
.
copy
()
# 先定位签字区域
...
...
@@ -459,7 +450,6 @@ class Finder:
signature_role_1
[
'position'
]
=
position
signature_role_1
[
'words'
]
=
words
return
signature_role_1
def
get_signature_role_2
(
self
):
signature_role_2
=
self
.
init_item
.
copy
()
# 先定位签字区域
...
...
@@ -494,7 +484,6 @@ class Finder:
signature_role_2
[
'position'
]
=
position
signature_role_2
[
'words'
]
=
words
return
signature_role_2
def
get_signature_role_3
(
self
):
signature_role_3
=
self
.
init_item
.
copy
()
# 先定位签字区域
...
...
@@ -529,7 +518,6 @@ class Finder:
signature_role_3
[
'position'
]
=
position
signature_role_3
[
'words'
]
=
words
return
signature_role_3
def
get_signature_role_4
(
self
):
signature_role_4
=
self
.
init_item
.
copy
()
# 先定位签字区域
...
...
@@ -564,7 +552,6 @@ class Finder:
signature_role_4
[
'position'
]
=
position
signature_role_4
[
'words'
]
=
words
return
signature_role_4
def
get_signature_role_5
(
self
):
signature_role_5
=
self
.
init_item
.
copy
()
# 先定位签字区域
...
...
@@ -600,7 +587,6 @@ class Finder:
signature_role_5
[
'position'
]
=
position
signature_role_5
[
'words'
]
=
words
return
signature_role_5
def
get_last_page_signature
(
self
,
page_num
,
top
,
bottom
):
signature_name
=
self
.
item
.
copy
()
signature_date
=
self
.
item
.
copy
()
...
...
@@ -616,6 +602,7 @@ class Finder:
anchor_top
=
bbox
[
1
]
if
bottom
in
text
:
anchor_bottom
=
bbox
[
1
]
# print(top, anchor_top, anchor_bottom)
if
anchor_top
is
not
None
and
anchor_bottom
is
not
None
:
for
block
in
self
.
pdf_info
[
page_num
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
...
...
@@ -629,9 +616,8 @@ class Finder:
signature_name
[
'words'
]
=
name
signature_name
[
'position'
]
=
bbox
signature_date
[
'words'
]
=
date
signature_
nam
e
[
'position'
]
=
bbox
signature_
dat
e
[
'position'
]
=
bbox
return
signature_name
,
signature_date
def
get_info
(
self
):
"""
block['type'] == 0 : 表示该元素为图片
...
...
@@ -639,21 +625,22 @@ class Finder:
Returns:
dict: Description
"""
# 先判断是否为 ASP 产品
# 只看第一页,判断是否有 '附加产品融资贷款本金总金额' 这一句话,若有则为 ASP 产品
# print(self.pdf_info['0']['blocks'])
for
block
in
self
.
pdf_info
[
'0'
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
'附加产品融资贷款本金总金额'
==
text
:
self
.
is_asp
=
True
# for block in self.pdf_info['0']['blocks']:
# if block['type'] != 0:
# continue
# for line in block['lines']:
# for span in line['spans']:
# bbox, text = span['bbox'], span['text']
# if '附加产品融资贷款本金总金额' == text:
# self.is_asp = True
for
key
in
self
.
ocr_results
[
'0'
]:
bbox
,
text
=
self
.
ocr_results
[
'0'
][
key
]
if
'附加产品融资贷款本金总金额'
in
text
:
self
.
is_asp
=
True
self
.
gen_init_result
(
self
.
is_asp
)
# Page 1
# 找合同编号
contract_no
=
self
.
get_contract_no
(
page_num
=
'0'
)
...
...
@@ -663,7 +650,7 @@ class Finder:
self
.
init_result
[
'page_1'
][
'所购车辆价格'
]
=
vehicle_price
# 车架号
vin
=
self
.
get_vin
()
self
.
init_result
[
'page_1'
][
'车架号'
]
=
v
ehicle_price
self
.
init_result
[
'page_1'
][
'车架号'
]
=
v
in
# 贷款本金金额(如果是 ASP产品)则'贷款本金金额'项目中包含'车辆贷款本金金额'和'附加产品融资贷款本金总金额'两个项目
upper
,
lower
,
asp_1
,
asp_2
=
self
.
get_loan_principal
()
self
.
init_result
[
'page_1'
][
'贷款本金金额'
][
'大写'
]
=
upper
...
...
@@ -685,11 +672,14 @@ class Finder:
contract_no
=
self
.
get_contract_no
(
page_num
=
'0'
)
self
.
init_result
[
'page_2'
][
'合同编号'
]
=
contract_no
# 找借款人及抵押人(地址字段原本有空格)
borrower_name
,
borrower_id
=
self
.
get_somebody
(
top
=
'借款人及抵押人:'
,
bottom
=
'共同借款人及共同抵押人:'
)
borrower_name
,
borrower_id
=
self
.
get_somebody
(
top
=
'借款人及抵押人:'
,
bottom
=
'共同借款人:'
)
# 这是为了同时兼容 8.1 版本
if
borrower_name
[
'words'
]
==
None
:
borrower_name
,
borrower_id
=
self
.
get_somebody
(
top
=
'借款人及抵押人:'
,
bottom
=
'共同借款人及共同抵押人:'
)
self
.
init_result
[
'page_2'
][
'借款人及抵押人'
][
'name'
]
=
borrower_name
self
.
init_result
[
'page_2'
][
'借款人及抵押人'
][
'id'
]
=
borrower_id
# 找共同借款人及共同抵押人
co_borrower_name
,
co_borrower_id
=
self
.
get_somebody
(
top
=
'共同借款人
及共同抵押人
:'
,
bottom
=
'保证人1:'
)
co_borrower_name
,
co_borrower_id
=
self
.
get_somebody
(
top
=
'共同借款人:'
,
bottom
=
'保证人1:'
)
self
.
init_result
[
'page_2'
][
'共同借款人及共同抵押人'
][
'name'
]
=
co_borrower_name
self
.
init_result
[
'page_2'
][
'共同借款人及共同抵押人'
][
'id'
]
=
co_borrower_id
# 保证人1
...
...
@@ -755,11 +745,11 @@ class Finder:
contract_no
=
self
.
get_contract_no
(
page_num
=
'6'
)
self
.
init_result
[
'page_7'
][
'合同编号'
]
=
contract_no
signature_name
,
signature_date
=
self
.
get_last_page_signature
(
page_num
=
'6'
,
top
=
'
借款人(抵押人)'
,
bottom
=
'共同借款人(共同抵押人)
'
)
top
=
'
合同编号'
,
bottom
=
'共同借款人
'
)
self
.
init_result
[
'page_7'
][
'主借人签字'
][
'签字'
]
=
signature_name
self
.
init_result
[
'page_7'
][
'主借人签字'
][
'日期'
]
=
signature_date
signature_name
,
signature_date
=
self
.
get_last_page_signature
(
page_num
=
'6'
,
top
=
'共同借款人
(共同抵押人)
'
,
bottom
=
'保证人1'
)
top
=
'共同借款人'
,
bottom
=
'保证人1'
)
self
.
init_result
[
'page_7'
][
'共借人签字'
][
'签字'
]
=
signature_name
self
.
init_result
[
'page_7'
][
'共借人签字'
][
'日期'
]
=
signature_date
signature_name
,
signature_date
=
self
.
get_last_page_signature
(
page_num
=
'6'
,
...
...
@@ -771,7 +761,7 @@ class Finder:
self
.
init_result
[
'page_7'
][
'保证人2签字'
][
'签字'
]
=
signature_name
self
.
init_result
[
'page_7'
][
'保证人2签字'
][
'日期'
]
=
signature_date
signature_name
,
signature_date
=
self
.
get_last_page_signature
(
page_num
=
'6'
,
top
=
'在本人面前亲笔签署本合同'
,
bottom
=
'
(以下无正文)
'
)
top
=
'在本人面前亲笔签署本合同'
,
bottom
=
'
以下无正文
'
)
self
.
init_result
[
'page_7'
][
'见证人签字'
][
'签字'
]
=
signature_name
self
.
init_result
[
'page_7'
][
'见证人签字'
][
'日期'
]
=
signature_date
else
:
...
...
@@ -784,11 +774,11 @@ class Finder:
contract_no
=
self
.
get_contract_no
(
page_num
=
'7'
)
self
.
init_result
[
'page_8'
][
'合同编号'
]
=
contract_no
signature_name
,
signature_date
=
self
.
get_last_page_signature
(
page_num
=
'7'
,
top
=
'
借款人(抵押人)'
,
bottom
=
'共同借款人(共同抵押人)
'
)
top
=
'
合同编号'
,
bottom
=
'共同借款人
'
)
self
.
init_result
[
'page_8'
][
'主借人签字'
][
'签字'
]
=
signature_name
self
.
init_result
[
'page_8'
][
'主借人签字'
][
'日期'
]
=
signature_date
signature_name
,
signature_date
=
self
.
get_last_page_signature
(
page_num
=
'7'
,
top
=
'共同借款人
(共同抵押人)
'
,
bottom
=
'保证人1'
)
top
=
'共同借款人'
,
bottom
=
'保证人1'
)
self
.
init_result
[
'page_8'
][
'共借人签字'
][
'签字'
]
=
signature_name
self
.
init_result
[
'page_8'
][
'共借人签字'
][
'日期'
]
=
signature_date
signature_name
,
signature_date
=
self
.
get_last_page_signature
(
page_num
=
'7'
,
...
...
@@ -800,10 +790,9 @@ class Finder:
self
.
init_result
[
'page_8'
][
'保证人2签字'
][
'签字'
]
=
signature_name
self
.
init_result
[
'page_8'
][
'保证人2签字'
][
'日期'
]
=
signature_date
signature_name
,
signature_date
=
self
.
get_last_page_signature
(
page_num
=
'7'
,
top
=
'在本人面前亲笔签署本合同'
,
bottom
=
'
(以下无正文)
'
)
top
=
'在本人面前亲笔签署本合同'
,
bottom
=
'
以下无正文
'
)
self
.
init_result
[
'page_8'
][
'见证人签字'
][
'签字'
]
=
signature_name
self
.
init_result
[
'page_8'
][
'见证人签字'
][
'日期'
]
=
signature_date
# 重新定制输出
new_results
=
{
"is_asp"
:
self
.
is_asp
,
"page_info"
:
self
.
init_result
...
...
Write
Preview
Styling with
Markdown
is supported
Attach a file
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to post a comment