Skip to content
Toggle navigation
Toggle navigation
This project
Loading...
Sign in
周伟奇
/
bmw-ocr
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Graphs
Network
Create a new issue
Commits
Issue Boards
Files
Commits
Network
Compare
Branches
Tags
d3de42e6
authored
2021-11-18 14:19:43 +0800
by
周伟奇
Browse Files
Options
Browse Files
Tag
Download
Email Patches
Plain Diff
fix HIL contract
1 parent
a7933381
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
199 additions
and
76 deletions
src/common/electronic_hil_contract/get_char.py
src/common/electronic_hil_contract/hil_contract_ocr.py
src/common/electronic_hil_contract/get_char.py
View file @
d3de42e
...
...
@@ -6,14 +6,11 @@
# @Description :
import
re
import
cv2
import
base64
import
numpy
as
np
from
fuzzywuzzy
import
fuzz
class
Finder
:
def
__init__
(
self
,
pdf_info
):
self
.
pdf_info
=
pdf_info
self
.
item
=
{
"words"
:
None
,
...
...
@@ -25,6 +22,9 @@ class Finder:
"承租人-姓名"
:
self
.
item
,
"承租人-证件号码"
:
self
.
item
,
"承租人-法定代表人或授权代表"
:
self
.
item
,
"共同承租人-姓名"
:
self
.
item
,
"共同承租人-证件号码"
:
self
.
item
,
"共同承租人-法定代表人或授权代表"
:
self
.
item
,
"保证人1-姓名"
:
self
.
item
,
"保证人1-证件号码"
:
self
.
item
,
"保证人1-法定代表人或授权代表"
:
self
.
item
,
...
...
@@ -47,6 +47,8 @@ class Finder:
"银行账户-开户行"
:
self
.
item
,
"签字页-承租人姓名"
:
self
.
item
,
"签字页-承租人签章"
:
self
.
item
,
"签字页-共同承租人姓名"
:
self
.
item
,
"签字页-共同承租人签章"
:
self
.
item
,
"签字页-保证人1姓名"
:
self
.
item
,
"签字页-保证人1签章"
:
self
.
item
,
"签字页-保证人2姓名"
:
self
.
item
,
...
...
@@ -54,7 +56,6 @@ class Finder:
"签字页-保证人3姓名"
:
self
.
item
,
"签字页-保证人3签章"
:
self
.
item
,
}
# 格式化输出 车辆处置协议 要是别的字段
self
.
init_result_1
=
{
"合同编号"
:
self
.
item
,
"承租人-姓名"
:
self
.
item
,
...
...
@@ -66,9 +67,7 @@ class Finder:
"签字页-承租人签章"
:
self
.
item
,
"签字页-销售经销商"
:
self
.
item
,
"签字页-销售经销商签章"
:
self
.
item
,
}
# 格式化输出 车辆租赁抵押合同
self
.
init_result_2
=
{
"合同编号"
:
self
.
item
,
"合同编号(正文)"
:
self
.
item
,
...
...
@@ -150,23 +149,24 @@ class Finder:
words
=
matchObj
.
group
(
1
)
contract_no
[
'position'
]
=
None
contract_no
[
'page'
]
=
pno
contract_no
[
'words'
]
=
words
# contract_no['words'] = words
contract_no
[
'words'
]
=
re
.
sub
(
"
\
s"
,
""
,
words
)
.
replace
(
")"
,
""
)
return
contract_no
matchObj
=
re
.
search
(
r'编号为(.*?)的'
,
all_text
)
if
matchObj
:
words
=
matchObj
.
group
(
1
)
.
strip
()
contract_no
[
'position'
]
=
None
contract_no
[
'page'
]
=
pno
contract_no
[
'words'
]
=
words
# contract_no['words'] = words
contract_no
[
'words'
]
=
re
.
sub
(
"
\
s"
,
""
,
words
)
.
replace
(
")"
,
""
)
return
contract_no
matchObj
=
re
.
search
(
r'编号为(.*?))的'
,
all_text
)
if
matchObj
:
words
=
matchObj
.
group
(
1
)
.
strip
()
contract_no
[
'position'
]
=
None
contract_no
[
'page'
]
=
pno
contract_no
[
'words'
]
=
words
# contract_no['words'] = words
contract_no
[
'words'
]
=
re
.
sub
(
"
\
s"
,
""
,
words
)
return
contract_no
def
get_key_value
(
self
,
key
,
page_num
=
None
):
...
...
@@ -180,10 +180,11 @@ class Finder:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
key
in
text
:
words
=
text
.
split
(
':'
)[
-
1
]
words
=
text
.
split
(
':'
)[
-
1
]
.
replace
(
"。"
,
""
)
value
[
'position'
]
=
bbox
value
[
'page'
]
=
pno
value
[
'words'
]
=
words
# value['words'] = words
value
[
'words'
]
=
re
.
sub
(
"
\
s"
,
""
,
words
)
else
:
for
pno
in
self
.
pdf_info
:
for
block
in
self
.
pdf_info
[
pno
][
'blocks'
]:
...
...
@@ -194,10 +195,11 @@ class Finder:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
key
in
text
:
# print(self.pdf_info[pno])
words
=
text
.
split
(
':'
)[
-
1
]
words
=
text
.
split
(
':'
)[
-
1
]
.
replace
(
"。"
,
""
)
value
[
'position'
]
=
bbox
value
[
'page'
]
=
pno
value
[
'words'
]
=
words
# value['words'] = words
value
[
'words'
]
=
re
.
sub
(
"
\
s"
,
""
,
words
)
return
value
def
get_loan_principal
(
self
,
page_num
=
'0'
):
...
...
@@ -267,7 +269,6 @@ class Finder:
def
get_asp_details
(
self
,
page_num
):
asp_details_table_term
=
self
.
item
.
copy
()
asp_details_table
=
[]
asp_details_text_list
=
[]
table
=
False
...
...
@@ -283,25 +284,20 @@ class Finder:
table
=
False
if
table
==
True
:
asp_details_text_list
.
append
(
text
)
for
i
in
range
((
len
(
asp_details_text_list
)
+
2
)
//
3
):
for
i
in
range
((
len
(
asp_details_text_list
)
+
2
)
//
3
):
line
=
[]
if
i
==
0
:
line
=
[
asp_details_text_list
[
0
]]
else
:
for
j
in
range
(
3
):
line
.
append
(
asp_details_text_list
[
i
*
3
-
2
+
j
])
line
.
append
(
asp_details_text_list
[
i
*
3
-
2
+
j
])
asp_details_table
.
append
(
line
)
if
len
(
asp_details_table
)
>
0
:
asp_details_table_term
[
'words'
]
=
asp_details_table
return
asp_details_table_term
def
get_signature
(
self
):
signature
=
self
.
item
.
copy
()
for
block
in
self
.
pdf_info
[
'0'
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
...
...
@@ -369,8 +365,8 @@ class Finder:
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
anchor_bbox
[
2
]
<
np
.
mean
(
bbox
[::
2
])
<
half_width
and
\
anchor_bbox
[
1
]
<
np
.
mean
(
bbox
[
1
::
2
])
<
anchor_bbox
[
3
]:
if
anchor_bbox
[
2
]
<
np
.
mean
(
bbox
[::
2
])
<
half_width
and
\
anchor_bbox
[
1
]
<
np
.
mean
(
bbox
[
1
::
2
])
<
anchor_bbox
[
3
]:
seller
[
'position'
]
=
bbox
seller
[
'words'
]
=
text
return
seller
...
...
@@ -430,7 +426,6 @@ class Finder:
def
get_repayment_schedule
(
self
):
repayment_schedule
=
self
.
item
.
copy
()
repayment_schedule_text_list
=
[]
table
=
False
page
=
None
...
...
@@ -444,20 +439,25 @@ class Finder:
if
'以上表格中所列序号'
in
text
:
table
=
False
if
table
==
True
:
# 过滤汉字
if
re
.
compile
(
r'[\u4e00-\u9fff]'
)
.
search
(
text
):
continue
# 过滤 1. - 61. 这些标题
if
re
.
findall
(
"
\
d+"
,
text
):
if
len
(
re
.
findall
(
"
\
d+"
,
text
))
==
1
:
continue
repayment_schedule_text_list
.
append
(
text
)
if
'61.'
in
text
:
page
=
pno
table
=
True
# print("repayment_schedule_text_list = ", repayment_schedule_text_list)
repayment_schedule_table
=
[[
'序号'
,
'融资租赁成本'
,
'融资租赁费用'
,
'租金'
,
'剩余融资租赁成本'
]]
for
i
in
range
(
len
(
repayment_schedule_text_list
)
//
4
):
line
=
[
f
'{i
+
1}.'
]
for
i
in
range
(
len
(
repayment_schedule_text_list
)
//
4
):
line
=
[
f
'{i
+
1}.'
]
# 4表示4列的意思
for
j
in
range
(
4
):
line
.
append
(
repayment_schedule_text_list
[
i
*
4
+
j
])
line
.
append
(
repayment_schedule_text_list
[
i
*
4
+
j
])
repayment_schedule_table
.
append
(
line
)
repayment_schedule
[
'words'
]
=
repayment_schedule_table
repayment_schedule
[
'page'
]
=
page
return
repayment_schedule
...
...
@@ -506,7 +506,7 @@ class Finder:
else
:
words
=
'无'
boxes
=
np
.
array
(
boxes
)
.
reshape
((
-
1
,
2
))
position
=
[
min
(
boxes
[:,
0
]),
min
(
boxes
[:,
1
]),
max
(
boxes
[:,
0
]),
max
(
boxes
[:,
1
])]
position
=
[
min
(
boxes
[:,
0
]),
min
(
boxes
[:,
1
]),
max
(
boxes
[:,
0
]),
max
(
boxes
[:,
1
])]
signature_role_2
[
'page_num'
]
=
page_num
signature_role_2
[
'position'
]
=
position
signature_role_2
[
'words'
]
=
words
...
...
@@ -541,7 +541,7 @@ class Finder:
else
:
words
=
'无'
boxes
=
np
.
array
(
boxes
)
.
reshape
((
-
1
,
2
))
position
=
[
min
(
boxes
[:,
0
]),
min
(
boxes
[:,
1
]),
max
(
boxes
[:,
0
]),
max
(
boxes
[:,
1
])]
position
=
[
min
(
boxes
[:,
0
]),
min
(
boxes
[:,
1
]),
max
(
boxes
[:,
0
]),
max
(
boxes
[:,
1
])]
signature_role_3
[
'page_num'
]
=
page_num
signature_role_3
[
'position'
]
=
position
signature_role_3
[
'words'
]
=
words
...
...
@@ -576,7 +576,7 @@ class Finder:
else
:
words
=
'无'
boxes
=
np
.
array
(
boxes
)
.
reshape
((
-
1
,
2
))
position
=
[
min
(
boxes
[:,
0
]),
min
(
boxes
[:,
1
]),
max
(
boxes
[:,
0
]),
max
(
boxes
[:,
1
])]
position
=
[
min
(
boxes
[:,
0
]),
min
(
boxes
[:,
1
]),
max
(
boxes
[:,
0
]),
max
(
boxes
[:,
1
])]
signature_role_4
[
'page_num'
]
=
page_num
signature_role_4
[
'position'
]
=
position
signature_role_4
[
'words'
]
=
words
...
...
@@ -612,7 +612,7 @@ class Finder:
else
:
words
=
'无'
boxes
=
np
.
array
(
boxes
)
.
reshape
((
-
1
,
2
))
position
=
[
min
(
boxes
[:,
0
]),
min
(
boxes
[:,
1
]),
max
(
boxes
[:,
0
]),
max
(
boxes
[:,
1
])]
position
=
[
min
(
boxes
[:,
0
]),
min
(
boxes
[:,
1
]),
max
(
boxes
[:,
0
]),
max
(
boxes
[:,
1
])]
signature_role_5
[
'page_num'
]
=
page_num
signature_role_5
[
'position'
]
=
position
signature_role_5
[
'words'
]
=
words
...
...
@@ -640,7 +640,7 @@ class Finder:
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
'签署日期'
in
text
and
int
(
anchor_top
)
<
np
.
mean
(
bbox
[
1
::
2
])
<
int
(
anchor_bottom
):
if
'签署日期'
in
text
and
int
(
anchor_top
)
<
np
.
mean
(
bbox
[
1
::
2
])
<
int
(
anchor_bottom
):
name
=
text
.
split
(
' '
)[
0
]
date
=
text
.
split
(
':'
)[
-
1
]
signature_name
[
'words'
]
=
name
...
...
@@ -663,7 +663,7 @@ class Finder:
if
top
in
text
:
anchor_top
=
bbox
[
1
]
if
bottom
in
text
:
anchor_bottom
=
bbox
[
1
]
anchor_bottom
=
bbox
[
3
]
if
anchor_top
is
not
None
and
anchor_bottom
is
not
None
:
for
pno
in
self
.
pdf_info
:
for
block
in
self
.
pdf_info
[
pno
][
'blocks'
]:
...
...
@@ -672,7 +672,9 @@ class Finder:
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
'签署日期'
in
text
and
int
(
anchor_top
)
<
np
.
mean
(
bbox
[
1
::
2
])
<
int
(
anchor_bottom
):
# ------------ #
# print("--text = ", text)
if
'签署日期'
in
text
and
int
(
anchor_top
)
<
np
.
mean
(
bbox
[
1
::
2
])
<
int
(
anchor_bottom
):
words
=
text
signature
[
'words'
]
=
words
signature
[
'page'
]
=
pno
...
...
@@ -683,7 +685,6 @@ class Finder:
name
=
self
.
item
.
copy
()
id_num
=
self
.
item
.
copy
()
representative
=
self
.
item
.
copy
()
# 以保证人3 的左上角为定位点
anchor
=
None
for
block
in
self
.
pdf_info
[
page_num
][
'blocks'
]:
...
...
@@ -695,7 +696,6 @@ class Finder:
# 找到角色姓名
if
re
.
match
(
'保证人3'
,
text
)
is
not
None
:
anchor
=
[
bbox
[
0
],
bbox
[
1
]]
if
anchor
is
not
None
:
for
block
in
self
.
pdf_info
[
page_num
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
...
...
@@ -711,52 +711,60 @@ class Finder:
name
[
'position'
]
=
bbox
if
role_key
==
'承租人:'
:
# 找到证件号码且确定位置
if
re
.
match
(
'证件号码:'
,
text
)
is
not
None
and
np
.
mean
(
bbox
[::
2
])
<
anchor
[
0
]
and
np
.
mean
(
bbox
[
1
::
2
])
<
anchor
[
1
]:
if
re
.
match
(
'证件号码:'
,
text
)
is
not
None
and
np
.
mean
(
bbox
[::
2
])
<
anchor
[
0
]
and
np
.
mean
(
bbox
[
1
::
2
])
<
anchor
[
1
]:
words
=
text
.
split
(
':'
)[
-
1
]
id_num
[
'words'
]
=
words
id_num
[
'page'
]
=
page_num
id_num
[
'position'
]
=
bbox
# 找到法人代表且确定位置
if
re
.
match
(
'法定代表人或授权代表:'
,
text
)
is
not
None
and
np
.
mean
(
bbox
[::
2
])
<
anchor
[
0
]
and
np
.
mean
(
bbox
[
1
::
2
])
<
anchor
[
1
]:
if
re
.
match
(
'法定代表人或授权代表:'
,
text
)
is
not
None
and
np
.
mean
(
bbox
[::
2
])
<
anchor
[
0
]
and
np
.
mean
(
bbox
[
1
::
2
])
<
anchor
[
1
]:
words
=
text
.
split
(
':'
)[
-
1
]
representative
[
'words'
]
=
words
representative
[
'page'
]
=
page_num
representative
[
'position'
]
=
bbox
if
role_key
==
'保证人1:'
:
# 找到证件号码且确定位置
if
re
.
match
(
'证件号码:'
,
text
)
is
not
None
and
np
.
mean
(
bbox
[::
2
])
<
anchor
[
0
]
and
np
.
mean
(
bbox
[
1
::
2
])
>
anchor
[
1
]:
if
re
.
match
(
'证件号码:'
,
text
)
is
not
None
and
np
.
mean
(
bbox
[::
2
])
<
anchor
[
0
]
and
np
.
mean
(
bbox
[
1
::
2
])
>
anchor
[
1
]:
words
=
text
.
split
(
':'
)[
-
1
]
id_num
[
'words'
]
=
words
id_num
[
'page'
]
=
page_num
id_num
[
'position'
]
=
bbox
# 找到法人代表且确定位置
if
re
.
match
(
'法定代表人或授权代表:'
,
text
)
is
not
None
and
np
.
mean
(
bbox
[::
2
])
<
anchor
[
0
]
and
np
.
mean
(
bbox
[
1
::
2
])
>
anchor
[
1
]:
if
re
.
match
(
'法定代表人或授权代表:'
,
text
)
is
not
None
and
np
.
mean
(
bbox
[::
2
])
<
anchor
[
0
]
and
np
.
mean
(
bbox
[
1
::
2
])
>
anchor
[
1
]:
words
=
text
.
split
(
':'
)[
-
1
]
representative
[
'words'
]
=
words
representative
[
'page'
]
=
page_num
representative
[
'position'
]
=
bbox
if
role_key
==
'保证人2:'
:
# 找到证件号码且确定位置
if
re
.
match
(
'证件号码:'
,
text
)
is
not
None
and
np
.
mean
(
bbox
[::
2
])
>
anchor
[
0
]
and
np
.
mean
(
bbox
[
1
::
2
])
<
anchor
[
1
]:
if
re
.
match
(
'证件号码:'
,
text
)
is
not
None
and
np
.
mean
(
bbox
[::
2
])
>
anchor
[
0
]
and
np
.
mean
(
bbox
[
1
::
2
])
<
anchor
[
1
]:
words
=
text
.
split
(
':'
)[
-
1
]
id_num
[
'words'
]
=
words
id_num
[
'page'
]
=
page_num
id_num
[
'position'
]
=
bbox
# 找到法人代表且确定位置
if
re
.
match
(
'法定代表人或授权代表:'
,
text
)
is
not
None
and
np
.
mean
(
bbox
[::
2
])
>
anchor
[
0
]
and
np
.
mean
(
bbox
[
1
::
2
])
<
anchor
[
1
]:
if
re
.
match
(
'法定代表人或授权代表:'
,
text
)
is
not
None
and
np
.
mean
(
bbox
[::
2
])
>
anchor
[
0
]
and
np
.
mean
(
bbox
[
1
::
2
])
<
anchor
[
1
]:
words
=
text
.
split
(
':'
)[
-
1
]
representative
[
'words'
]
=
words
representative
[
'page'
]
=
page_num
representative
[
'position'
]
=
bbox
if
role_key
==
'保证人3:'
:
# 找到证件号码且确定位置
if
re
.
match
(
'证件号码:'
,
text
)
is
not
None
and
np
.
mean
(
bbox
[::
2
])
>
anchor
[
0
]
and
np
.
mean
(
bbox
[
1
::
2
])
>
anchor
[
1
]:
if
re
.
match
(
'证件号码:'
,
text
)
is
not
None
and
np
.
mean
(
bbox
[::
2
])
>
anchor
[
0
]
and
np
.
mean
(
bbox
[
1
::
2
])
>
anchor
[
1
]:
words
=
text
.
split
(
':'
)[
-
1
]
id_num
[
'words'
]
=
words
id_num
[
'page'
]
=
page_num
id_num
[
'position'
]
=
bbox
# 找到法人代表且确定位置
if
re
.
match
(
'法定代表人或授权代表:'
,
text
)
is
not
None
and
np
.
mean
(
bbox
[::
2
])
>
anchor
[
0
]
and
np
.
mean
(
bbox
[
1
::
2
])
>
anchor
[
1
]:
if
re
.
match
(
'法定代表人或授权代表:'
,
text
)
is
not
None
and
np
.
mean
(
bbox
[::
2
])
>
anchor
[
0
]
and
np
.
mean
(
bbox
[
1
::
2
])
>
anchor
[
1
]:
words
=
text
.
split
(
':'
)[
-
1
]
representative
[
'words'
]
=
words
representative
[
'page'
]
=
page_num
...
...
@@ -783,12 +791,10 @@ class Finder:
start
=
False
if
start
==
True
:
items
.
append
(
text
)
lines
=
[[
'项目'
,
'购买价格'
,
'实际融资金额'
]]
for
i
in
range
(
len
(
items
)
//
3
):
line
=
[
items
[
2
+
i
*
3
+
0
],
items
[
2
+
i
*
3
+
1
],
items
[
2
+
i
*
3
+
2
]]
for
i
in
range
(
len
(
items
)
//
3
):
line
=
[
items
[
2
+
i
*
3
+
0
],
items
[
2
+
i
*
3
+
1
],
items
[
2
+
i
*
3
+
2
]]
lines
.
append
(
line
)
if
len
(
items
)
>
0
:
lines
.
append
([
items
[
0
],
''
,
items
[
1
]])
...
...
@@ -800,7 +806,6 @@ class Finder:
def
get_contract_no_dy
(
self
):
# 查找抵押合同编号
contract_no
=
self
.
item
.
copy
()
key_box
=
None
for
pno
in
self
.
pdf_info
:
for
block
in
self
.
pdf_info
[
pno
][
'blocks'
]:
...
...
@@ -811,7 +816,6 @@ class Finder:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
'抵押合同编号'
in
text
:
key_box
=
bbox
if
key_box
is
not
None
:
for
pno
in
self
.
pdf_info
:
for
block
in
self
.
pdf_info
[
pno
][
'blocks'
]:
...
...
@@ -829,7 +833,6 @@ class Finder:
def
get_dyr_name_id
(
self
):
name
=
self
.
item
.
copy
()
_id
=
self
.
item
.
copy
()
key_box
=
None
for
pno
in
self
.
pdf_info
:
for
block
in
self
.
pdf_info
[
pno
][
'blocks'
]:
...
...
@@ -842,7 +845,7 @@ class Finder:
key_box
=
bbox
if
key_box
is
not
None
:
rh
=
abs
(
key_box
[
1
]
-
key_box
[
3
])
rh
=
abs
(
key_box
[
1
]
-
key_box
[
3
])
for
pno
in
self
.
pdf_info
:
for
block
in
self
.
pdf_info
[
pno
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
...
...
@@ -850,12 +853,12 @@ class Finder:
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
key_box
[
1
]
<
np
.
mean
(
bbox
[
1
::
2
])
<
key_box
[
3
]
+
rh
*
3
and
'姓名'
in
text
:
if
key_box
[
1
]
<
np
.
mean
(
bbox
[
1
::
2
])
<
key_box
[
3
]
+
rh
*
3
and
'姓名'
in
text
:
words
=
text
.
split
(
':'
)[
-
1
]
name
[
'position'
]
=
bbox
name
[
'page'
]
=
pno
name
[
'words'
]
=
words
if
key_box
[
1
]
<
np
.
mean
(
bbox
[
1
::
2
])
<
key_box
[
3
]
+
rh
*
3
and
'证件号码'
in
text
:
if
key_box
[
1
]
<
np
.
mean
(
bbox
[
1
::
2
])
<
key_box
[
3
]
+
rh
*
3
and
'证件号码'
in
text
:
words
=
text
.
split
(
':'
)[
-
1
]
_id
[
'position'
]
=
bbox
_id
[
'page'
]
=
pno
...
...
@@ -864,7 +867,6 @@ class Finder:
def
get_key_value_position
(
self
,
key
):
value
=
self
.
item
.
copy
()
key_box
=
None
for
pno
in
self
.
pdf_info
:
for
block
in
self
.
pdf_info
[
pno
][
'blocks'
]:
...
...
@@ -875,9 +877,8 @@ class Finder:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
text
==
key
:
key_box
=
bbox
if
key_box
is
not
None
:
rh
=
abs
(
key_box
[
1
]
-
key_box
[
3
])
rh
=
abs
(
key_box
[
1
]
-
key_box
[
3
])
for
pno
in
self
.
pdf_info
:
for
block
in
self
.
pdf_info
[
pno
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
...
...
@@ -885,13 +886,104 @@ class Finder:
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
key_box
[
1
]
<
np
.
mean
(
bbox
[
1
::
2
])
<
key_box
[
3
]
and
key_box
[
0
]
<
bbox
[
0
]
and
abs
(
key_box
[
2
]
-
bbox
[
0
])
<
rh
*
10
:
if
key_box
[
1
]
<
np
.
mean
(
bbox
[
1
::
2
])
<
key_box
[
3
]
and
key_box
[
0
]
<
bbox
[
0
]
and
abs
(
key_box
[
2
]
-
bbox
[
0
])
<
rh
*
10
:
words
=
text
value
[
'position'
]
=
bbox
value
[
'page'
]
=
pno
value
[
'words'
]
=
words
return
value
def
get_role_info_3_3
(
self
,
role_key
,
page_num
=
'0'
):
name
=
self
.
item
.
copy
()
id_num
=
self
.
item
.
copy
()
representative
=
self
.
item
.
copy
()
# 以保证人2 的左上角为定位点
anchor
=
None
for
block
in
self
.
pdf_info
[
page_num
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
# 找到角色姓名
if
re
.
match
(
'保证人2'
,
text
)
is
not
None
:
anchor
=
[
bbox
[
0
],
bbox
[
1
]]
if
anchor
is
not
None
:
for
block
in
self
.
pdf_info
[
page_num
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
# 找到角色姓名
if
re
.
match
(
role_key
,
text
)
is
not
None
:
words
=
text
.
split
(
':'
)[
-
1
]
name
[
'words'
]
=
words
name
[
'page'
]
=
page_num
name
[
'position'
]
=
bbox
if
role_key
==
'承租人一:'
:
# 找到证件号码且确定位置
if
re
.
match
(
'证件号码:'
,
text
)
is
not
None
and
np
.
mean
(
bbox
[::
2
])
<
anchor
[
0
]
and
np
.
mean
(
bbox
[
1
::
2
])
<
anchor
[
1
]:
words
=
text
.
split
(
':'
)[
-
1
]
id_num
[
'words'
]
=
words
id_num
[
'page'
]
=
page_num
id_num
[
'position'
]
=
bbox
# 找到法人代表且确定位置
if
re
.
match
(
'法定代表人或授权代表:'
,
text
)
is
not
None
and
np
.
mean
(
bbox
[::
2
])
<
anchor
[
0
]
and
np
.
mean
(
bbox
[
1
::
2
])
<
anchor
[
1
]:
words
=
text
.
split
(
':'
)[
-
1
]
representative
[
'words'
]
=
words
representative
[
'page'
]
=
page_num
representative
[
'position'
]
=
bbox
if
role_key
==
'共同承租人:'
:
# 找到证件号码且确定位置
if
re
.
match
(
'证件号码:'
,
text
)
is
not
None
and
np
.
mean
(
bbox
[::
2
])
<
anchor
[
0
]
and
np
.
mean
(
bbox
[
1
::
2
])
>
anchor
[
1
]:
words
=
text
.
split
(
':'
)[
-
1
]
id_num
[
'words'
]
=
words
id_num
[
'page'
]
=
page_num
id_num
[
'position'
]
=
bbox
# 找到法人代表且确定位置
if
re
.
match
(
'法定代表人或授权代表:'
,
text
)
is
not
None
and
np
.
mean
(
bbox
[::
2
])
<
anchor
[
0
]
and
np
.
mean
(
bbox
[
1
::
2
])
>
anchor
[
1
]:
words
=
text
.
split
(
':'
)[
-
1
]
representative
[
'words'
]
=
words
representative
[
'page'
]
=
page_num
representative
[
'position'
]
=
bbox
if
role_key
==
'保证人1:'
:
# 找到证件号码且确定位置
if
re
.
match
(
'证件号码:'
,
text
)
is
not
None
and
np
.
mean
(
bbox
[::
2
])
>
anchor
[
0
]
and
np
.
mean
(
bbox
[
1
::
2
])
<
anchor
[
1
]:
words
=
text
.
split
(
':'
)[
-
1
]
id_num
[
'words'
]
=
words
id_num
[
'page'
]
=
page_num
id_num
[
'position'
]
=
bbox
# 找到法人代表且确定位置
if
re
.
match
(
'法定代表人或授权代表:'
,
text
)
is
not
None
and
np
.
mean
(
bbox
[::
2
])
>
anchor
[
0
]
and
np
.
mean
(
bbox
[
1
::
2
])
<
anchor
[
1
]:
words
=
text
.
split
(
':'
)[
-
1
]
representative
[
'words'
]
=
words
representative
[
'page'
]
=
page_num
representative
[
'position'
]
=
bbox
if
role_key
==
'保证人2:'
:
# 找到证件号码且确定位置
if
re
.
match
(
'证件号码:'
,
text
)
is
not
None
and
np
.
mean
(
bbox
[::
2
])
>
anchor
[
0
]
and
np
.
mean
(
bbox
[
1
::
2
])
>
anchor
[
1
]:
words
=
text
.
split
(
':'
)[
-
1
]
id_num
[
'words'
]
=
words
id_num
[
'page'
]
=
page_num
id_num
[
'position'
]
=
bbox
# 找到法人代表且确定位置
if
re
.
match
(
'法定代表人或授权代表:'
,
text
)
is
not
None
and
np
.
mean
(
bbox
[::
2
])
>
anchor
[
0
]
and
np
.
mean
(
bbox
[
1
::
2
])
>
anchor
[
1
]:
words
=
text
.
split
(
':'
)[
-
1
]
representative
[
'words'
]
=
words
representative
[
'page'
]
=
page_num
representative
[
'position'
]
=
bbox
return
name
,
id_num
,
representative
def
get_info
(
self
):
"""
block['type'] == 0 : 表示该元素为图片
...
...
@@ -905,6 +997,8 @@ class Finder:
self
.
init_result
[
'合同编号'
]
=
contract_no
# 从第一页上取四个角色的姓名和证件号码
name
,
id_num
,
representative
=
self
.
get_role_info
(
role_key
=
'承租人:'
,
page_num
=
'0'
)
if
name
[
"words"
]
==
None
:
name
,
id_num
,
representative
=
self
.
get_role_info_3_3
(
role_key
=
'承租人一:'
,
page_num
=
'0'
)
self
.
init_result
[
'承租人-姓名'
]
=
name
self
.
init_result
[
'承租人-证件号码'
]
=
id_num
self
.
init_result
[
'承租人-法定代表人或授权代表'
]
=
representative
...
...
@@ -912,14 +1006,31 @@ class Finder:
self
.
init_result
[
'保证人1-姓名'
]
=
name
self
.
init_result
[
'保证人1-证件号码'
]
=
id_num
self
.
init_result
[
'保证人1-法定代表人或授权代表'
]
=
representative
# if条件判别 对应3_3版本
if
name
[
"words"
]
==
None
:
name
,
id_num
,
representative
=
self
.
get_role_info_3_3
(
role_key
=
'共同承租人:'
,
page_num
=
'0'
)
self
.
init_result
[
'共同承租人-姓名'
]
=
name
self
.
init_result
[
'共同承租人-证件号码'
]
=
id_num
self
.
init_result
[
'共同承租人-法定代表人或授权代表'
]
=
representative
name
,
id_num
,
representative
=
self
.
get_role_info
(
role_key
=
'保证人2:'
,
page_num
=
'0'
)
self
.
init_result
[
'保证人2-姓名'
]
=
name
self
.
init_result
[
'保证人2-证件号码'
]
=
id_num
self
.
init_result
[
'保证人2-法定代表人或授权代表'
]
=
representative
# if条件判别 对应3_3版本
if
name
[
"words"
]
==
None
:
name
,
id_num
,
representative
=
self
.
get_role_info_3_3
(
role_key
=
'保证人1:'
,
page_num
=
'0'
)
self
.
init_result
[
'保证人2-姓名'
]
=
name
self
.
init_result
[
'保证人2-证件号码'
]
=
id_num
self
.
init_result
[
'保证人2-法定代表人或授权代表'
]
=
representative
name
,
id_num
,
representative
=
self
.
get_role_info
(
role_key
=
'保证人3:'
,
page_num
=
'0'
)
self
.
init_result
[
'保证人3-姓名'
]
=
name
self
.
init_result
[
'保证人3-证件号码'
]
=
id_num
self
.
init_result
[
'保证人3-法定代表人或授权代表'
]
=
representative
if
name
[
"words"
]
==
None
:
name
,
id_num
,
representative
=
self
.
get_role_info_3_3
(
role_key
=
'保证人2:'
,
page_num
=
'0'
)
self
.
init_result
[
'保证人3-姓名'
]
=
name
self
.
init_result
[
'保证人3-证件号码'
]
=
id_num
self
.
init_result
[
'保证人3-法定代表人或授权代表'
]
=
representative
# 在所有页面中找正文中(第二部分 融资租赁主要条款及付款计划)的那个编号,因为存在换行的情况所以暂时不带位置输出
contract_no
=
self
.
get_contract_no_one
()
self
.
init_result
[
'合同编号(正文)'
]
=
contract_no
...
...
@@ -955,6 +1066,9 @@ class Finder:
# 承租人姓名、签章
name
=
self
.
get_key_value
(
key
=
'承租人姓名:'
)
electronic_signature
=
self
.
get_electronic_signature
(
top
=
'承租人姓名:'
,
bottom
=
'保证人1姓名:'
)
if
name
[
"words"
]
==
None
:
name
=
self
.
get_key_value
(
key
=
'承租人一姓名:'
)
electronic_signature
=
self
.
get_electronic_signature
(
top
=
'承租人一姓名:'
,
bottom
=
'共同承租人名称:'
)
self
.
init_result
[
'签字页-承租人姓名'
]
=
name
self
.
init_result
[
'签字页-承租人签章'
]
=
electronic_signature
# 保证人1姓名、签章
...
...
@@ -962,19 +1076,35 @@ class Finder:
electronic_signature
=
self
.
get_electronic_signature
(
top
=
'保证人1姓名:'
,
bottom
=
'保证人2姓名:'
)
self
.
init_result
[
'签字页-保证人1姓名'
]
=
name
self
.
init_result
[
'签字页-保证人1签章'
]
=
electronic_signature
# 这里用的是 name["words"] == ""
if
name
[
"words"
]
==
""
:
name
=
self
.
get_key_value
(
key
=
'共同承租人名称:'
)
electronic_signature
=
self
.
get_electronic_signature
(
top
=
'共同承租人名称:'
,
bottom
=
'保证人1姓名:'
)
self
.
init_result
[
'签字页-共同承租人姓名'
]
=
name
self
.
init_result
[
'签字页-共同承租人签章'
]
=
electronic_signature
# 保证人2姓名、签章
name
=
self
.
get_key_value
(
key
=
'保证人2姓名:'
)
electronic_signature
=
self
.
get_electronic_signature
(
top
=
'保证人2姓名:'
,
bottom
=
'保证人3姓名:'
)
self
.
init_result
[
'签字页-保证人2姓名'
]
=
name
self
.
init_result
[
'签字页-保证人2签章'
]
=
electronic_signature
# 保证人2姓名、签章
# if判断条件对应3_3版本
if
name
[
"words"
]
==
""
:
name
=
self
.
get_key_value
(
key
=
'保证人1姓名:'
)
electronic_signature
=
self
.
get_electronic_signature
(
top
=
'保证人1姓名:'
,
bottom
=
'保证人2姓名:'
)
self
.
init_result
[
'签字页-保证人1姓名'
]
=
name
self
.
init_result
[
'签字页-保证人1签章'
]
=
electronic_signature
# 保证人3姓名、签章
name
=
self
.
get_key_value
(
key
=
'保证人3姓名:'
)
electronic_signature
=
self
.
get_electronic_signature
(
top
=
'保证人3姓名:'
,
bottom
=
'日期:'
)
self
.
init_result
[
'签字页-保证人3姓名'
]
=
name
self
.
init_result
[
'签字页-保证人3签章'
]
=
electronic_signature
# if判断条件对应3_3版本
if
name
[
"words"
]
==
None
:
name
=
self
.
get_key_value
(
key
=
'保证人2姓名:'
)
electronic_signature
=
self
.
get_electronic_signature
(
top
=
'保证人2姓名:'
,
bottom
=
'日期:'
)
self
.
init_result
[
'签字页-保证人2姓名'
]
=
name
self
.
init_result
[
'签字页-保证人2签章'
]
=
electronic_signature
return
self
.
init_result
# results['is_shhz_contract'] = True
# results['pdf_info'] = self.init_result
...
...
src/common/electronic_hil_contract/hil_contract_ocr.py
View file @
d3de42e
...
...
@@ -18,7 +18,6 @@ def predict(pdf_info, file_cls):
Returns:
TYPE: Description
"""
# 0: 售后回租合同
pdf_info_0
=
[]
for
pno
in
pdf_info
:
...
...
@@ -30,7 +29,6 @@ def predict(pdf_info, file_cls):
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
'售后回租合同_'
in
text
:
pdf_info_0
.
append
(
pdf_info
[
pno
])
# 1: 车辆处置协议
pdf_info_1
=
[]
for
pno
in
pdf_info
:
...
...
@@ -42,7 +40,6 @@ def predict(pdf_info, file_cls):
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
'售后回租合同附件一'
in
text
:
pdf_info_1
.
append
(
pdf_info
[
pno
])
# 2: 车辆租赁抵押合同
pdf_info_2
=
[]
for
pno
in
pdf_info
:
...
...
@@ -54,7 +51,6 @@ def predict(pdf_info, file_cls):
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
'车辆租赁抵押合同_'
in
text
:
pdf_info_2
.
append
(
pdf_info
[
pno
])
is_clczxy
=
False
# 如果 pdf_info_1 == 4 页,则说明此时输入包含了车辆处置协议
if
len
(
pdf_info_1
)
==
4
and
file_cls
==
1
and
len
(
pdf_info_0
)
!=
0
:
...
...
@@ -62,7 +58,6 @@ def predict(pdf_info, file_cls):
pdf_info
=
dict
()
for
pno
,
page_info
in
enumerate
(
pdf_info_1
):
pdf_info
[
str
(
pno
)]
=
page_info
f
=
Finder
(
pdf_info
)
if
file_cls
==
0
:
results
=
f
.
get_info
()
...
...
@@ -72,13 +67,11 @@ def predict(pdf_info, file_cls):
if
file_cls
==
2
:
# 提取信息 ———— 车辆租赁抵押合同
results
=
f
.
get_info_2
()
if
is_clczxy
==
True
:
if
is_clczxy
is
True
:
for
key
in
results
:
if
results
[
key
][
'page'
]
is
not
None
:
results
[
key
][
'page'
]
=
str
(
int
(
results
[
key
][
'page'
])
+
6
)
results
[
key
][
'page'
]
=
str
(
int
(
results
[
key
][
'page'
])
+
6
)
for
key
in
results
:
if
results
[
key
][
'page'
]
is
not
None
:
results
[
key
][
'page'
]
=
'page_'
+
str
(
int
(
results
[
key
][
'page'
])
+
1
)
results
[
key
][
'page'
]
=
'page_'
+
str
(
int
(
results
[
key
][
'page'
])
+
1
)
return
results
...
...
Write
Preview
Styling with
Markdown
is supported
Attach a file
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to post a comment