Skip to content
Toggle navigation
Toggle navigation
This project
Loading...
Sign in
周伟奇
/
bmw-ocr
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Graphs
Network
Create a new issue
Commits
Issue Boards
Files
Commits
Network
Compare
Branches
Tags
8d595a3e
authored
2022-12-27 15:28:55 +0800
by
周伟奇
Browse Files
Options
Browse Files
Tag
Download
Email Patches
Plain Diff
add FSM AFC/HIL Contract
1 parent
a9ba395a
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
2395 additions
and
15 deletions
src/apps/doc/consts.py
src/apps/doc/management/commands/ocr_process.py
src/apps/doc/named_enum.py
src/apps/doc/views.py
src/common/electronic_afc_contract/afc_contract_ocr.py
src/common/electronic_afc_contract/get_char_fsm.py
src/common/electronic_hil_contract/get_char_fsm.py
src/common/electronic_hil_contract/hil_contract_ocr.py
src/apps/doc/consts.py
View file @
8d595a3
...
...
@@ -11,7 +11,7 @@ PAGE_SIZE_DEFAULT = 10
FIXED_APPLICATION_ID_PREFIX
=
'CH-S'
DOC_SCHEME_LIST
=
[
'ACCEPTANCE'
,
'SETTLEMENT'
,
'CONTRACTMANAGEMENT'
]
DATA_SOURCE_LIST
=
[
'POS'
,
'EAPP'
,
'ECONTRACT'
]
DATA_SOURCE_LIST
=
[
'POS'
,
'EAPP'
,
'ECONTRACT'
,
'OVP'
]
COMPARE_DOC_SCHEME_LIST
=
[
'CA'
,
'SE'
]
HIL_PREFIX
=
'HIL'
...
...
src/apps/doc/management/commands/ocr_process.py
View file @
8d595a3
...
...
@@ -1476,7 +1476,8 @@ class Command(BaseCommand, LoggerMixin):
# AFC合同
if
classify_1_str
==
str
(
consts
.
CONTRACT_CLASSIFY
):
ocr_result
=
afc_predict
(
pdf_handler
.
pdf_info
)
is_fsm
=
doc
.
data_source
==
consts
.
DATA_SOURCE_LIST
[
3
]
ocr_result
=
afc_predict
(
pdf_handler
.
pdf_info
,
is_fsm
=
is_fsm
)
page_res
=
{}
for
page_num
,
page_info
in
ocr_result
.
get
(
'page_info'
,
{})
.
items
():
if
isinstance
(
page_num
,
str
)
and
page_num
.
startswith
(
'page_'
):
...
...
@@ -1499,8 +1500,9 @@ class Command(BaseCommand, LoggerMixin):
}
# HIL合同
elif
classify_1_str
in
consts
.
HIL_CONTRACT_TYPE_MAP
:
is_fsm
=
doc
.
data_source
==
consts
.
DATA_SOURCE_LIST
[
3
]
file_type_1
=
consts
.
HIL_CONTRACT_TYPE_MAP
.
get
(
classify_1_str
)
ocr_result_1
=
hil_predict
(
pdf_handler
.
pdf_info
,
file_type_1
)
ocr_result_1
=
hil_predict
(
pdf_handler
.
pdf_info
,
file_type_1
,
is_fsm
=
is_fsm
)
rebuild_res_1
=
{}
page_res
=
{}
for
field_name
,
field_info
in
ocr_result_1
.
items
():
...
...
@@ -1526,8 +1528,8 @@ class Command(BaseCommand, LoggerMixin):
'page_info'
:
page_info
}
# hmh
else
:
pass
#
else:
#
pass
contract_res
=
{}
...
...
src/apps/doc/named_enum.py
View file @
8d595a3
...
...
@@ -36,6 +36,7 @@ class RequestTrigger(NamedEnum):
DOCUPLOAD
=
(
3
,
'Document Upload'
)
SUBMITING
=
(
4
,
'Submiting'
)
UPLOADING
=
(
5
,
'Uploading'
)
OVP
=
(
6
,
'OVP'
)
class
FailureReason
(
NamedEnum
):
...
...
src/apps/doc/views.py
View file @
8d595a3
...
...
@@ -590,12 +590,13 @@ class UploadDocView(GenericView, DocHandler):
is_zip
=
False
classify_1
=
0
# 电子合同
if
data_source
==
consts
.
DATA_SOURCE_LIST
[
-
1
]
and
document_scheme
==
consts
.
DOC_SCHEME_LIST
[
1
]:
for
keyword
,
classify_1_tmp
in
consts
.
ECONTRACT_KEYWORDS_MAP
.
get
(
prefix
):
if
keyword
in
document_name
:
classify_1
=
classify_1_tmp
break
# 电子合同 Econtract or OVP(FSM)
if
data_source
==
consts
.
DATA_SOURCE_LIST
[
2
]
or
data_source
==
consts
.
DATA_SOURCE_LIST
[
3
]:
if
document_scheme
==
consts
.
DOC_SCHEME_LIST
[
1
]:
for
keyword
,
classify_1_tmp
in
consts
.
ECONTRACT_KEYWORDS_MAP
.
get
(
prefix
):
if
keyword
in
document_name
:
classify_1
=
classify_1_tmp
break
# FSM合同:WEP/MSI/SC
elif
data_source
==
consts
.
DATA_SOURCE_LIST
[
0
]
and
document_scheme
==
consts
.
DOC_SCHEME_LIST
[
0
]:
for
keyword
,
classify_1_tmp
in
consts
.
FSM_ECONTRACT_KEYWORDS_MAP
.
get
(
prefix
):
...
...
src/common/electronic_afc_contract/afc_contract_ocr.py
View file @
8d595a3
...
...
@@ -6,6 +6,7 @@
# @Description :
from
.get_char
import
Finder
from
.get_char_fsm
import
Finder
as
FSMFinder
import
numpy
as
np
...
...
@@ -23,7 +24,7 @@ def extract_info(ocr_results):
return
{
'page_1'
:
{
'合同编号'
:
contract_no
}}
def
predict
(
pdf_info
,
is_qrs
=
False
):
def
predict
(
pdf_info
,
is_qrs
=
False
,
is_fsm
=
False
):
ocr_results
=
{}
for
pno
in
pdf_info
:
ocr_results
[
pno
]
=
{}
...
...
@@ -50,7 +51,10 @@ def predict(pdf_info, is_qrs=False):
results
=
extract_info
(
ocr_results
)
else
:
# 输入是整个 PDF 中的信息
f
=
Finder
(
pdf_info
,
ocr_results
=
ocr_results
)
if
is_fsm
:
f
=
FSMFinder
(
pdf_info
,
ocr_results
=
ocr_results
)
else
:
f
=
Finder
(
pdf_info
,
ocr_results
=
ocr_results
)
results
=
f
.
get_info
()
return
results
...
...
src/common/electronic_afc_contract/get_char_fsm.py
0 → 100644
View file @
8d595a3
import
re
import
numpy
as
np
from
fuzzywuzzy
import
fuzz
from
shapely.geometry
import
Polygon
class
Finder
:
def
__init__
(
self
,
pdf_info
,
ocr_results
):
self
.
pdf_info
=
pdf_info
self
.
ocr_results
=
ocr_results
self
.
is_asp
=
False
self
.
item
=
{
"words"
:
None
,
"position"
:
None
,
}
def
gen_init_result
(
self
,
is_asp
):
# 格式化算法输出
self
.
init_result
=
{
"page_1"
:
{
"合同编号"
:
self
.
item
,
"所购车辆价格"
:
self
.
item
,
"车架号"
:
self
.
item
,
"贷款本金金额"
:
{
"大写"
:
self
.
item
,
"小写"
:
self
.
item
,
"车辆贷款本金金额"
:
self
.
item
,
"附加产品融资贷款本金总金额"
:
self
.
item
,
},
"贷款期限"
:
self
.
item
,
"附加产品融资贷款本金总金额明细"
:
self
.
item
,
"借款人签字及时间"
:
self
.
item
,
},
"page_2"
:
{
"合同编号"
:
self
.
item
,
"借款人及抵押人"
:
{
"name"
:
self
.
item
,
"id"
:
self
.
item
,
},
"共同借款人及共同抵押人"
:
{
"name"
:
self
.
item
,
"id"
:
self
.
item
,
},
"保证人1"
:
{
"name"
:
self
.
item
,
"id"
:
self
.
item
,
},
"保证人2"
:
{
"name"
:
self
.
item
,
"id"
:
self
.
item
,
},
"所购车辆价格"
:
self
.
item
,
"车架号"
:
self
.
item
,
"经销商"
:
self
.
item
,
"贷款本金金额"
:
{
"大写"
:
self
.
item
,
"小写"
:
self
.
item
,
"车辆贷款本金金额"
:
self
.
item
,
"附加产品融资贷款本金总金额"
:
self
.
item
,
},
"贷款期限"
:
self
.
item
,
"标准利率"
:
self
.
item
,
"借款人收款账户"
:
{
"账号"
:
self
.
item
,
"户名"
:
self
.
item
,
"开户行"
:
self
.
item
,
},
"还款账户"
:
{
"账号"
:
self
.
item
,
"户名"
:
self
.
item
,
"开户行"
:
self
.
item
,
},
},
"page_3"
:
{
"合同编号"
:
self
.
item
,
"还款计划表"
:
self
.
item
,
},
"page_4"
:
{
"合同编号"
:
self
.
item
,
"附加产品融资贷款本金总金额明细"
:
self
.
item
,
},
"page_5"
:
{
"合同编号"
:
self
.
item
,
},
"page_6"
:
{
"合同编号"
:
self
.
item
,
},
}
self
.
init_result
[
"page_7"
]
=
{
"合同编号"
:
self
.
item
,
}
self
.
init_result
[
"page_8"
]
=
{
"合同编号"
:
self
.
item
,
"主借人签字"
:
{
"签字"
:
self
.
item
,
"日期"
:
self
.
item
,
},
"共借人签字"
:
{
"签字"
:
self
.
item
,
"日期"
:
self
.
item
,
},
"保证人1签字"
:
{
"签字"
:
self
.
item
,
"日期"
:
self
.
item
,
},
"保证人2签字"
:
{
"签字"
:
self
.
item
,
"日期"
:
self
.
item
,
},
"见证人签字"
:
{
"签字"
:
self
.
item
,
"日期"
:
self
.
item
,
},
}
def
get_top_iou
(
self
,
poly
,
ocr_result
):
"""传入一个多边形, 找到与之最匹配的多边形
Args:
poly (TYPE): Description
"""
iou_list
=
[]
for
key
in
ocr_result
:
bbox
,
text
=
ocr_result
[
key
]
g
=
Polygon
(
np
.
array
(
bbox
)
.
reshape
((
-
1
,
2
)))
p
=
Polygon
(
np
.
array
(
poly
)
.
reshape
((
-
1
,
2
)))
if
not
g
.
is_valid
or
not
p
.
is_valid
:
continue
inter
=
Polygon
(
g
)
.
intersection
(
Polygon
(
p
))
.
area
union
=
g
.
area
+
p
.
area
-
inter
iou
=
inter
/
union
iou_list
.
append
([
iou
,
key
])
if
len
(
iou_list
)
==
0
:
return
-
1
,
-
1
top_iou
=
sorted
(
iou_list
,
key
=
lambda
x
:
x
[
0
])[
-
1
]
return
top_iou
def
poly_to_rectangle
(
self
,
poly
):
xmin
,
ymin
,
xmax
,
ymin
,
xmax
,
ymax
,
xmin
,
ymax
=
poly
bbox
=
[
xmin
,
ymin
,
xmax
,
ymax
]
return
bbox
def
get_contract_no
(
self
,
page_num
):
"""传入页码,查看该页码右上角的编号
Args:
page_num (string):
Returns:
sting:
"""
contract_no
=
self
.
item
.
copy
()
# contract_no['words'] = ''
# contract_no['position'] = [-1, -1, -1, -1]
# 只看第一页
for
key
in
self
.
ocr_results
[
page_num
]:
bbox
,
text
=
self
.
ocr_results
[
page_num
][
key
]
if
'合同编号:'
in
text
:
words
=
text
.
split
(
':'
)[
-
1
]
location
=
self
.
poly_to_rectangle
(
bbox
)
contract_no
[
'words'
]
=
words
contract_no
[
'position'
]
=
location
return
contract_no
def
get_vehicle_price
(
self
,
page_num
=
'0'
):
vehicle_price
=
self
.
item
.
copy
()
# vehicle_price['words'] = ''
# vehicle_price['position'] = [-1, -1, -1, -1]
for
key
in
self
.
ocr_results
[
page_num
]:
bbox
,
text
=
self
.
ocr_results
[
page_num
][
key
]
if
'所购车辆价格为人民币'
in
text
:
words
=
text
.
split
(
'币'
)[
-
1
]
location
=
self
.
poly_to_rectangle
(
bbox
)
vehicle_price
[
'words'
]
=
words
vehicle_price
[
'position'
]
=
location
return
vehicle_price
def
get_vin
(
self
,
page_num
=
'0'
):
vin
=
self
.
item
.
copy
()
# vin['words'] = ''
# vin['position'] = [-1, -1, -1, -1]
for
key
in
self
.
ocr_results
[
page_num
]:
bbox
,
text
=
self
.
ocr_results
[
page_num
][
key
]
if
'车架号:'
in
text
:
words
=
text
.
split
(
':'
)[
-
1
]
location
=
self
.
poly_to_rectangle
(
bbox
)
vin
[
'words'
]
=
words
vin
[
'position'
]
=
location
return
vin
def
get_loan_principal
(
self
,
page_num
=
'0'
):
chinese_keywords
=
[
'壹'
,
'贰'
,
'叁'
,
'肆'
,
'伍'
,
'陆'
,
'柒'
,
'捌'
,
'玖'
,
'拾'
,
'佰'
,
'仟'
,
'万'
,
'亿'
,
'元'
,
'角'
,
'分'
,
'零'
,
'整'
]
upper
=
self
.
item
.
copy
()
lower
=
self
.
item
.
copy
()
asp_1
=
self
.
item
.
copy
()
asp_2
=
self
.
item
.
copy
()
anchor_bbox
=
None
for
block
in
self
.
pdf_info
[
page_num
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
fuzz
.
ratio
(
''
.
join
(
chinese_keywords
),
text
)
>
15
:
text
=
text
.
split
(
':'
)[
-
1
]
.
strip
()
upper
[
'position'
]
=
bbox
upper
[
'words'
]
=
text
if
'小写:¥'
in
text
:
words
=
text
.
split
(
'¥'
)[
-
1
]
.
strip
()
lower
[
'position'
]
=
bbox
lower
[
'words'
]
=
words
if
'附加产品融资贷款本金总金额'
==
text
:
anchor_bbox
=
bbox
if
anchor_bbox
:
for
block
in
self
.
pdf_info
[
page_num
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
np
.
mean
(
bbox
[
1
::
2
])
<
np
.
mean
(
anchor_bbox
[
1
::
2
])
and
'人民币:小写:'
in
text
:
words
=
re
.
findall
(
r'人民币:小写:\[(.*)\]'
,
text
)[
0
]
asp_1
[
'position'
]
=
bbox
asp_1
[
'words'
]
=
words
if
np
.
mean
(
bbox
[
1
::
2
])
>
np
.
mean
(
anchor_bbox
[
1
::
2
])
and
'人民币:小写:'
in
text
:
words
=
re
.
findall
(
r'人民币:小写:\[(.*)\]'
,
text
)[
0
]
asp_2
[
'position'
]
=
bbox
asp_2
[
'words'
]
=
words
return
upper
,
lower
,
asp_1
,
asp_2
def
get_loan_term
(
self
,
page_num
=
'0'
):
loan_term
=
self
.
item
.
copy
()
all_text
=
''
for
block
in
self
.
pdf_info
[
page_num
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
all_text
+=
text
matchs
=
re
.
search
(
r'贷款期限(\d+)个月'
,
all_text
)
if
matchs
:
words
=
matchs
.
group
(
1
)
for
block
in
self
.
pdf_info
[
page_num
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
f
'{words}个月'
in
text
:
loan_term
[
'position'
]
=
bbox
loan_term
[
'words'
]
=
words
return
loan_term
def
get_standard_rate
(
self
,
page_num
=
'0'
):
standard_rate
=
self
.
item
.
copy
()
for
block
in
self
.
pdf_info
[
page_num
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
matchs
=
re
.
search
(
r'本合同当期的标准利率为(\S+)
%
/年'
,
text
)
if
matchs
:
standard_rate
[
'position'
]
=
bbox
standard_rate
[
'words'
]
=
matchs
.
group
(
1
)
return
standard_rate
def
mergelist
(
self
,
text_list
):
pattern
=
re
.
compile
(
"[^
\u4e00
-
\u9fa5
]"
)
# 匹配不是中文的其他字符
mergeindex
=
-
1
for
index
,
i
in
enumerate
(
text_list
):
if
'所购'
in
i
and
len
(
pattern
.
sub
(
''
,
pattern
.
sub
(
''
,
text_list
[
index
+
1
])))
!=
0
:
# if '所购' in i and '.00' not in text_list[index+1]:
mergeindex
=
index
if
mergeindex
==
-
1
:
return
text_list
else
:
new_text_list
=
text_list
[:
mergeindex
]
+
[
text_list
[
mergeindex
]
+
text_list
[
mergeindex
+
1
]]
+
text_list
[
mergeindex
+
2
:]
return
self
.
mergelist
(
new_text_list
)
def
get_asp_details
(
self
,
page_num
):
asp_details_table_term
=
self
.
item
.
copy
()
asp_details_table
=
[[
'附加产品融资贷款本金总金额及贷款利率明细'
],
[
'项目1'
,
'用途总金额2'
,
'贷款本金3'
]]
bbox_xm
=
None
bbox_ytzje
=
None
bbox_dkbj
=
None
bbox_total
=
None
for
key
in
self
.
ocr_results
[
page_num
]:
bbox
,
text
=
self
.
ocr_results
[
page_num
][
key
]
if
text
==
'项目1'
:
bbox_xm
=
bbox
if
text
==
'用途总金额2'
:
bbox_ytzje
=
bbox
if
text
==
'贷款本金3'
:
bbox_dkbj
=
bbox
if
text
in
[
'附加产品融资贷款本'
,
'附加产品融资贷款本金'
,
'附加产品融资贷'
]:
bbox_total
=
bbox
if
bbox_xm
:
for
i
in
range
(
10
):
rh
=
abs
(
bbox_xm
[
1
]
-
bbox_xm
[
-
1
])
anchor
=
np
.
array
(
bbox_xm
)
.
reshape
((
-
1
,
2
))
anchor
[:,
1
]
+=
int
(
rh
*
1.4
)
_iou
,
_key
=
self
.
get_top_iou
(
poly
=
anchor
,
ocr_result
=
self
.
ocr_results
[
page_num
])
if
_iou
>
0
:
bbox
,
xm_text
=
self
.
ocr_results
[
page_num
][
_key
]
bbox_xm
=
bbox
# 解决项目内容是两行的问题
if
not
'所购'
in
xm_text
:
line
=
asp_details_table
[
-
1
]
line
[
0
]
+=
xm_text
asp_details_table
[
-
1
]
=
line
continue
# print(xm_text)
anchor_1
=
[
bbox_ytzje
[
0
],
bbox
[
1
],
bbox_ytzje
[
2
],
bbox
[
3
],
bbox_ytzje
[
4
],
bbox
[
5
],
bbox_ytzje
[
6
],
bbox
[
7
]]
_iou
,
_key
=
self
.
get_top_iou
(
poly
=
anchor_1
,
ocr_result
=
self
.
ocr_results
[
page_num
])
bbox
,
ytzje_text
=
self
.
ocr_results
[
page_num
][
_key
]
# print(ytzje_text)
anchor_2
=
[
bbox_dkbj
[
0
],
bbox
[
1
],
bbox_dkbj
[
2
],
bbox
[
3
],
bbox_dkbj
[
4
],
bbox
[
5
],
bbox_dkbj
[
6
],
bbox
[
7
]]
_iou
,
_key
=
self
.
get_top_iou
(
poly
=
anchor_2
,
ocr_result
=
self
.
ocr_results
[
page_num
])
bbox
,
dkbj_text
=
self
.
ocr_results
[
page_num
][
_key
]
# print(dkbj_text)
if
xm_text
==
ytzje_text
:
xm_text
,
ytzje_text
=
xm_text
.
split
(
' '
)
line
=
[
xm_text
,
ytzje_text
,
dkbj_text
]
asp_details_table
.
append
(
line
)
else
:
break
if
bbox_total
:
anchor
=
[
bbox_dkbj
[
0
],
bbox_total
[
1
],
bbox_dkbj
[
2
],
bbox_total
[
3
],
bbox_dkbj
[
4
],
bbox_total
[
5
],
bbox_dkbj
[
6
],
bbox_total
[
7
]]
_iou
,
_key
=
self
.
get_top_iou
(
poly
=
anchor
,
ocr_result
=
self
.
ocr_results
[
page_num
])
bbox
,
total_text
=
self
.
ocr_results
[
page_num
][
_key
]
asp_details_table
.
append
([
'附加产品融资贷款本金总金额:'
,
''
,
total_text
])
asp_details_table_term
[
'words'
]
=
asp_details_table
return
asp_details_table_term
def
get_signature
(
self
):
signature
=
self
.
item
.
copy
()
for
block
in
self
.
pdf_info
[
'0'
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
'签署日期'
in
text
:
words
=
text
signature
[
'words'
]
=
words
signature
[
'position'
]
=
bbox
return
signature
def
get_somebody
(
self
,
top
,
bottom
):
# 指定上下边界后,返回上下边界内的客户信息
_name
=
self
.
item
.
copy
()
_id
=
self
.
item
.
copy
()
# 只看第一页,先划定上下边界
y_top
=
0
y_bottom
=
0
for
block
in
self
.
pdf_info
[
'1'
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
top
in
text
:
y_top
=
bbox
[
3
]
if
bottom
in
text
:
y_bottom
=
bbox
[
3
]
for
block
in
self
.
pdf_info
[
'1'
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
y_top
<
bbox
[
3
]
<
y_bottom
:
# print(top, bottom, text)
if
'姓名/名称'
in
text
:
words
=
text
.
split
(
':'
)[
-
1
]
_name
[
'position'
]
=
bbox
_name
[
'words'
]
=
words
if
'自然人身份证件号码/法人执照号码'
in
text
:
words
=
text
.
split
(
':'
)[
-
1
]
_id
[
'position'
]
=
bbox
_id
[
'words'
]
=
words
return
_name
,
_id
def
get_seller
(
self
):
seller
=
self
.
item
.
copy
()
# 先找到 key
anchor_bbox
=
None
for
block
in
self
.
pdf_info
[
'1'
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
text
in
[
'经销商'
,
'车辆销售方'
]:
anchor_bbox
=
bbox
# 当找到了 key, 则根据 key 去匹配 value
if
anchor_bbox
:
half_width
=
self
.
pdf_info
[
'1'
][
'width'
]
*
0.5
for
block
in
self
.
pdf_info
[
'1'
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
anchor_bbox
[
2
]
<
np
.
mean
(
bbox
[::
2
])
<
half_width
and
\
anchor_bbox
[
1
]
<
np
.
mean
(
bbox
[
1
::
2
])
<
anchor_bbox
[
3
]:
seller
[
'position'
]
=
bbox
seller
[
'words'
]
=
text
return
seller
def
get_borrower_collection_account
(
self
):
account
=
self
.
item
.
copy
()
account_name
=
self
.
item
.
copy
()
account_bank
=
self
.
item
.
copy
()
all_text
=
''
for
block
in
self
.
pdf_info
[
'1'
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
all_text
+=
text
# 首先确定账户信息是哪种,我们只输出非另行通知的格式
if
'借款人收款账户'
in
all_text
:
all_text
=
all_text
.
replace
(
' '
,
''
)
.
replace
(
' '
,
''
)
matchs_1
=
re
.
findall
(
r'账号:(.*?)户名'
,
all_text
)
if
matchs_1
:
words
=
matchs_1
[
0
]
for
block
in
self
.
pdf_info
[
'1'
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
f
'{words}'
in
text
:
account
[
'position'
]
=
bbox
account
[
'words'
]
=
words
matchs_2
=
re
.
findall
(
r'户名:(.*?)开户行'
,
all_text
)
if
matchs_2
:
words
=
matchs_2
[
0
]
for
block
in
self
.
pdf_info
[
'1'
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
f
'{words}'
in
text
:
account_name
[
'position'
]
=
bbox
account_name
[
'words'
]
=
words
matchs_3
=
re
.
findall
(
r'开户行:(.*?)借款人'
,
all_text
)
if
matchs_3
:
words
=
matchs_3
[
0
]
for
block
in
self
.
pdf_info
[
'1'
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
f
'{words}'
in
text
:
account_bank
[
'position'
]
=
bbox
account_bank
[
'words'
]
=
words
return
account
,
account_name
,
account_bank
def
get_payback_account
(
self
):
account
=
self
.
item
.
copy
()
account_name
=
self
.
item
.
copy
()
account_bank
=
self
.
item
.
copy
()
all_text
=
''
for
block
in
self
.
pdf_info
[
'1'
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
all_text
+=
text
# 首先确定账户信息是哪种,我们只输出非另行通知的格式
if
'(13) 还款账户'
in
all_text
:
all_text
=
all_text
.
split
(
'(13) 还款账户'
)[
-
1
]
all_text
=
all_text
.
replace
(
' '
,
''
)
.
replace
(
' '
,
''
)
matchs_1
=
re
.
findall
(
r'账号:(.*?)户名'
,
all_text
)
if
matchs_1
:
words
=
matchs_1
[
0
]
for
block
in
self
.
pdf_info
[
'1'
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
f
'{words}'
in
text
:
account
[
'position'
]
=
bbox
account
[
'words'
]
=
words
matchs_2
=
re
.
findall
(
r'户名:(.*?)开户行'
,
all_text
)
if
matchs_2
:
words
=
matchs_2
[
0
]
for
block
in
self
.
pdf_info
[
'1'
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
f
'{words}'
in
text
:
account_name
[
'position'
]
=
bbox
account_name
[
'words'
]
=
words
matchs_3
=
re
.
findall
(
r'开户行:(.*?);'
,
all_text
)
if
matchs_3
:
words
=
matchs_3
[
0
]
for
block
in
self
.
pdf_info
[
'1'
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
f
'开户行:{words};'
in
text
.
replace
(
' '
,
''
):
account_bank
[
'position'
]
=
bbox
account_bank
[
'words'
]
=
words
return
account
,
account_name
,
account_bank
def
get_repayment_schedule
(
self
):
repayment_schedule
=
self
.
item
.
copy
()
# 只看第二页
repayment_schedule_table
=
[]
repayment_schedule_text_list
=
[]
table
=
False
for
block
in
self
.
pdf_info
[
'2'
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
'序号'
==
text
:
table
=
True
if
'以上表格中所列的序号并非还款期数'
in
text
:
table
=
False
if
table
==
True
:
repayment_schedule_text_list
.
append
(
text
)
for
i
in
range
(
len
(
repayment_schedule_text_list
)
//
5
):
line
=
[]
# 5表示5列的意思
for
j
in
range
(
5
):
line
.
append
(
repayment_schedule_text_list
[
i
*
5
+
j
])
if
str
(
i
+
1
)
==
line
[
1
]:
break
repayment_schedule_table
.
append
(
line
)
if
len
(
repayment_schedule_table
)
>
0
:
repayment_schedule
[
'words'
]
=
repayment_schedule_table
return
repayment_schedule
def
get_signature_role_1
(
self
):
signature_role_1
=
self
.
init_item
.
copy
()
# 先定位签字区域
texts
=
[]
boxes
=
[]
page_num
=
None
position
=
None
words
=
None
region
=
False
for
i
in
list
(
self
.
pdf_info
.
keys
()):
for
block
in
self
.
pdf_info
[
i
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
'借款人(抵押人)'
in
text
:
region
=
True
if
'日期'
in
text
:
region
=
False
if
region
==
True
:
page_num
=
i
texts
.
append
(
text
)
boxes
.
append
(
bbox
)
if
len
(
texts
)
>
4
:
words
=
'有'
else
:
words
=
'无'
boxes
=
np
.
array
(
boxes
)
.
reshape
((
-
1
,
2
))
position
=
[
min
(
boxes
[:,
0
]),
min
(
boxes
[:,
1
]),
max
(
boxes
[:,
0
]),
max
(
boxes
[:,
1
])]
signature_role_1
[
'page_num'
]
=
page_num
signature_role_1
[
'position'
]
=
position
signature_role_1
[
'words'
]
=
words
return
signature_role_1
def
get_signature_role_2
(
self
):
signature_role_2
=
self
.
init_item
.
copy
()
# 先定位签字区域
texts
=
[]
boxes
=
[]
page_num
=
None
position
=
None
words
=
None
region
=
False
for
i
in
list
(
self
.
pdf_info
.
keys
()):
for
block
in
self
.
pdf_info
[
i
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
'共同借款人(共同抵押人)'
in
text
:
region
=
True
if
'日期'
in
text
:
region
=
False
if
region
==
True
:
page_num
=
i
texts
.
append
(
text
)
boxes
.
append
(
bbox
)
if
len
(
texts
)
>
4
:
words
=
'有'
else
:
words
=
'无'
boxes
=
np
.
array
(
boxes
)
.
reshape
((
-
1
,
2
))
position
=
[
min
(
boxes
[:,
0
]),
min
(
boxes
[:,
1
]),
max
(
boxes
[:,
0
]),
max
(
boxes
[:,
1
])]
signature_role_2
[
'page_num'
]
=
page_num
signature_role_2
[
'position'
]
=
position
signature_role_2
[
'words'
]
=
words
return
signature_role_2
def
get_signature_role_3
(
self
):
signature_role_3
=
self
.
init_item
.
copy
()
# 先定位签字区域
texts
=
[]
boxes
=
[]
page_num
=
None
position
=
None
words
=
None
region
=
False
for
i
in
list
(
self
.
pdf_info
.
keys
()):
for
block
in
self
.
pdf_info
[
i
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
'保证人1'
in
text
and
int
(
i
)
!=
0
:
region
=
True
if
'日期'
in
text
:
region
=
False
if
region
==
True
:
page_num
=
i
texts
.
append
(
text
)
boxes
.
append
(
bbox
)
if
len
(
texts
)
>
4
:
words
=
'有'
else
:
words
=
'无'
boxes
=
np
.
array
(
boxes
)
.
reshape
((
-
1
,
2
))
position
=
[
min
(
boxes
[:,
0
]),
min
(
boxes
[:,
1
]),
max
(
boxes
[:,
0
]),
max
(
boxes
[:,
1
])]
signature_role_3
[
'page_num'
]
=
page_num
signature_role_3
[
'position'
]
=
position
signature_role_3
[
'words'
]
=
words
return
signature_role_3
def
get_signature_role_4
(
self
):
signature_role_4
=
self
.
init_item
.
copy
()
# 先定位签字区域
texts
=
[]
boxes
=
[]
page_num
=
None
position
=
None
words
=
None
region
=
False
for
i
in
list
(
self
.
pdf_info
.
keys
()):
for
block
in
self
.
pdf_info
[
i
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
'保证人2'
in
text
and
int
(
i
)
!=
0
:
region
=
True
if
'日期'
in
text
:
region
=
False
if
region
==
True
:
page_num
=
i
texts
.
append
(
text
)
boxes
.
append
(
bbox
)
if
len
(
texts
)
>
4
:
words
=
'有'
else
:
words
=
'无'
boxes
=
np
.
array
(
boxes
)
.
reshape
((
-
1
,
2
))
position
=
[
min
(
boxes
[:,
0
]),
min
(
boxes
[:,
1
]),
max
(
boxes
[:,
0
]),
max
(
boxes
[:,
1
])]
signature_role_4
[
'page_num'
]
=
page_num
signature_role_4
[
'position'
]
=
position
signature_role_4
[
'words'
]
=
words
return
signature_role_4
def
get_signature_role_5
(
self
):
signature_role_5
=
self
.
init_item
.
copy
()
# 先定位签字区域
texts
=
[]
boxes
=
[]
page_num
=
None
position
=
None
words
=
None
region
=
False
for
i
in
list
(
self
.
pdf_info
.
keys
()):
for
block
in
self
.
pdf_info
[
i
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
'见证人签字'
in
text
and
int
(
i
)
!=
0
:
region
=
True
if
'年'
in
text
:
region
=
False
if
region
==
True
:
page_num
=
i
texts
.
append
(
text
)
boxes
.
append
(
bbox
)
print
(
texts
)
if
len
(
texts
)
>
4
:
words
=
'有'
else
:
words
=
'无'
boxes
=
np
.
array
(
boxes
)
.
reshape
((
-
1
,
2
))
position
=
[
min
(
boxes
[:,
0
]),
min
(
boxes
[:,
1
]),
max
(
boxes
[:,
0
]),
max
(
boxes
[:,
1
])]
signature_role_5
[
'page_num'
]
=
page_num
signature_role_5
[
'position'
]
=
position
signature_role_5
[
'words'
]
=
words
return
signature_role_5
def
get_last_page_signature
(
self
,
page_num
,
top
,
bottom
):
signature_name
=
self
.
item
.
copy
()
signature_date
=
self
.
item
.
copy
()
anchor_top
=
None
anchor_bottom
=
None
for
block
in
self
.
pdf_info
[
page_num
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
top
in
text
:
anchor_top
=
bbox
[
1
]
if
bottom
in
text
:
anchor_bottom
=
bbox
[
1
]
# print(top, anchor_top, anchor_bottom)
if
anchor_top
is
not
None
and
anchor_bottom
is
not
None
:
for
block
in
self
.
pdf_info
[
page_num
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
'签署日期'
in
text
and
int
(
anchor_top
)
<
np
.
mean
(
bbox
[
1
::
2
])
<
int
(
anchor_bottom
):
name
=
text
.
split
(
' '
)[
0
]
date
=
text
.
split
(
':'
)[
-
1
]
signature_name
[
'words'
]
=
name
signature_name
[
'position'
]
=
bbox
signature_date
[
'words'
]
=
date
signature_date
[
'position'
]
=
bbox
return
signature_name
,
signature_date
def
get_info
(
self
):
"""
block['type'] == 0 : 表示该元素为图片
Returns:
dict: Description
"""
# 先判断是否为 ASP 产品
# 只看第一页,判断是否有 '附加产品融资贷款本金总金额' 这一句话,若有则为 ASP 产品
# print(self.pdf_info['0']['blocks'])
# for block in self.pdf_info['0']['blocks']:
# if block['type'] != 0:
# continue
# for line in block['lines']:
# for span in line['spans']:
# bbox, text = span['bbox'], span['text']
# if '附加产品融资贷款本金总金额' == text:
# self.is_asp = True
for
key
in
self
.
ocr_results
[
'0'
]:
bbox
,
text
=
self
.
ocr_results
[
'0'
][
key
]
if
'附加产品融资贷款本金总金额'
in
text
:
self
.
is_asp
=
True
self
.
gen_init_result
(
self
.
is_asp
)
if
len
(
list
(
self
.
ocr_results
.
keys
()))
<=
8
:
# 8.5 版本客户提供的样本出现串页的情况,暂时无法识别
# Page 1
# 找合同编号
contract_no
=
self
.
get_contract_no
(
page_num
=
'0'
)
# print(contract_no)
self
.
init_result
[
'page_1'
][
'合同编号'
]
=
contract_no
# 所购车辆价格
vehicle_price
=
self
.
get_vehicle_price
()
# print(vehicle_price)
self
.
init_result
[
'page_1'
][
'所购车辆价格'
]
=
vehicle_price
# 车架号
vin
=
self
.
get_vin
()
# print(vin)
self
.
init_result
[
'page_1'
][
'车架号'
]
=
vin
# 贷款本金金额(如果是 ASP产品)则'贷款本金金额'项目中包含'车辆贷款本金金额'和'附加产品融资贷款本金总金额'两个项目
upper
,
lower
,
asp_1
,
asp_2
=
self
.
get_loan_principal
()
# print(upper, lower, asp_1, asp_2)
self
.
init_result
[
'page_1'
][
'贷款本金金额'
][
'大写'
]
=
upper
self
.
init_result
[
'page_1'
][
'贷款本金金额'
][
'小写'
]
=
lower
self
.
init_result
[
'page_1'
][
'贷款本金金额'
][
'车辆贷款本金金额'
]
=
asp_1
self
.
init_result
[
'page_1'
][
'贷款本金金额'
][
'附加产品融资贷款本金总金额'
]
=
asp_2
# 贷款期限
loan_term
=
self
.
get_loan_term
()
# print(loan_term)
self
.
init_result
[
'page_1'
][
'贷款期限'
]
=
loan_term
# 附加产品融资贷款本金总金额明细(ASP-表格)
asp_details_table
=
self
.
get_asp_details
(
page_num
=
'0'
)
# print(asp_details_table)
self
.
init_result
[
'page_1'
][
'附加产品融资贷款本金总金额明细'
]
=
asp_details_table
# 借款人签字及时间
signature
=
self
.
get_signature
()
# print(signature)
self
.
init_result
[
'page_1'
][
'借款人签字及时间'
]
=
signature
#######################################
# Page 2
# 找合同编号
contract_no
=
self
.
get_contract_no
(
page_num
=
'0'
)
# print(contract_no)
self
.
init_result
[
'page_2'
][
'合同编号'
]
=
contract_no
# 找借款人及抵押人(地址字段原本有空格)
borrower_name
,
borrower_id
=
self
.
get_somebody
(
top
=
'借款人及抵押人:'
,
bottom
=
'共同借款人:'
)
# 这是为了同时兼容 8.1 版本
if
borrower_name
[
'words'
]
==
None
:
borrower_name
,
borrower_id
=
self
.
get_somebody
(
top
=
'借款人及抵押人:'
,
bottom
=
'共同借款人及共同抵押人:'
)
# 这是为了兼容车贷分离版本
if
borrower_name
[
'words'
]
==
None
:
borrower_name
,
borrower_id
=
self
.
get_somebody
(
top
=
'借款人:'
,
bottom
=
'共同借款人及抵押人:'
)
# print(borrower_name, borrower_id)
self
.
init_result
[
'page_2'
][
'借款人及抵押人'
][
'name'
]
=
borrower_name
self
.
init_result
[
'page_2'
][
'借款人及抵押人'
][
'id'
]
=
borrower_id
# 找共同借款人及共同抵押人
co_borrower_name
,
co_borrower_id
=
self
.
get_somebody
(
top
=
'共同借款人:'
,
bottom
=
'保证人1:'
)
# print(co_borrower_name, co_borrower_id)
self
.
init_result
[
'page_2'
][
'共同借款人及共同抵押人'
][
'name'
]
=
co_borrower_name
self
.
init_result
[
'page_2'
][
'共同借款人及共同抵押人'
][
'id'
]
=
co_borrower_id
# 保证人1
first_guarantor_name
,
first_guarantor_id
=
self
.
get_somebody
(
top
=
'保证人1:'
,
bottom
=
'保证人2:'
)
self
.
init_result
[
'page_2'
][
'保证人1'
][
'name'
]
=
first_guarantor_name
self
.
init_result
[
'page_2'
][
'保证人1'
][
'id'
]
=
first_guarantor_id
# 保证人2
second_guarantor_name
,
second_guarantor_id
=
self
.
get_somebody
(
top
=
'保证人2:'
,
bottom
=
'第一章'
)
self
.
init_result
[
'page_2'
][
'保证人2'
][
'name'
]
=
second_guarantor_name
self
.
init_result
[
'page_2'
][
'保证人2'
][
'id'
]
=
second_guarantor_id
# 所购车辆价格
vehicle_price
=
self
.
get_vehicle_price
(
page_num
=
'1'
)
# print(vehicle_price)
self
.
init_result
[
'page_2'
][
'所购车辆价格'
]
=
vehicle_price
# 车架号
vin
=
self
.
get_vin
(
page_num
=
'1'
)
# print(vin)
self
.
init_result
[
'page_2'
][
'车架号'
]
=
vin
# 经销商
seller
=
self
.
get_seller
()
# print(seller)
self
.
init_result
[
'page_2'
][
'经销商'
]
=
seller
# 贷款本金金额(如果是 ASP产品)则'贷款本金金额'项目中包含'车辆贷款本金金额'和'附加产品融资贷款本金总金额'两个项目
upper
,
lower
,
asp_1
,
asp_2
=
self
.
get_loan_principal
(
page_num
=
'1'
)
# print(upper, lower, asp_1, asp_2)
self
.
init_result
[
'page_2'
][
'贷款本金金额'
][
'大写'
]
=
upper
self
.
init_result
[
'page_2'
][
'贷款本金金额'
][
'小写'
]
=
lower
self
.
init_result
[
'page_2'
][
'贷款本金金额'
][
'车辆贷款本金金额'
]
=
asp_1
self
.
init_result
[
'page_2'
][
'贷款本金金额'
][
'附加产品融资贷款本金总金额'
]
=
asp_2
# 贷款期限
loan_term
=
self
.
get_loan_term
(
page_num
=
'1'
)
# print(loan_term)
self
.
init_result
[
'page_2'
][
'贷款期限'
]
=
loan_term
# 本合同当期的标准利率
standard_rate
=
self
.
get_standard_rate
(
page_num
=
'1'
)
# print(standard_rate)
self
.
init_result
[
'page_2'
][
'标准利率'
]
=
standard_rate
# 202212 release 新增借款人收款账户
account
,
account_name
,
account_bank
=
self
.
get_borrower_collection_account
()
# print(account, account_name, account_bank)
self
.
init_result
[
'page_2'
][
'借款人收款账户'
][
'账号'
]
=
account
self
.
init_result
[
'page_2'
][
'借款人收款账户'
][
'户名'
]
=
account_name
self
.
init_result
[
'page_2'
][
'借款人收款账户'
][
'开户行'
]
=
account_bank
# 还款账户
account
,
account_name
,
account_bank
=
self
.
get_payback_account
()
# print(account, account_name, account_bank)
self
.
init_result
[
'page_2'
][
'还款账户'
][
'账号'
]
=
account
self
.
init_result
[
'page_2'
][
'还款账户'
][
'户名'
]
=
account_name
self
.
init_result
[
'page_2'
][
'还款账户'
][
'开户行'
]
=
account_bank
#######################################
# Page 3
# 找合同编号
contract_no
=
self
.
get_contract_no
(
page_num
=
'2'
)
self
.
init_result
[
'page_3'
][
'合同编号'
]
=
contract_no
# 还款计划表(表格)
repayment_schedule_table
=
self
.
get_repayment_schedule
()
# print(repayment_schedule_table)
self
.
init_result
[
'page_3'
][
'还款计划表'
]
=
repayment_schedule_table
#######################################
# Page 4
# 找合同编号
contract_no
=
self
.
get_contract_no
(
page_num
=
'3'
)
# print(contract_no)
self
.
init_result
[
'page_4'
][
'合同编号'
]
=
contract_no
# 附加产品融资贷款本金总金额明细(ASP-表格)
asp_details_table
=
self
.
get_asp_details
(
page_num
=
'3'
)
# print(asp_details_table)
self
.
init_result
[
'page_4'
][
'附加产品融资贷款本金总金额明细'
]
=
asp_details_table
#######################################
# Page 5
# 找合同编号
contract_no
=
self
.
get_contract_no
(
page_num
=
'4'
)
# print(contract_no)
self
.
init_result
[
'page_5'
][
'合同编号'
]
=
contract_no
#######################################
# Page 6
# 找合同编号
contract_no
=
self
.
get_contract_no
(
page_num
=
'5'
)
# print(contract_no)
self
.
init_result
[
'page_6'
][
'合同编号'
]
=
contract_no
# Page 7
# 找合同编号
contract_no
=
self
.
get_contract_no
(
page_num
=
'6'
)
self
.
init_result
[
'page_7'
][
'合同编号'
]
=
contract_no
# Page 8
# 找合同编号
contract_no
=
self
.
get_contract_no
(
page_num
=
'7'
)
self
.
init_result
[
'page_8'
][
'合同编号'
]
=
contract_no
signature_name
,
signature_date
=
self
.
get_last_page_signature
(
page_num
=
'7'
,
top
=
'合同编号'
,
bottom
=
'共同借款人'
)
if
signature_name
[
'words'
]
==
None
:
signature_name
,
signature_date
=
self
.
get_last_page_signature
(
page_num
=
'7'
,
top
=
'合同编号'
,
bottom
=
'共同借款人(抵押人)'
)
# print(signature_name, signature_date)
self
.
init_result
[
'page_8'
][
'主借人签字'
][
'签字'
]
=
signature_name
self
.
init_result
[
'page_8'
][
'主借人签字'
][
'日期'
]
=
signature_date
signature_name
,
signature_date
=
self
.
get_last_page_signature
(
page_num
=
'7'
,
top
=
'共同借款人'
,
bottom
=
'保证人1'
)
if
signature_name
[
'words'
]
==
None
:
signature_name
,
signature_date
=
self
.
get_last_page_signature
(
page_num
=
'7'
,
top
=
'共同借款人(抵押人)'
,
bottom
=
'保证人1'
)
# print(signature_name, signature_date)
self
.
init_result
[
'page_8'
][
'共借人签字'
][
'签字'
]
=
signature_name
self
.
init_result
[
'page_8'
][
'共借人签字'
][
'日期'
]
=
signature_date
signature_name
,
signature_date
=
self
.
get_last_page_signature
(
page_num
=
'7'
,
top
=
'保证人1'
,
bottom
=
'保证人2'
)
self
.
init_result
[
'page_8'
][
'保证人1签字'
][
'签字'
]
=
signature_name
self
.
init_result
[
'page_8'
][
'保证人1签字'
][
'日期'
]
=
signature_date
signature_name
,
signature_date
=
self
.
get_last_page_signature
(
page_num
=
'7'
,
top
=
'保证人2'
,
bottom
=
'在本人面前亲笔签署本合同'
)
self
.
init_result
[
'page_8'
][
'保证人2签字'
][
'签字'
]
=
signature_name
self
.
init_result
[
'page_8'
][
'保证人2签字'
][
'日期'
]
=
signature_date
signature_name
,
signature_date
=
self
.
get_last_page_signature
(
page_num
=
'7'
,
top
=
'在本人面前亲笔签署本合同'
,
bottom
=
'以下无正文'
)
# print(signature_name, signature_date)
self
.
init_result
[
'page_8'
][
'见证人签字'
][
'签字'
]
=
signature_name
self
.
init_result
[
'page_8'
][
'见证人签字'
][
'日期'
]
=
signature_date
# 重新定制输出
new_results
=
{
"is_asp"
:
self
.
is_asp
,
"page_info"
:
self
.
init_result
}
return
new_results
\ No newline at end of file
src/common/electronic_hil_contract/get_char_fsm.py
0 → 100644
View file @
8d595a3
import
re
import
numpy
as
np
from
fuzzywuzzy
import
fuzz
from
shapely.geometry
import
Polygon
def
caculate_iou
(
g
,
p
):
g
=
Polygon
(
np
.
array
(
g
)
.
reshape
((
-
1
,
2
)))
p
=
Polygon
(
np
.
array
(
p
)
.
reshape
((
-
1
,
2
)))
inter
=
Polygon
(
g
)
.
intersection
(
Polygon
(
p
))
.
area
union
=
g
.
area
+
p
.
area
-
inter
return
inter
/
union
def
get_table_info
(
bbox_1
,
bbox_2
,
ocr_result
):
anchor
=
[
bbox_2
[
0
],
bbox_1
[
1
],
bbox_2
[
2
],
bbox_1
[
3
],
bbox_2
[
4
],
bbox_1
[
5
],
bbox_2
[
6
],
bbox_1
[
7
]]
table_info
=
''
for
span
in
ocr_result
:
iou
=
caculate_iou
(
anchor
,
span
[
0
])
if
iou
>
0
:
table_info
=
span
[
1
]
return
table_info
class
Finder
:
def
__init__
(
self
,
pdf_info
):
self
.
pdf_info
=
pdf_info
self
.
item
=
{
"words"
:
None
,
"page"
:
None
,
"position"
:
None
,
}
# 格式化算法输出
self
.
init_result
=
{
"合同编号"
:
self
.
item
,
"承租人-姓名"
:
self
.
item
,
"承租人-证件号码"
:
self
.
item
,
"承租人-法定代表人或授权代表"
:
self
.
item
,
"共同承租人-姓名"
:
self
.
item
,
"共同承租人-证件号码"
:
self
.
item
,
"共同承租人-法定代表人或授权代表"
:
self
.
item
,
"保证人1-姓名"
:
self
.
item
,
"保证人1-证件号码"
:
self
.
item
,
"保证人1-法定代表人或授权代表"
:
self
.
item
,
"保证人2-姓名"
:
self
.
item
,
"保证人2-证件号码"
:
self
.
item
,
"保证人2-法定代表人或授权代表"
:
self
.
item
,
"保证人3-姓名"
:
self
.
item
,
"保证人3-证件号码"
:
self
.
item
,
"保证人3-法定代表人或授权代表"
:
self
.
item
,
"合同编号(正文)"
:
self
.
item
,
"车辆识别代码"
:
self
.
item
,
"车辆卖方(经销商)"
:
self
.
item
,
"车辆原始销售价格(《机动车销售统一发票》所列金额)"
:
self
.
item
,
"车辆附加产品明细表"
:
self
.
item
,
"融资成本总额"
:
self
.
item
,
"租期"
:
self
.
item
,
"付款计划表"
:
self
.
item
,
"承租人收款账户-户名"
:
self
.
item
,
"承租人收款账户-银行账号"
:
self
.
item
,
"承租人收款账户-开户行"
:
self
.
item
,
"承租人扣款账户-户名"
:
self
.
item
,
"承租人扣款账户-银行账号"
:
self
.
item
,
"承租人扣款账户-开户行"
:
self
.
item
,
"签字页-承租人姓名"
:
self
.
item
,
"签字页-承租人签章"
:
self
.
item
,
"签字页-共同承租人姓名"
:
self
.
item
,
"签字页-共同承租人签章"
:
self
.
item
,
"签字页-保证人1姓名"
:
self
.
item
,
"签字页-保证人1签章"
:
self
.
item
,
"签字页-保证人2姓名"
:
self
.
item
,
"签字页-保证人2签章"
:
self
.
item
,
"签字页-保证人3姓名"
:
self
.
item
,
"签字页-保证人3签章"
:
self
.
item
,
}
# 格式化输出 车辆处置协议 要是别的字段
self
.
init_result_1
=
{
"合同编号"
:
self
.
item
,
"承租人-姓名"
:
self
.
item
,
"承租人-证件号码"
:
self
.
item
,
"销售经销商"
:
self
.
item
,
"合同编号(正文)"
:
self
.
item
,
"签字页-承租人姓名"
:
self
.
item
,
"签字页-承租人证件号码"
:
self
.
item
,
"签字页-承租人签章"
:
self
.
item
,
"签字页-销售经销商"
:
self
.
item
,
"签字页-销售经销商签章"
:
self
.
item
,
}
# 格式化输出 车辆租赁抵押合同
self
.
init_result_2
=
{
"合同编号"
:
self
.
item
,
"合同编号(正文)"
:
self
.
item
,
"抵押人姓名/名称"
:
self
.
item
,
"抵押人证件号码"
:
self
.
item
,
"抵押人配偶姓名/名称"
:
self
.
item
,
"抵押人配偶证件号码"
:
self
.
item
,
"车辆识别代码"
:
self
.
item
,
"租金总额"
:
self
.
item
,
"融资租赁期限"
:
self
.
item
,
"签字页-抵押人姓名"
:
self
.
item
,
"签字页-抵押人签章"
:
self
.
item
,
"签字页-抵押人配偶姓名"
:
self
.
item
,
"签字页-抵押人配偶签章"
:
self
.
item
,
}
def
get_contract_no
(
self
,
page_num
):
"""传入页码,查看该页码右上角的编号
Args:
page_num (string):
Returns:
sting:
"""
contract_no
=
self
.
item
.
copy
()
# 只看第一页
for
block
in
self
.
pdf_info
[
page_num
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
'合同编号:'
in
text
:
words
=
text
.
split
(
':'
)[
-
1
]
contract_no
[
'position'
]
=
bbox
contract_no
[
'page'
]
=
page_num
contract_no
[
'words'
]
=
words
if
contract_no
[
'words'
]
==
''
:
for
block
in
self
.
pdf_info
[
page_num
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
bbox
[
1
]
<
contract_no
[
'position'
][
3
]
and
'CH'
in
text
:
contract_no
[
'position'
]
=
bbox
contract_no
[
'page'
]
=
page_num
contract_no
[
'words'
]
=
text
return
contract_no
def
get_vehicle_price
(
self
,
page_num
=
'0'
):
vehicle_price
=
self
.
item
.
copy
()
for
block
in
self
.
pdf_info
[
page_num
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
'所购车辆价格为人民币'
in
text
:
words
=
text
.
split
(
'币'
)[
-
1
]
vehicle_price
[
'position'
]
=
bbox
vehicle_price
[
'words'
]
=
words
return
vehicle_price
def
get_contract_no_one
(
self
):
# 查找正文中的合同编号,有可能存在换行的情况
contract_no
=
self
.
item
.
copy
()
for
pno
in
self
.
pdf_info
:
all_text
=
''
for
block
in
self
.
pdf_info
[
pno
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
all_text
+=
text
all_text
=
all_text
.
replace
(
' '
,
''
)
matchObj
=
re
.
search
(
r'(合同编号:\[(.*?)\])'
,
all_text
)
if
matchObj
:
words
=
matchObj
.
group
(
1
)
contract_no
[
'position'
]
=
None
contract_no
[
'page'
]
=
pno
# contract_no['words'] = words
contract_no
[
'words'
]
=
re
.
sub
(
"
\
s"
,
""
,
words
)
.
replace
(
")"
,
""
)
return
contract_no
matchObj
=
re
.
search
(
r'编号为(.*?)的'
,
all_text
)
if
matchObj
:
words
=
matchObj
.
group
(
1
)
.
strip
()
contract_no
[
'position'
]
=
None
contract_no
[
'page'
]
=
pno
# contract_no['words'] = words
contract_no
[
'words'
]
=
re
.
sub
(
"
\
s"
,
""
,
words
)
.
replace
(
")"
,
""
)
return
contract_no
matchObj
=
re
.
search
(
r'编号为(.*?))的'
,
all_text
)
if
matchObj
:
words
=
matchObj
.
group
(
1
)
.
strip
()
contract_no
[
'position'
]
=
None
contract_no
[
'page'
]
=
pno
# contract_no['words'] = words
contract_no
[
'words'
]
=
re
.
sub
(
"
\
s"
,
""
,
words
)
return
contract_no
def
get_key_value
(
self
,
key
,
page_num
=
None
):
value
=
self
.
item
.
copy
()
if
page_num
is
not
None
:
pno
=
page_num
for
block
in
self
.
pdf_info
[
pno
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
key
in
text
:
words
=
text
.
split
(
':'
)[
-
1
]
.
replace
(
"。"
,
""
)
value
[
'position'
]
=
bbox
value
[
'page'
]
=
pno
# value['words'] = words
value
[
'words'
]
=
re
.
sub
(
"
\
s"
,
""
,
words
)
else
:
for
pno
in
self
.
pdf_info
:
for
block
in
self
.
pdf_info
[
pno
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
key
in
text
:
# print(self.pdf_info[pno])
words
=
text
.
split
(
':'
)[
-
1
]
.
replace
(
"。"
,
""
)
value
[
'position'
]
=
bbox
value
[
'page'
]
=
pno
# value['words'] = words
value
[
'words'
]
=
re
.
sub
(
"
\
s"
,
""
,
words
)
return
value
def
get_loan_principal
(
self
,
page_num
=
'0'
):
chinese_keywords
=
[
'壹'
,
'贰'
,
'叁'
,
'肆'
,
'伍'
,
'陆'
,
'柒'
,
'捌'
,
'玖'
,
'拾'
,
'佰'
,
'仟'
,
'万'
,
'亿'
,
'元'
,
'角'
,
'分'
,
'零'
,
'整'
]
upper
=
self
.
item
.
copy
()
lower
=
self
.
item
.
copy
()
asp_1
=
self
.
item
.
copy
()
asp_2
=
self
.
item
.
copy
()
anchor_bbox
=
None
for
block
in
self
.
pdf_info
[
page_num
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
fuzz
.
ratio
(
''
.
join
(
chinese_keywords
),
text
)
>
15
:
text
=
text
.
split
(
':'
)[
-
1
]
.
strip
()
upper
[
'position'
]
=
bbox
upper
[
'words'
]
=
text
if
'小写:¥'
in
text
:
words
=
text
.
split
(
'¥'
)[
-
1
]
.
strip
()
lower
[
'position'
]
=
bbox
lower
[
'words'
]
=
words
if
'附加产品融资贷款本金总金额'
==
text
:
anchor_bbox
=
bbox
if
anchor_bbox
:
for
block
in
self
.
pdf_info
[
page_num
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
np
.
mean
(
bbox
[
1
::
2
])
<
np
.
mean
(
anchor_bbox
[
1
::
2
])
and
'人民币:小写:'
in
text
:
words
=
re
.
findall
(
r'人民币:小写:\[(.*)\]'
,
text
)[
0
]
asp_1
[
'position'
]
=
bbox
asp_1
[
'words'
]
=
words
if
np
.
mean
(
bbox
[
1
::
2
])
>
np
.
mean
(
anchor_bbox
[
1
::
2
])
and
'人民币:小写:'
in
text
:
words
=
re
.
findall
(
r'人民币:小写:\[(.*)\]'
,
text
)[
0
]
asp_2
[
'position'
]
=
bbox
asp_2
[
'words'
]
=
words
return
upper
,
lower
,
asp_1
,
asp_2
def
get_loan_term
(
self
,
page_num
=
'0'
):
loan_term
=
self
.
item
.
copy
()
all_text
=
''
for
block
in
self
.
pdf_info
[
page_num
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
all_text
+=
text
matchs
=
re
.
search
(
r'贷款期限(\d+)个月'
,
all_text
)
if
matchs
:
words
=
matchs
.
group
(
1
)
for
block
in
self
.
pdf_info
[
page_num
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
f
'{words}个月'
in
text
:
loan_term
[
'position'
]
=
bbox
loan_term
[
'words'
]
=
words
return
loan_term
def
get_asp_details
(
self
,
page_num
):
asp_details_table_term
=
self
.
item
.
copy
()
asp_details_table
=
[]
asp_details_text_list
=
[]
table
=
False
for
block
in
self
.
pdf_info
[
page_num
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
'附加产品融资贷款本金总金额明细'
==
text
:
table
=
True
if
'第二条'
in
text
or
'征信管理'
in
text
:
table
=
False
if
table
==
True
:
asp_details_text_list
.
append
(
text
)
for
i
in
range
((
len
(
asp_details_text_list
)
+
2
)
//
3
):
line
=
[]
if
i
==
0
:
line
=
[
asp_details_text_list
[
0
]]
else
:
for
j
in
range
(
3
):
line
.
append
(
asp_details_text_list
[
i
*
3
-
2
+
j
])
asp_details_table
.
append
(
line
)
if
len
(
asp_details_table
)
>
0
:
asp_details_table_term
[
'words'
]
=
asp_details_table
return
asp_details_table_term
def
get_signature
(
self
):
signature
=
self
.
item
.
copy
()
for
block
in
self
.
pdf_info
[
'0'
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
'签署日期'
in
text
:
words
=
text
signature
[
'words'
]
=
words
signature
[
'position'
]
=
bbox
return
signature
def
get_somebody
(
self
,
top
,
bottom
):
# 指定上下边界后,返回上下边界内的客户信息
_name
=
self
.
item
.
copy
()
_id
=
self
.
item
.
copy
()
# 只看第一页,先划定上下边界
y_top
=
0
y_bottom
=
0
for
block
in
self
.
pdf_info
[
'1'
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
top
in
text
:
y_top
=
bbox
[
3
]
if
bottom
in
text
:
y_bottom
=
bbox
[
3
]
for
block
in
self
.
pdf_info
[
'1'
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
y_top
<
bbox
[
3
]
<
y_bottom
:
if
'姓名/名称'
in
text
:
words
=
text
.
split
(
':'
)[
-
1
]
_name
[
'position'
]
=
bbox
_name
[
'words'
]
=
words
if
'自然人身份证件号码/法人执照号码'
in
text
:
words
=
text
.
split
(
':'
)[
-
1
]
_id
[
'position'
]
=
bbox
_id
[
'words'
]
=
words
return
_name
,
_id
def
get_seller
(
self
):
seller
=
self
.
item
.
copy
()
# 先找到 key
anchor_bbox
=
None
for
block
in
self
.
pdf_info
[
'1'
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
'经销商'
==
text
:
anchor_bbox
=
bbox
# 当找到了 key, 则根据 key 去匹配 value
if
anchor_bbox
:
half_width
=
self
.
pdf_info
[
'1'
][
'width'
]
*
0.5
for
block
in
self
.
pdf_info
[
'1'
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
anchor_bbox
[
2
]
<
np
.
mean
(
bbox
[::
2
])
<
half_width
and
\
anchor_bbox
[
1
]
<
np
.
mean
(
bbox
[
1
::
2
])
<
anchor_bbox
[
3
]:
seller
[
'position'
]
=
bbox
seller
[
'words'
]
=
text
return
seller
def
get_payback_account
(
self
):
account
=
self
.
item
.
copy
()
account_name
=
self
.
item
.
copy
()
account_bank
=
self
.
item
.
copy
()
all_text
=
''
for
block
in
self
.
pdf_info
[
'1'
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
all_text
+=
text
# 首先确定账户信息是哪种,我们只输出非另行通知的格式
if
'☑账号'
in
all_text
:
all_text
=
all_text
.
replace
(
' '
,
''
)
matchs_1
=
re
.
findall
(
r'账号:(.*)户名'
,
all_text
)
if
matchs_1
:
words
=
matchs_1
[
0
]
for
block
in
self
.
pdf_info
[
'1'
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
f
'{words}'
in
text
:
account
[
'position'
]
=
bbox
account
[
'words'
]
=
words
matchs_2
=
re
.
findall
(
r'户名:(.*)开户行'
,
all_text
)
if
matchs_2
:
words
=
matchs_2
[
0
]
for
block
in
self
.
pdf_info
[
'1'
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
f
'{words}'
in
text
:
account_name
[
'position'
]
=
bbox
account_name
[
'words'
]
=
words
matchs_3
=
re
.
findall
(
r'开户行:(.*);'
,
all_text
)
if
matchs_3
:
words
=
matchs_3
[
0
]
for
block
in
self
.
pdf_info
[
'1'
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
f
'开户行:{words};'
in
text
.
replace
(
' '
,
''
):
account_bank
[
'position'
]
=
bbox
account_bank
[
'words'
]
=
words
return
account
,
account_name
,
account_bank
def
get_repayment_schedule
(
self
):
repayment_schedule
=
self
.
item
.
copy
()
repayment_schedule_text_list
=
[]
table
=
False
page
=
None
left
=
0
right
=
0
for
pno
in
self
.
pdf_info
:
for
block
in
self
.
pdf_info
[
pno
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
'剩余融资'
in
text
:
right
=
bbox
[
2
]
if
'以上表格中所列序号'
in
text
:
table
=
False
if
table
==
True
:
# 过滤汉字
if
re
.
compile
(
r'[\u4e00-\u9fff]'
)
.
search
(
text
):
continue
# 过滤 1. - 61. 这些标题
if
re
.
findall
(
"
\
d+"
,
text
):
if
len
(
re
.
findall
(
"
\
d+"
,
text
))
==
1
:
continue
if
not
left
<
bbox
[
0
]
<
right
:
continue
repayment_schedule_text_list
.
append
(
text
)
if
text
.
strip
()
==
"61."
:
page
=
pno
table
=
True
left
=
bbox
[
0
]
# print("repayment_schedule_text_list = ", repayment_schedule_text_list)
# repayment_schedule_table = [['序号', '融资租赁成本', '融资租赁费用', '租金', '剩余融资租赁成本']]
repayment_schedule_table
=
[[
'序号'
,
'租金'
]]
for
i
in
range
(
len
(
repayment_schedule_text_list
)
//
4
):
line
=
[
f
'{i+1}.'
]
# 4表示4列的意思
for
j
in
range
(
4
):
line
.
append
(
repayment_schedule_text_list
[
i
*
4
+
j
])
# 只保留序号和租金列
line
=
[
line
[
0
]
.
replace
(
'.'
,
''
),
line
[
3
]]
repayment_schedule_table
.
append
(
line
)
repayment_schedule
[
'words'
]
=
repayment_schedule_table
repayment_schedule
[
'page'
]
=
page
return
repayment_schedule
def
get_signature_role_1
(
self
):
signature_role_1
=
self
.
item
.
copy
()
for
pno
in
self
.
pdf_info
:
for
block
in
self
.
pdf_info
[
pno
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
'签署日期'
in
text
:
signature_role_1
[
'position'
]
=
bbox
signature_role_1
[
'page'
]
=
pno
signature_role_1
[
'words'
]
=
text
return
signature_role_1
def
get_signature_role_2
(
self
):
signature_role_2
=
self
.
init_item
.
copy
()
# 先定位签字区域
texts
=
[]
boxes
=
[]
page_num
=
None
position
=
None
words
=
None
region
=
False
for
i
in
list
(
self
.
pdf_info
.
keys
()):
for
block
in
self
.
pdf_info
[
i
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
'共同借款人(共同抵押人)'
in
text
:
region
=
True
if
'日期'
in
text
:
region
=
False
if
region
==
True
:
page_num
=
i
texts
.
append
(
text
)
boxes
.
append
(
bbox
)
if
len
(
texts
)
>
4
:
words
=
'有'
else
:
words
=
'无'
boxes
=
np
.
array
(
boxes
)
.
reshape
((
-
1
,
2
))
position
=
[
min
(
boxes
[:,
0
]),
min
(
boxes
[:,
1
]),
max
(
boxes
[:,
0
]),
max
(
boxes
[:,
1
])]
signature_role_2
[
'page_num'
]
=
page_num
signature_role_2
[
'position'
]
=
position
signature_role_2
[
'words'
]
=
words
return
signature_role_2
def
get_signature_role_3
(
self
):
signature_role_3
=
self
.
init_item
.
copy
()
# 先定位签字区域
texts
=
[]
boxes
=
[]
page_num
=
None
position
=
None
words
=
None
region
=
False
for
i
in
list
(
self
.
pdf_info
.
keys
()):
for
block
in
self
.
pdf_info
[
i
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
'保证人1'
in
text
and
int
(
i
)
!=
0
:
region
=
True
if
'日期'
in
text
:
region
=
False
if
region
==
True
:
page_num
=
i
texts
.
append
(
text
)
boxes
.
append
(
bbox
)
if
len
(
texts
)
>
4
:
words
=
'有'
else
:
words
=
'无'
boxes
=
np
.
array
(
boxes
)
.
reshape
((
-
1
,
2
))
position
=
[
min
(
boxes
[:,
0
]),
min
(
boxes
[:,
1
]),
max
(
boxes
[:,
0
]),
max
(
boxes
[:,
1
])]
signature_role_3
[
'page_num'
]
=
page_num
signature_role_3
[
'position'
]
=
position
signature_role_3
[
'words'
]
=
words
return
signature_role_3
def
get_signature_role_4
(
self
):
signature_role_4
=
self
.
init_item
.
copy
()
# 先定位签字区域
texts
=
[]
boxes
=
[]
page_num
=
None
position
=
None
words
=
None
region
=
False
for
i
in
list
(
self
.
pdf_info
.
keys
()):
for
block
in
self
.
pdf_info
[
i
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
'保证人2'
in
text
and
int
(
i
)
!=
0
:
region
=
True
if
'日期'
in
text
:
region
=
False
if
region
==
True
:
page_num
=
i
texts
.
append
(
text
)
boxes
.
append
(
bbox
)
if
len
(
texts
)
>
4
:
words
=
'有'
else
:
words
=
'无'
boxes
=
np
.
array
(
boxes
)
.
reshape
((
-
1
,
2
))
position
=
[
min
(
boxes
[:,
0
]),
min
(
boxes
[:,
1
]),
max
(
boxes
[:,
0
]),
max
(
boxes
[:,
1
])]
signature_role_4
[
'page_num'
]
=
page_num
signature_role_4
[
'position'
]
=
position
signature_role_4
[
'words'
]
=
words
return
signature_role_4
def
get_signature_role_5
(
self
):
signature_role_5
=
self
.
init_item
.
copy
()
# 先定位签字区域
texts
=
[]
boxes
=
[]
page_num
=
None
position
=
None
words
=
None
region
=
False
for
i
in
list
(
self
.
pdf_info
.
keys
()):
for
block
in
self
.
pdf_info
[
i
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
'见证人签字'
in
text
and
int
(
i
)
!=
0
:
region
=
True
if
'年'
in
text
:
region
=
False
if
region
==
True
:
page_num
=
i
texts
.
append
(
text
)
boxes
.
append
(
bbox
)
# print(texts)
if
len
(
texts
)
>
4
:
words
=
'有'
else
:
words
=
'无'
boxes
=
np
.
array
(
boxes
)
.
reshape
((
-
1
,
2
))
position
=
[
min
(
boxes
[:,
0
]),
min
(
boxes
[:,
1
]),
max
(
boxes
[:,
0
]),
max
(
boxes
[:,
1
])]
signature_role_5
[
'page_num'
]
=
page_num
signature_role_5
[
'position'
]
=
position
signature_role_5
[
'words'
]
=
words
return
signature_role_5
def
get_last_page_signature
(
self
,
page_num
,
top
,
bottom
):
signature_name
=
self
.
item
.
copy
()
signature_date
=
self
.
item
.
copy
()
anchor_top
=
None
anchor_bottom
=
None
for
block
in
self
.
pdf_info
[
page_num
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
top
in
text
:
anchor_top
=
bbox
[
1
]
if
bottom
in
text
:
anchor_bottom
=
bbox
[
1
]
if
anchor_top
is
not
None
and
anchor_bottom
is
not
None
:
for
block
in
self
.
pdf_info
[
page_num
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
'签署日期'
in
text
and
int
(
anchor_top
)
<
np
.
mean
(
bbox
[
1
::
2
])
<
int
(
anchor_bottom
):
name
=
text
.
split
(
' '
)[
0
]
date
=
text
.
split
(
':'
)[
-
1
]
signature_name
[
'words'
]
=
name
signature_name
[
'position'
]
=
bbox
signature_date
[
'words'
]
=
date
signature_name
[
'position'
]
=
bbox
return
signature_name
,
signature_date
def
get_electronic_signature
(
self
,
top
,
bottom
):
signature
=
self
.
item
.
copy
()
anchor_top
=
None
anchor_bottom
=
None
for
pno
in
self
.
pdf_info
:
for
block
in
self
.
pdf_info
[
pno
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
top
in
text
:
anchor_top
=
bbox
[
1
]
if
bottom
in
text
:
anchor_bottom
=
bbox
[
3
]
if
anchor_top
is
not
None
and
anchor_bottom
is
not
None
:
for
pno
in
self
.
pdf_info
:
for
block
in
self
.
pdf_info
[
pno
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
# ------------ #
# print("--text = ", text)
if
'签署日期'
in
text
and
int
(
anchor_top
)
<
np
.
mean
(
bbox
[
1
::
2
])
<
int
(
anchor_bottom
):
words
=
text
signature
[
'words'
]
=
words
signature
[
'page'
]
=
pno
signature
[
'position'
]
=
bbox
return
signature
def
get_role_info
(
self
,
role_key
,
page_num
=
'0'
):
name
=
self
.
item
.
copy
()
id_num
=
self
.
item
.
copy
()
representative
=
self
.
item
.
copy
()
# 以保证人3 的左上角为定位点
anchor
=
None
for
block
in
self
.
pdf_info
[
page_num
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
# 找到角色姓名
if
re
.
match
(
'保证人3'
,
text
)
is
not
None
:
anchor
=
[
bbox
[
0
],
bbox
[
1
]]
if
anchor
is
not
None
:
for
block
in
self
.
pdf_info
[
page_num
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
# 找到角色姓名
if
re
.
match
(
role_key
,
text
)
is
not
None
:
words
=
text
.
split
(
':'
)[
-
1
]
name
[
'words'
]
=
words
name
[
'page'
]
=
page_num
name
[
'position'
]
=
bbox
if
role_key
==
'承租人:'
:
# 找到证件号码且确定位置
if
re
.
match
(
'证件号码:'
,
text
)
is
not
None
and
np
.
mean
(
bbox
[::
2
])
<
anchor
[
0
]
and
np
.
mean
(
bbox
[
1
::
2
])
<
anchor
[
1
]:
words
=
text
.
split
(
':'
)[
-
1
]
id_num
[
'words'
]
=
words
id_num
[
'page'
]
=
page_num
id_num
[
'position'
]
=
bbox
# 找到法人代表且确定位置
if
re
.
match
(
'法定代表人或授权代表:'
,
text
)
is
not
None
and
np
.
mean
(
bbox
[::
2
])
<
anchor
[
0
]
and
np
.
mean
(
bbox
[
1
::
2
])
<
anchor
[
1
]:
words
=
text
.
split
(
':'
)[
-
1
]
representative
[
'words'
]
=
words
representative
[
'page'
]
=
page_num
representative
[
'position'
]
=
bbox
if
role_key
==
'保证人1:'
:
# 找到证件号码且确定位置
if
re
.
match
(
'证件号码:'
,
text
)
is
not
None
and
np
.
mean
(
bbox
[::
2
])
<
anchor
[
0
]
and
np
.
mean
(
bbox
[
1
::
2
])
>
anchor
[
1
]:
words
=
text
.
split
(
':'
)[
-
1
]
id_num
[
'words'
]
=
words
id_num
[
'page'
]
=
page_num
id_num
[
'position'
]
=
bbox
# 找到法人代表且确定位置
if
re
.
match
(
'法定代表人或授权代表:'
,
text
)
is
not
None
and
np
.
mean
(
bbox
[::
2
])
<
anchor
[
0
]
and
np
.
mean
(
bbox
[
1
::
2
])
>
anchor
[
1
]:
words
=
text
.
split
(
':'
)[
-
1
]
representative
[
'words'
]
=
words
representative
[
'page'
]
=
page_num
representative
[
'position'
]
=
bbox
if
role_key
==
'保证人2:'
:
# 找到证件号码且确定位置
if
re
.
match
(
'证件号码:'
,
text
)
is
not
None
and
np
.
mean
(
bbox
[::
2
])
>
anchor
[
0
]
and
np
.
mean
(
bbox
[
1
::
2
])
<
anchor
[
1
]:
words
=
text
.
split
(
':'
)[
-
1
]
id_num
[
'words'
]
=
words
id_num
[
'page'
]
=
page_num
id_num
[
'position'
]
=
bbox
# 找到法人代表且确定位置
if
re
.
match
(
'法定代表人或授权代表:'
,
text
)
is
not
None
and
np
.
mean
(
bbox
[::
2
])
>
anchor
[
0
]
and
np
.
mean
(
bbox
[
1
::
2
])
<
anchor
[
1
]:
words
=
text
.
split
(
':'
)[
-
1
]
representative
[
'words'
]
=
words
representative
[
'page'
]
=
page_num
representative
[
'position'
]
=
bbox
if
role_key
==
'保证人3:'
:
# 找到证件号码且确定位置
if
re
.
match
(
'证件号码:'
,
text
)
is
not
None
and
np
.
mean
(
bbox
[::
2
])
>
anchor
[
0
]
and
np
.
mean
(
bbox
[
1
::
2
])
>
anchor
[
1
]:
words
=
text
.
split
(
':'
)[
-
1
]
id_num
[
'words'
]
=
words
id_num
[
'page'
]
=
page_num
id_num
[
'position'
]
=
bbox
# 找到法人代表且确定位置
if
re
.
match
(
'法定代表人或授权代表:'
,
text
)
is
not
None
and
np
.
mean
(
bbox
[::
2
])
>
anchor
[
0
]
and
np
.
mean
(
bbox
[
1
::
2
])
>
anchor
[
1
]:
words
=
text
.
split
(
':'
)[
-
1
]
representative
[
'words'
]
=
words
representative
[
'page'
]
=
page_num
representative
[
'position'
]
=
bbox
return
name
,
id_num
,
representative
def
get_table_add_product
(
self
):
table_add_product
=
self
.
item
.
copy
()
add_product_page_num
=
None
for
pno
in
self
.
pdf_info
:
for
block
in
self
.
pdf_info
[
f
'{pno}'
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
'车辆附加产品(明细见下表)'
in
text
:
add_product_page_num
=
pno
ocr_results
=
[]
for
block
in
self
.
pdf_info
[
f
'{add_product_page_num}'
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
xmin
,
ymin
,
xmax
,
ymax
=
bbox
bbox
=
[
xmin
,
ymin
,
xmax
,
ymin
,
xmax
,
ymax
,
xmin
,
ymax
]
ocr_results
.
append
([
bbox
,
text
])
lines
=
[[
'项目'
,
'购买价格'
,
'实际融资金额'
]]
key_xm
=
None
key_gmjg
=
None
key_sjrzje
=
None
key_total
=
None
for
index
,
span
in
enumerate
(
ocr_results
):
if
span
[
1
]
==
'项目'
:
key_xm
=
index
if
span
[
1
]
==
'购买价格'
:
key_gmjg
=
index
if
span
[
1
]
==
'实际融资金额'
:
key_sjrzje
=
index
if
span
[
1
]
==
'总计'
:
key_total
=
index
bbox
,
text
=
ocr_results
[
key_xm
]
rh
=
abs
(
bbox
[
1
]
-
bbox
[
-
1
])
anchor
=
np
.
array
(
bbox
)
.
reshape
((
-
1
,
2
))
anchor
[:,
0
]
+=
2
*
rh
anchor
[:,
1
]
+=
rh
for
i
in
range
(
5
):
for
span
in
ocr_results
:
iou
=
caculate_iou
(
anchor
,
span
[
0
])
if
iou
>
0.01
and
span
[
1
]
.
strip
()
!=
'所购'
:
x
=
get_table_info
(
span
[
0
],
ocr_results
[
key_gmjg
][
0
],
ocr_results
)
y
=
get_table_info
(
span
[
0
],
ocr_results
[
key_sjrzje
][
0
],
ocr_results
)
line
=
[
span
[
1
]
.
replace
(
'
\u3000
'
,
' '
),
x
,
y
]
# print(line)
lines
.
append
(
line
)
anchor
=
np
.
array
(
span
[
0
])
.
reshape
((
-
1
,
2
))
anchor
[:,
1
]
+=
rh
total
=
get_table_info
(
ocr_results
[
key_total
][
0
],
ocr_results
[
key_sjrzje
][
0
],
ocr_results
)
lines
.
append
([
'总计'
,
''
,
total
])
# 所购 BMW悦然焕
# 新服务
# 所购 BMW5年10
# 万公里长悦保养套餐
# 所购 事故维修补偿
# 方案
# 所购 BMW5年10万公里
# 长悦保养套餐
# 所购 MINI4年6万公里长悦
# 保养套餐
filtered_lines
=
[]
for
line
in
lines
:
if
line
[
0
][:
2
]
not
in
[
'所购'
,
'项目'
,
'总计'
]:
continue
if
'BMW悦然'
in
line
[
0
]:
line
[
0
]
=
'所购 BMW悦然焕新服务'
if
'BMW5年10'
in
line
[
0
]:
line
[
0
]
=
'所购 BMW5年10万公里长悦保养套餐'
if
'事故维修补'
in
line
[
0
]:
line
[
0
]
=
'所购 事故维修补偿方案'
if
'MINI4年6万公里长悦'
in
line
[
0
]:
line
[
0
]
=
'所购 MINI4年6万公里长悦保养套餐'
filtered_lines
.
append
(
line
)
table_add_product
[
'words'
]
=
filtered_lines
table_add_product
[
'page'
]
=
add_product_page_num
table_add_product
[
'position'
]
=
None
return
table_add_product
def
get_contract_no_dy
(
self
):
# 查找抵押合同编号
contract_no
=
self
.
item
.
copy
()
key_box
=
None
for
pno
in
self
.
pdf_info
:
for
block
in
self
.
pdf_info
[
pno
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
'抵押合同编号'
in
text
:
key_box
=
bbox
if
key_box
is
not
None
:
for
pno
in
self
.
pdf_info
:
for
block
in
self
.
pdf_info
[
pno
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
key_box
[
1
]
<
np
.
mean
(
bbox
[
1
::
2
])
<
key_box
[
3
]
and
'CH-'
in
text
:
contract_no
[
'position'
]
=
bbox
contract_no
[
'page'
]
=
pno
contract_no
[
'words'
]
=
text
return
contract_no
def
get_dyr_name_id
(
self
):
name
=
self
.
item
.
copy
()
_id
=
self
.
item
.
copy
()
key_box
=
None
for
pno
in
self
.
pdf_info
:
for
block
in
self
.
pdf_info
[
pno
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
text
==
'抵押人'
:
key_box
=
bbox
if
key_box
is
not
None
:
rh
=
abs
(
key_box
[
1
]
-
key_box
[
3
])
for
pno
in
self
.
pdf_info
:
for
block
in
self
.
pdf_info
[
pno
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
key_box
[
1
]
<
np
.
mean
(
bbox
[
1
::
2
])
<
key_box
[
3
]
+
rh
*
3
and
'姓名'
in
text
:
words
=
text
.
split
(
':'
)[
-
1
]
name
[
'position'
]
=
bbox
name
[
'page'
]
=
pno
name
[
'words'
]
=
words
if
key_box
[
1
]
<
np
.
mean
(
bbox
[
1
::
2
])
<
key_box
[
3
]
+
rh
*
3
and
'证件号码'
in
text
:
words
=
text
.
split
(
':'
)[
-
1
]
_id
[
'position'
]
=
bbox
_id
[
'page'
]
=
pno
_id
[
'words'
]
=
words
return
name
,
_id
def
get_dyrpo_name_id
(
self
):
name
=
self
.
item
.
copy
()
_id
=
self
.
item
.
copy
()
key_box
=
None
for
pno
in
self
.
pdf_info
:
for
block
in
self
.
pdf_info
[
pno
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
text
==
'抵押人配偶(如适'
:
key_box
=
bbox
if
key_box
is
not
None
:
rh
=
abs
(
key_box
[
1
]
-
key_box
[
3
])
for
pno
in
self
.
pdf_info
:
for
block
in
self
.
pdf_info
[
pno
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
key_box
[
1
]
<
np
.
mean
(
bbox
[
1
::
2
])
<
key_box
[
3
]
+
rh
*
3
and
'姓名'
in
text
:
words
=
text
.
split
(
':'
)[
-
1
]
name
[
'position'
]
=
bbox
name
[
'page'
]
=
pno
name
[
'words'
]
=
words
if
key_box
[
1
]
<
np
.
mean
(
bbox
[
1
::
2
])
<
key_box
[
3
]
+
rh
*
3
and
'证件号码'
in
text
:
words
=
text
.
split
(
':'
)[
-
1
]
_id
[
'position'
]
=
bbox
_id
[
'page'
]
=
pno
_id
[
'words'
]
=
words
return
name
,
_id
def
get_key_value_position
(
self
,
key
):
value
=
self
.
item
.
copy
()
key_box
=
None
for
pno
in
self
.
pdf_info
:
for
block
in
self
.
pdf_info
[
pno
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
text
==
key
:
key_box
=
bbox
if
key_box
is
not
None
:
rh
=
abs
(
key_box
[
1
]
-
key_box
[
3
])
for
pno
in
self
.
pdf_info
:
for
block
in
self
.
pdf_info
[
pno
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
key_box
[
1
]
<
np
.
mean
(
bbox
[
1
::
2
])
<
key_box
[
3
]
and
key_box
[
0
]
<
bbox
[
0
]
and
abs
(
key_box
[
2
]
-
bbox
[
0
])
<
rh
*
10
:
words
=
text
value
[
'position'
]
=
bbox
value
[
'page'
]
=
pno
value
[
'words'
]
=
words
return
value
def
get_role_info_3_3
(
self
,
role_key
,
page_num
=
'0'
):
name
=
self
.
item
.
copy
()
id_num
=
self
.
item
.
copy
()
representative
=
self
.
item
.
copy
()
# 以保证人2 的左上角为定位点
anchor
=
None
for
block
in
self
.
pdf_info
[
page_num
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
# 找到角色姓名
if
re
.
match
(
'保证人2'
,
text
)
is
not
None
:
anchor
=
[
bbox
[
0
],
bbox
[
1
]]
if
anchor
is
not
None
:
for
block
in
self
.
pdf_info
[
page_num
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
# 找到角色姓名
if
re
.
match
(
role_key
,
text
)
is
not
None
:
words
=
text
.
split
(
':'
)[
-
1
]
name
[
'words'
]
=
words
name
[
'page'
]
=
page_num
name
[
'position'
]
=
bbox
if
role_key
==
'承租人一:'
:
# 找到证件号码且确定位置
if
re
.
match
(
'证件号码:'
,
text
)
is
not
None
and
np
.
mean
(
bbox
[::
2
])
<
anchor
[
0
]
and
np
.
mean
(
bbox
[
1
::
2
])
<
anchor
[
1
]:
words
=
text
.
split
(
':'
)[
-
1
]
id_num
[
'words'
]
=
words
id_num
[
'page'
]
=
page_num
id_num
[
'position'
]
=
bbox
# 找到法人代表且确定位置
if
re
.
match
(
'法定代表人或授权代表:'
,
text
)
is
not
None
and
np
.
mean
(
bbox
[::
2
])
<
anchor
[
0
]
and
np
.
mean
(
bbox
[
1
::
2
])
<
anchor
[
1
]:
words
=
text
.
split
(
':'
)[
-
1
]
representative
[
'words'
]
=
words
representative
[
'page'
]
=
page_num
representative
[
'position'
]
=
bbox
if
role_key
==
'共同承租人:'
:
# 找到证件号码且确定位置
if
re
.
match
(
'证件号码:'
,
text
)
is
not
None
and
np
.
mean
(
bbox
[::
2
])
<
anchor
[
0
]
and
np
.
mean
(
bbox
[
1
::
2
])
>
anchor
[
1
]:
words
=
text
.
split
(
':'
)[
-
1
]
id_num
[
'words'
]
=
words
id_num
[
'page'
]
=
page_num
id_num
[
'position'
]
=
bbox
# 找到法人代表且确定位置
if
re
.
match
(
'法定代表人或授权代表:'
,
text
)
is
not
None
and
np
.
mean
(
bbox
[::
2
])
<
anchor
[
0
]
and
np
.
mean
(
bbox
[
1
::
2
])
>
anchor
[
1
]:
words
=
text
.
split
(
':'
)[
-
1
]
representative
[
'words'
]
=
words
representative
[
'page'
]
=
page_num
representative
[
'position'
]
=
bbox
if
role_key
==
'保证人1:'
:
# 找到证件号码且确定位置
if
re
.
match
(
'证件号码:'
,
text
)
is
not
None
and
np
.
mean
(
bbox
[::
2
])
>
anchor
[
0
]
and
np
.
mean
(
bbox
[
1
::
2
])
<
anchor
[
1
]:
words
=
text
.
split
(
':'
)[
-
1
]
id_num
[
'words'
]
=
words
id_num
[
'page'
]
=
page_num
id_num
[
'position'
]
=
bbox
# 找到法人代表且确定位置
if
re
.
match
(
'法定代表人或授权代表:'
,
text
)
is
not
None
and
np
.
mean
(
bbox
[::
2
])
>
anchor
[
0
]
and
np
.
mean
(
bbox
[
1
::
2
])
<
anchor
[
1
]:
words
=
text
.
split
(
':'
)[
-
1
]
representative
[
'words'
]
=
words
representative
[
'page'
]
=
page_num
representative
[
'position'
]
=
bbox
if
role_key
==
'保证人2:'
:
# 找到证件号码且确定位置
if
re
.
match
(
'证件号码:'
,
text
)
is
not
None
and
np
.
mean
(
bbox
[::
2
])
>
anchor
[
0
]
and
np
.
mean
(
bbox
[
1
::
2
])
>
anchor
[
1
]:
words
=
text
.
split
(
':'
)[
-
1
]
id_num
[
'words'
]
=
words
id_num
[
'page'
]
=
page_num
id_num
[
'position'
]
=
bbox
# 找到法人代表且确定位置
if
re
.
match
(
'法定代表人或授权代表:'
,
text
)
is
not
None
and
np
.
mean
(
bbox
[::
2
])
>
anchor
[
0
]
and
np
.
mean
(
bbox
[
1
::
2
])
>
anchor
[
1
]:
words
=
text
.
split
(
':'
)[
-
1
]
representative
[
'words'
]
=
words
representative
[
'page'
]
=
page_num
representative
[
'position'
]
=
bbox
return
name
,
id_num
,
representative
def
get_value_by_findall
(
self
,
prefix
,
suffix
,
page_num
):
value
=
self
.
item
.
copy
()
all_text
=
''
pno
=
page_num
for
block
in
self
.
pdf_info
[
pno
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
all_text
+=
text
words_list
=
re
.
findall
(
f
"{prefix}(.*?){suffix}"
,
all_text
)
if
len
(
words_list
)
>
0
:
for
block
in
self
.
pdf_info
[
pno
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
words_list
[
0
]
in
text
:
value
[
'position'
]
=
bbox
value
[
'page'
]
=
pno
value
[
'words'
]
=
words_list
[
0
]
return
value
def
get_info
(
self
):
"""
block['type'] == 0 : 表示该元素为图片
Returns:
dict: Description
"""
if
len
(
self
.
pdf_info
)
>
0
:
# 取 Page 1 上的合同编号
contract_no
=
self
.
get_contract_no
(
page_num
=
'0'
)
self
.
init_result
[
'合同编号'
]
=
contract_no
# 粗略判断是否是 ‘车贷分离版本’ 的合同
is_cdfl
=
False
for
block
in
self
.
pdf_info
[
'0'
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
'共同承租人:'
in
text
:
is_cdfl
=
True
if
is_cdfl
==
False
:
# 从第一页上取四个角色的姓名和证件号码
name
,
id_num
,
representative
=
self
.
get_role_info
(
role_key
=
'承租人:'
,
page_num
=
'0'
)
if
name
[
"words"
]
==
None
:
name
,
id_num
,
representative
=
self
.
get_role_info_3_3
(
role_key
=
'承租人一:'
,
page_num
=
'0'
)
self
.
init_result
[
'承租人-姓名'
]
=
name
self
.
init_result
[
'承租人-证件号码'
]
=
id_num
self
.
init_result
[
'承租人-法定代表人或授权代表'
]
=
representative
name
,
id_num
,
representative
=
self
.
get_role_info
(
role_key
=
'保证人1:'
,
page_num
=
'0'
)
self
.
init_result
[
'保证人1-姓名'
]
=
name
self
.
init_result
[
'保证人1-证件号码'
]
=
id_num
self
.
init_result
[
'保证人1-法定代表人或授权代表'
]
=
representative
# if条件判别 对应3_3版本
if
name
[
"words"
]
==
None
:
name
,
id_num
,
representative
=
self
.
get_role_info_3_3
(
role_key
=
'共同承租人:'
,
page_num
=
'0'
)
self
.
init_result
[
'共同承租人-姓名'
]
=
name
self
.
init_result
[
'共同承租人-证件号码'
]
=
id_num
self
.
init_result
[
'共同承租人-法定代表人或授权代表'
]
=
representative
name
,
id_num
,
representative
=
self
.
get_role_info
(
role_key
=
'保证人2:'
,
page_num
=
'0'
)
self
.
init_result
[
'保证人2-姓名'
]
=
name
self
.
init_result
[
'保证人2-证件号码'
]
=
id_num
self
.
init_result
[
'保证人2-法定代表人或授权代表'
]
=
representative
# if条件判别 对应3_3版本
if
name
[
"words"
]
==
None
:
name
,
id_num
,
representative
=
self
.
get_role_info_3_3
(
role_key
=
'保证人1:'
,
page_num
=
'0'
)
self
.
init_result
[
'保证人2-姓名'
]
=
name
self
.
init_result
[
'保证人2-证件号码'
]
=
id_num
self
.
init_result
[
'保证人2-法定代表人或授权代表'
]
=
representative
name
,
id_num
,
representative
=
self
.
get_role_info
(
role_key
=
'保证人3:'
,
page_num
=
'0'
)
self
.
init_result
[
'保证人3-姓名'
]
=
name
self
.
init_result
[
'保证人3-证件号码'
]
=
id_num
self
.
init_result
[
'保证人3-法定代表人或授权代表'
]
=
representative
if
name
[
"words"
]
==
None
:
name
,
id_num
,
representative
=
self
.
get_role_info_3_3
(
role_key
=
'保证人2:'
,
page_num
=
'0'
)
self
.
init_result
[
'保证人3-姓名'
]
=
name
self
.
init_result
[
'保证人3-证件号码'
]
=
id_num
self
.
init_result
[
'保证人3-法定代表人或授权代表'
]
=
representative
else
:
name
,
id_num
,
representative
=
self
.
get_role_info_3_3
(
role_key
=
'承租人一:'
,
page_num
=
'0'
)
self
.
init_result
[
'承租人-姓名'
]
=
name
self
.
init_result
[
'承租人-证件号码'
]
=
id_num
self
.
init_result
[
'承租人-法定代表人或授权代表'
]
=
representative
name
,
id_num
,
representative
=
self
.
get_role_info_3_3
(
role_key
=
'共同承租人:'
,
page_num
=
'0'
)
self
.
init_result
[
'共同承租人-姓名'
]
=
name
self
.
init_result
[
'共同承租人-证件号码'
]
=
id_num
self
.
init_result
[
'共同承租人-法定代表人或授权代表'
]
=
representative
name
,
id_num
,
representative
=
self
.
get_role_info_3_3
(
role_key
=
'保证人1:'
,
page_num
=
'0'
)
self
.
init_result
[
'保证人1-姓名'
]
=
name
self
.
init_result
[
'保证人1-证件号码'
]
=
id_num
self
.
init_result
[
'保证人1-法定代表人或授权代表'
]
=
representative
name
,
id_num
,
representative
=
self
.
get_role_info_3_3
(
role_key
=
'保证人2:'
,
page_num
=
'0'
)
self
.
init_result
[
'保证人2-姓名'
]
=
name
self
.
init_result
[
'保证人2-证件号码'
]
=
id_num
self
.
init_result
[
'保证人2-法定代表人或授权代表'
]
=
representative
# 在所有页面中找正文中(第二部分 融资租赁主要条款及付款计划)的那个编号,因为存在换行的情况所以暂时不带位置输出
contract_no
=
self
.
get_contract_no_one
()
self
.
init_result
[
'合同编号(正文)'
]
=
contract_no
# 找到车辆识别代码
vin
=
self
.
get_key_value
(
key
=
'车辆识别代码:'
)
self
.
init_result
[
'车辆识别代码'
]
=
vin
# 找到经销商(车辆卖方(经销商))
seller
=
self
.
get_key_value
(
key
=
'车辆卖方(经销商):'
)
if
seller
[
'words'
]
==
None
:
seller
=
self
.
get_key_value
(
key
=
'车辆卖方:'
)
self
.
init_result
[
'车辆卖方(经销商)'
]
=
seller
# 找到 —— 车辆原始销售价格
vehicle_price
=
self
.
get_key_value
(
key
=
'车辆原始销售价格(《机动车销售统一发票》所列金额):'
)
self
.
init_result
[
'车辆原始销售价格(《机动车销售统一发票》所列金额)'
]
=
vehicle_price
# 找车辆附加产品明细(表)
table_add_product
=
self
.
get_table_add_product
()
self
.
init_result
[
'车辆附加产品明细表'
]
=
table_add_product
# 找融资成本总额
financing_cost
=
self
.
get_key_value
(
key
=
'融资成本总额:'
)
self
.
init_result
[
'融资成本总额'
]
=
financing_cost
# 找租期
lease_term
=
self
.
get_key_value
(
key
=
'租期:'
)
self
.
init_result
[
'租期'
]
=
lease_term
# 找还款计划(表)
repayment_schedule
=
self
.
get_repayment_schedule
()
self
.
init_result
[
'付款计划表'
]
=
repayment_schedule
# 找承租人收款账户户名、银行账号、银行
name
=
self
.
get_key_value
(
key
=
'户名:'
,
page_num
=
'4'
)
self
.
init_result
[
'承租人收款账户-户名'
]
=
name
account
=
self
.
get_key_value
(
key
=
'银行账号:'
,
page_num
=
'4'
)
self
.
init_result
[
'承租人收款账户-银行账号'
]
=
account
bank
=
self
.
get_key_value
(
key
=
'开户银行:'
,
page_num
=
'4'
)
self
.
init_result
[
'承租人收款账户-开户行'
]
=
bank
# 找承租人扣款账户户名、银行账号、银行
name
=
self
.
get_key_value
(
key
=
'户名:'
,
page_num
=
'5'
)
self
.
init_result
[
'承租人扣款账户-户名'
]
=
name
account
=
self
.
get_key_value
(
key
=
'银行账号:'
,
page_num
=
'5'
)
self
.
init_result
[
'承租人扣款账户-银行账号'
]
=
account
bank
=
self
.
get_key_value
(
key
=
'开户银行:'
,
page_num
=
'5'
)
self
.
init_result
[
'承租人扣款账户-开户行'
]
=
bank
# 找签字页上的系列信息
# 承租人姓名、签章
if
is_cdfl
==
False
:
name
=
self
.
get_key_value
(
key
=
'承租人姓名:'
)
electronic_signature
=
self
.
get_electronic_signature
(
top
=
'承租人姓名:'
,
bottom
=
'保证人1姓名:'
)
if
name
[
"words"
]
==
None
:
name
=
self
.
get_key_value
(
key
=
'承租人一姓名:'
)
electronic_signature
=
self
.
get_electronic_signature
(
top
=
'承租人一姓名:'
,
bottom
=
'共同承租人名称:'
)
self
.
init_result
[
'签字页-承租人姓名'
]
=
name
self
.
init_result
[
'签字页-承租人签章'
]
=
electronic_signature
# 保证人1姓名、签章
name
=
self
.
get_key_value
(
key
=
'保证人1姓名:'
)
electronic_signature
=
self
.
get_electronic_signature
(
top
=
'保证人1姓名:'
,
bottom
=
'保证人2姓名:'
)
self
.
init_result
[
'签字页-保证人1姓名'
]
=
name
self
.
init_result
[
'签字页-保证人1签章'
]
=
electronic_signature
# 这里用的是 name["words"] == ""
if
name
[
"words"
]
==
""
:
name
=
self
.
get_key_value
(
key
=
'共同承租人名称:'
)
electronic_signature
=
self
.
get_electronic_signature
(
top
=
'共同承租人名称:'
,
bottom
=
'保证人1姓名:'
)
self
.
init_result
[
'签字页-共同承租人姓名'
]
=
name
self
.
init_result
[
'签字页-共同承租人签章'
]
=
electronic_signature
# 保证人2姓名、签章
name
=
self
.
get_key_value
(
key
=
'保证人2姓名:'
)
electronic_signature
=
self
.
get_electronic_signature
(
top
=
'保证人2姓名:'
,
bottom
=
'保证人3姓名:'
)
self
.
init_result
[
'签字页-保证人2姓名'
]
=
name
self
.
init_result
[
'签字页-保证人2签章'
]
=
electronic_signature
# if判断条件对应3_3版本
if
name
[
"words"
]
==
""
:
name
=
self
.
get_key_value
(
key
=
'保证人1姓名:'
)
electronic_signature
=
self
.
get_electronic_signature
(
top
=
'保证人1姓名:'
,
bottom
=
'保证人2姓名:'
)
self
.
init_result
[
'签字页-保证人1姓名'
]
=
name
self
.
init_result
[
'签字页-保证人1签章'
]
=
electronic_signature
# 保证人3姓名、签章
name
=
self
.
get_key_value
(
key
=
'保证人3姓名:'
)
electronic_signature
=
self
.
get_electronic_signature
(
top
=
'保证人3姓名:'
,
bottom
=
'日期:'
)
self
.
init_result
[
'签字页-保证人3姓名'
]
=
name
self
.
init_result
[
'签字页-保证人3签章'
]
=
electronic_signature
# if判断条件对应3_3版本
if
name
[
"words"
]
==
None
:
name
=
self
.
get_key_value
(
key
=
'保证人2姓名:'
)
electronic_signature
=
self
.
get_electronic_signature
(
top
=
'保证人2姓名:'
,
bottom
=
'日期:'
)
self
.
init_result
[
'签字页-保证人2姓名'
]
=
name
self
.
init_result
[
'签字页-保证人2签章'
]
=
electronic_signature
else
:
name
=
self
.
get_key_value
(
key
=
'承租人一姓名:'
)
electronic_signature
=
self
.
get_electronic_signature
(
top
=
'承租人一姓名:'
,
bottom
=
'共同承租人名称:'
)
self
.
init_result
[
'签字页-承租人姓名'
]
=
name
self
.
init_result
[
'签字页-承租人签章'
]
=
electronic_signature
name
=
self
.
get_key_value
(
key
=
'共同承租人名称:'
)
electronic_signature
=
self
.
get_electronic_signature
(
top
=
'共同承租人名称:'
,
bottom
=
'保证人1姓名:'
)
self
.
init_result
[
'签字页-共同承租人姓名'
]
=
name
self
.
init_result
[
'签字页-共同承租人签章'
]
=
electronic_signature
name
=
self
.
get_key_value
(
key
=
'保证人1姓名:'
)
electronic_signature
=
self
.
get_electronic_signature
(
top
=
'保证人1姓名:'
,
bottom
=
'保证人2姓名:'
)
self
.
init_result
[
'签字页-保证人1姓名'
]
=
name
self
.
init_result
[
'签字页-保证人1签章'
]
=
electronic_signature
name
=
self
.
get_key_value
(
key
=
'保证人2姓名:'
)
electronic_signature
=
self
.
get_electronic_signature
(
top
=
'保证人2姓名:'
,
bottom
=
'保证人3姓名:'
)
self
.
init_result
[
'签字页-保证人2姓名'
]
=
name
self
.
init_result
[
'签字页-保证人2签章'
]
=
electronic_signature
return
self
.
init_result
def
get_info_1
(
self
):
if
len
(
self
.
pdf_info
)
>
0
:
contract_no
=
self
.
get_contract_no
(
page_num
=
'0'
)
self
.
init_result_1
[
'合同编号'
]
=
contract_no
# 承租人姓名
name
=
self
.
get_key_value
(
key
=
'承租人:'
,
page_num
=
'0'
)
self
.
init_result_1
[
'承租人-姓名'
]
=
name
# 承租人证件号码
_id
=
self
.
get_key_value
(
key
=
'证件号码:'
,
page_num
=
'0'
)
self
.
init_result_1
[
'承租人-证件号码'
]
=
_id
# 销售经销商
seller
=
self
.
get_key_value
(
key
=
'销售经销商:'
,
page_num
=
'0'
)
if
seller
[
'words'
]
==
""
:
seller
=
self
.
get_value_by_findall
(
'销售经销商:'
,
'地址:'
,
page_num
=
'0'
)
self
.
init_result_1
[
'销售经销商'
]
=
seller
# 合同编号(正文)
contract_no
=
self
.
get_contract_no_one
()
self
.
init_result_1
[
'合同编号(正文)'
]
=
contract_no
# 签字页-承租人姓名
name
=
self
.
get_key_value
(
key
=
'姓名/名称:'
)
self
.
init_result_1
[
'签字页-承租人姓名'
]
=
name
# 签字页-承租人证件号码
_id
=
self
.
get_key_value
(
key
=
'自然人身份证件号码/法人执照号码:'
)
self
.
init_result_1
[
'签字页-承租人证件号码'
]
=
_id
# 签字页-承租人签章
signature_role_1
=
self
.
get_signature_role_1
()
self
.
init_result_1
[
'签字页-承租人签章'
]
=
signature_role_1
# 签字页-销售经销商
seller
=
self
.
get_key_value
(
key
=
'销售经销商:'
)
if
seller
[
'words'
]
==
""
:
# 销售经销商:深圳市宝创汽车贸易有限公司南山分公司(请授权代表签字并请盖章)
seller
=
self
.
get_value_by_findall
(
'销售经销商:'
,
'(请授权代表签字并请盖章)'
,
page_num
=
'3'
)
self
.
init_result_1
[
'签字页-销售经销商'
]
=
seller
# 经销商签章
pass
return
self
.
init_result_1
def
get_info_2
(
self
):
if
len
(
self
.
pdf_info
)
>
0
:
contract_no
=
self
.
get_contract_no_dy
()
self
.
init_result_2
[
'合同编号'
]
=
contract_no
# 合同编号(正文)
contract_no
=
self
.
get_contract_no_one
()
self
.
init_result_2
[
'合同编号(正文)'
]
=
contract_no
# 抵押人姓名/名称
name
,
_id
=
self
.
get_dyr_name_id
()
self
.
init_result_2
[
'抵押人姓名/名称'
]
=
name
self
.
init_result_2
[
'抵押人证件号码'
]
=
_id
# 抵押人配偶信息
name
,
_id
=
self
.
get_dyrpo_name_id
()
self
.
init_result_2
[
'抵押人配偶姓名/名称'
]
=
name
self
.
init_result_2
[
'抵押人配偶证件号码'
]
=
_id
# 车辆识别代码
vin
=
self
.
get_key_value
(
key
=
'车辆识别代码:'
)
self
.
init_result_2
[
'车辆识别代码'
]
=
vin
# 租金总额
rent
=
self
.
get_key_value_position
(
key
=
'租金总额'
)
self
.
init_result_2
[
'租金总额'
]
=
rent
# 融资租赁期限
lease_term
=
self
.
get_key_value_position
(
key
=
'融资租赁期限'
)
self
.
init_result_2
[
'融资租赁期限'
]
=
lease_term
# 签字页抵押人姓名和签章
name
=
self
.
get_key_value
(
key
=
'抵押人姓名:'
)
electronic_signature
=
self
.
get_electronic_signature
(
top
=
'抵押权人盖章'
,
bottom
=
'抵押人配偶姓名:'
)
self
.
init_result_2
[
'签字页-抵押人姓名'
]
=
name
self
.
init_result_2
[
'签字页-抵押人签章'
]
=
electronic_signature
# 签字页抵押人配偶姓名和签章
name
=
self
.
get_key_value
(
key
=
'抵押人配偶姓名:'
)
electronic_signature
=
self
.
get_electronic_signature
(
top
=
'抵押人配偶姓名:'
,
bottom
=
'日期'
)
self
.
init_result_2
[
'签字页-抵押人配偶姓名'
]
=
name
self
.
init_result_2
[
'签字页-抵押人配偶签章'
]
=
electronic_signature
return
self
.
init_result_2
\ No newline at end of file
src/common/electronic_hil_contract/hil_contract_ocr.py
View file @
8d595a3
...
...
@@ -6,9 +6,10 @@
# @Description :
from
.get_char
import
Finder
from
.get_char_fsm
import
Finder
as
FSMFinder
def
predict
(
pdf_info
,
file_cls
):
def
predict
(
pdf_info
,
file_cls
,
is_fsm
=
False
):
"""Summary
Args:
...
...
@@ -58,7 +59,11 @@ def predict(pdf_info, file_cls):
pdf_info
=
dict
()
for
pno
,
page_info
in
enumerate
(
pdf_info_1
):
pdf_info
[
str
(
pno
)]
=
page_info
f
=
Finder
(
pdf_info
)
if
is_fsm
:
f
=
FSMFinder
(
pdf_info
)
else
:
f
=
Finder
(
pdf_info
)
if
file_cls
==
0
:
results
=
f
.
get_info
()
if
file_cls
==
1
:
...
...
Write
Preview
Styling with
Markdown
is supported
Attach a file
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to post a comment