Skip to content
Toggle navigation
Toggle navigation
This project
Loading...
Sign in
周伟奇
/
bmw-ocr
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Graphs
Network
Create a new issue
Commits
Issue Boards
Files
Commits
Network
Compare
Branches
Tags
64325811
authored
2021-11-05 18:13:40 +0800
by
周伟奇
Browse Files
Options
Browse Files
Tag
Download
Plain Diff
Merge branch 'feature/e-contract' into feature/1119
2 parents
e3565c7c
f77b2322
Hide whitespace changes
Inline
Side-by-side
Showing
15 changed files
with
2327 additions
and
116 deletions
src/apps/doc/consts.py
src/apps/doc/management/commands/ocr_process.py
src/apps/doc/models.py
src/apps/doc/ocr/ecm.py
src/apps/doc/views.py
src/celery_compare/tasks.py
src/common/electronic_afc_contract/afc_contract_ocr.py
src/common/electronic_afc_contract/get_char.py
src/common/electronic_hil_contract/get_char.py
src/common/electronic_hil_contract/hil_contract_ocr.py
src/common/tools/mssql_script10.py
src/common/tools/pdf_to_img.py
src/settings/conf/prd.ini
src/settings/conf/sit.ini
src/settings/conf/uat.ini
src/apps/doc/consts.py
View file @
6432581
...
...
@@ -1773,3 +1773,21 @@ APPLICANT_TYPE_MAP = {
}
APPLICANT_TYPE_ORDER
=
[
'Borrower'
,
'Co-Borrower'
,
'Guarantor'
,
'Mortgager'
]
FILE_NAME_PREFIX_MAP
=
{
AFC_PREFIX
:
[
((
CONTRACT_CLASSIFY
,
0
),
'{0}_电子签署-汽车抵押贷款合同'
),
((
HMH_CLASSIFY
,
0
),
'{0}_电子签署-抵押登记豁免函'
),
],
HIL_PREFIX
:
[
((
HIL_CONTRACT_1_CLASSIFY
,
HIL_CONTRACT_3_CLASSIFY
),
'{0}_电子签署-售后回租合同'
),
((
HIL_CONTRACT_2_CLASSIFY
,
0
),
'{0}_电子签署-汽车租赁抵押合同'
),
((
HMH_CLASSIFY
,
0
),
'{0}_电子签署-抵押登记豁免函'
),
]
}
HIL_CONTRACT_TYPE_MAP
=
{
str
(
HIL_CONTRACT_1_CLASSIFY
):
0
,
str
(
HIL_CONTRACT_2_CLASSIFY
):
2
,
str
(
HIL_CONTRACT_3_CLASSIFY
):
1
,
}
\ No newline at end of file
...
...
src/apps/doc/management/commands/ocr_process.py
View file @
6432581
...
...
@@ -18,6 +18,8 @@ from settings import conf
from
common.mixins
import
LoggerMixin
from
common.tools.file_tools
import
write_zip_file
from
common.tools.pdf_to_img
import
PDFHandler
from
common.electronic_afc_contract.afc_contract_ocr
import
predict
as
afc_predict
from
common.electronic_hil_contract.hil_contract_ocr
import
predict
as
hil_predict
from
apps.doc
import
consts
# from apps.doc.ocr.edms import EDMS, rh
from
apps.doc.ocr.ecm
import
ECM
,
rh
...
...
@@ -47,6 +49,7 @@ class Command(BaseCommand, LoggerMixin):
def
__init__
(
self
):
super
()
.
__init__
()
self
.
log_base
=
'[doc ocr process]'
self
.
e_log_base
=
'[e-contract ocr process]'
# 处理文件开关
self
.
switch
=
True
# 睡眠时间
...
...
@@ -90,13 +93,20 @@ class Command(BaseCommand, LoggerMixin):
task_str
,
is_priority
=
rh
.
dequeue
()
if
task_str
is
None
:
self
.
online_log
.
info
(
'{0} [get_doc_info] [queue empty]'
.
format
(
self
.
log_base
))
return
None
,
None
,
None
return
None
,
None
,
None
,
None
,
None
self
.
online_log
.
info
(
'{0} [get_doc_info] [task={1}] [is_priority={2}]'
.
format
(
self
.
log_base
,
task_str
,
is_priority
))
try
:
# doc, business_type = self.get_doc_object(task_str)
business_type
,
doc_id_str
=
task_str
.
split
(
consts
.
SPLIT_STR
)
info_tuple
=
task_str
.
split
(
consts
.
SPLIT_STR
)
if
len
(
info_tuple
)
==
2
:
business_type
,
doc_id_str
=
info_tuple
classify_1_str
=
classify_2_str
=
'0'
rebuild_task_str
=
task_str
else
:
business_type
,
doc_id_str
,
classify_1_str
,
classify_2_str
=
info_tuple
rebuild_task_str
=
'{0}{1}{2}'
.
format
(
business_type
,
consts
.
SPLIT_STR
,
doc_id_str
)
doc_id
=
int
(
doc_id_str
)
doc_class
=
HILDoc
if
business_type
==
consts
.
HIL_PREFIX
else
AFCDoc
doc
=
doc_class
.
objects
.
filter
(
id
=
doc_id
)
.
first
()
...
...
@@ -104,11 +114,11 @@ class Command(BaseCommand, LoggerMixin):
if
doc
is
None
:
self
.
online_log
.
warn
(
'{0} [get_doc_info] [doc not exist] [task_str={1}] [is_priority={2}]'
.
format
(
self
.
log_base
,
task_str
,
is_priority
))
return
None
,
None
,
None
return
None
,
None
,
None
,
None
,
None
elif
doc
.
status
!=
DocStatus
.
INIT
.
value
:
self
.
online_log
.
warn
(
'{0} [get_doc_info] [doc status error] [task_str={1}] [is_priority={2}] '
'[doc_status={3}]'
.
format
(
self
.
log_base
,
task_str
,
is_priority
,
doc
.
status
))
return
None
,
None
,
None
'[doc_status={3}]'
.
format
(
self
.
log_base
,
task_str
,
is_priority
,
doc
.
status
))
return
None
,
None
,
None
,
None
,
None
doc
.
status
=
DocStatus
.
PROCESSING
.
value
doc
.
start_time
=
timezone
.
now
()
doc
.
save
()
...
...
@@ -120,7 +130,7 @@ class Command(BaseCommand, LoggerMixin):
else
:
self
.
online_log
.
info
(
'{0} [get_doc_info] [db save end] [task_str={1}] [is_priority={2}]'
.
format
(
self
.
log_base
,
task_str
,
is_priority
))
return
doc
,
business_type
,
task
_str
return
doc
,
business_type
,
rebuild_task_str
,
classify_1_str
,
classify_2
_str
# def pdf_download(self, doc, pdf_path):
# if not doc.application_id.startswith(consts.FIXED_APPLICATION_ID_PREFIX):
...
...
@@ -212,7 +222,7 @@ class Command(BaseCommand, LoggerMixin):
def
contract_process
(
self
,
classify
,
ocr_data
,
contract_result
,
res_list
,
pno
,
ino
,
part_idx
,
img_path
):
contract_dict
=
ocr_data
.
get
(
'data'
)
if
not
contract_dict
or
contract_dict
.
get
(
'page_num'
)
is
None
or
contract_dict
.
get
(
'page_info'
)
is
None
:
if
not
contract_dict
or
contract_dict
.
get
(
'page_num'
)
is
None
or
contract_dict
.
get
(
'page_info'
)
is
None
:
res_list
.
append
((
pno
,
ino
,
part_idx
,
consts
.
RES_SUCCESS_EMPTY
))
return
res_list
.
append
((
pno
,
ino
,
part_idx
,
consts
.
RES_SUCCESS
))
...
...
@@ -915,11 +925,11 @@ class Command(BaseCommand, LoggerMixin):
# summary['confidence'] = max(summary['confidence'])
return
merged_bs_summary
def
pdf_2_img_2_queue
(
self
,
img_queue
,
todo_count_dict
,
lock
,
error_list
):
def
pdf_2_img_2_queue
(
self
,
img_queue
,
todo_count_dict
,
lock
,
error_list
,
res_dict
,
finish_queue
):
while
self
.
switch
:
try
:
# 1. 从队列获取文件信息
doc
,
business_type
,
task_str
=
self
.
get_doc_info
()
doc
,
business_type
,
task_str
,
classify_1_str
,
classify_2_str
=
self
.
get_doc_info
()
# 队列为空时的处理
if
doc
is
None
:
time
.
sleep
(
self
.
sleep_time_doc_get
)
...
...
@@ -930,55 +940,109 @@ class Command(BaseCommand, LoggerMixin):
error_list
.
append
(
1
)
return
else
:
try
:
# 2. 从EDMS获取PDF文件
doc_data_path
=
os
.
path
.
join
(
self
.
data_dir
,
business_type
,
consts
.
TMP_DIR_NAME
,
str
(
doc
.
id
))
os
.
makedirs
(
doc_data_path
,
exist_ok
=
True
)
img_save_path
=
os
.
path
.
join
(
doc_data_path
,
'img'
)
pdf_path
=
os
.
path
.
join
(
doc_data_path
,
'{0}.pdf'
.
format
(
doc
.
id
))
pdf_handler
=
PDFHandler
(
pdf_path
,
img_save_path
,
doc
.
document_name
)
max_count_obj
=
Configs
.
objects
.
filter
(
id
=
2
)
.
first
()
try
:
max_img_count
=
int
(
max_count_obj
.
value
)
except
Exception
as
e
:
max_img_count
=
500
doc_data_path
=
os
.
path
.
join
(
self
.
data_dir
,
business_type
,
consts
.
TMP_DIR_NAME
,
str
(
doc
.
id
))
os
.
makedirs
(
doc_data_path
,
exist_ok
=
True
)
img_save_path
=
os
.
path
.
join
(
doc_data_path
,
'img'
)
pdf_path
=
os
.
path
.
join
(
doc_data_path
,
'{0}.pdf'
.
format
(
doc
.
id
))
for
times
in
range
(
consts
.
RETRY_TIMES
):
try
:
if
not
doc
.
application_id
.
startswith
(
consts
.
FIXED_APPLICATION_ID_PREFIX
):
# self.edms.download(pdf_path, doc.metadata_version_id)
self
.
edms
.
download
(
pdf_path
,
doc
.
metadata_version_id
,
doc
.
document_scheme
,
business_type
)
self
.
online_log
.
info
(
'{0} [edms download success] [task={1}] [times={2}] '
'[pdf_path={3}]'
.
format
(
self
.
log_base
,
task_str
,
times
,
pdf_path
))
pdf_handler
=
PDFHandler
(
pdf_path
,
img_save_path
,
doc
.
document_name
)
# 3.PDF文件提取图片
self
.
online_log
.
info
(
'{0} [pdf to img start] [task={1}] [times={2}]'
.
format
(
self
.
log_base
,
task_str
,
times
))
start_time
=
time
.
time
()
pdf_handler
.
extract_image
(
max_img_count
)
end_time
=
time
.
time
()
speed_time
=
int
(
end_time
-
start_time
)
self
.
online_log
.
info
(
'{0} [pdf to img end] [task={1}] [times={2}] [spend_time={3}]'
.
format
(
self
.
log_base
,
task_str
,
times
,
speed_time
))
if
classify_1_str
==
'0'
or
classify_1_str
==
str
(
consts
.
HMH_CLASSIFY
):
try
:
# 2. 从EDMS获取PDF文件
max_count_obj
=
Configs
.
objects
.
filter
(
id
=
2
)
.
first
()
try
:
max_img_count
=
int
(
max_count_obj
.
value
)
except
Exception
as
e
:
self
.
online_log
.
warn
(
'{0} [download or pdf to img failed] [task={1}] [times={2}] '
'[error={3}]'
.
format
(
self
.
log_base
,
task_str
,
times
,
traceback
.
format_exc
()))
max_img_count
=
500
for
times
in
range
(
consts
.
RETRY_TIMES
):
try
:
if
not
doc
.
application_id
.
startswith
(
consts
.
FIXED_APPLICATION_ID_PREFIX
):
# self.edms.download(pdf_path, doc.metadata_version_id)
self
.
edms
.
download
(
pdf_path
,
doc
.
metadata_version_id
,
doc
.
document_scheme
,
business_type
)
self
.
online_log
.
info
(
'{0} [edms download success] [task={1}] [times={2}] '
'[pdf_path={3}]'
.
format
(
self
.
log_base
,
task_str
,
times
,
pdf_path
))
# 3.PDF文件提取图片
self
.
online_log
.
info
(
'{0} [pdf to img start] [task={1}] [times={2}]'
.
format
(
self
.
log_base
,
task_str
,
times
))
start_time
=
time
.
time
()
pdf_handler
.
extract_image
(
max_img_count
)
end_time
=
time
.
time
()
speed_time
=
int
(
end_time
-
start_time
)
self
.
online_log
.
info
(
'{0} [pdf to img end] [task={1}] [times={2}] [spend_time={3}]'
.
format
(
self
.
log_base
,
task_str
,
times
,
speed_time
))
except
Exception
as
e
:
self
.
online_log
.
warn
(
'{0} [download or pdf to img failed] [task={1}] [times={2}] '
'[error={3}]'
.
format
(
self
.
log_base
,
task_str
,
times
,
traceback
.
format_exc
()))
else
:
break
else
:
break
else
:
raise
Exception
(
'download or pdf to img failed'
)
raise
Exception
(
'download or pdf to img failed'
)
if
pdf_handler
.
img_count
==
0
:
self
.
online_log
.
warn
(
'{0} [pdf to img failed (pdf img empty)] [task={1}]'
.
format
(
self
.
log_base
,
task_str
))
raise
Exception
(
'pdf img empty'
)
elif
pdf_handler
.
img_count
>=
max_img_count
:
self
.
online_log
.
info
(
'{0} [too many pdf image] [task={1}] [img_count={2}]'
.
format
(
self
.
log_base
,
task_str
,
pdf_handler
.
img_count
))
if
pdf_handler
.
img_count
==
0
:
self
.
online_log
.
warn
(
'{0} [pdf to img failed (pdf img empty)] [task={1}]'
.
format
(
self
.
log_base
,
task_str
))
raise
Exception
(
'pdf img empty'
)
elif
pdf_handler
.
img_count
>=
max_img_count
:
self
.
online_log
.
info
(
'{0} [too many pdf image] [task={1}] [img_count={2}]'
.
format
(
self
.
log_base
,
task_str
,
pdf_handler
.
img_count
))
try
:
report_table
=
HILOCRReport
if
business_type
==
consts
.
HIL_PREFIX
else
AFCOCRReport
report_table
.
objects
.
create
(
case_number
=
doc
.
application_id
,
request_team
=
RequestTeam
.
get_value
(
doc
.
document_scheme
,
0
),
request_trigger
=
RequestTrigger
.
get_value
(
doc
.
data_source
,
0
),
input_file
=
doc
.
document_name
,
transaction_start
=
doc
.
start_time
,
transaction_end
=
doc
.
start_time
,
successful_at_this_level
=
False
,
failure_reason
=
FailureReason
.
IMG_LIMIT
.
value
,
process_name
=
ProcessName
.
ALL
.
value
,
notes
=
'pdf page count: {0}'
.
format
(
str
(
pdf_handler
.
img_count
))
)
except
Exception
as
e
:
self
.
online_log
.
error
(
'{0} [process error (report db save)] [error={1}]'
.
format
(
self
.
log_base
,
traceback
.
format_exc
()))
try
:
doc
.
status
=
DocStatus
.
PROCESS_FAILED
.
value
doc
.
save
()
except
Exception
as
e
:
self
.
online_log
.
error
(
'{0} [process error (db save)] [error={1}]'
.
format
(
self
.
log_base
,
traceback
.
format_exc
()))
else
:
with
lock
:
todo_count_dict
[
task_str
]
=
pdf_handler
.
img_count
for
img_idx
,
img_path
in
enumerate
(
pdf_handler
.
img_path_list
):
while
img_queue
.
full
():
self
.
online_log
.
info
(
'{0} [pdf_2_img_2_queue] [img queue full]'
.
format
(
self
.
log_base
))
time
.
sleep
(
self
.
sleep_time_img_put
)
if
pdf_handler
.
is_ebank
:
try
:
text_list
=
pdf_handler
.
page_text_list
[
img_idx
]
.
pop
(
'rebuild_text'
)
except
Exception
as
e
:
text_list
=
[]
else
:
text_list
=
[]
img_queue
.
put
((
business_type
,
img_path
,
text_list
))
# except EDMSException as e:
# try:
# doc.status = DocStatus.PROCESS_FAILED.value
# doc.save()
# self.online_log.warn('{0} [process failed (edms download)] [task={1}] [error={2}]'.format(
# self.log_base, task_str, traceback.format_exc()))
# except Exception as e:
# self.online_log.error('{0} [process error (db save 1)] [error={1}]'.format(
# self.log_base, traceback.format_exc()))
# error_list.append(1)
# return
except
Exception
as
e
:
try
:
end_time
=
timezone
.
now
()
report_table
=
HILOCRReport
if
business_type
==
consts
.
HIL_PREFIX
else
AFCOCRReport
report_table
.
objects
.
create
(
case_number
=
doc
.
application_id
,
...
...
@@ -986,11 +1050,10 @@ class Command(BaseCommand, LoggerMixin):
request_trigger
=
RequestTrigger
.
get_value
(
doc
.
data_source
,
0
),
input_file
=
doc
.
document_name
,
transaction_start
=
doc
.
start_time
,
transaction_end
=
doc
.
start
_time
,
transaction_end
=
end
_time
,
successful_at_this_level
=
False
,
failure_reason
=
FailureReason
.
IMG_LIMIT
.
value
,
failure_reason
=
FailureReason
.
PDF
.
value
,
process_name
=
ProcessName
.
ALL
.
value
,
notes
=
'pdf page count: {0}'
.
format
(
str
(
pdf_handler
.
img_count
))
)
except
Exception
as
e
:
self
.
online_log
.
error
(
'{0} [process error (report db save)] [error={1}]'
.
format
(
...
...
@@ -999,64 +1062,114 @@ class Command(BaseCommand, LoggerMixin):
try
:
doc
.
status
=
DocStatus
.
PROCESS_FAILED
.
value
doc
.
save
()
self
.
online_log
.
warn
(
'{0} [process failed (pdf_2_img_2_queue)] [task={1}] '
'[error={2}]'
.
format
(
self
.
log_base
,
task_str
,
traceback
.
format_exc
()))
except
Exception
as
e
:
self
.
online_log
.
error
(
'{0} [process error (db save)] [error={1}]'
.
format
(
self
.
log_base
,
traceback
.
format_exc
()))
else
:
with
lock
:
todo_count_dict
[
task_str
]
=
pdf_handler
.
img_count
for
img_idx
,
img_path
in
enumerate
(
pdf_handler
.
img_path_list
):
while
img_queue
.
full
():
self
.
online_log
.
info
(
'{0} [pdf_2_img_2_queue] [img queue full]'
.
format
(
self
.
log_base
))
time
.
sleep
(
self
.
sleep_time_img_put
)
if
pdf_handler
.
is_ebank
:
try
:
text_list
=
pdf_handler
.
page_text_list
[
img_idx
]
.
pop
(
'rebuild_text'
)
except
Exception
as
e
:
text_list
=
[]
else
:
text_list
=
[]
img_queue
.
put
((
business_type
,
img_path
,
text_list
))
# except EDMSException as e:
# try:
# doc.status = DocStatus.PROCESS_FAILED.value
# doc.save()
# self.online_log.warn('{0} [process failed (edms download)] [task={1}] [error={2}]'.format(
# self.log_base, task_str, traceback.format_exc()))
# except Exception as e:
# self.online_log.error('{0} [process error (db save 1)] [error={1}]'.format(
# self.log_base, traceback.format_exc()))
# error_list.append(1)
# return
except
Exception
as
e
:
error_list
.
append
(
1
)
return
else
:
# e-contract
try
:
end_time
=
timezone
.
now
()
report_table
=
HILOCRReport
if
business_type
==
consts
.
HIL_PREFIX
else
AFCOCRReport
report_table
.
objects
.
create
(
case_number
=
doc
.
application_id
,
request_team
=
RequestTeam
.
get_value
(
doc
.
document_scheme
,
0
),
request_trigger
=
RequestTrigger
.
get_value
(
doc
.
data_source
,
0
),
input_file
=
doc
.
document_name
,
transaction_start
=
doc
.
start_time
,
transaction_end
=
end_time
,
successful_at_this_level
=
False
,
failure_reason
=
FailureReason
.
PDF
.
value
,
process_name
=
ProcessName
.
ALL
.
value
,
)
except
Exception
as
e
:
self
.
online_log
.
error
(
'{0} [process error (report db save)] [error={1}]'
.
format
(
self
.
log_base
,
traceback
.
format_exc
()))
# pdf下载 处理 图片存储 识别
for
times
in
range
(
consts
.
RETRY_TIMES
):
try
:
self
.
edms
.
download
(
pdf_path
,
doc
.
metadata_version_id
,
doc
.
document_scheme
,
business_type
)
self
.
online_log
.
info
(
'{0} [edms download success] [task={1}] [times={2}] '
'[pdf_path={3}]'
.
format
(
self
.
e_log_base
,
task_str
,
times
,
pdf_path
))
self
.
online_log
.
info
(
'{0} [pdf to img start] [task={1}] [times={2}]'
.
format
(
self
.
e_log_base
,
task_str
,
times
))
pdf_handler
.
e_contract_process
()
self
.
online_log
.
info
(
'{0} [pdf to img end] [task={1}] [times={2}]'
.
format
(
self
.
e_log_base
,
task_str
,
times
))
except
Exception
as
e
:
self
.
online_log
.
warn
(
'{0} [download or pdf to img failed] [task={1}] [times={2}] '
'[error={3}]'
.
format
(
self
.
e_log_base
,
task_str
,
times
,
traceback
.
format_exc
()))
else
:
break
else
:
raise
Exception
(
'download or pdf to img failed'
)
if
classify_1_str
==
str
(
consts
.
CONTRACT_CLASSIFY
):
ocr_result
=
afc_predict
(
pdf_handler
.
pdf_info
)
page_res
=
{}
for
page_num
,
page_info
in
ocr_result
.
get
(
'page_info'
,
{})
.
items
():
if
isinstance
(
page_num
,
str
)
and
page_num
.
startswith
(
'page_'
):
page_res
[
page_num
]
=
{
'classify'
:
int
(
classify_1_str
),
'page_num'
:
page_num
,
'page_info'
:
page_info
}
try
:
doc
.
status
=
DocStatus
.
PROCESS_FAILED
.
value
doc
.
save
()
self
.
online_log
.
warn
(
'{0} [process failed (pdf_2_img_2_queue)] [task={1}] '
'[error={2}]'
.
format
(
self
.
log_base
,
task_str
,
traceback
.
format_exc
()))
else
:
file_type_1
=
consts
.
HIL_CONTRACT_TYPE_MAP
.
get
(
classify_1_str
)
file_type_2
=
consts
.
HIL_CONTRACT_TYPE_MAP
.
get
(
classify_2_str
)
ocr_result_1
=
hil_predict
(
pdf_handler
.
pdf_info
,
file_type_1
)
rebuild_res_1
=
{}
page_res
=
{}
for
field_name
,
field_info
in
ocr_result_1
.
items
():
page_num
=
field_info
.
pop
(
'page'
,
'page_1'
)
rebuild_res_1
.
setdefault
(
page_num
,
dict
())[
field_name
]
=
field_info
for
page_num
,
page_info
in
rebuild_res_1
.
items
():
if
isinstance
(
page_num
,
str
)
and
page_num
.
startswith
(
'page_'
):
page_res
[
page_num
]
=
{
'classify'
:
int
(
classify_1_str
),
'page_num'
:
page_num
,
'page_info'
:
page_info
}
if
isinstance
(
file_type_2
,
int
):
rebuild_res_2
=
{}
ocr_result_2
=
hil_predict
(
pdf_handler
.
pdf_info
,
file_type_2
)
for
field_name
,
field_info
in
ocr_result_2
.
items
():
page_num
=
field_info
.
pop
(
'page'
,
'page_1'
)
rebuild_res_2
.
setdefault
(
page_num
,
dict
())[
field_name
]
=
field_info
for
page_num
,
page_info
in
ocr_result_2
.
items
():
if
isinstance
(
page_num
,
str
)
and
page_num
.
startswith
(
'page_'
):
page_res
[
page_num
]
=
{
'classify'
:
int
(
classify_2_str
),
'page_num'
:
page_num
,
'page_info'
:
page_info
}
contract_res
=
{}
for
img_path_tmp
,
page_key
in
pdf_handler
.
img_path_pno_list
:
if
page_key
in
page_res
:
img_contract_res
=
{
'code'
:
1
,
'data'
:
[
{
'classify'
:
page_res
[
page_key
]
.
pop
(
'classify'
,
consts
.
OTHER_CLASSIFY
),
'data'
:
page_res
[
page_key
]
}
]
}
else
:
img_contract_res
=
{
'code'
:
1
,
'data'
:
[
{
'classify'
:
int
(
classify_1_str
),
}
]
}
contract_res
[
img_path_tmp
]
=
img_contract_res
with
lock
:
res_dict
[
task_str
]
=
contract_res
finish_queue
.
put
(
task_str
)
except
Exception
as
e
:
self
.
online_log
.
error
(
'{0} [process error (db save)] [error={1}]'
.
format
(
self
.
log_base
,
traceback
.
format_exc
()))
error_list
.
append
(
1
)
return
try
:
doc
.
status
=
DocStatus
.
PROCESS_FAILED
.
value
doc
.
save
()
self
.
online_log
.
warn
(
'{0} [process failed (e-contract)] [task={1}] '
'[error={2}]'
.
format
(
self
.
e_log_base
,
task_str
,
traceback
.
format_exc
()))
except
Exception
as
e
:
self
.
online_log
.
error
(
'{0} [process error (db save)] [error={1}]'
.
format
(
self
.
e_log_base
,
traceback
.
format_exc
()))
error_list
.
append
(
1
)
return
def
img_2_ocr_1
(
self
,
img_queue
,
todo_count_dict
,
res_dict
,
finish_queue
,
lock
,
url
,
error_list
):
while
len
(
error_list
)
==
0
or
not
img_queue
.
empty
():
...
...
@@ -1801,7 +1914,7 @@ class Command(BaseCommand, LoggerMixin):
finish_queue
=
Queue
()
process_list
=
[]
pdf_process
=
Process
(
target
=
self
.
pdf_2_img_2_queue
,
args
=
(
img_queue
,
todo_count_dict
,
lock
,
error_list
))
pdf_process
=
Process
(
target
=
self
.
pdf_2_img_2_queue
,
args
=
(
img_queue
,
todo_count_dict
,
lock
,
error_list
,
res_dict
,
finish_queue
))
process_list
.
append
(
pdf_process
)
for
url
in
self
.
ocr_1_urls
.
values
():
...
...
src/apps/doc/models.py
View file @
6432581
...
...
@@ -789,3 +789,24 @@ class HILCACompareResultRecord(models.Model):
db_table
=
'hil_ca_compare_result_record'
class
HILContract
(
models
.
Model
):
id
=
models
.
AutoField
(
primary_key
=
True
,
verbose_name
=
"id"
)
# 主键
application_id
=
models
.
CharField
(
max_length
=
64
,
verbose_name
=
"申请id"
)
# 索引
create_time
=
models
.
DateTimeField
(
auto_now_add
=
True
,
verbose_name
=
'创建时间'
)
class
Meta
:
managed
=
False
db_table
=
'hil_contract'
class
AFCContract
(
models
.
Model
):
id
=
models
.
AutoField
(
primary_key
=
True
,
verbose_name
=
"id"
)
# 主键
application_id
=
models
.
CharField
(
max_length
=
64
,
verbose_name
=
"申请id"
)
# 索引
create_time
=
models
.
DateTimeField
(
auto_now_add
=
True
,
verbose_name
=
'创建时间'
)
class
Meta
:
managed
=
False
db_table
=
'afc_contract'
situ_db_label
=
'afc'
...
...
src/apps/doc/ocr/ecm.py
View file @
6432581
import
os
import
base64
import
requests
from
common.redis_cache
import
redis_handler
as
rh
...
...
@@ -44,7 +45,6 @@ class ECM:
"b_coborrower_id"
,
"b_coborrower_name"
,
"b_guarantor_id"
,
"b_guarantor_name"
,
"b_frontend_partner"
,
"b_dealer_code"
,
"b_dealer_name"
,
"b_input_date"
,
"b_comment"
,
"b_contract_no"
,
"b_location"
]
self
.
contract_prefix
=
'电子'
def
update_oauth_token
(
self
):
response
=
requests
.
post
(
self
.
oauth_url
,
headers
=
self
.
oauth_headers
,
data
=
self
.
oauth_payload
,
verify
=
False
)
...
...
@@ -69,9 +69,9 @@ class ECM:
def
get_headers
(
self
):
return
{
'Authorization'
:
'{0} {1}'
.
format
(
self
.
token_type
,
self
.
get_oauth_token
())}
def
search
(
self
,
application_id
,
business_type
):
def
search
(
self
,
application_id
,
business_type
,
prefix
):
sql
=
"select * from {0} where b_application_no='{1}' and object_name like '{2}
%
'"
.
format
(
self
.
settlement_type
,
application_id
,
self
.
contract_
prefix
)
self
.
settlement_type
,
application_id
,
prefix
)
search_args
=
{
"userName"
:
self
.
username
,
"password"
:
self
.
pwd
,
...
...
@@ -96,7 +96,6 @@ class ECM:
result
.
append
((
object_name
,
object_id
))
return
result
def
download
(
self
,
save_path
,
object_id
,
document_scheme
,
business_type
):
doc_type
,
_
,
_
=
self
.
doc_type_map
.
get
(
document_scheme
)
download_json
=
{
...
...
src/apps/doc/views.py
View file @
6432581
...
...
@@ -36,12 +36,14 @@ from .models import (
AFCSECompareResultRecord
,
HILCACompareResultRecord
,
HILSECompareResultRecord
,
HILContract
,
AFCContract
,
)
from
.named_enum
import
ErrorType
from
.mixins
import
DocHandler
from
.
import
consts
from
apps.account.authentication
import
OAuth2AuthenticationWithUser
from
celery_compare.tasks
import
compare
from
celery_compare.tasks
import
compare
,
forwarding_station
class
CustomDate
(
fields
.
Date
):
...
...
@@ -1166,5 +1168,11 @@ class SEContractView(GenericView):
# pos上传e-contract信息接口 SE
@use_args
(
se_contract_args
,
location
=
'data'
)
def
post
(
self
,
request
,
args
):
self
.
running_log
.
info
(
'e-contract in'
)
contract_info
=
args
.
get
(
'content'
,
{})
application_id
=
contract_info
.
get
(
'applicationId'
,
''
)
entity
=
contract_info
.
get
(
'applicationEntity'
,
''
)
table_class
=
HILContract
if
entity
==
consts
.
HIL_PREFIX
else
AFCContract
table_class
.
objects
.
create
(
application_id
=
application_id
)
forwarding_station
.
apply_async
((
application_id
,
entity
),
queue
=
'queue_compare'
,
countdown
=
conf
.
DELAY_SECONDS
)
self
.
running_log
.
info
(
'[e-contract] [application_id={0}] [entity={1}]'
.
format
(
application_id
,
entity
))
return
response
.
ok
()
...
...
src/celery_compare/tasks.py
View file @
6432581
...
...
@@ -27,10 +27,13 @@ from apps.doc.models import (
AFCCACompareResult
,
HILSECompareResult
,
HILCACompareResult
,
AFCDoc
,
HILDoc
)
from
apps.doc
import
consts
from
apps.doc.ocr.gcap
import
gcap
from
apps.doc.ocr.cms
import
cms
from
apps.doc.ocr.ecm
import
ECM
,
rh
from
apps.doc.exceptions
import
GCAPException
from
apps.doc.named_enum
import
RequestTeam
,
RequestTrigger
,
ProcessName
,
ErrorType
from
common.tools.comparison
import
cp
...
...
@@ -38,9 +41,11 @@ from common.tools.des import decode_des
compare_log
=
logging
.
getLogger
(
'compare'
)
log_base
=
'[Compare]'
e_log_base
=
'[e-contract]'
empty_str
=
''
empty_error_type
=
1000
des_key
=
conf
.
CMS_DES_KEY
ecm
=
ECM
()
def
rotate_bound
(
image
,
angle
):
...
...
@@ -1867,4 +1872,32 @@ def compare(application_id, application_entity, uniq_seq, ocr_res_id, is_ca=True
se_compare
(
application_id
,
application_entity
,
ocr_res_id
,
last_obj
,
ocr_res_dict
,
is_cms
)
@app.task
def
forwarding_station
(
application_id
,
entity
):
compare_log
.
info
(
'{0} [forward start] [application_id={1}] [entity={2}]'
.
format
(
e_log_base
,
application_id
,
entity
))
doc_class
=
HILDoc
if
entity
in
consts
.
HIL_SET
else
AFCDoc
entity_prefix
=
consts
.
HIL_PREFIX
if
entity
in
consts
.
HIL_SET
else
consts
.
AFC_PREFIX
for
(
classify_1
,
classify_2
),
prefix
in
consts
.
FILE_NAME_PREFIX_MAP
.
get
(
entity
):
try
:
file_list
=
ecm
.
search
(
application_id
,
entity
,
prefix
.
format
(
application_id
))
# TODO 获取最新文件
except
Exception
as
e
:
compare_log
.
error
(
'{0} [search failed] [application_id={1}] [entity={2}] [error={3}]'
.
format
(
e_log_base
,
application_id
,
entity
,
traceback
.
format_exc
()))
else
:
compare_log
.
info
(
'{0} [search end] [application_id={1}] [entity={2}] [file_list={3}]'
.
format
(
e_log_base
,
application_id
,
entity
,
file_list
))
for
object_name
,
object_id
in
file_list
:
doc
=
doc_class
.
objects
.
create
(
metadata_version_id
=
object_id
,
application_id
=
application_id
,
document_name
=
object_name
,
document_scheme
=
'SETTLEMENT'
,
data_source
=
'POS'
,
upload_finish_time
=
datetime
.
now
(),
)
task
=
consts
.
SPLIT_STR
.
join
([
entity_prefix
,
str
(
doc
.
id
),
str
(
classify_1
),
str
(
classify_2
)])
enqueue_res
=
rh
.
enqueue
([
task
],
False
)
compare_log
.
info
(
'{0} [upload success] [res={1}] [application_id={2}] [entity={3}] [object_name={4}] '
'[object_id={5}] [doc_id={6}]'
.
format
(
e_log_base
,
enqueue_res
,
application_id
,
entity
,
object_name
,
object_id
,
doc
.
id
))
compare_log
.
info
(
'{0} [forward end] [application_id={1}] [entity={2}]'
.
format
(
e_log_base
,
application_id
,
entity
))
...
...
src/common/electronic_afc_contract/afc_contract_ocr.py
0 → 100644
View file @
6432581
# -*- coding: utf-8 -*-
# @Author : lk
# @Email : 9428.al@gmail.com
# @Created Date : 2021-06-29 17:43:46
# @Last Modified : 2021-09-07 14:11:25
# @Description :
from
.get_char
import
Finder
def
predict
(
pdf_info
):
# 输入是整个 PDF 中的信息
f
=
Finder
(
pdf_info
)
results
=
f
.
get_info
()
return
results
src/common/electronic_afc_contract/get_char.py
0 → 100644
View file @
6432581
# -*- coding: utf-8 -*-
# @Author : lk
# @Email : 9428.al@gmail.com
# @Create Date : 2021-07-20 16:42:41
# @Last Modified : 2021-09-07 19:52:39
# @Description :
import
re
import
numpy
as
np
from
fuzzywuzzy
import
fuzz
class
Finder
:
def
__init__
(
self
,
pdf_info
):
self
.
pdf_info
=
pdf_info
self
.
is_asp
=
False
self
.
item
=
{
"words"
:
None
,
"position"
:
None
,
}
def
gen_init_result
(
self
,
is_asp
):
# 格式化算法输出
self
.
init_result
=
{
"page_1"
:
{
"合同编号"
:
self
.
item
,
"所购车辆价格"
:
self
.
item
,
"车架号"
:
self
.
item
,
"贷款本金金额"
:
{
"大写"
:
self
.
item
,
"小写"
:
self
.
item
,
"车辆贷款本金金额"
:
self
.
item
,
"附加产品融资贷款本金总金额"
:
self
.
item
,
},
"贷款期限"
:
self
.
item
,
"附加产品融资贷款本金总金额明细"
:
self
.
item
,
"借款人签字及时间"
:
self
.
item
,
},
"page_2"
:
{
"合同编号"
:
self
.
item
,
"借款人及抵押人"
:
{
"name"
:
self
.
item
,
"id"
:
self
.
item
,
},
"共同借款人及共同抵押人"
:
{
"name"
:
self
.
item
,
"id"
:
self
.
item
,
},
"保证人1"
:
{
"name"
:
self
.
item
,
"id"
:
self
.
item
,
},
"保证人2"
:
{
"name"
:
self
.
item
,
"id"
:
self
.
item
,
},
"所购车辆价格"
:
self
.
item
,
"车架号"
:
self
.
item
,
"经销商"
:
self
.
item
,
"贷款本金金额"
:
{
"大写"
:
self
.
item
,
"小写"
:
self
.
item
,
"车辆贷款本金金额"
:
self
.
item
,
"附加产品融资贷款本金总金额"
:
self
.
item
,
},
"贷款期限"
:
self
.
item
,
"还款账户"
:
{
"账号"
:
self
.
item
,
"户名"
:
self
.
item
,
"开户行"
:
self
.
item
,
},
},
"page_3"
:
{
"合同编号"
:
self
.
item
,
"还款计划表"
:
self
.
item
,
},
"page_4"
:
{
"合同编号"
:
self
.
item
,
"附加产品融资贷款本金总金额明细"
:
self
.
item
,
},
"page_5"
:
{
"合同编号"
:
self
.
item
,
},
"page_6"
:
{
"合同编号"
:
self
.
item
,
},
}
if
self
.
is_asp
==
False
:
self
.
init_result
[
"page_7"
]
=
{
"合同编号"
:
self
.
item
,
"主借人签字"
:
{
"签字"
:
self
.
item
,
"日期"
:
self
.
item
,
},
"共借人签字"
:
{
"签字"
:
self
.
item
,
"日期"
:
self
.
item
,
},
"保证人1签字"
:
{
"签字"
:
self
.
item
,
"日期"
:
self
.
item
,
},
"保证人2签字"
:
{
"签字"
:
self
.
item
,
"日期"
:
self
.
item
,
},
"见证人签字"
:
{
"签字"
:
self
.
item
,
"日期"
:
self
.
item
,
},
}
else
:
self
.
init_result
[
"page_7"
]
=
{
"合同编号"
:
self
.
item
,
}
self
.
init_result
[
"page_8"
]
=
{
"合同编号"
:
self
.
item
,
"主借人签字"
:
{
"签字"
:
self
.
item
,
"日期"
:
self
.
item
,
},
"共借人签字"
:
{
"签字"
:
self
.
item
,
"日期"
:
self
.
item
,
},
"保证人1签字"
:
{
"签字"
:
self
.
item
,
"日期"
:
self
.
item
,
},
"保证人2签字"
:
{
"签字"
:
self
.
item
,
"日期"
:
self
.
item
,
},
"见证人签字"
:
{
"签字"
:
self
.
item
,
"日期"
:
self
.
item
,
},
}
def
get_contract_no
(
self
,
page_num
):
"""传入页码,查看该页码右上角的编号
Args:
page_num (string):
Returns:
sting:
"""
contract_no
=
self
.
item
.
copy
()
# 只看第一页
for
block
in
self
.
pdf_info
[
page_num
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
'合同编号:'
in
text
:
words
=
text
.
split
(
':'
)[
-
1
]
contract_no
[
'position'
]
=
bbox
contract_no
[
'words'
]
=
words
return
contract_no
def
get_vehicle_price
(
self
,
page_num
=
'0'
):
vehicle_price
=
self
.
item
.
copy
()
for
block
in
self
.
pdf_info
[
page_num
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
'所购车辆价格为人民币'
in
text
:
words
=
text
.
split
(
'币'
)[
-
1
]
vehicle_price
[
'position'
]
=
bbox
vehicle_price
[
'words'
]
=
words
return
vehicle_price
def
get_vin
(
self
,
page_num
=
'0'
):
vin
=
self
.
item
.
copy
()
for
block
in
self
.
pdf_info
[
page_num
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
'车架号:'
in
text
:
words
=
text
.
split
(
':'
)[
-
1
]
vin
[
'position'
]
=
bbox
vin
[
'words'
]
=
words
return
vin
def
get_loan_principal
(
self
,
page_num
=
'0'
):
chinese_keywords
=
[
'壹'
,
'贰'
,
'叁'
,
'肆'
,
'伍'
,
'陆'
,
'柒'
,
'捌'
,
'玖'
,
'拾'
,
'佰'
,
'仟'
,
'万'
,
'亿'
,
'元'
,
'角'
,
'分'
,
'零'
,
'整'
]
upper
=
self
.
item
.
copy
()
lower
=
self
.
item
.
copy
()
asp_1
=
self
.
item
.
copy
()
asp_2
=
self
.
item
.
copy
()
anchor_bbox
=
None
for
block
in
self
.
pdf_info
[
page_num
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
fuzz
.
ratio
(
''
.
join
(
chinese_keywords
),
text
)
>
15
:
text
=
text
.
split
(
':'
)[
-
1
]
.
strip
()
upper
[
'position'
]
=
bbox
upper
[
'words'
]
=
text
if
'小写:¥'
in
text
:
words
=
text
.
split
(
'¥'
)[
-
1
]
.
strip
()
lower
[
'position'
]
=
bbox
lower
[
'words'
]
=
words
if
'附加产品融资贷款本金总金额'
==
text
:
anchor_bbox
=
bbox
if
anchor_bbox
:
for
block
in
self
.
pdf_info
[
page_num
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
np
.
mean
(
bbox
[
1
::
2
])
<
np
.
mean
(
anchor_bbox
[
1
::
2
])
and
'人民币:小写:'
in
text
:
words
=
re
.
findall
(
r'人民币:小写:\[(.*)\]'
,
text
)[
0
]
asp_1
[
'position'
]
=
bbox
asp_1
[
'words'
]
=
words
if
np
.
mean
(
bbox
[
1
::
2
])
>
np
.
mean
(
anchor_bbox
[
1
::
2
])
and
'人民币:小写:'
in
text
:
words
=
re
.
findall
(
r'人民币:小写:\[(.*)\]'
,
text
)[
0
]
asp_2
[
'position'
]
=
bbox
asp_2
[
'words'
]
=
words
return
upper
,
lower
,
asp_1
,
asp_2
def
get_loan_term
(
self
,
page_num
=
'0'
):
loan_term
=
self
.
item
.
copy
()
all_text
=
''
for
block
in
self
.
pdf_info
[
page_num
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
all_text
+=
text
matchs
=
re
.
search
(
r'贷款期限(\d+)个月'
,
all_text
)
if
matchs
:
words
=
matchs
.
group
(
1
)
for
block
in
self
.
pdf_info
[
page_num
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
f
'{words}个月'
in
text
:
loan_term
[
'position'
]
=
bbox
loan_term
[
'words'
]
=
words
return
loan_term
def
get_asp_details
(
self
,
page_num
):
asp_details_table_term
=
self
.
item
.
copy
()
asp_details_table
=
[]
asp_details_text_list
=
[]
table
=
False
for
block
in
self
.
pdf_info
[
page_num
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
'附加产品融资贷款本金总金额明细'
==
text
:
table
=
True
if
'第二条'
in
text
or
'征信管理'
in
text
:
table
=
False
if
table
==
True
:
asp_details_text_list
.
append
(
text
)
for
i
in
range
((
len
(
asp_details_text_list
)
+
2
)
//
3
):
line
=
[]
if
i
==
0
:
line
=
[
asp_details_text_list
[
0
]]
else
:
for
j
in
range
(
3
):
line
.
append
(
asp_details_text_list
[
i
*
3
-
2
+
j
])
asp_details_table
.
append
(
line
)
if
len
(
asp_details_table
)
>
0
:
asp_details_table_term
[
'words'
]
=
asp_details_table
return
asp_details_table_term
def
get_signature
(
self
):
signature
=
self
.
item
.
copy
()
for
block
in
self
.
pdf_info
[
'0'
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
'签署日期'
in
text
:
words
=
text
signature
[
'words'
]
=
words
signature
[
'position'
]
=
bbox
return
signature
def
get_somebody
(
self
,
top
,
bottom
):
# 指定上下边界后,返回上下边界内的客户信息
_name
=
self
.
item
.
copy
()
_id
=
self
.
item
.
copy
()
# 只看第一页,先划定上下边界
y_top
=
0
y_bottom
=
0
for
block
in
self
.
pdf_info
[
'1'
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
top
in
text
:
y_top
=
bbox
[
3
]
if
bottom
in
text
:
y_bottom
=
bbox
[
3
]
for
block
in
self
.
pdf_info
[
'1'
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
y_top
<
bbox
[
3
]
<
y_bottom
:
if
'姓名/名称'
in
text
:
words
=
text
.
split
(
':'
)[
-
1
]
_name
[
'position'
]
=
bbox
_name
[
'words'
]
=
words
if
'自然人身份证件号码/法人执照号码'
in
text
:
words
=
text
.
split
(
':'
)[
-
1
]
_id
[
'position'
]
=
bbox
_id
[
'words'
]
=
words
return
_name
,
_id
def
get_seller
(
self
):
seller
=
self
.
item
.
copy
()
# 先找到 key
anchor_bbox
=
None
for
block
in
self
.
pdf_info
[
'1'
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
'经销商'
==
text
:
anchor_bbox
=
bbox
# 当找到了 key, 则根据 key 去匹配 value
if
anchor_bbox
:
half_width
=
self
.
pdf_info
[
'1'
][
'width'
]
*
0.5
for
block
in
self
.
pdf_info
[
'1'
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
anchor_bbox
[
2
]
<
np
.
mean
(
bbox
[::
2
])
<
half_width
and
\
anchor_bbox
[
1
]
<
np
.
mean
(
bbox
[
1
::
2
])
<
anchor_bbox
[
3
]:
seller
[
'position'
]
=
bbox
seller
[
'words'
]
=
text
return
seller
def
get_payback_account
(
self
):
account
=
self
.
item
.
copy
()
account_name
=
self
.
item
.
copy
()
account_bank
=
self
.
item
.
copy
()
all_text
=
''
for
block
in
self
.
pdf_info
[
'1'
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
all_text
+=
text
# 首先确定账户信息是哪种,我们只输出非另行通知的格式
if
'☑账号'
in
all_text
:
all_text
=
all_text
.
replace
(
' '
,
''
)
matchs_1
=
re
.
findall
(
r'账号:(.*)户名'
,
all_text
)
if
matchs_1
:
words
=
matchs_1
[
0
]
for
block
in
self
.
pdf_info
[
'1'
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
f
'{words}'
in
text
:
account
[
'position'
]
=
bbox
account
[
'words'
]
=
words
matchs_2
=
re
.
findall
(
r'户名:(.*)开户行'
,
all_text
)
if
matchs_2
:
words
=
matchs_2
[
0
]
for
block
in
self
.
pdf_info
[
'1'
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
f
'{words}'
in
text
:
account_name
[
'position'
]
=
bbox
account_name
[
'words'
]
=
words
matchs_3
=
re
.
findall
(
r'开户行:(.*);'
,
all_text
)
if
matchs_3
:
words
=
matchs_3
[
0
]
for
block
in
self
.
pdf_info
[
'1'
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
f
'开户行:{words};'
in
text
.
replace
(
' '
,
''
):
account_bank
[
'position'
]
=
bbox
account_bank
[
'words'
]
=
words
return
account
,
account_name
,
account_bank
def
get_repayment_schedule
(
self
):
repayment_schedule
=
self
.
item
.
copy
()
# 只看第二页
repayment_schedule_table
=
[]
repayment_schedule_text_list
=
[]
table
=
False
for
block
in
self
.
pdf_info
[
'2'
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
'序号'
==
text
:
table
=
True
if
'以上表格中所列的序号并非还款期数'
in
text
:
table
=
False
if
table
==
True
:
repayment_schedule_text_list
.
append
(
text
)
for
i
in
range
(
len
(
repayment_schedule_text_list
)
//
5
):
line
=
[]
# 5表示5列的意思
for
j
in
range
(
5
):
line
.
append
(
repayment_schedule_text_list
[
i
*
5
+
j
])
if
str
(
i
+
1
)
==
line
[
1
]:
break
repayment_schedule_table
.
append
(
line
)
if
len
(
repayment_schedule_table
)
>
0
:
repayment_schedule
[
'words'
]
=
repayment_schedule_table
return
repayment_schedule
def
get_signature_role_1
(
self
):
signature_role_1
=
self
.
init_item
.
copy
()
# 先定位签字区域
texts
=
[]
boxes
=
[]
page_num
=
None
position
=
None
words
=
None
region
=
False
for
i
in
list
(
self
.
pdf_info
.
keys
()):
for
block
in
self
.
pdf_info
[
i
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
'借款人(抵押人)'
in
text
:
region
=
True
if
'日期'
in
text
:
region
=
False
if
region
==
True
:
page_num
=
i
texts
.
append
(
text
)
boxes
.
append
(
bbox
)
if
len
(
texts
)
>
4
:
words
=
'有'
else
:
words
=
'无'
boxes
=
np
.
array
(
boxes
)
.
reshape
((
-
1
,
2
))
position
=
[
min
(
boxes
[:,
0
]),
min
(
boxes
[:,
1
]),
max
(
boxes
[:,
0
]),
max
(
boxes
[:,
1
])]
signature_role_1
[
'page_num'
]
=
page_num
signature_role_1
[
'position'
]
=
position
signature_role_1
[
'words'
]
=
words
return
signature_role_1
def
get_signature_role_2
(
self
):
signature_role_2
=
self
.
init_item
.
copy
()
# 先定位签字区域
texts
=
[]
boxes
=
[]
page_num
=
None
position
=
None
words
=
None
region
=
False
for
i
in
list
(
self
.
pdf_info
.
keys
()):
for
block
in
self
.
pdf_info
[
i
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
'共同借款人(共同抵押人)'
in
text
:
region
=
True
if
'日期'
in
text
:
region
=
False
if
region
==
True
:
page_num
=
i
texts
.
append
(
text
)
boxes
.
append
(
bbox
)
if
len
(
texts
)
>
4
:
words
=
'有'
else
:
words
=
'无'
boxes
=
np
.
array
(
boxes
)
.
reshape
((
-
1
,
2
))
position
=
[
min
(
boxes
[:,
0
]),
min
(
boxes
[:,
1
]),
max
(
boxes
[:,
0
]),
max
(
boxes
[:,
1
])]
signature_role_2
[
'page_num'
]
=
page_num
signature_role_2
[
'position'
]
=
position
signature_role_2
[
'words'
]
=
words
return
signature_role_2
def
get_signature_role_3
(
self
):
signature_role_3
=
self
.
init_item
.
copy
()
# 先定位签字区域
texts
=
[]
boxes
=
[]
page_num
=
None
position
=
None
words
=
None
region
=
False
for
i
in
list
(
self
.
pdf_info
.
keys
()):
for
block
in
self
.
pdf_info
[
i
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
'保证人1'
in
text
and
int
(
i
)
!=
0
:
region
=
True
if
'日期'
in
text
:
region
=
False
if
region
==
True
:
page_num
=
i
texts
.
append
(
text
)
boxes
.
append
(
bbox
)
if
len
(
texts
)
>
4
:
words
=
'有'
else
:
words
=
'无'
boxes
=
np
.
array
(
boxes
)
.
reshape
((
-
1
,
2
))
position
=
[
min
(
boxes
[:,
0
]),
min
(
boxes
[:,
1
]),
max
(
boxes
[:,
0
]),
max
(
boxes
[:,
1
])]
signature_role_3
[
'page_num'
]
=
page_num
signature_role_3
[
'position'
]
=
position
signature_role_3
[
'words'
]
=
words
return
signature_role_3
def
get_signature_role_4
(
self
):
signature_role_4
=
self
.
init_item
.
copy
()
# 先定位签字区域
texts
=
[]
boxes
=
[]
page_num
=
None
position
=
None
words
=
None
region
=
False
for
i
in
list
(
self
.
pdf_info
.
keys
()):
for
block
in
self
.
pdf_info
[
i
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
'保证人2'
in
text
and
int
(
i
)
!=
0
:
region
=
True
if
'日期'
in
text
:
region
=
False
if
region
==
True
:
page_num
=
i
texts
.
append
(
text
)
boxes
.
append
(
bbox
)
if
len
(
texts
)
>
4
:
words
=
'有'
else
:
words
=
'无'
boxes
=
np
.
array
(
boxes
)
.
reshape
((
-
1
,
2
))
position
=
[
min
(
boxes
[:,
0
]),
min
(
boxes
[:,
1
]),
max
(
boxes
[:,
0
]),
max
(
boxes
[:,
1
])]
signature_role_4
[
'page_num'
]
=
page_num
signature_role_4
[
'position'
]
=
position
signature_role_4
[
'words'
]
=
words
return
signature_role_4
def
get_signature_role_5
(
self
):
signature_role_5
=
self
.
init_item
.
copy
()
# 先定位签字区域
texts
=
[]
boxes
=
[]
page_num
=
None
position
=
None
words
=
None
region
=
False
for
i
in
list
(
self
.
pdf_info
.
keys
()):
for
block
in
self
.
pdf_info
[
i
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
'见证人签字'
in
text
and
int
(
i
)
!=
0
:
region
=
True
if
'年'
in
text
:
region
=
False
if
region
==
True
:
page_num
=
i
texts
.
append
(
text
)
boxes
.
append
(
bbox
)
print
(
texts
)
if
len
(
texts
)
>
4
:
words
=
'有'
else
:
words
=
'无'
boxes
=
np
.
array
(
boxes
)
.
reshape
((
-
1
,
2
))
position
=
[
min
(
boxes
[:,
0
]),
min
(
boxes
[:,
1
]),
max
(
boxes
[:,
0
]),
max
(
boxes
[:,
1
])]
signature_role_5
[
'page_num'
]
=
page_num
signature_role_5
[
'position'
]
=
position
signature_role_5
[
'words'
]
=
words
return
signature_role_5
def
get_last_page_signature
(
self
,
page_num
,
top
,
bottom
):
signature_name
=
self
.
item
.
copy
()
signature_date
=
self
.
item
.
copy
()
anchor_top
=
None
anchor_bottom
=
None
for
block
in
self
.
pdf_info
[
page_num
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
top
in
text
:
anchor_top
=
bbox
[
1
]
if
bottom
in
text
:
anchor_bottom
=
bbox
[
1
]
if
anchor_top
is
not
None
and
anchor_bottom
is
not
None
:
for
block
in
self
.
pdf_info
[
page_num
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
'签署日期'
in
text
and
int
(
anchor_top
)
<
np
.
mean
(
bbox
[
1
::
2
])
<
int
(
anchor_bottom
):
name
=
text
.
split
(
' '
)[
0
]
date
=
text
.
split
(
':'
)[
-
1
]
signature_name
[
'words'
]
=
name
signature_name
[
'position'
]
=
bbox
signature_date
[
'words'
]
=
date
signature_name
[
'position'
]
=
bbox
return
signature_name
,
signature_date
def
get_info
(
self
):
"""
block['type'] == 0 : 表示该元素为图片
Returns:
dict: Description
"""
# 先判断是否为 ASP 产品
# 只看第一页,判断是否有 '附加产品融资贷款本金总金额' 这一句话,若有则为 ASP 产品
# print(self.pdf_info['0']['blocks'])
for
block
in
self
.
pdf_info
[
'0'
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
'附加产品融资贷款本金总金额'
==
text
:
self
.
is_asp
=
True
self
.
gen_init_result
(
self
.
is_asp
)
# Page 1
# 找合同编号
contract_no
=
self
.
get_contract_no
(
page_num
=
'0'
)
self
.
init_result
[
'page_1'
][
'合同编号'
]
=
contract_no
# 所购车辆价格
vehicle_price
=
self
.
get_vehicle_price
()
self
.
init_result
[
'page_1'
][
'所购车辆价格'
]
=
vehicle_price
# 车架号
vin
=
self
.
get_vin
()
self
.
init_result
[
'page_1'
][
'车架号'
]
=
vehicle_price
# 贷款本金金额(如果是 ASP产品)则'贷款本金金额'项目中包含'车辆贷款本金金额'和'附加产品融资贷款本金总金额'两个项目
upper
,
lower
,
asp_1
,
asp_2
=
self
.
get_loan_principal
()
self
.
init_result
[
'page_1'
][
'贷款本金金额'
][
'大写'
]
=
upper
self
.
init_result
[
'page_1'
][
'贷款本金金额'
][
'小写'
]
=
lower
self
.
init_result
[
'page_1'
][
'贷款本金金额'
][
'车辆贷款本金金额'
]
=
asp_1
self
.
init_result
[
'page_1'
][
'贷款本金金额'
][
'附加产品融资贷款本金总金额'
]
=
asp_2
# 贷款期限
loan_term
=
self
.
get_loan_term
()
self
.
init_result
[
'page_1'
][
'贷款期限'
]
=
loan_term
# 附加产品融资贷款本金总金额明细(ASP-表格)
asp_details_table
=
self
.
get_asp_details
(
page_num
=
'0'
)
self
.
init_result
[
'page_1'
][
'附加产品融资贷款本金总金额明细'
]
=
asp_details_table
# 借款人签字及时间
signature
=
self
.
get_signature
()
self
.
init_result
[
'page_1'
][
'借款人签字及时间'
]
=
signature
#######################################
# Page 2
# 找合同编号
contract_no
=
self
.
get_contract_no
(
page_num
=
'0'
)
self
.
init_result
[
'page_2'
][
'合同编号'
]
=
contract_no
# 找借款人及抵押人(地址字段原本有空格)
borrower_name
,
borrower_id
=
self
.
get_somebody
(
top
=
'借款人及抵押人:'
,
bottom
=
'共同借款人及共同抵押人:'
)
self
.
init_result
[
'page_2'
][
'借款人及抵押人'
][
'name'
]
=
borrower_name
self
.
init_result
[
'page_2'
][
'借款人及抵押人'
][
'id'
]
=
borrower_id
# 找共同借款人及共同抵押人
co_borrower_name
,
co_borrower_id
=
self
.
get_somebody
(
top
=
'共同借款人及共同抵押人:'
,
bottom
=
'保证人1:'
)
self
.
init_result
[
'page_2'
][
'共同借款人及共同抵押人'
][
'name'
]
=
co_borrower_name
self
.
init_result
[
'page_2'
][
'共同借款人及共同抵押人'
][
'id'
]
=
co_borrower_id
# 保证人1
first_guarantor_name
,
first_guarantor_id
=
self
.
get_somebody
(
top
=
'保证人1:'
,
bottom
=
'保证人2:'
)
self
.
init_result
[
'page_2'
][
'保证人1'
][
'name'
]
=
first_guarantor_name
self
.
init_result
[
'page_2'
][
'保证人1'
][
'id'
]
=
first_guarantor_id
# 保证人2
second_guarantor_name
,
second_guarantor_id
=
self
.
get_somebody
(
top
=
'保证人2:'
,
bottom
=
'第一章'
)
self
.
init_result
[
'page_2'
][
'保证人2'
][
'name'
]
=
second_guarantor_name
self
.
init_result
[
'page_2'
][
'保证人2'
][
'id'
]
=
second_guarantor_id
# 所购车辆价格
vehicle_price
=
self
.
get_vehicle_price
(
page_num
=
'1'
)
self
.
init_result
[
'page_2'
][
'所购车辆价格'
]
=
vehicle_price
# 车架号
vin
=
self
.
get_vin
(
page_num
=
'1'
)
self
.
init_result
[
'page_2'
][
'车架号'
]
=
vin
# 经销商
seller
=
self
.
get_seller
()
self
.
init_result
[
'page_2'
][
'经销商'
]
=
seller
# 贷款本金金额(如果是 ASP产品)则'贷款本金金额'项目中包含'车辆贷款本金金额'和'附加产品融资贷款本金总金额'两个项目
upper
,
lower
,
asp_1
,
asp_2
=
self
.
get_loan_principal
(
page_num
=
'1'
)
self
.
init_result
[
'page_2'
][
'贷款本金金额'
][
'大写'
]
=
upper
self
.
init_result
[
'page_2'
][
'贷款本金金额'
][
'小写'
]
=
lower
self
.
init_result
[
'page_2'
][
'贷款本金金额'
][
'车辆贷款本金金额'
]
=
asp_1
self
.
init_result
[
'page_2'
][
'贷款本金金额'
][
'附加产品融资贷款本金总金额'
]
=
asp_2
# 贷款期限
loan_term
=
self
.
get_loan_term
(
page_num
=
'1'
)
self
.
init_result
[
'page_2'
][
'贷款期限'
]
=
loan_term
# 还款账户
account
,
account_name
,
account_bank
=
self
.
get_payback_account
()
self
.
init_result
[
'page_2'
][
'还款账户'
][
'账号'
]
=
account
self
.
init_result
[
'page_2'
][
'还款账户'
][
'户名'
]
=
account_name
self
.
init_result
[
'page_2'
][
'还款账户'
][
'开户行'
]
=
account_bank
#######################################
# Page 3
# 找合同编号
contract_no
=
self
.
get_contract_no
(
page_num
=
'2'
)
self
.
init_result
[
'page_3'
][
'合同编号'
]
=
contract_no
# 还款计划表(表格)
repayment_schedule_table
=
self
.
get_repayment_schedule
()
self
.
init_result
[
'page_3'
][
'还款计划表'
]
=
repayment_schedule_table
#######################################
# Page 4
# 找合同编号
contract_no
=
self
.
get_contract_no
(
page_num
=
'3'
)
self
.
init_result
[
'page_4'
][
'合同编号'
]
=
contract_no
# 附加产品融资贷款本金总金额明细(ASP-表格)
asp_details_table
=
self
.
get_asp_details
(
page_num
=
'3'
)
self
.
init_result
[
'page_4'
][
'附加产品融资贷款本金总金额明细'
]
=
asp_details_table
#######################################
# Page 5
# 找合同编号
contract_no
=
self
.
get_contract_no
(
page_num
=
'4'
)
self
.
init_result
[
'page_5'
][
'合同编号'
]
=
contract_no
#######################################
# Page 6
# 找合同编号
contract_no
=
self
.
get_contract_no
(
page_num
=
'5'
)
self
.
init_result
[
'page_6'
][
'合同编号'
]
=
contract_no
if
self
.
is_asp
==
False
:
# Page 7
# 找合同编号
contract_no
=
self
.
get_contract_no
(
page_num
=
'6'
)
self
.
init_result
[
'page_7'
][
'合同编号'
]
=
contract_no
signature_name
,
signature_date
=
self
.
get_last_page_signature
(
page_num
=
'6'
,
top
=
'借款人(抵押人)'
,
bottom
=
'共同借款人(共同抵押人)'
)
self
.
init_result
[
'page_7'
][
'主借人签字'
][
'签字'
]
=
signature_name
self
.
init_result
[
'page_7'
][
'主借人签字'
][
'日期'
]
=
signature_date
signature_name
,
signature_date
=
self
.
get_last_page_signature
(
page_num
=
'6'
,
top
=
'共同借款人(共同抵押人)'
,
bottom
=
'保证人1'
)
self
.
init_result
[
'page_7'
][
'共借人签字'
][
'签字'
]
=
signature_name
self
.
init_result
[
'page_7'
][
'共借人签字'
][
'日期'
]
=
signature_date
signature_name
,
signature_date
=
self
.
get_last_page_signature
(
page_num
=
'6'
,
top
=
'保证人1'
,
bottom
=
'保证人2'
)
self
.
init_result
[
'page_7'
][
'保证人1签字'
][
'签字'
]
=
signature_name
self
.
init_result
[
'page_7'
][
'保证人1签字'
][
'日期'
]
=
signature_date
signature_name
,
signature_date
=
self
.
get_last_page_signature
(
page_num
=
'6'
,
top
=
'保证人2'
,
bottom
=
'在本人面前亲笔签署本合同'
)
self
.
init_result
[
'page_7'
][
'保证人2签字'
][
'签字'
]
=
signature_name
self
.
init_result
[
'page_7'
][
'保证人2签字'
][
'日期'
]
=
signature_date
signature_name
,
signature_date
=
self
.
get_last_page_signature
(
page_num
=
'6'
,
top
=
'在本人面前亲笔签署本合同'
,
bottom
=
'(以下无正文)'
)
self
.
init_result
[
'page_7'
][
'见证人签字'
][
'签字'
]
=
signature_name
self
.
init_result
[
'page_7'
][
'见证人签字'
][
'日期'
]
=
signature_date
else
:
# Page 7
# 找合同编号
contract_no
=
self
.
get_contract_no
(
page_num
=
'6'
)
self
.
init_result
[
'page_7'
][
'合同编号'
]
=
contract_no
# Page 8
# 找合同编号
contract_no
=
self
.
get_contract_no
(
page_num
=
'7'
)
self
.
init_result
[
'page_8'
][
'合同编号'
]
=
contract_no
signature_name
,
signature_date
=
self
.
get_last_page_signature
(
page_num
=
'7'
,
top
=
'借款人(抵押人)'
,
bottom
=
'共同借款人(共同抵押人)'
)
self
.
init_result
[
'page_8'
][
'主借人签字'
][
'签字'
]
=
signature_name
self
.
init_result
[
'page_8'
][
'主借人签字'
][
'日期'
]
=
signature_date
signature_name
,
signature_date
=
self
.
get_last_page_signature
(
page_num
=
'7'
,
top
=
'共同借款人(共同抵押人)'
,
bottom
=
'保证人1'
)
self
.
init_result
[
'page_8'
][
'共借人签字'
][
'签字'
]
=
signature_name
self
.
init_result
[
'page_8'
][
'共借人签字'
][
'日期'
]
=
signature_date
signature_name
,
signature_date
=
self
.
get_last_page_signature
(
page_num
=
'7'
,
top
=
'保证人1'
,
bottom
=
'保证人2'
)
self
.
init_result
[
'page_8'
][
'保证人1签字'
][
'签字'
]
=
signature_name
self
.
init_result
[
'page_8'
][
'保证人1签字'
][
'日期'
]
=
signature_date
signature_name
,
signature_date
=
self
.
get_last_page_signature
(
page_num
=
'7'
,
top
=
'保证人2'
,
bottom
=
'在本人面前亲笔签署本合同'
)
self
.
init_result
[
'page_8'
][
'保证人2签字'
][
'签字'
]
=
signature_name
self
.
init_result
[
'page_8'
][
'保证人2签字'
][
'日期'
]
=
signature_date
signature_name
,
signature_date
=
self
.
get_last_page_signature
(
page_num
=
'7'
,
top
=
'在本人面前亲笔签署本合同'
,
bottom
=
'(以下无正文)'
)
self
.
init_result
[
'page_8'
][
'见证人签字'
][
'签字'
]
=
signature_name
self
.
init_result
[
'page_8'
][
'见证人签字'
][
'日期'
]
=
signature_date
# 重新定制输出
new_results
=
{
"is_asp"
:
self
.
is_asp
,
"page_info"
:
self
.
init_result
}
return
new_results
src/common/electronic_hil_contract/get_char.py
0 → 100644
View file @
6432581
# -*- coding: utf-8 -*-
# @Author : lk
# @Email : 9428.al@gmail.com
# @Create Date : 2021-07-20 16:42:41
# @Last Modified : 2021-10-28 17:41:00
# @Description :
import
re
import
cv2
import
base64
import
numpy
as
np
from
fuzzywuzzy
import
fuzz
class
Finder
:
def
__init__
(
self
,
pdf_info
):
self
.
pdf_info
=
pdf_info
self
.
item
=
{
"words"
:
None
,
"page"
:
None
,
"position"
:
None
,
}
# 格式化算法输出
self
.
init_result
=
{
"合同编号"
:
self
.
item
,
"承租人-姓名"
:
self
.
item
,
"承租人-证件号码"
:
self
.
item
,
"承租人-法定代表人或授权代表"
:
self
.
item
,
"保证人1-姓名"
:
self
.
item
,
"保证人1-证件号码"
:
self
.
item
,
"保证人1-法定代表人或授权代表"
:
self
.
item
,
"保证人2-姓名"
:
self
.
item
,
"保证人2-证件号码"
:
self
.
item
,
"保证人2-法定代表人或授权代表"
:
self
.
item
,
"保证人3-姓名"
:
self
.
item
,
"保证人3-证件号码"
:
self
.
item
,
"保证人3-法定代表人或授权代表"
:
self
.
item
,
"合同编号(正文)"
:
self
.
item
,
"车辆识别代码"
:
self
.
item
,
"车辆卖方(经销商)"
:
self
.
item
,
"车辆原始销售价格(《机动车销售统一发票》所列金额)"
:
self
.
item
,
"车辆附加产品明细表"
:
self
.
item
,
"融资成本总额"
:
self
.
item
,
"租期"
:
self
.
item
,
"付款计划表"
:
self
.
item
,
"银行账户-户名"
:
self
.
item
,
"银行账户-银行账号"
:
self
.
item
,
"银行账户-开户行"
:
self
.
item
,
"签字页-承租人姓名"
:
self
.
item
,
"签字页-承租人签章"
:
self
.
item
,
"签字页-保证人1姓名"
:
self
.
item
,
"签字页-保证人1签章"
:
self
.
item
,
"签字页-保证人2姓名"
:
self
.
item
,
"签字页-保证人2签章"
:
self
.
item
,
"签字页-保证人3姓名"
:
self
.
item
,
"签字页-保证人3签章"
:
self
.
item
,
}
# 格式化输出 车辆处置协议 要是别的字段
self
.
init_result_1
=
{
"合同编号"
:
self
.
item
,
"承租人-姓名"
:
self
.
item
,
"承租人-证件号码"
:
self
.
item
,
"销售经销商"
:
self
.
item
,
"合同编号(正文)"
:
self
.
item
,
"签字页-承租人姓名"
:
self
.
item
,
"签字页-承租人证件号码"
:
self
.
item
,
"签字页-承租人签章"
:
self
.
item
,
"签字页-销售经销商"
:
self
.
item
,
"签字页-销售经销商签章"
:
self
.
item
,
}
# 格式化输出 车辆租赁抵押合同
self
.
init_result_2
=
{
"合同编号"
:
self
.
item
,
"合同编号(正文)"
:
self
.
item
,
"抵押人姓名/名称"
:
self
.
item
,
"抵押人证件号码"
:
self
.
item
,
"车辆识别代码"
:
self
.
item
,
"租金总额"
:
self
.
item
,
"融资租赁期限"
:
self
.
item
,
"签字页-抵押人姓名"
:
self
.
item
,
"签字页-抵押人签章"
:
self
.
item
,
"签字页-抵押人配偶姓名"
:
self
.
item
,
"签字页-抵押人配偶签章"
:
self
.
item
,
}
def
get_contract_no
(
self
,
page_num
):
"""传入页码,查看该页码右上角的编号
Args:
page_num (string):
Returns:
sting:
"""
contract_no
=
self
.
item
.
copy
()
# 只看第一页
for
block
in
self
.
pdf_info
[
page_num
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
'合同编号:'
in
text
:
words
=
text
.
split
(
':'
)[
-
1
]
contract_no
[
'position'
]
=
bbox
contract_no
[
'page'
]
=
page_num
contract_no
[
'words'
]
=
words
if
contract_no
[
'words'
]
==
''
:
for
block
in
self
.
pdf_info
[
page_num
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
bbox
[
1
]
<
contract_no
[
'position'
][
3
]
and
'CH'
in
text
:
contract_no
[
'position'
]
=
bbox
contract_no
[
'page'
]
=
page_num
contract_no
[
'words'
]
=
text
return
contract_no
def
get_vehicle_price
(
self
,
page_num
=
'0'
):
vehicle_price
=
self
.
item
.
copy
()
for
block
in
self
.
pdf_info
[
page_num
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
'所购车辆价格为人民币'
in
text
:
words
=
text
.
split
(
'币'
)[
-
1
]
vehicle_price
[
'position'
]
=
bbox
vehicle_price
[
'words'
]
=
words
return
vehicle_price
def
get_contract_no_one
(
self
):
# 查找正文中的合同编号,有可能存在换行的情况
contract_no
=
self
.
item
.
copy
()
for
pno
in
self
.
pdf_info
:
all_text
=
''
for
block
in
self
.
pdf_info
[
pno
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
all_text
+=
text
all_text
=
all_text
.
replace
(
' '
,
''
)
matchObj
=
re
.
search
(
r'(合同编号:\[(.*?)\])'
,
all_text
)
if
matchObj
:
words
=
matchObj
.
group
(
1
)
contract_no
[
'position'
]
=
None
contract_no
[
'page'
]
=
pno
contract_no
[
'words'
]
=
words
return
contract_no
matchObj
=
re
.
search
(
r'编号为(.*?)的'
,
all_text
)
if
matchObj
:
words
=
matchObj
.
group
(
1
)
.
strip
()
contract_no
[
'position'
]
=
None
contract_no
[
'page'
]
=
pno
contract_no
[
'words'
]
=
words
return
contract_no
matchObj
=
re
.
search
(
r'编号为(.*?))的'
,
all_text
)
if
matchObj
:
words
=
matchObj
.
group
(
1
)
.
strip
()
contract_no
[
'position'
]
=
None
contract_no
[
'page'
]
=
pno
contract_no
[
'words'
]
=
words
return
contract_no
def
get_key_value
(
self
,
key
,
page_num
=
None
):
value
=
self
.
item
.
copy
()
if
page_num
is
not
None
:
pno
=
page_num
for
block
in
self
.
pdf_info
[
pno
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
key
in
text
:
words
=
text
.
split
(
':'
)[
-
1
]
value
[
'position'
]
=
bbox
value
[
'page'
]
=
pno
value
[
'words'
]
=
words
else
:
for
pno
in
self
.
pdf_info
:
for
block
in
self
.
pdf_info
[
pno
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
key
in
text
:
# print(self.pdf_info[pno])
words
=
text
.
split
(
':'
)[
-
1
]
value
[
'position'
]
=
bbox
value
[
'page'
]
=
pno
value
[
'words'
]
=
words
return
value
def
get_loan_principal
(
self
,
page_num
=
'0'
):
chinese_keywords
=
[
'壹'
,
'贰'
,
'叁'
,
'肆'
,
'伍'
,
'陆'
,
'柒'
,
'捌'
,
'玖'
,
'拾'
,
'佰'
,
'仟'
,
'万'
,
'亿'
,
'元'
,
'角'
,
'分'
,
'零'
,
'整'
]
upper
=
self
.
item
.
copy
()
lower
=
self
.
item
.
copy
()
asp_1
=
self
.
item
.
copy
()
asp_2
=
self
.
item
.
copy
()
anchor_bbox
=
None
for
block
in
self
.
pdf_info
[
page_num
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
fuzz
.
ratio
(
''
.
join
(
chinese_keywords
),
text
)
>
15
:
text
=
text
.
split
(
':'
)[
-
1
]
.
strip
()
upper
[
'position'
]
=
bbox
upper
[
'words'
]
=
text
if
'小写:¥'
in
text
:
words
=
text
.
split
(
'¥'
)[
-
1
]
.
strip
()
lower
[
'position'
]
=
bbox
lower
[
'words'
]
=
words
if
'附加产品融资贷款本金总金额'
==
text
:
anchor_bbox
=
bbox
if
anchor_bbox
:
for
block
in
self
.
pdf_info
[
page_num
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
np
.
mean
(
bbox
[
1
::
2
])
<
np
.
mean
(
anchor_bbox
[
1
::
2
])
and
'人民币:小写:'
in
text
:
words
=
re
.
findall
(
r'人民币:小写:\[(.*)\]'
,
text
)[
0
]
asp_1
[
'position'
]
=
bbox
asp_1
[
'words'
]
=
words
if
np
.
mean
(
bbox
[
1
::
2
])
>
np
.
mean
(
anchor_bbox
[
1
::
2
])
and
'人民币:小写:'
in
text
:
words
=
re
.
findall
(
r'人民币:小写:\[(.*)\]'
,
text
)[
0
]
asp_2
[
'position'
]
=
bbox
asp_2
[
'words'
]
=
words
return
upper
,
lower
,
asp_1
,
asp_2
def
get_loan_term
(
self
,
page_num
=
'0'
):
loan_term
=
self
.
item
.
copy
()
all_text
=
''
for
block
in
self
.
pdf_info
[
page_num
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
all_text
+=
text
matchs
=
re
.
search
(
r'贷款期限(\d+)个月'
,
all_text
)
if
matchs
:
words
=
matchs
.
group
(
1
)
for
block
in
self
.
pdf_info
[
page_num
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
f
'{words}个月'
in
text
:
loan_term
[
'position'
]
=
bbox
loan_term
[
'words'
]
=
words
return
loan_term
def
get_asp_details
(
self
,
page_num
):
asp_details_table_term
=
self
.
item
.
copy
()
asp_details_table
=
[]
asp_details_text_list
=
[]
table
=
False
for
block
in
self
.
pdf_info
[
page_num
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
'附加产品融资贷款本金总金额明细'
==
text
:
table
=
True
if
'第二条'
in
text
or
'征信管理'
in
text
:
table
=
False
if
table
==
True
:
asp_details_text_list
.
append
(
text
)
for
i
in
range
((
len
(
asp_details_text_list
)
+
2
)
//
3
):
line
=
[]
if
i
==
0
:
line
=
[
asp_details_text_list
[
0
]]
else
:
for
j
in
range
(
3
):
line
.
append
(
asp_details_text_list
[
i
*
3
-
2
+
j
])
asp_details_table
.
append
(
line
)
if
len
(
asp_details_table
)
>
0
:
asp_details_table_term
[
'words'
]
=
asp_details_table
return
asp_details_table_term
def
get_signature
(
self
):
signature
=
self
.
item
.
copy
()
for
block
in
self
.
pdf_info
[
'0'
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
'签署日期'
in
text
:
words
=
text
signature
[
'words'
]
=
words
signature
[
'position'
]
=
bbox
return
signature
def
get_somebody
(
self
,
top
,
bottom
):
# 指定上下边界后,返回上下边界内的客户信息
_name
=
self
.
item
.
copy
()
_id
=
self
.
item
.
copy
()
# 只看第一页,先划定上下边界
y_top
=
0
y_bottom
=
0
for
block
in
self
.
pdf_info
[
'1'
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
top
in
text
:
y_top
=
bbox
[
3
]
if
bottom
in
text
:
y_bottom
=
bbox
[
3
]
for
block
in
self
.
pdf_info
[
'1'
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
y_top
<
bbox
[
3
]
<
y_bottom
:
if
'姓名/名称'
in
text
:
words
=
text
.
split
(
':'
)[
-
1
]
_name
[
'position'
]
=
bbox
_name
[
'words'
]
=
words
if
'自然人身份证件号码/法人执照号码'
in
text
:
words
=
text
.
split
(
':'
)[
-
1
]
_id
[
'position'
]
=
bbox
_id
[
'words'
]
=
words
return
_name
,
_id
def
get_seller
(
self
):
seller
=
self
.
item
.
copy
()
# 先找到 key
anchor_bbox
=
None
for
block
in
self
.
pdf_info
[
'1'
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
'经销商'
==
text
:
anchor_bbox
=
bbox
# 当找到了 key, 则根据 key 去匹配 value
if
anchor_bbox
:
half_width
=
self
.
pdf_info
[
'1'
][
'width'
]
*
0.5
for
block
in
self
.
pdf_info
[
'1'
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
anchor_bbox
[
2
]
<
np
.
mean
(
bbox
[::
2
])
<
half_width
and
\
anchor_bbox
[
1
]
<
np
.
mean
(
bbox
[
1
::
2
])
<
anchor_bbox
[
3
]:
seller
[
'position'
]
=
bbox
seller
[
'words'
]
=
text
return
seller
def
get_payback_account
(
self
):
account
=
self
.
item
.
copy
()
account_name
=
self
.
item
.
copy
()
account_bank
=
self
.
item
.
copy
()
all_text
=
''
for
block
in
self
.
pdf_info
[
'1'
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
all_text
+=
text
# 首先确定账户信息是哪种,我们只输出非另行通知的格式
if
'☑账号'
in
all_text
:
all_text
=
all_text
.
replace
(
' '
,
''
)
matchs_1
=
re
.
findall
(
r'账号:(.*)户名'
,
all_text
)
if
matchs_1
:
words
=
matchs_1
[
0
]
for
block
in
self
.
pdf_info
[
'1'
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
f
'{words}'
in
text
:
account
[
'position'
]
=
bbox
account
[
'words'
]
=
words
matchs_2
=
re
.
findall
(
r'户名:(.*)开户行'
,
all_text
)
if
matchs_2
:
words
=
matchs_2
[
0
]
for
block
in
self
.
pdf_info
[
'1'
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
f
'{words}'
in
text
:
account_name
[
'position'
]
=
bbox
account_name
[
'words'
]
=
words
matchs_3
=
re
.
findall
(
r'开户行:(.*);'
,
all_text
)
if
matchs_3
:
words
=
matchs_3
[
0
]
for
block
in
self
.
pdf_info
[
'1'
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
f
'开户行:{words};'
in
text
.
replace
(
' '
,
''
):
account_bank
[
'position'
]
=
bbox
account_bank
[
'words'
]
=
words
return
account
,
account_name
,
account_bank
def
get_repayment_schedule
(
self
):
repayment_schedule
=
self
.
item
.
copy
()
repayment_schedule_text_list
=
[]
table
=
False
page
=
None
for
pno
in
self
.
pdf_info
:
for
block
in
self
.
pdf_info
[
pno
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
'以上表格中所列序号'
in
text
:
table
=
False
if
table
==
True
:
repayment_schedule_text_list
.
append
(
text
)
if
'61.'
in
text
:
page
=
pno
table
=
True
repayment_schedule_table
=
[[
'序号'
,
'融资租赁成本'
,
'融资租赁费用'
,
'租金'
,
'剩余融资租赁成本'
]]
for
i
in
range
(
len
(
repayment_schedule_text_list
)
//
4
):
line
=
[
f
'{i+1}.'
]
# 4表示4列的意思
for
j
in
range
(
4
):
line
.
append
(
repayment_schedule_text_list
[
i
*
4
+
j
])
repayment_schedule_table
.
append
(
line
)
repayment_schedule
[
'words'
]
=
repayment_schedule_table
repayment_schedule
[
'page'
]
=
page
return
repayment_schedule
def
get_signature_role_1
(
self
):
signature_role_1
=
self
.
item
.
copy
()
for
pno
in
self
.
pdf_info
:
for
block
in
self
.
pdf_info
[
pno
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
'签署日期'
in
text
:
signature_role_1
[
'position'
]
=
bbox
signature_role_1
[
'page'
]
=
pno
signature_role_1
[
'words'
]
=
text
return
signature_role_1
def
get_signature_role_2
(
self
):
signature_role_2
=
self
.
init_item
.
copy
()
# 先定位签字区域
texts
=
[]
boxes
=
[]
page_num
=
None
position
=
None
words
=
None
region
=
False
for
i
in
list
(
self
.
pdf_info
.
keys
()):
for
block
in
self
.
pdf_info
[
i
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
'共同借款人(共同抵押人)'
in
text
:
region
=
True
if
'日期'
in
text
:
region
=
False
if
region
==
True
:
page_num
=
i
texts
.
append
(
text
)
boxes
.
append
(
bbox
)
if
len
(
texts
)
>
4
:
words
=
'有'
else
:
words
=
'无'
boxes
=
np
.
array
(
boxes
)
.
reshape
((
-
1
,
2
))
position
=
[
min
(
boxes
[:,
0
]),
min
(
boxes
[:,
1
]),
max
(
boxes
[:,
0
]),
max
(
boxes
[:,
1
])]
signature_role_2
[
'page_num'
]
=
page_num
signature_role_2
[
'position'
]
=
position
signature_role_2
[
'words'
]
=
words
return
signature_role_2
def
get_signature_role_3
(
self
):
signature_role_3
=
self
.
init_item
.
copy
()
# 先定位签字区域
texts
=
[]
boxes
=
[]
page_num
=
None
position
=
None
words
=
None
region
=
False
for
i
in
list
(
self
.
pdf_info
.
keys
()):
for
block
in
self
.
pdf_info
[
i
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
'保证人1'
in
text
and
int
(
i
)
!=
0
:
region
=
True
if
'日期'
in
text
:
region
=
False
if
region
==
True
:
page_num
=
i
texts
.
append
(
text
)
boxes
.
append
(
bbox
)
if
len
(
texts
)
>
4
:
words
=
'有'
else
:
words
=
'无'
boxes
=
np
.
array
(
boxes
)
.
reshape
((
-
1
,
2
))
position
=
[
min
(
boxes
[:,
0
]),
min
(
boxes
[:,
1
]),
max
(
boxes
[:,
0
]),
max
(
boxes
[:,
1
])]
signature_role_3
[
'page_num'
]
=
page_num
signature_role_3
[
'position'
]
=
position
signature_role_3
[
'words'
]
=
words
return
signature_role_3
def
get_signature_role_4
(
self
):
signature_role_4
=
self
.
init_item
.
copy
()
# 先定位签字区域
texts
=
[]
boxes
=
[]
page_num
=
None
position
=
None
words
=
None
region
=
False
for
i
in
list
(
self
.
pdf_info
.
keys
()):
for
block
in
self
.
pdf_info
[
i
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
'保证人2'
in
text
and
int
(
i
)
!=
0
:
region
=
True
if
'日期'
in
text
:
region
=
False
if
region
==
True
:
page_num
=
i
texts
.
append
(
text
)
boxes
.
append
(
bbox
)
if
len
(
texts
)
>
4
:
words
=
'有'
else
:
words
=
'无'
boxes
=
np
.
array
(
boxes
)
.
reshape
((
-
1
,
2
))
position
=
[
min
(
boxes
[:,
0
]),
min
(
boxes
[:,
1
]),
max
(
boxes
[:,
0
]),
max
(
boxes
[:,
1
])]
signature_role_4
[
'page_num'
]
=
page_num
signature_role_4
[
'position'
]
=
position
signature_role_4
[
'words'
]
=
words
return
signature_role_4
def
get_signature_role_5
(
self
):
signature_role_5
=
self
.
init_item
.
copy
()
# 先定位签字区域
texts
=
[]
boxes
=
[]
page_num
=
None
position
=
None
words
=
None
region
=
False
for
i
in
list
(
self
.
pdf_info
.
keys
()):
for
block
in
self
.
pdf_info
[
i
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
'见证人签字'
in
text
and
int
(
i
)
!=
0
:
region
=
True
if
'年'
in
text
:
region
=
False
if
region
==
True
:
page_num
=
i
texts
.
append
(
text
)
boxes
.
append
(
bbox
)
print
(
texts
)
if
len
(
texts
)
>
4
:
words
=
'有'
else
:
words
=
'无'
boxes
=
np
.
array
(
boxes
)
.
reshape
((
-
1
,
2
))
position
=
[
min
(
boxes
[:,
0
]),
min
(
boxes
[:,
1
]),
max
(
boxes
[:,
0
]),
max
(
boxes
[:,
1
])]
signature_role_5
[
'page_num'
]
=
page_num
signature_role_5
[
'position'
]
=
position
signature_role_5
[
'words'
]
=
words
return
signature_role_5
def
get_last_page_signature
(
self
,
page_num
,
top
,
bottom
):
signature_name
=
self
.
item
.
copy
()
signature_date
=
self
.
item
.
copy
()
anchor_top
=
None
anchor_bottom
=
None
for
block
in
self
.
pdf_info
[
page_num
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
top
in
text
:
anchor_top
=
bbox
[
1
]
if
bottom
in
text
:
anchor_bottom
=
bbox
[
1
]
if
anchor_top
is
not
None
and
anchor_bottom
is
not
None
:
for
block
in
self
.
pdf_info
[
page_num
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
'签署日期'
in
text
and
int
(
anchor_top
)
<
np
.
mean
(
bbox
[
1
::
2
])
<
int
(
anchor_bottom
):
name
=
text
.
split
(
' '
)[
0
]
date
=
text
.
split
(
':'
)[
-
1
]
signature_name
[
'words'
]
=
name
signature_name
[
'position'
]
=
bbox
signature_date
[
'words'
]
=
date
signature_name
[
'position'
]
=
bbox
return
signature_name
,
signature_date
def
get_electronic_signature
(
self
,
top
,
bottom
):
signature
=
self
.
item
.
copy
()
anchor_top
=
None
anchor_bottom
=
None
for
pno
in
self
.
pdf_info
:
for
block
in
self
.
pdf_info
[
pno
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
top
in
text
:
anchor_top
=
bbox
[
1
]
if
bottom
in
text
:
anchor_bottom
=
bbox
[
1
]
if
anchor_top
is
not
None
and
anchor_bottom
is
not
None
:
for
pno
in
self
.
pdf_info
:
for
block
in
self
.
pdf_info
[
pno
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
'签署日期'
in
text
and
int
(
anchor_top
)
<
np
.
mean
(
bbox
[
1
::
2
])
<
int
(
anchor_bottom
):
words
=
text
signature
[
'words'
]
=
words
signature
[
'page'
]
=
pno
signature
[
'position'
]
=
bbox
return
signature
def
get_role_info
(
self
,
role_key
,
page_num
=
'0'
):
name
=
self
.
item
.
copy
()
id_num
=
self
.
item
.
copy
()
representative
=
self
.
item
.
copy
()
# 以保证人3 的左上角为定位点
anchor
=
None
for
block
in
self
.
pdf_info
[
page_num
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
# 找到角色姓名
if
re
.
match
(
'保证人3'
,
text
)
is
not
None
:
anchor
=
[
bbox
[
0
],
bbox
[
1
]]
if
anchor
is
not
None
:
for
block
in
self
.
pdf_info
[
page_num
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
# 找到角色姓名
if
re
.
match
(
role_key
,
text
)
is
not
None
:
words
=
text
.
split
(
':'
)[
-
1
]
name
[
'words'
]
=
words
name
[
'page'
]
=
page_num
name
[
'position'
]
=
bbox
if
role_key
==
'承租人:'
:
# 找到证件号码且确定位置
if
re
.
match
(
'证件号码:'
,
text
)
is
not
None
and
np
.
mean
(
bbox
[::
2
])
<
anchor
[
0
]
and
np
.
mean
(
bbox
[
1
::
2
])
<
anchor
[
1
]:
words
=
text
.
split
(
':'
)[
-
1
]
id_num
[
'words'
]
=
words
id_num
[
'page'
]
=
page_num
id_num
[
'position'
]
=
bbox
# 找到法人代表且确定位置
if
re
.
match
(
'法定代表人或授权代表:'
,
text
)
is
not
None
and
np
.
mean
(
bbox
[::
2
])
<
anchor
[
0
]
and
np
.
mean
(
bbox
[
1
::
2
])
<
anchor
[
1
]:
words
=
text
.
split
(
':'
)[
-
1
]
representative
[
'words'
]
=
words
representative
[
'page'
]
=
page_num
representative
[
'position'
]
=
bbox
if
role_key
==
'保证人1:'
:
# 找到证件号码且确定位置
if
re
.
match
(
'证件号码:'
,
text
)
is
not
None
and
np
.
mean
(
bbox
[::
2
])
<
anchor
[
0
]
and
np
.
mean
(
bbox
[
1
::
2
])
>
anchor
[
1
]:
words
=
text
.
split
(
':'
)[
-
1
]
id_num
[
'words'
]
=
words
id_num
[
'page'
]
=
page_num
id_num
[
'position'
]
=
bbox
# 找到法人代表且确定位置
if
re
.
match
(
'法定代表人或授权代表:'
,
text
)
is
not
None
and
np
.
mean
(
bbox
[::
2
])
<
anchor
[
0
]
and
np
.
mean
(
bbox
[
1
::
2
])
>
anchor
[
1
]:
words
=
text
.
split
(
':'
)[
-
1
]
representative
[
'words'
]
=
words
representative
[
'page'
]
=
page_num
representative
[
'position'
]
=
bbox
if
role_key
==
'保证人2:'
:
# 找到证件号码且确定位置
if
re
.
match
(
'证件号码:'
,
text
)
is
not
None
and
np
.
mean
(
bbox
[::
2
])
>
anchor
[
0
]
and
np
.
mean
(
bbox
[
1
::
2
])
<
anchor
[
1
]:
words
=
text
.
split
(
':'
)[
-
1
]
id_num
[
'words'
]
=
words
id_num
[
'page'
]
=
page_num
id_num
[
'position'
]
=
bbox
# 找到法人代表且确定位置
if
re
.
match
(
'法定代表人或授权代表:'
,
text
)
is
not
None
and
np
.
mean
(
bbox
[::
2
])
>
anchor
[
0
]
and
np
.
mean
(
bbox
[
1
::
2
])
<
anchor
[
1
]:
words
=
text
.
split
(
':'
)[
-
1
]
representative
[
'words'
]
=
words
representative
[
'page'
]
=
page_num
representative
[
'position'
]
=
bbox
if
role_key
==
'保证人3:'
:
# 找到证件号码且确定位置
if
re
.
match
(
'证件号码:'
,
text
)
is
not
None
and
np
.
mean
(
bbox
[::
2
])
>
anchor
[
0
]
and
np
.
mean
(
bbox
[
1
::
2
])
>
anchor
[
1
]:
words
=
text
.
split
(
':'
)[
-
1
]
id_num
[
'words'
]
=
words
id_num
[
'page'
]
=
page_num
id_num
[
'position'
]
=
bbox
# 找到法人代表且确定位置
if
re
.
match
(
'法定代表人或授权代表:'
,
text
)
is
not
None
and
np
.
mean
(
bbox
[::
2
])
>
anchor
[
0
]
and
np
.
mean
(
bbox
[
1
::
2
])
>
anchor
[
1
]:
words
=
text
.
split
(
':'
)[
-
1
]
representative
[
'words'
]
=
words
representative
[
'page'
]
=
page_num
representative
[
'position'
]
=
bbox
return
name
,
id_num
,
representative
def
get_table_add_product
(
self
):
table_add_product
=
self
.
item
.
copy
()
items
=
[]
start
=
False
page
=
None
for
pno
in
self
.
pdf_info
:
condition
=
False
for
block
in
self
.
pdf_info
[
f
'{pno}'
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
'总计'
in
text
:
start
=
True
if
'注:出租人向承租人购买租赁车辆的对价'
in
text
:
page
=
pno
start
=
False
if
start
==
True
:
items
.
append
(
text
)
lines
=
[[
'项目'
,
'购买价格'
,
'实际融资金额'
]]
for
i
in
range
(
len
(
items
)
//
3
):
line
=
[
items
[
2
+
i
*
3
+
0
],
items
[
2
+
i
*
3
+
1
],
items
[
2
+
i
*
3
+
2
]]
lines
.
append
(
line
)
if
len
(
items
)
>
0
:
lines
.
append
([
items
[
0
],
''
,
items
[
1
]])
table_add_product
[
'words'
]
=
lines
table_add_product
[
'page'
]
=
page
table_add_product
[
'position'
]
=
None
return
table_add_product
def
get_contract_no_dy
(
self
):
# 查找抵押合同编号
contract_no
=
self
.
item
.
copy
()
key_box
=
None
for
pno
in
self
.
pdf_info
:
for
block
in
self
.
pdf_info
[
pno
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
'抵押合同编号'
in
text
:
key_box
=
bbox
if
key_box
is
not
None
:
for
pno
in
self
.
pdf_info
:
for
block
in
self
.
pdf_info
[
pno
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
key_box
[
1
]
<
np
.
mean
(
bbox
[
1
::
2
])
<
key_box
[
3
]
and
'CH-'
in
text
:
contract_no
[
'position'
]
=
bbox
contract_no
[
'page'
]
=
pno
contract_no
[
'words'
]
=
text
return
contract_no
def
get_dyr_name_id
(
self
):
name
=
self
.
item
.
copy
()
_id
=
self
.
item
.
copy
()
key_box
=
None
for
pno
in
self
.
pdf_info
:
for
block
in
self
.
pdf_info
[
pno
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
text
==
'抵押人'
:
key_box
=
bbox
if
key_box
is
not
None
:
rh
=
abs
(
key_box
[
1
]
-
key_box
[
3
])
for
pno
in
self
.
pdf_info
:
for
block
in
self
.
pdf_info
[
pno
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
key_box
[
1
]
<
np
.
mean
(
bbox
[
1
::
2
])
<
key_box
[
3
]
+
rh
*
3
and
'姓名'
in
text
:
words
=
text
.
split
(
':'
)[
-
1
]
name
[
'position'
]
=
bbox
name
[
'page'
]
=
pno
name
[
'words'
]
=
words
if
key_box
[
1
]
<
np
.
mean
(
bbox
[
1
::
2
])
<
key_box
[
3
]
+
rh
*
3
and
'证件号码'
in
text
:
words
=
text
.
split
(
':'
)[
-
1
]
_id
[
'position'
]
=
bbox
_id
[
'page'
]
=
pno
_id
[
'words'
]
=
words
return
name
,
_id
def
get_key_value_position
(
self
,
key
):
value
=
self
.
item
.
copy
()
key_box
=
None
for
pno
in
self
.
pdf_info
:
for
block
in
self
.
pdf_info
[
pno
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
text
==
key
:
key_box
=
bbox
if
key_box
is
not
None
:
rh
=
abs
(
key_box
[
1
]
-
key_box
[
3
])
for
pno
in
self
.
pdf_info
:
for
block
in
self
.
pdf_info
[
pno
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
key_box
[
1
]
<
np
.
mean
(
bbox
[
1
::
2
])
<
key_box
[
3
]
and
key_box
[
0
]
<
bbox
[
0
]
and
abs
(
key_box
[
2
]
-
bbox
[
0
])
<
rh
*
10
:
words
=
text
value
[
'position'
]
=
bbox
value
[
'page'
]
=
pno
value
[
'words'
]
=
words
return
value
def
get_info
(
self
):
"""
block['type'] == 0 : 表示该元素为图片
Returns:
dict: Description
"""
if
len
(
self
.
pdf_info
)
>
0
:
# 取 Page 1 上的合同编号
contract_no
=
self
.
get_contract_no
(
page_num
=
'0'
)
self
.
init_result
[
'合同编号'
]
=
contract_no
# 从第一页上取四个角色的姓名和证件号码
name
,
id_num
,
representative
=
self
.
get_role_info
(
role_key
=
'承租人:'
,
page_num
=
'0'
)
self
.
init_result
[
'承租人-姓名'
]
=
name
self
.
init_result
[
'承租人-证件号码'
]
=
id_num
self
.
init_result
[
'承租人-法定代表人或授权代表'
]
=
representative
name
,
id_num
,
representative
=
self
.
get_role_info
(
role_key
=
'保证人1:'
,
page_num
=
'0'
)
self
.
init_result
[
'保证人1-姓名'
]
=
name
self
.
init_result
[
'保证人1-证件号码'
]
=
id_num
self
.
init_result
[
'保证人1-法定代表人或授权代表'
]
=
representative
name
,
id_num
,
representative
=
self
.
get_role_info
(
role_key
=
'保证人2:'
,
page_num
=
'0'
)
self
.
init_result
[
'保证人2-姓名'
]
=
name
self
.
init_result
[
'保证人2-证件号码'
]
=
id_num
self
.
init_result
[
'保证人2-法定代表人或授权代表'
]
=
representative
name
,
id_num
,
representative
=
self
.
get_role_info
(
role_key
=
'保证人3:'
,
page_num
=
'0'
)
self
.
init_result
[
'保证人3-姓名'
]
=
name
self
.
init_result
[
'保证人3-证件号码'
]
=
id_num
self
.
init_result
[
'保证人3-法定代表人或授权代表'
]
=
representative
# 在所有页面中找正文中(第二部分 融资租赁主要条款及付款计划)的那个编号,因为存在换行的情况所以暂时不带位置输出
contract_no
=
self
.
get_contract_no_one
()
self
.
init_result
[
'合同编号(正文)'
]
=
contract_no
# 找到车辆识别代码
vin
=
self
.
get_key_value
(
key
=
'车辆识别代码:'
)
self
.
init_result
[
'车辆识别代码'
]
=
vin
# 找到经销商(车辆卖方(经销商))
seller
=
self
.
get_key_value
(
key
=
'车辆卖方(经销商):'
)
self
.
init_result
[
'车辆卖方(经销商)'
]
=
seller
# 找到 —— 车辆原始销售价格
vehicle_price
=
self
.
get_key_value
(
key
=
'车辆原始销售价格(《机动车销售统一发票》所列金额):'
)
self
.
init_result
[
'车辆原始销售价格(《机动车销售统一发票》所列金额)'
]
=
vehicle_price
# 找车辆附加产品明细(表)
table_add_product
=
self
.
get_table_add_product
()
self
.
init_result
[
'车辆附加产品明细表'
]
=
table_add_product
# 找融资成本总额
financing_cost
=
self
.
get_key_value
(
key
=
'融资成本总额:'
)
self
.
init_result
[
'融资成本总额'
]
=
financing_cost
# 找租期
lease_term
=
self
.
get_key_value
(
key
=
'租期:'
)
self
.
init_result
[
'租期'
]
=
lease_term
# 找还款计划(表)
repayment_schedule
=
self
.
get_repayment_schedule
()
self
.
init_result
[
'付款计划表'
]
=
repayment_schedule
# 找开户行户名、银行账号、银行
name
=
self
.
get_key_value
(
key
=
'户名:'
)
self
.
init_result
[
'银行账户-户名'
]
=
name
account
=
self
.
get_key_value
(
key
=
'银行账号:'
)
self
.
init_result
[
'银行账户-银行账号'
]
=
account
bank
=
self
.
get_key_value
(
key
=
'开户银行:'
)
self
.
init_result
[
'银行账户-开户行'
]
=
bank
# 找签字页上的系列信息
# 承租人姓名、签章
name
=
self
.
get_key_value
(
key
=
'承租人姓名:'
)
electronic_signature
=
self
.
get_electronic_signature
(
top
=
'承租人姓名:'
,
bottom
=
'保证人1姓名:'
)
self
.
init_result
[
'签字页-承租人姓名'
]
=
name
self
.
init_result
[
'签字页-承租人签章'
]
=
electronic_signature
# 保证人1姓名、签章
name
=
self
.
get_key_value
(
key
=
'保证人1姓名:'
)
electronic_signature
=
self
.
get_electronic_signature
(
top
=
'保证人1姓名:'
,
bottom
=
'保证人2姓名:'
)
self
.
init_result
[
'签字页-保证人1姓名'
]
=
name
self
.
init_result
[
'签字页-保证人1签章'
]
=
electronic_signature
# 保证人2姓名、签章
name
=
self
.
get_key_value
(
key
=
'保证人2姓名:'
)
electronic_signature
=
self
.
get_electronic_signature
(
top
=
'保证人2姓名:'
,
bottom
=
'保证人3姓名:'
)
self
.
init_result
[
'签字页-保证人2姓名'
]
=
name
self
.
init_result
[
'签字页-保证人2签章'
]
=
electronic_signature
# 保证人2姓名、签章
name
=
self
.
get_key_value
(
key
=
'保证人3姓名:'
)
electronic_signature
=
self
.
get_electronic_signature
(
top
=
'保证人3姓名:'
,
bottom
=
'日期:'
)
self
.
init_result
[
'签字页-保证人3姓名'
]
=
name
self
.
init_result
[
'签字页-保证人3签章'
]
=
electronic_signature
return
self
.
init_result
# results['is_shhz_contract'] = True
# results['pdf_info'] = self.init_result
# return results
def
get_info_1
(
self
):
if
len
(
self
.
pdf_info
)
>
0
:
contract_no
=
self
.
get_contract_no
(
page_num
=
'0'
)
self
.
init_result_1
[
'合同编号'
]
=
contract_no
# 承租人姓名
name
=
self
.
get_key_value
(
key
=
'承租人:'
,
page_num
=
'0'
)
self
.
init_result_1
[
'承租人-姓名'
]
=
name
# 承租人证件号码
_id
=
self
.
get_key_value
(
key
=
'证件号码:'
,
page_num
=
'0'
)
self
.
init_result_1
[
'承租人-证件号码'
]
=
_id
# 销售经销商
seller
=
self
.
get_key_value
(
key
=
'销售经销商:'
,
page_num
=
'0'
)
self
.
init_result_1
[
'销售经销商'
]
=
seller
# 合同编号(正文)
contract_no
=
self
.
get_contract_no_one
()
self
.
init_result_1
[
'合同编号(正文)'
]
=
contract_no
# 签字页-承租人姓名
name
=
self
.
get_key_value
(
key
=
'姓名/名称:'
)
self
.
init_result_1
[
'签字页-承租人姓名'
]
=
name
# 签字页-承租人证件号码
_id
=
self
.
get_key_value
(
key
=
'自然人身份证件号码/法人执照号码:'
)
self
.
init_result_1
[
'签字页-承租人证件号码'
]
=
_id
# 签字页-承租人签章
signature_role_1
=
self
.
get_signature_role_1
()
self
.
init_result_1
[
'签字页-承租人签章'
]
=
signature_role_1
# 签字页-销售经销商
seller
=
self
.
get_key_value
(
key
=
'销售经销商:'
)
self
.
init_result_1
[
'签字页-销售经销商'
]
=
seller
# 经销商签章
pass
return
self
.
init_result_1
def
get_info_2
(
self
):
if
len
(
self
.
pdf_info
)
>
0
:
contract_no
=
self
.
get_contract_no_dy
()
self
.
init_result_2
[
'合同编号'
]
=
contract_no
# 合同编号(正文)
contract_no
=
self
.
get_contract_no_one
()
self
.
init_result_2
[
'合同编号(正文)'
]
=
contract_no
# 抵押人姓名/名称
name
,
_id
=
self
.
get_dyr_name_id
()
self
.
init_result_2
[
'抵押人姓名/名称'
]
=
name
self
.
init_result_2
[
'抵押人证件号码'
]
=
_id
# 车辆识别代码
vin
=
self
.
get_key_value
(
key
=
'车辆识别代码:'
)
self
.
init_result_2
[
'车辆识别代码'
]
=
vin
# 租金总额
rent
=
self
.
get_key_value_position
(
key
=
'租金总额'
)
self
.
init_result_2
[
'租金总额'
]
=
rent
# 融资租赁期限
lease_term
=
self
.
get_key_value_position
(
key
=
'融资租赁期限'
)
self
.
init_result_2
[
'融资租赁期限'
]
=
lease_term
# 签字页抵押人姓名和签章
name
=
self
.
get_key_value
(
key
=
'抵押人姓名:'
)
electronic_signature
=
self
.
get_electronic_signature
(
top
=
'抵押权人盖章'
,
bottom
=
'抵押人配偶姓名:'
)
self
.
init_result_2
[
'签字页-抵押人姓名'
]
=
name
self
.
init_result_2
[
'签字页-抵押人签章'
]
=
electronic_signature
# 签字页抵押人配偶姓名和签章
name
=
self
.
get_key_value
(
key
=
'抵押人配偶姓名:'
)
electronic_signature
=
self
.
get_electronic_signature
(
top
=
'抵押人配偶姓名:'
,
bottom
=
'日期'
)
self
.
init_result_2
[
'签字页-抵押人配偶姓名'
]
=
name
self
.
init_result_2
[
'签字页-抵押人配偶签章'
]
=
electronic_signature
return
self
.
init_result_2
src/common/electronic_hil_contract/hil_contract_ocr.py
0 → 100644
View file @
6432581
# -*- coding: utf-8 -*-
# @Author : lk
# @Email : 9428.al@gmail.com
# @Created Date : 2021-06-29 17:43:46
# @Last Modified : 2021-11-03 16:07:36
# @Description :
from
.get_char
import
Finder
def
predict
(
pdf_info
,
file_cls
):
"""Summary
Args:
pdf_info (TYPE): Description
file_cls (TYPE): file_cls = 0: 售后回租合同; file_cls = 1: 车辆处置协议; file_cls = 2: 车辆租赁抵押合同
Returns:
TYPE: Description
"""
# 0: 售后回租合同
pdf_info_0
=
[]
for
pno
in
pdf_info
:
for
block
in
pdf_info
[
f
'{pno}'
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
'售后回租合同_'
in
text
:
pdf_info_0
.
append
(
pdf_info
[
pno
])
# 1: 车辆处置协议
pdf_info_1
=
[]
for
pno
in
pdf_info
:
for
block
in
pdf_info
[
f
'{pno}'
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
'售后回租合同附件一'
in
text
:
pdf_info_1
.
append
(
pdf_info
[
pno
])
# 2: 车辆租赁抵押合同
pdf_info_2
=
[]
for
pno
in
pdf_info
:
for
block
in
pdf_info
[
f
'{pno}'
][
'blocks'
]:
if
block
[
'type'
]
!=
0
:
continue
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
if
'车辆租赁抵押合同_'
in
text
:
pdf_info_2
.
append
(
pdf_info
[
pno
])
is_clczxy
=
False
# 如果 pdf_info_1 == 4 页,则说明此时输入包含了车辆处置协议
if
len
(
pdf_info_1
)
==
4
and
file_cls
==
1
and
len
(
pdf_info_0
)
!=
0
:
is_clczxy
=
True
pdf_info
=
dict
()
for
pno
,
page_info
in
enumerate
(
pdf_info_1
):
pdf_info
[
str
(
pno
)]
=
page_info
f
=
Finder
(
pdf_info
)
if
file_cls
==
0
:
results
=
f
.
get_info
()
if
file_cls
==
1
:
# 提取信息 ———— 车辆处置协议
results
=
f
.
get_info_1
()
if
file_cls
==
2
:
# 提取信息 ———— 车辆租赁抵押合同
results
=
f
.
get_info_2
()
if
is_clczxy
==
True
:
for
key
in
results
:
if
results
[
key
][
'page'
]
is
not
None
:
results
[
key
][
'page'
]
=
str
(
int
(
results
[
key
][
'page'
])
+
6
)
for
key
in
results
:
if
results
[
key
][
'page'
]
is
not
None
:
results
[
key
][
'page'
]
=
'page_'
+
str
(
int
(
results
[
key
][
'page'
])
+
1
)
return
results
src/common/tools/mssql_script10.py
0 → 100644
View file @
6432581
import
pyodbc
afc_sql
=
"""
create table afc_contract
(
id bigint identity primary key,
application_id nvarchar(64) not null,
create_time datetime not null
);
create index afc_contract_application_id_index
on afc_contract (application_id);
"""
hil_sql
=
"""
create table hil_contract
(
id bigint identity primary key,
application_id nvarchar(64) not null,
create_time datetime not null
);
create index hil_contract_application_id_index
on hil_contract (application_id);
"""
hil_cnxn
=
pyodbc
.
connect
(
'DRIVER={ODBC Driver 17 for SQL Server};'
,
autocommit
=
True
)
hil_cursor
=
hil_cnxn
.
cursor
()
hil_cursor
.
execute
(
hil_sql
)
hil_cursor
.
close
()
hil_cnxn
.
close
()
afc_cnxn
=
pyodbc
.
connect
(
'DRIVER={ODBC Driver 17 for SQL Server};'
,
autocommit
=
True
)
afc_cursor
=
afc_cnxn
.
cursor
()
afc_cursor
.
execute
(
afc_sql
)
afc_cursor
.
close
()
afc_cnxn
.
close
()
src/common/tools/pdf_to_img.py
View file @
6432581
import
os
import
json
import
cv2
import
shutil
import
fitz
...
...
@@ -36,6 +37,8 @@ class PDFHandler:
self
.
is_ebank
=
False
self
.
is_e_pdf
=
False
self
.
page_text_list
=
[]
self
.
pdf_info
=
{}
self
.
img_path_pno_list
=
[]
def
get_suffix
(
self
,
file_name
):
if
file_name
is
None
:
...
...
@@ -298,6 +301,17 @@ class PDFHandler:
self
.
is_e_pdf
=
True
# self.page_text_list = page_text_list
def
e_contract_process
(
self
):
with
fitz
.
Document
(
self
.
path
)
as
pdf
:
for
pno
in
range
(
pdf
.
pageCount
):
page
=
pdf
.
loadPage
(
pno
)
self
.
pdf_info
[
str
(
pno
)]
=
json
.
loads
(
page
.
getText
(
'json'
))
pix
=
page
.
getPixmap
()
img_save_path
=
self
.
get_img_save_path
(
page
.
number
)
self
.
img_path_pno_list
.
append
((
img_save_path
,
'page_{0}'
.
format
(
str
(
pno
+
1
))))
pix
.
writePNG
(
img_save_path
)
def
extract_image
(
self
,
max_img_count
=
None
):
self
.
img_path_list
=
[]
self
.
xref_set
=
set
()
...
...
src/settings/conf/prd.ini
View file @
6432581
...
...
@@ -14,3 +14,5 @@ DEALER_CODE = ocr_group
BASE_URL
=
https://li19dkocruat02vm.bmwgroup.net
DELAY_SECONDS
=
60
...
...
src/settings/conf/sit.ini
View file @
6432581
...
...
@@ -12,4 +12,6 @@ EDMS_DOWNLOAD_URL = https://edms-test.bmw.com/FH/FileHold/DocumentRepository/Dow
EDMS_UPLOAD_URL
=
https://edms-test.bmw.com/FH/FileHold/DocumentRepository/UploadHandler.ashx
DEALER_CODE
=
ocr_situ_group
BASE_URL
=
https://staging-bmw-ocr.situdata.com
\ No newline at end of file
BASE_URL
=
https://staging-bmw-ocr.situdata.com
DELAY_SECONDS
=
60
\ No newline at end of file
...
...
src/settings/conf/uat.ini
View file @
6432581
...
...
@@ -12,4 +12,6 @@ EDMS_DOWNLOAD_URL = http://sccn0637.bmwgroup.net/FH/FileHold/DocumentRepository/
EDMS_UPLOAD_URL
=
http://sccn0637.bmwgroup.net/FH/FileHold/DocumentRepository/UploadHandler.ashx
DEALER_CODE
=
ocr_situ_group
BASE_URL
=
https://li19dkocruat01vm.bmwgroup.net
\ No newline at end of file
BASE_URL
=
https://li19dkocruat01vm.bmwgroup.net
DELAY_SECONDS
=
60
\ No newline at end of file
...
...
Write
Preview
Styling with
Markdown
is supported
Attach a file
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to post a comment