Skip to content
Toggle navigation
Toggle navigation
This project
Loading...
Sign in
周伟奇
/
bmw-ocr
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Graphs
Network
Create a new issue
Commits
Issue Boards
Files
Commits
Network
Compare
Branches
Tags
1aa502ce
authored
2024-09-10 14:52:23 +0800
by
chenyao
Browse Files
Options
Browse Files
Tag
Download
Email Patches
Plain Diff
修改抵押登记豁免函的字段处理
1 parent
a0d6443d
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
13 additions
and
5 deletions
src/common/fsm_econtract/retriever.py
src/common/fsm_econtract/retriever.py
View file @
1aa502c
...
...
@@ -19,10 +19,18 @@ class HMHRetriever:
def
get_target_fields
(
self
,
pdf_text_list
):
result
=
dict
()
is_find_name_id_company
,
is_find_application_no
,
is_find_name_date
=
False
,
False
,
False
for
bbox
,
text
in
pdf_text_list
.
pop
(
str
(
0
),
[]):
# print(text)
# for bbox, text in pdf_text_list.pop(str(0), []):
pdf_text_items
=
pdf_text_list
.
pop
(
str
(
0
),
[])
for
i
in
range
(
len
(
pdf_text_items
)):
bbox
,
text
=
pdf_text_items
[
i
]
combined_text
=
text
if
i
<
len
(
pdf_text_items
)
-
1
:
combined_text
+=
pdf_text_items
[
i
+
1
][
1
]
if
not
is_find_name_id_company
:
name_id_company_list
=
re
.
findall
(
r'姓名(.*)证件号码(.*)与(.*公司)'
,
text
)
# name_id_company_list = re.findall(r'姓名(.*?)证件号码(.*?)与(.*?公司|.*)', combined_text)
name_id_company_list
=
re
.
findall
(
r'姓名(.*)证件号码(.*)与(.*公司)'
,
combined_text
)
for
name_id_company_tuple
in
name_id_company_list
:
if
len
(
name_id_company_tuple
)
==
3
:
result
[
self
.
search_fields_list
[
0
][
0
]]
=
{
...
...
@@ -40,7 +48,7 @@ class HMHRetriever:
is_find_name_id_company
=
True
break
if
not
is_find_application_no
:
application_no_list
=
re
.
findall
(
r'合同编号.*(CH-B\d*-\d*).*'
,
text
)
application_no_list
=
re
.
findall
(
r'合同编号.*(CH-B\d*-\d*).*'
,
combined_
text
)
if
len
(
application_no_list
)
==
1
:
result
[
self
.
search_fields_list
[
3
][
0
]]
=
{
self
.
words_str
:
application_no_list
[
0
],
...
...
@@ -48,7 +56,7 @@ class HMHRetriever:
}
is_find_application_no
=
True
if
not
is_find_name_date
:
name_date_list
=
re
.
findall
(
r'(.*).*签署日期.*(\d{4}-\d{2}-\d{2})'
,
text
)
name_date_list
=
re
.
findall
(
r'(.*).*签署日期.*(\d{4}-\d{2}-\d{2})'
,
combined_
text
)
for
name_date_tuple
in
name_date_list
:
if
len
(
name_date_tuple
)
==
2
:
result
[
self
.
search_fields_list
[
4
][
0
]]
=
{
...
...
Write
Preview
Styling with
Markdown
is supported
Attach a file
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to post a comment