Skip to content
Toggle navigation
Toggle navigation
This project
Loading...
Sign in
周伟奇
/
bmw-ocr
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Graphs
Network
Create a new issue
Commits
Issue Boards
Files
Commits
Network
Compare
Branches
Tags
c8bacd75
authored
2025-05-16 15:35:02 +0800
by
冯轩
Browse Files
Options
Browse Files
Tag
Download
Plain Diff
Merge remote-tracking branch 'origin/feature_add_insurance_sec_page'
2 parents
83c49cee
d62f4b3c
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
39 additions
and
1 deletions
src/apps/doc/management/commands/ocr_process.py
src/common/fsm_econtract/retriever.py
src/apps/doc/management/commands/ocr_process.py
View file @
c8bacd7
...
...
@@ -2055,6 +2055,43 @@ class Command(BaseCommand, LoggerMixin):
# src_excel_path = os.path.join(doc_data_path, 'src.xlsx')
# wb.save(src_excel_path)
#need_follow表示在上传edms时文件名是否要添加"关注"两字
# 处理 保单 的后续页的章识别不到的问题
if
len
(
license_summary
)
>
0
:
# 如果是保单
if
consts
.
INSURANCE_CLASSIFY
in
license_summary
.
keys
():
# 获取 license_list
license_list_tmp
=
license_summary
.
get
(
consts
.
INSURANCE_CLASSIFY
,
[])
if
len
(
license_list_tmp
)
>
0
:
# 章的有无
stamp_flag_list
=
[
0
]
*
len
(
license_list_tmp
)
for
license_list_tmp_idx
,
license_dict_tmp
in
enumerate
(
license_list_tmp
):
if
isinstance
(
license_dict_tmp
,
dict
):
if
"保单章"
in
license_dict_tmp
.
keys
():
if
license_dict_tmp
.
get
(
"保单章"
)
is
not
None
:
if
isinstance
(
license_dict_tmp
.
get
(
"保单章"
),
str
)
and
license_dict_tmp
.
get
(
"保单章"
)
==
"有"
:
stamp_flag_list
[
license_list_tmp_idx
]
=
1
# 计算 stamp_flag_list 中的 sum
stamp_flag_sum
=
sum
(
stamp_flag_list
)
# 检查 license_list_tmp 中的每个元素中 [被保险人姓名] 的值是否为空, 若为空, 则不是第一页
license_first_page_list
=
[
0
]
*
len
(
license_list_tmp
)
for
license_list_tmp_idx
,
license_dict_tmp
in
enumerate
(
license_list_tmp
):
if
isinstance
(
license_dict_tmp
,
dict
):
if
"被保险人姓名"
in
license_dict_tmp
.
keys
():
if
license_dict_tmp
.
get
(
"被保险人姓名"
)
is
not
None
:
if
isinstance
(
license_dict_tmp
.
get
(
"被保险人姓名"
),
str
)
and
license_dict_tmp
.
get
(
"被保险人姓名"
)
!=
""
and
"保险"
not
in
license_dict_tmp
.
get
(
"被保险人姓名"
):
license_first_page_list
[
license_list_tmp_idx
]
=
1
break
# 获取 license_first_page_list 中为 1 的索引
license_first_page_idx
=
[
i
for
i
,
x
in
enumerate
(
license_first_page_list
)
if
x
==
1
]
# 将 license_summary 中key为 INSURANCE_CLASSIFY 的 value 替换为 first_page 中的内容, 且只保留 1 页
if
len
(
license_first_page_idx
)
>
0
:
license_summary
[
consts
.
INSURANCE_CLASSIFY
]
=
[
license_list_tmp
[
license_first_page_idx
[
0
]]]
# 如果 sum > 0, 说明有章, 无论是在第几页, 接下来将第一页的内容提取出来,只保留第一页
if
stamp_flag_sum
>
0
:
license_summary
[
consts
.
INSURANCE_CLASSIFY
][
0
][
"保单章"
]
=
"有"
count_list
,
need_follow
=
wb
.
rebuild
(
merged_bs_summary
,
license_summary
,
res_list
,
doc
.
document_scheme
,
contract_result
,
doc
.
metadata
,
financial_statement_dict
,
financial_explanation_dict
,
down_payment_dict
)
wb
.
save
(
excel_path
)
...
...
src/common/fsm_econtract/retriever.py
View file @
c8bacd7
...
...
@@ -209,7 +209,8 @@ class Retriever:
if
pno_str
==
'-1'
:
is_last_pno
=
True
pno_int_list
=
[
int
(
pno_str
)
for
pno_str
in
pdf_text_list
.
keys
()]
pno_str
=
str
(
max
(
pno_int_list
))
if
len
(
pno_int_list
)
>
0
:
pno_str
=
str
(
max
(
pno_int_list
))
# 搜索关键词
key_text_info
=
dict
()
...
...
Write
Preview
Styling with
Markdown
is supported
Attach a file
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to post a comment