Skip to content
Toggle navigation
Toggle navigation
This project
Loading...
Sign in
周伟奇
/
bmw-ocr
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Graphs
Network
Create a new issue
Commits
Issue Boards
Files
Commits
Network
Compare
Branches
Tags
83e1571a
authored
2024-09-25 18:37:11 +0800
by
冯轩
Browse Files
Options
Browse Files
Tag
Download
Plain Diff
Merge branch 'feature/add_log_20240924' into feature/uat-tmp
2 parents
8c0cdbfb
7653f384
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
27 additions
and
8 deletions
src/apps/doc/management/commands/ocr_process.py
src/common/fsm_econtract/retriever.py
src/common/tools/pdf_to_img.py
src/apps/doc/management/commands/ocr_process.py
View file @
83e1571
...
...
@@ -1015,6 +1015,13 @@ class Command(BaseCommand, LoggerMixin):
tmp_res
=
page_info_dict
.
get
(
str
(
pno2
),
{})
.
get
(
key1
,
''
)
img_pno
=
pno1
res
[
key
]
=
tmp_res
# 添加处理,
# [售后回租合同] - 如果 key 是 "承租人签字", 且内容中包含 签署日期:XXXX, 则将签署日期去除
# [车辆租赁抵押合同] - 如果 key 是 ""
if
key
==
'承租人签字'
and
'签署日期'
in
tmp_res
:
res
[
key
]
=
tmp_res
.
split
(
'签署日期'
)[
0
]
if
key
==
"抵押人签字"
and
"签署日期"
in
tmp_res
:
res
[
key
]
=
tmp_res
.
split
(
"签署日期"
)[
0
]
res
.
setdefault
(
consts
.
IMG_PATH_KEY
,
dict
())[
key
]
=
page_info_dict
.
get
(
str
(
img_pno
),
{})
.
get
(
consts
.
IMG_PATH_KEY
,
''
)
else
:
...
...
@@ -1668,16 +1675,20 @@ class Command(BaseCommand, LoggerMixin):
path_split
=
img_path
.
split
(
'/'
)
task_str
=
consts
.
SPLIT_STR
.
join
((
path_split
[
-
5
],
path_split
[
-
3
]))
self
.
online_log
.
info
(
'{0} [before lock] [img={1}] '
.
format
(
self
.
log_base
,
img_path
))
with
lock
:
self
.
online_log
.
info
(
'{0} [get lock] [img={1}] '
.
format
(
self
.
log_base
,
img_path
))
doc_res_dict
=
res_dict
.
setdefault
(
task_str
,
{})
doc_res_dict
[
img_path
]
=
ocr_1_res
res_dict
[
task_str
]
=
doc_res_dict
todo_count
=
todo_count_dict
.
get
(
task_str
)
if
todo_count
==
1
:
finish_queue
.
put
(
task_str
)
self
.
online_log
.
info
(
'{0} [ocr_1 to finish_queue] [img={1}] '
.
format
(
self
.
log_base
,
img_path
))
del
todo_count_dict
[
task_str
]
else
:
todo_count_dict
[
task_str
]
=
todo_count
-
1
self
.
online_log
.
info
(
'{0} [after lock] [img={1}] '
.
format
(
self
.
log_base
,
img_path
))
except
Exception
as
e
:
self
.
online_log
.
error
(
'{0} [process error (store ocr res)] [img_path={1}] [error={2}]'
.
format
(
self
.
log_base
,
img_path
,
traceback
.
format_exc
()))
...
...
src/common/fsm_econtract/retriever.py
View file @
83e1571
...
...
@@ -19,10 +19,18 @@ class HMHRetriever:
def
get_target_fields
(
self
,
pdf_text_list
):
result
=
dict
()
is_find_name_id_company
,
is_find_application_no
,
is_find_name_date
=
False
,
False
,
False
for
bbox
,
text
in
pdf_text_list
.
pop
(
str
(
0
),
[]):
# print(text)
# for bbox, text in pdf_text_list.pop(str(0), []):
pdf_text_items
=
pdf_text_list
.
pop
(
str
(
0
),
[])
for
i
in
range
(
len
(
pdf_text_items
)):
bbox
,
text
=
pdf_text_items
[
i
]
combined_text
=
text
if
i
<
len
(
pdf_text_items
)
-
1
:
combined_text
+=
pdf_text_items
[
i
+
1
][
1
]
if
not
is_find_name_id_company
:
name_id_company_list
=
re
.
findall
(
r'姓名(.*)证件号码(.*)与(.*公司)'
,
text
)
# name_id_company_list = re.findall(r'姓名(.*?)证件号码(.*?)与(.*?公司|.*)', combined_text)
name_id_company_list
=
re
.
findall
(
r'姓名(.*)证件号码(.*)与(.*公司)'
,
combined_text
)
for
name_id_company_tuple
in
name_id_company_list
:
if
len
(
name_id_company_tuple
)
==
3
:
result
[
self
.
search_fields_list
[
0
][
0
]]
=
{
...
...
@@ -40,7 +48,7 @@ class HMHRetriever:
is_find_name_id_company
=
True
break
if
not
is_find_application_no
:
application_no_list
=
re
.
findall
(
r'合同编号.*(CH-B\d*-\d*).*'
,
text
)
application_no_list
=
re
.
findall
(
r'合同编号.*(CH-B\d*-\d*).*'
,
combined_
text
)
if
len
(
application_no_list
)
==
1
:
result
[
self
.
search_fields_list
[
3
][
0
]]
=
{
self
.
words_str
:
application_no_list
[
0
],
...
...
@@ -48,7 +56,7 @@ class HMHRetriever:
}
is_find_application_no
=
True
if
not
is_find_name_date
:
name_date_list
=
re
.
findall
(
r'(.*).*签署日期.*(\d{4}-\d{2}-\d{2})'
,
text
)
name_date_list
=
re
.
findall
(
r'(.*).*签署日期.*(\d{4}-\d{2}-\d{2})'
,
combined_
text
)
for
name_date_tuple
in
name_date_list
:
if
len
(
name_date_tuple
)
==
2
:
result
[
self
.
search_fields_list
[
4
][
0
]]
=
{
...
...
src/common/tools/pdf_to_img.py
View file @
83e1571
...
...
@@ -10,9 +10,9 @@ from io import BytesIO
from
unicodedata
import
normalize
# 页面保存为png图片参数
ZOOM_X_1
=
ZOOM_Y_1
=
3
.0
ZOOM_X_2
=
ZOOM_Y_2
=
5
.0
ZOOM_X_3
=
ZOOM_Y_3
=
7
.0
ZOOM_X_1
=
ZOOM_Y_1
=
1
.0
ZOOM_X_2
=
ZOOM_Y_2
=
2
.0
ZOOM_X_3
=
ZOOM_Y_3
=
3
.0
trans_1
=
fitz
.
Matrix
(
ZOOM_X_1
,
ZOOM_X_1
)
.
preRotate
(
0
)
# zoom factor 1 in each dimension
trans_2
=
fitz
.
Matrix
(
ZOOM_X_2
,
ZOOM_X_2
)
.
preRotate
(
0
)
# zoom factor 2 in each dimension
trans_3
=
fitz
.
Matrix
(
ZOOM_X_3
,
ZOOM_X_3
)
.
preRotate
(
0
)
# zoom factor 3 in each dimension
...
...
Write
Preview
Styling with
Markdown
is supported
Attach a file
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to post a comment