Skip to content
Toggle navigation
Toggle navigation
This project
Loading...
Sign in
周伟奇
/
bmw-ocr
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Graphs
Network
Create a new issue
Commits
Issue Boards
Files
Commits
Network
Compare
Branches
Tags
eb259387
authored
2024-10-12 15:01:17 +0800
by
冯轩
Browse Files
Options
Browse Files
Tag
Download
Plain Diff
Merge branch 'feature/CHINARPA-4962' into feature/uat-tmp
2 parents
07007f09
2be87904
Show whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
28 additions
and
12 deletions
src/apps/doc/management/commands/ocr_process.py
src/celery_compare/tasks.py
src/common/tools/pdf_to_img.py
src/settings/conf/prd.ini
src/settings/conf/sit.ini
src/settings/conf/uat.ini
src/apps/doc/management/commands/ocr_process.py
View file @
eb25938
...
...
@@ -1018,9 +1018,9 @@ class Command(BaseCommand, LoggerMixin):
# 添加处理,
# [售后回租合同] - 如果 key 是 "承租人签字", 且内容中包含 签署日期:XXXX, 则将签署日期去除
# [车辆租赁抵押合同] - 如果 key 是 ""
if
key
==
'承租人签字'
and
'签署日期'
in
tmp_res
:
if
key
==
'承租人签字'
and
tmp_res
is
not
None
and
'签署日期'
in
tmp_res
:
res
[
key
]
=
tmp_res
.
split
(
'签署日期'
)[
0
]
if
key
==
"抵押人签字"
and
"签署日期"
in
tmp_res
:
if
key
==
"抵押人签字"
and
tmp_res
is
not
None
and
"签署日期"
in
tmp_res
:
res
[
key
]
=
tmp_res
.
split
(
"签署日期"
)[
0
]
res
.
setdefault
(
consts
.
IMG_PATH_KEY
,
dict
())[
key
]
=
page_info_dict
.
get
(
str
(
img_pno
),
{})
.
get
(
consts
.
IMG_PATH_KEY
,
''
)
...
...
@@ -1624,7 +1624,7 @@ class Command(BaseCommand, LoggerMixin):
try
:
channel
,
img_path
,
text_list
=
img_queue
.
get
(
block
=
False
)
except
Exception
as
e
:
#
self.online_log.info('{0} [img_2_ocr_1] [queue empty]'.format(self.log_base))
self
.
online_log
.
info
(
'{0} [img_2_ocr_1] [queue empty]'
.
format
(
self
.
log_base
))
time
.
sleep
(
self
.
sleep_time_img_get
)
continue
else
:
...
...
@@ -1653,6 +1653,7 @@ class Command(BaseCommand, LoggerMixin):
'[error={4}]'
.
format
(
self
.
log_base
,
times
,
url
,
img_path
,
traceback
.
format_exc
()))
else
:
self
.
online_log
.
info
(
'{0} [ocr_1 start] [img={1}] [url={2}]'
.
format
(
self
.
log_base
,
img_path
,
url
))
ocr_1_res
=
ocr_1_response
.
json
()
end_time
=
time
.
time
()
speed_time
=
int
(
end_time
-
start_time
)
...
...
@@ -1699,8 +1700,9 @@ class Command(BaseCommand, LoggerMixin):
self
.
online_log
.
info
(
'{0} [res_2_wb] [get task] [queue running] [finish_queue_size={1}]'
.
format
(
self
.
log_base
,
finish_queue
.
qsize
()))
while
len
(
error_list
)
==
0
or
not
img_queue
.
empty
()
or
not
finish_queue
.
empty
():
try
:
self
.
online_log
.
info
(
'{0} [res_2_wb] [finish_queue.get1] [finish_queue_size={1}] [img_queue_size={2}]'
.
format
(
self
.
log_base
,
finish_queue
.
qsize
(),
img_queue
.
qsize
()))
task_str
=
finish_queue
.
get
(
block
=
False
)
self
.
online_log
.
info
(
'{0} [res_2_wb] [finish_queue.get]'
.
format
(
self
.
log_base
))
self
.
online_log
.
info
(
'{0} [res_2_wb] [finish_queue.get
2
]'
.
format
(
self
.
log_base
))
except
Exception
as
e
:
self
.
online_log
.
info
(
'{0} [res_2_wb] [queue empty]'
.
format
(
self
.
log_base
))
time
.
sleep
(
self
.
sleep_time_task_get
)
...
...
@@ -2463,6 +2465,7 @@ class Command(BaseCommand, LoggerMixin):
except
Exception
as
e
:
self
.
online_log
.
error
(
'{0} [process error (pdf & img remove)] [task={1}] [error={2}]'
.
format
(
self
.
log_base
,
task_str
,
traceback
.
format_exc
()))
self
.
online_log
.
info
(
'{0} [res_2_wb after while] [len(error_list)={1}] [img_queue={2}] [finish_queue={3}]'
.
format
(
self
.
log_base
,
len
(
error_list
),
img_queue
.
empty
(),
finish_queue
.
empty
()))
def
handle
(
self
,
*
args
,
**
kwargs
):
db
.
close_old_connections
()
...
...
src/celery_compare/tasks.py
View file @
eb25938
...
...
@@ -2458,7 +2458,7 @@ def se_bs_compare(license_en, ocr_res_dict, strip_list, is_auto, aa_type):
return
result_field_list
,
field_img_path_dict
def
se_compare_license
(
license_en
,
ocr_res_dict
,
field_list
):
def
se_compare_license
(
license_en
,
ocr_res_dict
,
field_list
,
is_auto
):
ocr_field
,
compare_logic
,
special_expiry_date
=
consts
.
SE_COMPARE_FIELD
[
license_en
]
is_find
=
False
...
...
@@ -2513,6 +2513,14 @@ def se_compare_license(license_en, ocr_res_dict, field_list):
ocr_res_list
[
res_idx
]
.
get
(
consts
.
LOWER_AMOUNT_FIELD
,
''
),
ocr_res_list
[
res_idx
]
.
get
(
consts
.
UPPER_AMOUNT_FIELD
,
''
),
)
# auto 保单 保险费合计 ocr结果需要加上一个基数,再与cms结果做比对
elif
is_auto
and
ocr_field
==
consts
.
BD_FIELD
and
name
==
consts
.
SE_BD_FIELD
[
10
]:
ocr_str
=
ocr_res_list
[
res_idx
]
.
get
(
compare_logic
[
name
][
0
])
compare_log
.
info
(
'{0} [bd_4962_price] [ori ocr_str:{1}] '
.
format
(
log_base
,
ocr_str
))
add_price
=
conf
.
BD_PRICE
compare_log
.
info
(
'{0} [bd_4962_price] [add_price:{1}] '
.
format
(
log_base
,
add_price
))
ocr_str
=
float
(
ocr_str
)
+
float
(
add_price
)
compare_log
.
info
(
'{0} [bd_4962_price] [final ocr_str:{1}] '
.
format
(
log_base
,
ocr_str
))
else
:
ocr_str
=
ocr_res_list
[
res_idx
]
.
get
(
compare_logic
[
name
][
0
])
...
...
@@ -3287,7 +3295,7 @@ def se_compare_process(compare_info, ocr_res_dict, is_gsyh, is_auto, id_res_list
license_en
,
id_res_list
,
strip_list
,
is_auto
)
else
:
result_field_list
,
no_ocr_result
,
field_img_path_dict
=
se_compare_license
(
license_en
,
ocr_res_dict
,
strip_list
)
license_en
,
ocr_res_dict
,
strip_list
,
is_auto
)
each_license_failed_count
=
0
for
name
,
value
,
result
,
ocr_str
,
img_path
,
error_type
,
cn_reason
in
result_field_list
:
if
license_en
not
in
consts
.
SKIP_CARD
or
not
no_ocr_result
:
...
...
@@ -3346,7 +3354,7 @@ def se_compare_process(compare_info, ocr_res_dict, is_gsyh, is_auto, id_res_list
elif
license_en
==
consts
.
FS_EN
:
result_field_list
,
field_img_path_dict
=
se_fs_compare
(
license_en
,
ocr_res_dict
,
strip_list
)
else
:
result_field_list
,
_
,
field_img_path_dict
=
se_compare_license
(
license_en
,
ocr_res_dict
,
strip_list
)
result_field_list
,
_
,
field_img_path_dict
=
se_compare_license
(
license_en
,
ocr_res_dict
,
strip_list
,
is_auto
)
each_license_failed_count
=
0
for
name
,
value
,
result
,
ocr_str
,
img_path
,
error_type
,
cn_reason
in
result_field_list
:
...
...
src/common/tools/pdf_to_img.py
View file @
eb25938
...
...
@@ -345,7 +345,7 @@ class PDFHandler:
# 3.3 碎图分组大于2、全过滤、含特殊filter,特殊处理:整个页面保存为png图片
if
page_to_png
:
page
=
pdf
.
loadPage
(
pno
)
self
.
page_to_png
(
page
,
is_big_img
=
True
)
self
.
page_to_png
(
page
)
def
title_is_ebank
(
self
,
char
):
new_char
=
normalize
(
'NFKC'
,
char
)
...
...
@@ -450,7 +450,7 @@ class PDFHandler:
# 1.页面图片对象数目为0时,保存整个页面为png图片
if
self
.
is_e_pdf
or
self
.
is_ebank
or
len
(
il
)
==
0
:
page
=
pdf
.
loadPage
(
pno
)
self
.
page_to_png
(
page
,
is_big_img
=
True
)
self
.
page_to_png
(
page
)
# 2.页面图片对象数目为1时:
# 小图(如电子账单的盖章):保存整个页面为png图片
# 大图:提取图片对象
...
...
@@ -459,13 +459,13 @@ class PDFHandler:
# 小图
if
width
<
WH_COUPLE_1
[
0
]
and
height
<
WH_COUPLE_1
[
1
]:
page
=
pdf
.
loadPage
(
pno
)
self
.
page_to_png
(
page
,
is_big_img
=
True
)
self
.
page_to_png
(
page
)
# 大图
elif
width
>=
WH_COUPLE_6
[
0
]
or
height
>=
WH_COUPLE_6
[
1
]:
self
.
is_new_modify
=
1
is_big_img
=
(
width
<
WH_COUPLE_7
[
0
]
and
height
<
WH_COUPLE_7
[
1
])
# 防止图片过大
page
=
pdf
.
loadPage
(
pno
)
self
.
page_to_png
(
page
,
is_big_img
=
True
)
self
.
page_to_png
(
page
,
is_big_img
=
is_big_img
)
elif
xref
not
in
self
.
xref_set
:
self
.
extract_single_image
(
pdf
,
xref
,
smask
,
colorspace
,
pno
)
# 3.页面图片对象数目大于1时,特殊处理
...
...
@@ -480,7 +480,7 @@ class PDFHandler:
with
fitz
.
Document
(
self
.
path
)
as
pdf
:
for
pno
in
range
(
pdf
.
pageCount
):
page
=
pdf
.
loadPage
(
pno
)
self
.
page_to_png
(
page
,
is_big_img
=
True
)
self
.
page_to_png
(
page
)
self
.
img_count
=
len
(
self
.
img_path_list
)
def
ebank_draw
(
self
):
...
...
src/settings/conf/prd.ini
View file @
eb25938
...
...
@@ -16,3 +16,4 @@ BASE_URL = https://sfocr-prod.bmwgroup.net
DELAY_SECONDS
=
60
BD_PRICE
=
950
\ No newline at end of file
...
...
src/settings/conf/sit.ini
View file @
eb25938
...
...
@@ -15,3 +15,5 @@ DEALER_CODE = ocr_situ_group
BASE_URL
=
https://staging-bmw-ocr.situdata.com
DELAY_SECONDS
=
60
BD_PRICE
=
950
\ No newline at end of file
...
...
src/settings/conf/uat.ini
View file @
eb25938
...
...
@@ -15,3 +15,5 @@ DEALER_CODE = ocr_situ_group
BASE_URL
=
https://sfocr-uat.bmwgroup.net
DELAY_SECONDS
=
60
BD_PRICE
=
950
\ No newline at end of file
...
...
Write
Preview
Styling with
Markdown
is supported
Attach a file
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to post a comment