Skip to content
Toggle navigation
Toggle navigation
This project
Loading...
Sign in
周伟奇
/
bmw-ocr
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Graphs
Network
Create a new issue
Commits
Issue Boards
Files
Commits
Network
Compare
Branches
Tags
6934c592
authored
4 years ago
by
周伟奇
Browse Files
Options
Browse Files
Tag
Download
Email Patches
Plain Diff
fix bug
1 parent
b34cd942
master
...
CHINARPA-4562
OCR-recognition-for-FSM-related-documents
feature/202506-monixiadan
feature/4058
feature/CHINAPRA-4447
feature/CHINARPA-3290-FSM-AUTO
feature/CHINARPA-3443
feature/CHINARPA-3523
feature/CHINARPA-3528
feature/CHINARPA-3529
feature/CHINARPA-3577
feature/CHINARPA-3786
feature/CHINARPA-3964
feature/CHINARPA-4137
feature/CHINARPA-4277
feature/CHINARPA-4302/all-pass
feature/CHINARPA-4341
feature/CHINARPA-4357
feature/CHINARPA-4358
feature/CHINARPA-4395
feature/CHINARPA-4495
feature/CHINARPA-4546
feature/CHINARPA-4623
feature/CHINARPA-4659
feature/CHINARPA-4660
feature/CHINARPA-4731
feature/CHINARPA-4846
feature/CHINARPA-4941
feature/CHINARPA-4942
feature/CHINARPA-4944
feature/CHINARPA-4962
feature/CHINARPA-5015
feature/CHINARPA-5075
feature/CHINARPA-5092
feature/CHINARPA-5117
feature/CHINARPA-5118
feature/CHINARPA-5131-5234
feature/CHINARPA-5153
feature/CHINARPA-5155
feature/CHINARPA-5296
feature/CHINARPA-5504
feature/CHINARPA-5619
feature/CHINARPA-5620-dzfp
feature/CHINARPA_5015_SQL
feature/KWOM_July
feature/SE
feature/SE2
feature/SE3
feature/add_log_20240924
feature/add_try_except
feature/admin
feature/admin2
feature/auto-flag
feature/e-bank
feature/enhancement-file-name-change
feature/f3
feature/filter-file
feature/fix_label_40_dydjhmh
feature/fsm-contract
feature/fsm-full
feature/hotfix_insurance
feature/mssql-encrypt
feature/new-pwd
feature/pdftoimg
feature/pentest
feature/pres
feature/pres-3034
feature/qrs
feature/report
feature/report2
feature/rpa
feature/sc
feature/seOct
feature/token
feature/uat-new
feature/uat-tmp
feature/uat-tmp-cms-yace
feature/uat-tmp-cy
feature/uat-tmp-wblog
feature/upgrade_cut_img
feature/weixin-bs
feature/weixin-bs-2
feature/zfb
feature/zip
feature_add_down_payment
feature_add_income_keywords_cy
feature_add_insurance_sec_page
fix/1118上线问题反馈
fix/1227
fix/2024-05-pen-test
fix/20240424
fix/hil_excel_sql
fix/id-card
fix/new_hil_contract
fix/report_ca
hotfix/2025-02
hotfix/2025-04
hotfix/2025-06
master-0117
ocr-Pre-Settlement
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
74 additions
and
64 deletions
src/apps/doc/management/commands/folder_ocr_process.py
src/apps/doc/management/commands/ocr_process.py
src/apps/doc/management/commands/folder_ocr_process.py
View file @
6934c59
...
...
@@ -6,6 +6,7 @@ import base64
import
signal
import
requests
import
traceback
from
datetime
import
datetime
from
django.core.management
import
BaseCommand
from
multiprocessing
import
Process
...
...
@@ -57,7 +58,7 @@ class Command(BaseCommand, LoggerMixin):
@staticmethod
def
get_path
(
name
,
img_output_dir
,
wb_output_dir
,
pdf_output_dir
):
time_stamp
=
int
(
time
.
time
()
)
time_stamp
=
datetime
.
now
()
.
strftime
(
'
%
Y-
%
m-
%
d_
%
H:
%
M:
%
S'
)
new_name
=
'{0}_{1}'
.
format
(
time_stamp
,
name
)
img_save_path
=
os
.
path
.
join
(
img_output_dir
,
new_name
)
pdf_save_path
=
os
.
path
.
join
(
pdf_output_dir
,
new_name
)
...
...
This diff is collapsed.
Click to expand it.
src/apps/doc/management/commands/ocr_process.py
View file @
6934c59
...
...
@@ -365,48 +365,52 @@ class Command(BaseCommand, LoggerMixin):
def
pdf_2_img_2_queue
(
self
,
img_queue
,
todo_count_dict
,
lock
):
while
self
.
switch
:
# 1. 从队列获取文件信息
doc
,
business_type
,
task_str
=
self
.
get_doc_info
()
# 队列为空时的处理
if
doc
is
None
:
time
.
sleep
(
self
.
sleep_time_doc_get
)
continue
try
:
# 2. 从EDMS获取PDF文件
doc_data_path
=
os
.
path
.
join
(
self
.
data_dir
,
business_type
,
consts
.
TMP_DIR_NAME
,
str
(
doc
.
id
))
os
.
makedirs
(
doc_data_path
,
exist_ok
=
True
)
pdf_path
=
os
.
path
.
join
(
doc_data_path
,
'{0}.pdf'
.
format
(
doc
.
id
))
self
.
pdf_download
(
doc
,
pdf_path
)
# 3.PDF文件提取图片
self
.
cronjob_log
.
info
(
'{0} [pdf to img start] [task={1}]'
.
format
(
self
.
log_base
,
task_str
))
start_time
=
time
.
time
()
img_save_path
=
os
.
path
.
join
(
doc_data_path
,
'img'
)
pdf_handler
=
PDFHandler
(
pdf_path
,
img_save_path
)
pdf_handler
.
extract_image
()
end_time
=
time
.
time
()
speed_time
=
int
(
end_time
-
start_time
)
self
.
cronjob_log
.
info
(
'{0} [pdf to img end] [task={1}] [spend_time={2}]'
.
format
(
self
.
log_base
,
task_str
,
speed_time
))
with
lock
:
todo_count_dict
[
task_str
]
=
len
(
pdf_handler
.
img_path_list
)
for
img_path
in
pdf_handler
.
img_path_list
:
while
img_queue
.
full
():
self
.
cronjob_log
.
info
(
'{0} [pdf_2_img_2_queue] [img queue full]'
.
format
(
self
.
log_base
))
time
.
sleep
(
self
.
sleep_time_img_put
)
img_queue
.
put
(
img_path
)
except
EDMSException
as
e
:
doc
.
status
=
DocStatus
.
PROCESS_FAILED
.
value
doc
.
save
()
self
.
cronjob_log
.
error
(
'{0} [process failed (edms download)] [task={1}] [error={2}]'
.
format
(
self
.
log_base
,
task_str
,
traceback
.
format_exc
()))
# 1. 从队列获取文件信息
doc
,
business_type
,
task_str
=
self
.
get_doc_info
()
# 队列为空时的处理
if
doc
is
None
:
time
.
sleep
(
self
.
sleep_time_doc_get
)
continue
except
Exception
as
e
:
doc
.
status
=
DocStatus
.
PROCESS_FAILED
.
value
doc
.
save
()
self
.
cronjob_log
.
error
(
'{0} [process failed (pdf to img)] [task={1}] [error={2}]'
.
format
(
self
.
log_base
,
task_str
,
traceback
.
format_exc
()))
self
.
cronjob_log
.
error
(
'{0} [process failed (get doc into)] [error={1}]'
.
format
(
self
.
log_base
,
traceback
.
format_exc
()))
else
:
try
:
# 2. 从EDMS获取PDF文件
doc_data_path
=
os
.
path
.
join
(
self
.
data_dir
,
business_type
,
consts
.
TMP_DIR_NAME
,
str
(
doc
.
id
))
os
.
makedirs
(
doc_data_path
,
exist_ok
=
True
)
pdf_path
=
os
.
path
.
join
(
doc_data_path
,
'{0}.pdf'
.
format
(
doc
.
id
))
self
.
pdf_download
(
doc
,
pdf_path
)
# 3.PDF文件提取图片
self
.
cronjob_log
.
info
(
'{0} [pdf to img start] [task={1}]'
.
format
(
self
.
log_base
,
task_str
))
start_time
=
time
.
time
()
img_save_path
=
os
.
path
.
join
(
doc_data_path
,
'img'
)
pdf_handler
=
PDFHandler
(
pdf_path
,
img_save_path
)
pdf_handler
.
extract_image
()
end_time
=
time
.
time
()
speed_time
=
int
(
end_time
-
start_time
)
self
.
cronjob_log
.
info
(
'{0} [pdf to img end] [task={1}] [spend_time={2}]'
.
format
(
self
.
log_base
,
task_str
,
speed_time
))
with
lock
:
todo_count_dict
[
task_str
]
=
len
(
pdf_handler
.
img_path_list
)
for
img_path
in
pdf_handler
.
img_path_list
:
while
img_queue
.
full
():
self
.
cronjob_log
.
info
(
'{0} [pdf_2_img_2_queue] [img queue full]'
.
format
(
self
.
log_base
))
time
.
sleep
(
self
.
sleep_time_img_put
)
img_queue
.
put
(
img_path
)
except
EDMSException
as
e
:
doc
.
status
=
DocStatus
.
PROCESS_FAILED
.
value
doc
.
save
()
self
.
cronjob_log
.
error
(
'{0} [process failed (edms download)] [task={1}] [error={2}]'
.
format
(
self
.
log_base
,
task_str
,
traceback
.
format_exc
()))
except
Exception
as
e
:
doc
.
status
=
DocStatus
.
PROCESS_FAILED
.
value
doc
.
save
()
self
.
cronjob_log
.
error
(
'{0} [process failed (pdf to img)] [task={1}] [error={2}]'
.
format
(
self
.
log_base
,
task_str
,
traceback
.
format_exc
()))
def
img_2_ocr_1
(
self
,
img_queue
,
todo_count_dict
,
res_dict
,
finish_queue
,
lock
,
url
):
while
True
:
...
...
@@ -418,16 +422,17 @@ class Command(BaseCommand, LoggerMixin):
continue
else
:
self
.
cronjob_log
.
info
(
'{0} [img_2_ocr_1] [get img] [img_path={1}]'
.
format
(
self
.
log_base
,
img_path
))
with
open
(
img_path
,
'rb'
)
as
f
:
base64_data
=
base64
.
b64encode
(
f
.
read
())
# 获取解码后的base64值
file_data
=
base64_data
.
decode
()
json_data_1
=
{
"file"
:
file_data
}
for
times
in
range
(
consts
.
RETRY_TIMES
):
try
:
with
open
(
img_path
,
'rb'
)
as
f
:
base64_data
=
base64
.
b64encode
(
f
.
read
())
# 获取解码后的base64值
file_data
=
base64_data
.
decode
()
json_data_1
=
{
"file"
:
file_data
}
start_time
=
time
.
time
()
ocr_1_response
=
requests
.
post
(
url
,
json
=
json_data_1
)
if
ocr_1_response
.
status_code
!=
200
:
...
...
@@ -447,22 +452,26 @@ class Command(BaseCommand, LoggerMixin):
self
.
cronjob_log
.
warn
(
'{0} [ocr_1 failed] [img_path={1}]'
.
format
(
self
.
log_base
,
img_path
))
# continue
del
json_data_1
# /data/bmw-ocr-data/AFC/tmp/6/img/page_0_img_0.jpeg
# AFC_2
path_split
=
img_path
.
split
(
'/'
)
task_str
=
consts
.
SPLIT_STR
.
join
((
path_split
[
-
5
],
path_split
[
-
3
]))
with
lock
:
doc_res_dict
=
res_dict
.
setdefault
(
task_str
,
{})
doc_res_dict
[
img_path
]
=
ocr_1_res
res_dict
[
task_str
]
=
doc_res_dict
todo_count
=
todo_count_dict
.
get
(
task_str
)
if
todo_count
==
1
:
finish_queue
.
put
(
task_str
)
del
todo_count_dict
[
task_str
]
else
:
todo_count_dict
[
task_str
]
=
todo_count
-
1
try
:
del
json_data_1
# /data/bmw-ocr-data/AFC/tmp/6/img/page_0_img_0.jpeg
# AFC_2
path_split
=
img_path
.
split
(
'/'
)
task_str
=
consts
.
SPLIT_STR
.
join
((
path_split
[
-
5
],
path_split
[
-
3
]))
with
lock
:
doc_res_dict
=
res_dict
.
setdefault
(
task_str
,
{})
doc_res_dict
[
img_path
]
=
ocr_1_res
res_dict
[
task_str
]
=
doc_res_dict
todo_count
=
todo_count_dict
.
get
(
task_str
)
if
todo_count
==
1
:
finish_queue
.
put
(
task_str
)
del
todo_count_dict
[
task_str
]
else
:
todo_count_dict
[
task_str
]
=
todo_count
-
1
except
Exception
as
e
:
self
.
cronjob_log
.
error
(
'{0} [process failed (store ocr res)] [img_path={1}] [error={2}]'
.
format
(
self
.
log_base
,
img_path
,
traceback
.
format_exc
()))
def
res_2_wb
(
self
,
res_dict
,
finish_queue
,
lock
):
while
True
:
...
...
This diff is collapsed.
Click to expand it.
Write
Preview
Styling with
Markdown
is supported
Attach a file
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to post a comment