Skip to content
Toggle navigation
Toggle navigation
This project
Loading...
Sign in
周伟奇
/
bmw-ocr
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Graphs
Network
Create a new issue
Commits
Issue Boards
Files
Commits
Network
Compare
Branches
Tags
1f46e609
authored
2020-11-19 11:20:11 +0800
by
周伟奇
Browse Files
Options
Browse Files
Tag
Download
Email Patches
Plain Diff
fix flow
1 parent
55ba3382
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
64 additions
and
46 deletions
src/apps/doc/management/commands/ocr_process.py
src/apps/doc/management/commands/ocr_process.py
View file @
1f46e60
...
...
@@ -55,8 +55,7 @@ class Command(BaseCommand, LoggerMixin):
def
signal_handler
(
self
,
sig
,
frame
):
self
.
switch
=
False
# 停止处理文件
@staticmethod
def
get_doc_object
(
task_str
):
def
get_doc_object
(
self
,
task_str
):
business_type
,
doc_id_str
=
task_str
.
split
(
consts
.
SPLIT_STR
)
doc_id
=
int
(
doc_id_str
)
doc_class
=
HILDoc
if
business_type
==
consts
.
HIL_PREFIX
else
AFCDoc
...
...
@@ -71,28 +70,30 @@ class Command(BaseCommand, LoggerMixin):
self
.
cronjob_log
.
info
(
'{0} [get_doc_info] [queue empty]'
.
format
(
self
.
log_base
))
return
None
,
None
,
None
self
.
cronjob_log
.
info
(
'{0} [get_doc_info success] [task={1}] [is_priority={2}]'
.
format
(
self
.
log_base
,
task_str
,
is_priority
))
self
.
cronjob_log
.
info
(
'{0} [get_doc_info] [task={1}] [is_priority={2}]'
.
format
(
self
.
log_base
,
task_str
,
is_priority
))
try
:
doc
,
business_type
=
self
.
get_doc_object
(
task_str
)
if
doc
is
None
:
self
.
cronjob_log
.
warn
(
'{0} [get_doc_info] [doc not exist] [task_str={1}] [is_priority={2}]'
.
format
(
self
.
log_base
,
task_str
,
is_priority
))
return
None
,
None
,
None
elif
doc
.
status
!=
DocStatus
.
INIT
.
value
:
self
.
cronjob_log
.
warn
(
'{0} [get_doc_info] [doc status error] [task_str={1}] [is_priority={2}] '
'[doc_status={3}]'
.
format
(
self
.
log_base
,
task_str
,
is_priority
,
doc
.
status
))
return
None
,
None
,
None
doc
.
status
=
DocStatus
.
PROCESSING
.
value
doc
.
start_time
=
timezone
.
now
()
doc
.
save
()
except
Exception
as
e
:
rh
.
enqueue
([
task_str
],
is_priority
)
self
.
cronjob_log
.
error
(
'{0} [process error (get doc info in)] [error={1}]'
.
format
(
self
.
log_base
,
traceback
.
format_exc
()))
self
.
cronjob_log
.
error
(
'{0} [process error (get doc info in)] [error={1}]'
.
format
(
self
.
log_base
,
traceback
.
format_exc
()))
raise
e
if
doc
is
None
:
self
.
cronjob_log
.
warn
(
'{0} [get_doc_info] [doc not exist] [task_str={1}] [is_priority={2}]'
.
format
(
else
:
self
.
cronjob_log
.
info
(
'{0} [get_doc_info] [db save end] [task_str={1}] [is_priority={2}]'
.
format
(
self
.
log_base
,
task_str
,
is_priority
))
return
None
,
None
,
None
elif
doc
.
status
!=
DocStatus
.
INIT
.
value
:
self
.
cronjob_log
.
warn
(
'{0} [get_doc_info] [doc status error] [task_str={1}] [is_priority={2}] '
'[doc_status={3}]'
.
format
(
self
.
log_base
,
task_str
,
is_priority
,
doc
.
status
))
return
None
,
None
,
None
doc
.
status
=
DocStatus
.
PROCESSING
.
value
doc
.
start_time
=
timezone
.
now
()
doc
.
save
()
self
.
cronjob_log
.
info
(
'{0} [get_doc_info] [success] [task_str={1}] [is_priority={2}]'
.
format
(
self
.
log_base
,
task_str
,
is_priority
))
return
doc
,
business_type
,
task_str
return
doc
,
business_type
,
task_str
def
pdf_download
(
self
,
doc
,
pdf_path
):
if
not
doc
.
application_id
.
startswith
(
consts
.
FIXED_APPLICATION_ID_PREFIX
):
...
...
@@ -210,21 +211,18 @@ class Command(BaseCommand, LoggerMixin):
else
:
res_list
.
append
((
pno
,
ino
,
part_idx
,
consts
.
RES_FAILED_2
))
@staticmethod
def
parse_img_path
(
img_path
):
def
parse_img_path
(
self
,
img_path
):
img_name
,
_
=
os
.
path
.
splitext
(
os
.
path
.
basename
(
img_path
))
part_list
=
img_name
.
split
(
'_'
)
# page_7_img_11_0
return
int
(
part_list
[
1
])
+
1
,
int
(
part_list
[
3
])
+
1
@staticmethod
def
get_most
(
value_list
):
def
get_most
(
self
,
value_list
):
if
value_list
:
most_common
=
Counter
(
value_list
)
.
most_common
(
1
)
return
most_common
[
0
][
0
]
if
most_common
else
None
@staticmethod
def
date_format
(
date_str
,
format_str
):
def
date_format
(
self
,
date_str
,
format_str
):
try
:
date_res
=
datetime
.
strptime
(
date_str
,
format_str
)
.
date
()
except
Exception
as
e
:
...
...
@@ -402,23 +400,41 @@ class Command(BaseCommand, LoggerMixin):
self
.
cronjob_log
.
info
(
'{0} [pdf to img end] [task={1}] [spend_time={2}]'
.
format
(
self
.
log_base
,
task_str
,
speed_time
))
with
lock
:
todo_count_dict
[
task_str
]
=
len
(
pdf_handler
.
img_path_list
)
for
img_path
in
pdf_handler
.
img_path_list
:
while
img_queue
.
full
():
self
.
cronjob_log
.
info
(
'{0} [pdf_2_img_2_queue] [img queue full]'
.
format
(
self
.
log_base
))
time
.
sleep
(
self
.
sleep_time_img_put
)
img_queue
.
put
(
img_path
)
img_count
=
len
(
pdf_handler
.
img_path_list
)
if
img_count
==
0
:
self
.
cronjob_log
.
warn
(
'{0} [pdf to img failed (pdf img empty)] [task={1}]'
.
format
(
self
.
log_base
,
task_str
))
raise
Exception
(
'pdf img empty'
)
else
:
with
lock
:
todo_count_dict
[
task_str
]
=
img_count
for
img_path
in
pdf_handler
.
img_path_list
:
while
img_queue
.
full
():
self
.
cronjob_log
.
info
(
'{0} [pdf_2_img_2_queue] [img queue full]'
.
format
(
self
.
log_base
))
time
.
sleep
(
self
.
sleep_time_img_put
)
img_queue
.
put
(
img_path
)
except
EDMSException
as
e
:
doc
.
status
=
DocStatus
.
PROCESS_FAILED
.
value
doc
.
save
()
self
.
cronjob_log
.
error
(
'{0} [process failed (edms download)] [task={1}] [error={2}]'
.
format
(
self
.
log_base
,
task_str
,
traceback
.
format_exc
()))
try
:
doc
.
status
=
DocStatus
.
PROCESS_FAILED
.
value
doc
.
save
()
self
.
cronjob_log
.
warn
(
'{0} [process failed (edms download)] [task={1}] [error={2}]'
.
format
(
self
.
log_base
,
task_str
,
traceback
.
format_exc
()))
except
Exception
as
e
:
self
.
cronjob_log
.
error
(
'{0} [process error (db save 1)] [error={1}]'
.
format
(
self
.
log_base
,
traceback
.
format_exc
()))
error_list
.
append
(
1
)
return
except
Exception
as
e
:
doc
.
status
=
DocStatus
.
PROCESS_FAILED
.
value
doc
.
save
()
self
.
cronjob_log
.
error
(
'{0} [process failed (pdf to img)] [task={1}] [error={2}]'
.
format
(
self
.
log_base
,
task_str
,
traceback
.
format_exc
()))
try
:
doc
.
status
=
DocStatus
.
PROCESS_FAILED
.
value
doc
.
save
()
self
.
cronjob_log
.
warn
(
'{0} [process failed (pdf to img)] [task={1}] [error={2}]'
.
format
(
self
.
log_base
,
task_str
,
traceback
.
format_exc
()))
except
Exception
as
e
:
self
.
cronjob_log
.
error
(
'{0} [process error (db save 2)] [error={1}]'
.
format
(
self
.
log_base
,
traceback
.
format_exc
()))
error_list
.
append
(
1
)
return
def
img_2_ocr_1
(
self
,
img_queue
,
todo_count_dict
,
res_dict
,
finish_queue
,
lock
,
url
,
error_list
):
while
len
(
error_list
)
==
0
or
not
img_queue
.
empty
():
...
...
@@ -447,8 +463,9 @@ class Command(BaseCommand, LoggerMixin):
if
ocr_1_response
.
status_code
!=
200
:
raise
OCR1Exception
(
'ocr_1 status code: {0}'
.
format
(
ocr_1_response
.
status_code
))
except
Exception
as
e
:
self
.
cronjob_log
.
warn
(
'{0} [ocr_1 failed] [times={1}] [img_path={2}] [error={3}]'
.
format
(
self
.
log_base
,
times
,
img_path
,
traceback
.
format_exc
()))
self
.
cronjob_log
.
warn
(
'{0} [ocr_1 failed] [times={1}] [url={2}] [img_path={3}] '
'[error={4}]'
.
format
(
self
.
log_base
,
times
,
url
,
img_path
,
traceback
.
format_exc
()))
else
:
ocr_1_res
=
ocr_1_response
.
json
()
end_time
=
time
.
time
()
...
...
@@ -458,7 +475,8 @@ class Command(BaseCommand, LoggerMixin):
break
else
:
ocr_1_res
=
{}
self
.
cronjob_log
.
warn
(
'{0} [ocr_1 failed] [img_path={1}]'
.
format
(
self
.
log_base
,
img_path
))
self
.
cronjob_log
.
warn
(
'{0} [ocr_1 failed] [img_path={1}] [url={2}]'
.
format
(
self
.
log_base
,
img_path
,
url
))
# continue
except
Exception
as
e
:
self
.
cronjob_log
.
error
(
'{0} [process error (ocr fetch)] [img_path={1}] [error={2}]'
.
format
(
...
...
@@ -521,14 +539,14 @@ class Command(BaseCommand, LoggerMixin):
ocr_data_list
=
res
.
get
(
'data'
,
[])
if
not
isinstance
(
ocr_data_list
,
list
):
res_list
.
append
((
pno
,
ino
,
part_idx
,
consts
.
RES_FAILED_3
))
self
.
cronjob_log
.
info
(
'{0} [ocr_1 res error] [img={1}]'
.
format
(
self
.
log_base
,
img_path
))
self
.
cronjob_log
.
warn
(
'{0} [ocr_1 res error] [img={1}]'
.
format
(
self
.
log_base
,
img_path
))
else
:
for
part_idx
,
ocr_data
in
enumerate
(
ocr_data_list
):
part_idx
=
part_idx
+
1
classify
=
ocr_data
.
get
(
'classify'
)
if
classify
is
None
:
res_list
.
append
((
pno
,
ino
,
part_idx
,
consts
.
RES_FAILED_3
))
self
.
cronjob_log
.
info
(
'{0} [ocr_1 res error] [img={1}]'
.
format
(
self
.
cronjob_log
.
warn
(
'{0} [ocr_1 res error] [img={1}]'
.
format
(
self
.
log_base
,
img_path
))
continue
elif
classify
in
consts
.
OTHER_CLASSIFY_SET
:
# 其他类
...
...
@@ -624,7 +642,7 @@ class Command(BaseCommand, LoggerMixin):
doc
,
business_type
=
self
.
get_doc_object
(
task_str
)
doc
.
status
=
DocStatus
.
PROCESS_FAILED
.
value
doc
.
save
()
self
.
cronjob_log
.
error
(
'{0} [process failed (res to wb)] [task={1}] [error={2}]'
.
format
(
self
.
cronjob_log
.
warn
(
'{0} [process failed (res to wb)] [task={1}] [error={2}]'
.
format
(
self
.
log_base
,
task_str
,
traceback
.
format_exc
()))
except
Exception
as
e
:
self
.
cronjob_log
.
error
(
'{0} [process error (wb end)] [task={1}] [error={2}]'
.
format
(
...
...
@@ -673,7 +691,7 @@ class Command(BaseCommand, LoggerMixin):
if
hasattr
(
doc
,
field
):
setattr
(
doc
,
field
,
count
)
doc
.
save
()
self
.
cronjob_log
.
error
(
'{0} [process failed (edms upload)] [task={1}] [error={2}]'
.
format
(
self
.
cronjob_log
.
warn
(
'{0} [process failed (edms upload)] [task={1}] [error={2}]'
.
format
(
self
.
log_base
,
task_str
,
traceback
.
format_exc
()))
except
Exception
as
e
:
self
.
cronjob_log
.
error
(
'{0} [process error (edms upload)] [task={1}] [error={2}]'
.
format
(
...
...
Write
Preview
Styling with
Markdown
is supported
Attach a file
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to post a comment