Skip to content
Toggle navigation
Toggle navigation
This project
Loading...
Sign in
周伟奇
/
bmw-ocr
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Graphs
Network
Create a new issue
Commits
Issue Boards
Files
Commits
Network
Compare
Branches
Tags
799a0e94
authored
2020-12-15 16:04:22 +0800
by
周伟奇
Browse Files
Options
Browse Files
Tag
Download
Email Patches
Plain Diff
add pdf failed retry
1 parent
85b24dac
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
52 additions
and
34 deletions
src/apps/doc/management/commands/ocr_process.py
src/common/tools/pdf_to_img.py
src/apps/doc/management/commands/ocr_process.py
View file @
799a0e9
...
...
@@ -101,20 +101,20 @@ class Command(BaseCommand, LoggerMixin):
self
.
log_base
,
task_str
,
is_priority
))
return
doc
,
business_type
,
task_str
def
pdf_download
(
self
,
doc
,
pdf_path
):
if
not
doc
.
application_id
.
startswith
(
consts
.
FIXED_APPLICATION_ID_PREFIX
):
for
times
in
range
(
consts
.
RETRY_TIMES
):
try
:
self
.
edms
.
download
(
pdf_path
,
doc
.
metadata_version_id
)
except
Exception
as
e
:
self
.
cronjob_log
.
warn
(
'{0} [edms download failed] [times={1}] [pdf_path={2}] '
'[error={3}]'
.
format
(
self
.
log_base
,
times
,
pdf_path
,
traceback
.
format_exc
()))
edms_exc
=
str
(
e
)
else
:
break
else
:
raise
EDMSException
(
edms_exc
)
self
.
cronjob_log
.
info
(
'{0} [edms download success] [pdf_path={1}]'
.
format
(
self
.
log_base
,
pdf_path
))
#
def pdf_download(self, doc, pdf_path):
#
if not doc.application_id.startswith(consts.FIXED_APPLICATION_ID_PREFIX):
#
for times in range(consts.RETRY_TIMES):
#
try:
#
self.edms.download(pdf_path, doc.metadata_version_id)
#
except Exception as e:
#
self.cronjob_log.warn('{0} [edms download failed] [times={1}] [pdf_path={2}] '
#
'[error={3}]'.format(self.log_base, times, pdf_path, traceback.format_exc()))
#
edms_exc = str(e)
#
else:
#
break
#
else:
#
raise EDMSException(edms_exc)
#
self.cronjob_log.info('{0} [edms download success] [pdf_path={1}]'.format(self.log_base, pdf_path))
def
bs_process
(
self
,
wb
,
ocr_data
,
bs_summary
,
unknown_summary
,
classify
,
res_list
,
pno
,
ino
,
part_idx
):
sheets
=
ocr_data
.
get
(
'data'
,
[])
...
...
@@ -392,19 +392,35 @@ class Command(BaseCommand, LoggerMixin):
# 2. 从EDMS获取PDF文件
doc_data_path
=
os
.
path
.
join
(
self
.
data_dir
,
business_type
,
consts
.
TMP_DIR_NAME
,
str
(
doc
.
id
))
os
.
makedirs
(
doc_data_path
,
exist_ok
=
True
)
img_save_path
=
os
.
path
.
join
(
doc_data_path
,
'img'
)
pdf_path
=
os
.
path
.
join
(
doc_data_path
,
'{0}.pdf'
.
format
(
doc
.
id
))
self
.
pdf_download
(
doc
,
pdf_path
)
pdf_handler
=
PDFHandler
(
pdf_path
,
img_save_path
)
for
times
in
range
(
consts
.
RETRY_TIMES
):
try
:
if
not
doc
.
application_id
.
startswith
(
consts
.
FIXED_APPLICATION_ID_PREFIX
):
self
.
edms
.
download
(
pdf_path
,
doc
.
metadata_version_id
)
self
.
cronjob_log
.
info
(
'{0} [edms download success] [task={1}] [times={2}] '
'[pdf_path={3}]'
.
format
(
self
.
log_base
,
task_str
,
times
,
pdf_path
))
# 3.PDF文件提取图片
self
.
cronjob_log
.
info
(
'{0} [pdf to img start] [task={1}]'
.
format
(
self
.
log_base
,
task_str
))
self
.
cronjob_log
.
info
(
'{0} [pdf to img start] [task={1}] [times={2}]'
.
format
(
self
.
log_base
,
task_str
,
times
))
start_time
=
time
.
time
()
img_save_path
=
os
.
path
.
join
(
doc_data_path
,
'img'
)
pdf_handler
=
PDFHandler
(
pdf_path
,
img_save_path
)
pdf_handler
.
extract_image
()
end_time
=
time
.
time
()
speed_time
=
int
(
end_time
-
start_time
)
self
.
cronjob_log
.
info
(
'{0} [pdf to img end] [task={1}] [spend_time={2}]'
.
format
(
self
.
log_base
,
task_str
,
speed_time
))
self
.
cronjob_log
.
info
(
'{0} [pdf to img end] [task={1}] [times={2}] [spend_time={3}]'
.
format
(
self
.
log_base
,
task_str
,
times
,
speed_time
))
except
Exception
as
e
:
self
.
cronjob_log
.
warn
(
'{0} [download or pdf to img failed] [task={1}] [times={2}] '
'[error={3}]'
.
format
(
self
.
log_base
,
task_str
,
times
,
traceback
.
format_exc
()))
else
:
break
else
:
raise
Exception
(
'download or pdf to img failed'
)
img_count
=
len
(
pdf_handler
.
img_path_list
)
if
img_count
==
0
:
...
...
@@ -419,25 +435,25 @@ class Command(BaseCommand, LoggerMixin):
self
.
cronjob_log
.
info
(
'{0} [pdf_2_img_2_queue] [img queue full]'
.
format
(
self
.
log_base
))
time
.
sleep
(
self
.
sleep_time_img_put
)
img_queue
.
put
(
img_path
)
except
EDMSException
as
e
:
try
:
doc
.
status
=
DocStatus
.
PROCESS_FAILED
.
value
doc
.
save
()
self
.
cronjob_log
.
warn
(
'{0} [process failed (edms download)] [task={1}] [error={2}]'
.
format
(
self
.
log_base
,
task_str
,
traceback
.
format_exc
()))
except
Exception
as
e
:
self
.
cronjob_log
.
error
(
'{0} [process error (db save 1)] [error={1}]'
.
format
(
self
.
log_base
,
traceback
.
format_exc
()))
error_list
.
append
(
1
)
return
#
except EDMSException as e:
#
try:
#
doc.status = DocStatus.PROCESS_FAILED.value
#
doc.save()
#
self.cronjob_log.warn('{0} [process failed (edms download)] [task={1}] [error={2}]'.format(
#
self.log_base, task_str, traceback.format_exc()))
#
except Exception as e:
#
self.cronjob_log.error('{0} [process error (db save 1)] [error={1}]'.format(
#
self.log_base, traceback.format_exc()))
#
error_list.append(1)
#
return
except
Exception
as
e
:
try
:
doc
.
status
=
DocStatus
.
PROCESS_FAILED
.
value
doc
.
save
()
self
.
cronjob_log
.
warn
(
'{0} [process failed (pdf
to img)] [task={1}] [error={2}]'
.
format
(
self
.
log_base
,
task_str
,
traceback
.
format_exc
()))
self
.
cronjob_log
.
warn
(
'{0} [process failed (pdf
_2_img_2_queue)] [task={1}] '
'[error={2}]'
.
format
(
self
.
log_base
,
task_str
,
traceback
.
format_exc
()))
except
Exception
as
e
:
self
.
cronjob_log
.
error
(
'{0} [process error (db save
2
)] [error={1}]'
.
format
(
self
.
cronjob_log
.
error
(
'{0} [process error (db save
1
)] [error={1}]'
.
format
(
self
.
log_base
,
traceback
.
format_exc
()))
error_list
.
append
(
1
)
return
...
...
src/common/tools/pdf_to_img.py
View file @
799a0e9
...
...
@@ -187,6 +187,8 @@ class PDFHandler:
self
.
page_to_png
(
page
)
def
extract_image
(
self
):
self
.
img_path_list
=
[]
self
.
xref_set
=
set
()
os
.
makedirs
(
self
.
img_dir_path
,
exist_ok
=
True
)
with
fitz
.
Document
(
self
.
path
)
as
pdf
:
for
pno
in
range
(
pdf
.
pageCount
):
...
...
Write
Preview
Styling with
Markdown
is supported
Attach a file
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to post a comment