Skip to content
Toggle navigation
Toggle navigation
This project
Loading...
Sign in
周伟奇
/
bmw-ocr
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Graphs
Network
Create a new issue
Commits
Issue Boards
Files
Commits
Network
Compare
Branches
Tags
e325cfc3
authored
2020-06-28 16:49:17 +0800
by
周伟奇
Browse Files
Options
Browse Files
Tag
Download
Email Patches
Plain Diff
modify max sleep time
1 parent
d024de62
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
14 additions
and
12 deletions
src/apps/doc/management/commands/doc_process.py
src/apps/doc/management/commands/doc_process.py
View file @
e325cfc
...
...
@@ -184,9 +184,9 @@ class Command(BaseCommand, LoggerMixin):
start
=
i
+
1
return
img_il_list
def
handle
(
self
,
*
args
,
**
kwargs
):
def
handle
(
self
,
*
args
,
**
kwargs
):
# TODO 调用接口重试
sleep_second
=
5
max_sleep_second
=
30
0
max_sleep_second
=
6
0
while
self
.
switch
:
# 从队列获取文件信息
doc_info
=
self
.
get_doc_info
()
...
...
@@ -206,7 +206,7 @@ class Command(BaseCommand, LoggerMixin):
with
fitz
.
Document
(
pdf_path
)
as
pdf
:
self
.
cronjob_log
.
info
(
'{0} [pdf_path={1}] [metadata={2}]'
.
format
(
self
.
log_base
,
pdf_path
,
pdf
.
metadata
))
# xref_list = [] # TODO 图片去重
# xref_list = [] # TODO 图片去重
特殊pdf:如电子发票
for
pno
in
range
(
pdf
.
pageCount
):
il
=
pdf
.
getPageImageList
(
pno
)
il
.
sort
(
key
=
lambda
x
:
x
[
0
])
...
...
@@ -219,8 +219,8 @@ class Command(BaseCommand, LoggerMixin):
save_path
=
os
.
path
.
join
(
img_save_path
,
'page_{0}_img_0.png'
.
format
(
page
.
number
))
pm
.
writePNG
(
save_path
)
img_path_list
.
append
(
save_path
)
self
.
cronjob_log
.
info
(
'{0} [page to img success] [
pdf_path={1}] [page={2}]'
.
format
(
self
.
log_base
,
pdf_path
,
page
.
number
))
self
.
cronjob_log
.
info
(
'{0} [page to img success] [
doc_id={1}] [pdf_path={2}] '
'[page={3}]'
.
format
(
self
.
log_base
,
doc_id
,
pdf_path
,
page
.
number
))
else
:
# 提取图片
for
img_index
,
img_il
in
enumerate
(
img_il_list
):
if
len
(
img_il
)
==
1
:
# 当只有一张图片时, 简化处理
...
...
@@ -232,8 +232,8 @@ class Command(BaseCommand, LoggerMixin):
f
.
write
(
img_data
)
img_path_list
.
append
(
save_path
)
self
.
cronjob_log
.
info
(
'{0} [extract img success] [
pdf_path={1}] [page={2}] [img_index={3}]'
.
format
(
self
.
log_base
,
pdf_path
,
pno
,
img_index
))
'{0} [extract img success] [
doc_id={1}] [pdf_path={2}] [page={3}] '
'[img_index={4}]'
.
format
(
self
.
log_base
,
doc_id
,
pdf_path
,
pno
,
img_index
))
else
:
# 多张图片,竖向拼接
height_sum
=
0
im_list
=
[]
...
...
@@ -262,9 +262,9 @@ class Command(BaseCommand, LoggerMixin):
res
.
save
(
save_path
)
img_path_list
.
append
(
save_path
)
self
.
cronjob_log
.
info
(
'{0} [extract img success] [
pdf_path={1}] [page={2}] [img_index={3}]'
.
format
(
self
.
log_base
,
pdf_path
,
pno
,
img_index
))
self
.
cronjob_log
.
info
(
'{0} [pdf to img success]
'
.
format
(
self
.
log_base
))
'{0} [extract img success] [
doc_id={1}] [pdf_path={2}] [page={3}] '
'[img_index={4}]'
.
format
(
self
.
log_base
,
doc_id
,
pdf_path
,
pno
,
img_index
))
self
.
cronjob_log
.
info
(
'{0} [pdf to img success]
[doc_id={1}]'
.
format
(
self
.
log_base
,
doc_id
))
write_zip_file
(
img_save_path
,
os
.
path
.
join
(
doc_data_path
,
'{0}_img.zip'
.
format
(
doc_id
)))
# 图片调用算法判断是否为银行流水, 图片调用算法OCR为excel文件
...
...
@@ -273,11 +273,13 @@ class Command(BaseCommand, LoggerMixin):
tasks
=
[
self
.
img_ocr_excel
(
wb
,
img_path
)
for
img_path
in
img_path_list
]
loop
.
run_until_complete
(
asyncio
.
wait
(
tasks
))
# loop.close()
wb
.
save
(
excel_path
)
wb
.
save
(
excel_path
)
# TODO no sheet (res always [])
# 整合excel文件上传至EDMS
except
Exception
as
e
:
UploadDocRecords
.
objects
.
filter
(
id
=
doc_id
)
.
update
(
status
=
DocStatus
.
PROCESS_FAILED
.
value
)
self
.
cronjob_log
.
error
(
'{0} [process failed] [
err={1}]'
.
format
(
self
.
log_base
,
e
))
self
.
cronjob_log
.
error
(
'{0} [process failed] [
doc_id={1}] [err={2}]'
.
format
(
self
.
log_base
,
doc_id
,
e
))
else
:
UploadDocRecords
.
objects
.
filter
(
id
=
doc_id
)
.
update
(
status
=
DocStatus
.
COMPLETE
.
value
)
self
.
cronjob_log
.
info
(
'{0} [doc process complete] [doc_id={1}]'
.
format
(
self
.
log_base
,
doc_id
))
self
.
cronjob_log
.
info
(
'{0} [stop safely]'
)
...
...
Write
Preview
Styling with
Markdown
is supported
Attach a file
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to post a comment