Skip to content
Toggle navigation
Toggle navigation
This project
Loading...
Sign in
周伟奇
/
bmw-ocr
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Graphs
Network
Create a new issue
Commits
Issue Boards
Files
Commits
Network
Compare
Branches
Tags
f8904dcb
authored
2020-06-23 10:10:47 +0800
by
周伟奇
Browse Files
Options
Browse Files
Tag
Download
Email Patches
Plain Diff
fix doc list
1 parent
a1a92499
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
47 additions
and
29 deletions
src/apps/doc/management/commands/doc_process.py
src/apps/doc/views.py
src/settings/__init__.py
src/apps/doc/management/commands/doc_process.py
View file @
f8904dc
...
...
@@ -8,7 +8,7 @@ from io import BytesIO
from
django.core.management
import
BaseCommand
from
common.mixins
import
LoggerMixin
from
common.redis_cache
import
redis_handler
as
rh
from
apps.doc.models
import
UploadDocRecords
from
apps.doc.models
import
UploadDocRecords
,
DocStatus
from
settings
import
conf
...
...
@@ -31,26 +31,32 @@ class Command(BaseCommand, LoggerMixin):
def
signal_handler
(
self
,
sig
,
frame
):
self
.
switch
=
False
# 停止处理文件
def
get_
task_info
(
self
):
# TODO 优先队列 & status modify
task
_id
=
rh
.
dequeue
()
if
task
_id
is
None
:
self
.
cronjob_log
.
info
(
'{0} [get_
task
_info] [queue empty]'
.
format
(
self
.
log_base
))
def
get_
doc_info
(
self
):
# TODO 优先队列
doc
_id
=
rh
.
dequeue
()
if
doc
_id
is
None
:
self
.
cronjob_log
.
info
(
'{0} [get_
doc
_info] [queue empty]'
.
format
(
self
.
log_base
))
return
task_info
=
UploadDocRecords
.
objects
.
filter
(
id
=
task
_id
)
.
values
(
doc_info
=
UploadDocRecords
.
objects
.
filter
(
id
=
doc
_id
)
.
values
(
'id'
,
'metadata_version_id'
,
'document_name'
)
.
first
()
if
task_info
is
None
:
self
.
cronjob_log
.
warn
(
'{0} [get_task_info] [task not found] [task_id={1}]'
.
format
(
self
.
log_base
,
task_id
))
self
.
cronjob_log
.
info
(
'{0} [get_task_info success] [task_info={1}]'
.
format
(
self
.
log_base
,
task_info
))
return
task_info
if
doc_info
is
None
:
self
.
cronjob_log
.
warn
(
'{0} [get_doc_info] [doc not found] [doc_id={1}]'
.
format
(
self
.
log_base
,
doc_id
))
return
UploadDocRecords
.
objects
.
filter
(
id
=
doc_id
)
.
update
(
status
=
DocStatus
.
PROCESSING
.
value
)
self
.
cronjob_log
.
info
(
'{0} [get_task_info success] [doc_info={1}]'
.
format
(
self
.
log_base
,
doc_info
))
return
doc_info
def
pdf_download
(
self
,
task
_info
):
if
task
_info
is
None
:
def
pdf_download
(
self
,
doc
_info
):
if
doc
_info
is
None
:
return
# TODO EDMS下载pdf
pdf_path
=
'/Users/clay/Desktop/biz/biz_logic/data/2/横版-表格-工商银行CH-B008802400.pdf'
self
.
cronjob_log
.
info
(
'{0} [pdf download success] [task_info={1}] [pdf_path={2}]'
.
format
(
self
.
log_base
,
task_info
,
pdf_path
))
return
pdf_path
# pdf_path = '/Users/clay/Desktop/biz/biz_logic/data/2/横版-表格-工商银行CH-B008802400.pdf'
# doc_data_path = os.path.dirname(pdf_path)
doc_id
=
doc_info
[
'id'
]
doc_data_path
=
os
.
path
.
join
(
self
.
data_dir
,
str
(
doc_id
))
pdf_path
=
os
.
path
.
join
(
doc_data_path
,
'{0}.pdf'
.
format
(
doc_id
))
self
.
cronjob_log
.
info
(
'{0} [pdf download success] [doc_info={1}] [pdf_path={2}]'
.
format
(
self
.
log_base
,
doc_info
,
pdf_path
))
return
pdf_path
,
doc_data_path
@staticmethod
def
getimage
(
pix
):
...
...
@@ -135,15 +141,15 @@ class Command(BaseCommand, LoggerMixin):
def
handle
(
self
,
*
args
,
**
kwargs
):
while
self
.
switch
:
# 从队列获取文件信息
task_info
=
self
.
get_task
_info
()
doc_info
=
self
.
get_doc
_info
()
# 从EDMS获取PDF文件
pdf_path
=
self
.
pdf_download
(
task
_info
)
pdf_path
,
doc_data_path
=
self
.
pdf_download
(
doc
_info
)
# 队列为空时的处理
if
pdf_path
is
None
:
time
.
sleep
(
10
)
continue
# PDF文件提取图片
img_save_path
=
os
.
path
.
join
(
os
.
path
.
dirname
(
pdf_path
)
,
'img'
)
img_save_path
=
os
.
path
.
join
(
doc_data_path
,
'img'
)
os
.
makedirs
(
img_save_path
,
exist_ok
=
True
)
with
fitz
.
Document
(
pdf_path
)
as
pdf
:
self
.
cronjob_log
.
info
(
'{0} [pdf_path={1}] [pdf_metadata={2}]'
.
format
(
...
...
@@ -159,8 +165,7 @@ class Command(BaseCommand, LoggerMixin):
page
=
pdf
.
loadPage
(
pno
)
pm
=
page
.
getPixmap
(
matrix
=
self
.
trans
,
alpha
=
False
)
save_path
=
os
.
path
.
join
(
img_save_path
,
'page_{0}_img_0.png'
.
format
(
page
.
number
))
# pm.writePNG(save_path)
pm
.
writeImage
(
save_path
)
pm
.
writePNG
(
save_path
)
else
:
# 提取图片
for
img_count
,
img_il
in
enumerate
(
img_il_list
):
if
len
(
img_il
)
==
1
:
# 当只有一张图片时, 简化处理
...
...
src/apps/doc/views.py
View file @
f8904dc
...
...
@@ -61,8 +61,10 @@ doc_list_args = {
'application_id'
:
fields
.
Str
(
required
=
False
,
validate
=
validate
.
Length
(
max
=
64
)),
'data_source'
:
fields
.
Str
(
required
=
False
,
validate
=
validate
.
Length
(
max
=
64
)),
'business_type'
:
fields
.
Str
(
required
=
False
,
validate
=
validate
.
Length
(
max
=
64
)),
'upload_finish_time'
:
fields
.
Date
(
required
=
False
),
'create_time'
:
fields
.
Date
(
required
=
False
),
'upload_time_start'
:
fields
.
Date
(
required
=
False
),
'upload_time_end'
:
fields
.
Date
(
required
=
False
),
'create_time_start'
:
fields
.
Date
(
required
=
False
),
'create_time_end'
:
fields
.
Date
(
required
=
False
),
}
upload_pdf_args
=
{
...
...
@@ -133,18 +135,23 @@ class DocView(GenericView, DocHandler):
application_id
=
args
.
get
(
'application_id'
)
data_source
=
args
.
get
(
'data_source'
)
business_type
=
args
.
get
(
'business_type'
)
upload_finish_time
=
args
.
get
(
'upload_finish_time'
)
create_time
=
args
.
get
(
'create_time'
)
upload_time_start
=
args
.
get
(
'upload_time_start'
)
upload_time_end
=
args
.
get
(
'upload_time_end'
)
create_time_start
=
args
.
get
(
'create_time_start'
)
create_time_end
=
args
.
get
(
'create_time_end'
)
status_query
=
Q
(
status
=
status
)
if
status
else
Q
()
application_id_query
=
Q
(
application_id
=
application_id
)
if
application_id
else
Q
()
data_source_query
=
Q
(
data_source
=
data_source
)
if
data_source
else
Q
()
business_type_query
=
Q
(
business_type
=
business_type
)
if
business_type
else
Q
()
upload_finish_time_query
=
Q
(
upload_finish_time
=
upload_finish_time
)
if
upload_finish_time
else
Q
()
create_time_query
=
Q
(
create_time
=
create_time
)
if
create_time
else
Q
()
upload_finish_time_query
=
Q
(
upload_finish_time__gte
=
upload_time_start
,
upload_finish_time__lte
=
upload_time_end
)
\
if
upload_time_start
and
upload_time_end
else
Q
()
create_time_query
=
Q
(
create_time__gte
=
create_time_start
,
create_time__lte
=
create_time_end
)
\
if
create_time_start
and
create_time_end
else
Q
()
query
=
status_query
&
application_id_query
&
data_source_query
&
business_type_query
\
&
upload_finish_time_query
&
create_time_query
doc_queryset
=
UploadDocRecords
.
objects
.
filter
(
query
)
.
values
(
'id'
,
'application_id'
,
'upload_finish_time'
,
'create_time'
,
'business_type'
,
'data_source'
,
'status'
)
val_tuple
=
(
'id'
,
'application_id'
,
'upload_finish_time'
,
'create_time'
,
'business_type'
,
'data_source'
,
'status'
)
doc_queryset
=
UploadDocRecords
.
objects
.
filter
(
query
)
.
values
(
*
val_tuple
)
.
order_by
(
'-upload_finish_time'
)
doc_list
=
self
.
get_doc_list
(
doc_queryset
)
total
=
len
(
doc_list
)
...
...
src/settings/__init__.py
View file @
f8904dc
...
...
@@ -41,6 +41,7 @@ INSTALLED_APPS = [
'django.contrib.sessions'
,
'django.contrib.messages'
,
'django.contrib.staticfiles'
,
# 'corsheaders',
'rest_framework'
,
'common'
,
'apps.account'
,
...
...
@@ -48,6 +49,7 @@ INSTALLED_APPS = [
]
MIDDLEWARE
=
[
# 'corsheaders.middleware.CorsMiddleware',
'django.middleware.security.SecurityMiddleware'
,
'django.contrib.sessions.middleware.SessionMiddleware'
,
'django.middleware.common.CommonMiddleware'
,
...
...
@@ -166,3 +168,7 @@ JWT_AUTH = {
'JWT_VERIFY_EXPIRATION'
:
True
,
'JWT_ALLOW_REFRESH'
:
True
,
}
# 跨域设置
# CORS_ORIGIN_ALLOW_ALL = True
# CORS_ALLOW_CREDENTIALS = True
...
...
Write
Preview
Styling with
Markdown
is supported
Attach a file
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to post a comment