Skip to content
Toggle navigation
Toggle navigation
This project
Loading...
Sign in
周伟奇
/
bmw-ocr
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Graphs
Network
Create a new issue
Commits
Issue Boards
Files
Commits
Network
Compare
Branches
Tags
ad161125
authored
2020-06-30 14:15:02 +0800
by
周伟奇
Browse Files
Options
Browse Files
Tag
Download
Email Patches
Plain Diff
priority queue
1 parent
e325cfc3
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
12 changed files
with
131 additions
and
63 deletions
src/apps/doc/consts.py
src/apps/doc/urls.py → src/apps/doc/create_urls.py
src/apps/doc/management/commands/doc_process.py
src/apps/doc/models.py
src/apps/doc/priority_urls.py
src/apps/doc/views.py
src/apps/urls.py
src/common/redis_cache/base.py
src/common/redis_cache/handler.py
src/common/tools/pdf_tools.py
src/settings/__init__.py
src/settings/database.py
src/apps/doc/consts.py
View file @
ad16112
PAGE_DEFAULT
=
1
PAGE_SIZE_DEFAULT
=
10
BUSINESS_TYPE
=
[
'HIL'
,
'AFC'
]
HIL_SET
=
{
'HIL'
,
'hil'
,
'CO00002'
,
'C000002'
}
HIL_PREFIX
=
'HIL'
AFC_PREFIX
=
'AFC'
...
...
src/apps/doc/urls.py
→
src/apps/doc/
create_
urls.py
View file @
ad16112
...
...
@@ -3,5 +3,5 @@ from . import views
urlpatterns
=
[
path
(
r''
,
views
.
UploadDocView
.
as_view
()),
path
(
r'
v1
'
,
views
.
UploadDocView
.
as_view
()),
]
...
...
src/apps/doc/management/commands/doc_process.py
View file @
ad16112
...
...
@@ -13,7 +13,8 @@ from django.core.management import BaseCommand
from
common.mixins
import
LoggerMixin
from
common.redis_cache
import
redis_handler
as
rh
from
common.tools.file_tools
import
write_zip_file
from
apps.doc.models
import
UploadDocRecords
,
DocStatus
from
apps.doc.models
import
DocStatus
,
HILDoc
,
AFCDoc
from
apps.doc
import
consts
from
settings
import
conf
...
...
@@ -42,33 +43,39 @@ class Command(BaseCommand, LoggerMixin):
def
signal_handler
(
self
,
sig
,
frame
):
self
.
switch
=
False
# 停止处理文件
def
get_doc_info
(
self
):
# TODO 优先队列
doc_id
=
rh
.
dequeue
()
if
doc_id
is
None
:
def
get_doc_info
(
self
):
task_str
,
is_priority
=
rh
.
dequeue
()
if
task_str
is
None
:
self
.
cronjob_log
.
info
(
'{0} [get_doc_info] [queue empty]'
.
format
(
self
.
log_base
))
return
doc_info
=
UploadDocRecords
.
objects
.
filter
(
id
=
doc_id
)
.
values
(
'id'
,
'metadata_version_id'
,
'document_name'
)
.
first
()
if
doc_info
is
None
:
self
.
cronjob_log
.
warn
(
'{0} [get_doc_info] [doc not found] [doc_id={1}]'
.
format
(
self
.
log_base
,
doc_id
))
return
UploadDocRecords
.
objects
.
filter
(
id
=
doc_id
)
.
update
(
status
=
DocStatus
.
PROCESSING
.
value
)
self
.
cronjob_log
.
info
(
'{0} [get_task_info success] [doc_info={1}]'
.
format
(
self
.
log_base
,
doc_info
))
return
doc_info
return
None
,
None
,
None
,
None
def
pdf_download
(
self
,
doc_info
):
business_type
,
doc_id_str
=
task_str
.
split
(
'_'
)
doc_id
=
int
(
doc_id_str
)
doc_class
=
HILDoc
if
business_type
==
consts
.
HIL_PREFIX
else
AFCDoc
doc_info
=
doc_class
.
objects
.
filter
(
id
=
doc_id
,
status
=
DocStatus
.
INIT
.
value
)
.
values
(
'id'
,
'metadata_version_id'
,
'document_name'
)
.
first
()
# TODO 查不到时是否为None
if
doc_info
is
None
:
self
.
cronjob_log
.
warn
(
'{0} [get_doc_info] [doc completed] [task_str={1}] [is_priority={2}]'
.
format
(
self
.
log_base
,
task_str
,
is_priority
))
return
None
,
None
,
None
,
None
doc_class
.
objects
.
filter
(
id
=
doc_id
)
.
update
(
status
=
DocStatus
.
PROCESSING
.
value
)
self
.
cronjob_log
.
info
(
'{0} [get_doc_info] [task_str={1}] [is_priority={2}] [doc_info={3}]'
.
format
(
self
.
log_base
,
task_str
,
is_priority
,
doc_info
))
return
doc_info
,
doc_class
,
doc_id
,
business_type
def
pdf_download
(
self
,
doc_id
,
doc_info
,
business_type
):
if
doc_info
is
None
:
return
None
,
None
,
None
# TODO EDMS下载pdf
# pdf_path = '/Users/clay/Desktop/biz/biz_logic/data/2/横版-表格-工商银行CH-B008802400.pdf'
# doc_data_path = os.path.dirname(pdf_path)
doc_id
=
doc_info
[
'id'
]
doc_data_path
=
os
.
path
.
join
(
self
.
data_dir
,
str
(
doc_id
))
doc_data_path
=
os
.
path
.
join
(
self
.
data_dir
,
business_type
,
str
(
doc_id
))
pdf_path
=
os
.
path
.
join
(
doc_data_path
,
'{0}.pdf'
.
format
(
doc_id
))
excel_path
=
os
.
path
.
join
(
doc_data_path
,
'{0}.xls'
.
format
(
doc_id
))
self
.
cronjob_log
.
info
(
'{0} [pdf download success] [
doc_info={1}] [pdf_path={2
}]'
.
format
(
self
.
log_base
,
doc_info
,
pdf_path
))
return
doc_data_path
,
excel_path
,
pdf_path
,
doc_id
self
.
cronjob_log
.
info
(
'{0} [pdf download success] [
business_type={1}] [doc_info={2}] [pdf_path={3
}]'
.
format
(
self
.
log_base
,
business_type
,
doc_info
,
pdf_path
))
return
doc_data_path
,
excel_path
,
pdf_path
@staticmethod
def
append_sheet
(
wb
,
sheets_list
,
img_name
):
...
...
@@ -189,9 +196,9 @@ class Command(BaseCommand, LoggerMixin):
max_sleep_second
=
60
while
self
.
switch
:
# 从队列获取文件信息
doc_info
=
self
.
get_doc_info
()
doc_info
,
doc_class
,
doc_id
,
business_type
=
self
.
get_doc_info
()
# 从EDMS获取PDF文件
doc_data_path
,
excel_path
,
pdf_path
,
doc_id
=
self
.
pdf_download
(
doc_info
)
doc_data_path
,
excel_path
,
pdf_path
=
self
.
pdf_download
(
doc_id
,
doc_info
,
business_type
)
# 队列为空时的处理
if
pdf_path
is
None
:
time
.
sleep
(
sleep_second
)
...
...
@@ -276,10 +283,10 @@ class Command(BaseCommand, LoggerMixin):
wb
.
save
(
excel_path
)
# TODO no sheet (res always [])
# 整合excel文件上传至EDMS
except
Exception
as
e
:
UploadDocRecord
s
.
objects
.
filter
(
id
=
doc_id
)
.
update
(
status
=
DocStatus
.
PROCESS_FAILED
.
value
)
doc_clas
s
.
objects
.
filter
(
id
=
doc_id
)
.
update
(
status
=
DocStatus
.
PROCESS_FAILED
.
value
)
self
.
cronjob_log
.
error
(
'{0} [process failed] [doc_id={1}] [err={2}]'
.
format
(
self
.
log_base
,
doc_id
,
e
))
else
:
UploadDocRecord
s
.
objects
.
filter
(
id
=
doc_id
)
.
update
(
status
=
DocStatus
.
COMPLETE
.
value
)
doc_clas
s
.
objects
.
filter
(
id
=
doc_id
)
.
update
(
status
=
DocStatus
.
COMPLETE
.
value
)
self
.
cronjob_log
.
info
(
'{0} [doc process complete] [doc_id={1}]'
.
format
(
self
.
log_base
,
doc_id
))
self
.
cronjob_log
.
info
(
'{0} [stop safely]'
)
self
.
cronjob_log
.
info
(
'{0} [stop safely]'
.
format
(
self
.
log_base
)
)
...
...
src/apps/doc/models.py
View file @
ad16112
...
...
@@ -5,7 +5,7 @@ from .named_enum import DocStatus
# 上传文件记录表/任务表
class
UploadDocRecords
(
models
.
Model
):
# TODO records一张表、文件(任务)根据business_type分库存储
class
UploadDocRecords
(
models
.
Model
):
id
=
models
.
AutoField
(
primary_key
=
True
,
verbose_name
=
"id"
)
metadata_version_id
=
models
.
CharField
(
max_length
=
64
,
verbose_name
=
"元数据版本id"
)
application_id
=
models
.
CharField
(
max_length
=
64
,
verbose_name
=
"申请id"
)
...
...
@@ -13,7 +13,6 @@ class UploadDocRecords(models.Model): # TODO records一张表、文件(任务
co_applicant
=
models
.
CharField
(
max_length
=
16
,
verbose_name
=
"共同申请人"
)
guarantor_1
=
models
.
CharField
(
max_length
=
16
,
verbose_name
=
"担保人1"
)
guarantor_2
=
models
.
CharField
(
max_length
=
16
,
verbose_name
=
"担保人2"
)
status
=
models
.
SmallIntegerField
(
default
=
DocStatus
.
INIT
.
value
,
verbose_name
=
"文件状态"
)
document_name
=
models
.
CharField
(
max_length
=
255
,
verbose_name
=
"文件名"
)
document_scheme
=
models
.
CharField
(
max_length
=
64
,
verbose_name
=
"文件方案"
)
business_type
=
models
.
CharField
(
max_length
=
64
,
verbose_name
=
"业务类型"
)
...
...
@@ -26,3 +25,62 @@ class UploadDocRecords(models.Model): # TODO records一张表、文件(任务
managed
=
False
db_table
=
'upload_doc_records'
class
HILDoc
(
models
.
Model
):
id
=
models
.
AutoField
(
primary_key
=
True
,
verbose_name
=
"id"
)
record_id
=
models
.
IntegerField
(
verbose_name
=
'记录id'
)
metadata_version_id
=
models
.
CharField
(
max_length
=
64
,
verbose_name
=
"元数据版本id"
)
application_id
=
models
.
CharField
(
max_length
=
64
,
verbose_name
=
"申请id"
)
# 联合索引
status
=
models
.
SmallIntegerField
(
default
=
DocStatus
.
INIT
.
value
,
verbose_name
=
"文件状态"
)
# 联合索引
main_applicant
=
models
.
CharField
(
max_length
=
16
,
verbose_name
=
"主申请人"
)
co_applicant
=
models
.
CharField
(
max_length
=
16
,
verbose_name
=
"共同申请人"
)
guarantor_1
=
models
.
CharField
(
max_length
=
16
,
verbose_name
=
"担保人1"
)
guarantor_2
=
models
.
CharField
(
max_length
=
16
,
verbose_name
=
"担保人2"
)
document_name
=
models
.
CharField
(
max_length
=
255
,
verbose_name
=
"文件名"
)
document_scheme
=
models
.
CharField
(
max_length
=
64
,
verbose_name
=
"文件方案"
)
data_source
=
models
.
CharField
(
max_length
=
64
,
verbose_name
=
"数据源"
)
upload_finish_time
=
models
.
DateTimeField
(
verbose_name
=
"上传完成时间"
)
# 索引
update_time
=
models
.
DateTimeField
(
auto_now
=
True
,
verbose_name
=
'修改时间'
)
create_time
=
models
.
DateTimeField
(
auto_now_add
=
True
,
verbose_name
=
'创建时间'
)
# 索引
class
Meta
:
managed
=
False
db_table
=
'hil_doc'
class
AFCDoc
(
models
.
Model
):
id
=
models
.
AutoField
(
primary_key
=
True
,
verbose_name
=
"id"
)
record_id
=
models
.
IntegerField
(
verbose_name
=
'记录id'
)
metadata_version_id
=
models
.
CharField
(
max_length
=
64
,
verbose_name
=
"元数据版本id"
)
application_id
=
models
.
CharField
(
max_length
=
64
,
verbose_name
=
"申请id"
)
status
=
models
.
SmallIntegerField
(
default
=
DocStatus
.
INIT
.
value
,
verbose_name
=
"文件状态"
)
main_applicant
=
models
.
CharField
(
max_length
=
16
,
verbose_name
=
"主申请人"
)
co_applicant
=
models
.
CharField
(
max_length
=
16
,
verbose_name
=
"共同申请人"
)
guarantor_1
=
models
.
CharField
(
max_length
=
16
,
verbose_name
=
"担保人1"
)
guarantor_2
=
models
.
CharField
(
max_length
=
16
,
verbose_name
=
"担保人2"
)
document_name
=
models
.
CharField
(
max_length
=
255
,
verbose_name
=
"文件名"
)
document_scheme
=
models
.
CharField
(
max_length
=
64
,
verbose_name
=
"文件方案"
)
data_source
=
models
.
CharField
(
max_length
=
64
,
verbose_name
=
"数据源"
)
upload_finish_time
=
models
.
DateTimeField
(
verbose_name
=
"上传完成时间"
)
update_time
=
models
.
DateTimeField
(
auto_now
=
True
,
verbose_name
=
'修改时间'
)
create_time
=
models
.
DateTimeField
(
auto_now_add
=
True
,
verbose_name
=
'创建时间'
)
class
Meta
:
managed
=
False
situ_db_label
=
'afc'
db_table
=
'afc_doc'
class
PriorityApplication
(
models
.
Model
):
id
=
models
.
AutoField
(
primary_key
=
True
,
verbose_name
=
"id"
)
application_id
=
models
.
CharField
(
max_length
=
64
,
verbose_name
=
"申请id"
)
# 联合索引
business_type
=
models
.
CharField
(
max_length
=
64
,
verbose_name
=
"业务类型"
)
# 联合索引
on_off
=
models
.
BooleanField
(
default
=
True
,
verbose_name
=
"是否有效"
)
# 联合索引
update_time
=
models
.
DateTimeField
(
auto_now
=
True
,
verbose_name
=
'修改时间'
)
create_time
=
models
.
DateTimeField
(
auto_now_add
=
True
,
verbose_name
=
'创建时间'
)
class
Meta
:
managed
=
False
situ_db_label
=
'afc'
db_table
=
'priority_application'
...
...
src/apps/doc/priority_urls.py
0 → 100644
View file @
ad16112
from
django.urls
import
path
from
.
import
views
urlpatterns
=
[
path
(
r'v1'
,
views
.
PriorityDocView
.
as_view
()),
]
src/apps/doc/views.py
View file @
ad16112
This diff is collapsed.
Click to expand it.
src/apps/urls.py
View file @
ad16112
...
...
@@ -19,6 +19,7 @@ from django.urls import path, include
urlpatterns
=
[
path
(
'admin/'
,
admin
.
site
.
urls
),
path
(
r'api/user/'
,
include
(
'apps.account.urls'
)),
path
(
r'api/create/v1'
,
include
(
'apps.doc.urls'
)),
path
(
r'api/create/'
,
include
(
'apps.doc.create_urls'
)),
path
(
r'api/priority/'
,
include
(
'apps.doc.priority_urls'
)),
path
(
r'api/doc/'
,
include
(
'apps.doc.internal_urls'
)),
]
...
...
src/common/redis_cache/base.py
View file @
ad16112
...
...
@@ -92,27 +92,13 @@ class Redis:
def
expire
(
self
,
key
,
value
):
return
self
.
client
.
expire
(
key
,
value
)
def
hmset
(
self
,
name
,
mapping
):
return
self
.
client
.
hmset
(
name
,
mapping
)
def
lpush
(
self
,
key
,
values
):
return
self
.
client
.
lpush
(
key
,
*
values
)
# int
def
hgetall
(
self
,
name
):
return
self
.
client
.
hgetall
(
name
)
def
lrange
(
self
,
key
,
start
,
end
):
return
self
.
client
.
lrange
(
key
,
start
,
end
)
# list
def
hincrby
(
self
,
name
,
key
,
amount
=
1
):
return
self
.
client
.
hincrby
(
name
,
key
,
amount
)
def
rpop
(
self
,
key
):
return
self
.
client
.
rpop
(
key
)
# str or None
def
zadd
(
self
,
name
,
mapping
):
return
self
.
client
.
zadd
(
name
,
mapping
)
def
zremrangebyrank
(
self
,
name
,
start
,
end
):
with
self
.
client
.
pipeline
()
as
pipe
:
pipe
.
zrange
(
name
,
start
,
end
)
# TODO 可能出现不一致性
pipe
.
zremrangebyrank
(
name
,
start
,
end
)
item
=
pipe
.
execute
()
return
item
def
zrank
(
self
,
name
,
value
):
return
self
.
client
.
zrank
(
name
,
value
)
def
zrange
(
self
,
name
,
start
,
end
):
return
self
.
client
.
zrange
(
name
,
start
,
end
)
...
...
src/common/redis_cache/handler.py
View file @
ad16112
...
...
@@ -33,17 +33,20 @@ class RedisHandler:
self
.
time_expires
=
datetime
.
timedelta
(
hours
=
24
)
self
.
time_format
=
'
%
a
%
b
%
d
%
H:
%
M:
%
S
%
Y'
self
.
prefix
=
'bwm_ocr'
self
.
queue_key
=
'{0}:queue'
.
format
(
self
.
prefix
)
self
.
common_queue_key
=
'{0}:common_queue'
.
format
(
self
.
prefix
)
self
.
priority_queue_key
=
'{0}:priority_queue'
.
format
(
self
.
prefix
)
def
enqueue
(
self
,
task
_id
):
def
enqueue
(
self
,
task
s
,
is_priority
=
False
):
# 1
mapping
=
{
task_id
:
time
.
time
()}
return
self
.
redis
.
zadd
(
self
.
queue_key
,
mapping
)
key
=
self
.
priority_queue_key
if
is_priority
else
self
.
common_queue_key
return
self
.
redis
.
lpush
(
key
,
tasks
)
def
dequeue
(
self
):
# model_id:int or None
res_list
=
self
.
redis
.
zremrangebyrank
(
self
.
queue_key
,
0
,
0
)
pop_item_list
=
res_list
[
0
]
pop_item
=
int
(
pop_item_list
[
0
])
if
pop_item_list
else
None
return
pop_item
# task or None
task
=
self
.
redis
.
rpop
(
self
.
priority_queue_key
)
is_priority
=
True
if
task
is
None
:
task
=
self
.
redis
.
rpop
(
self
.
common_queue_key
)
is_priority
=
False
return
task
,
is_priority
...
...
src/common/tools/pdf_tools.py
View file @
ad16112
...
...
@@ -152,7 +152,7 @@ class PdfHandler:
print
(
'----------------------------'
)
print
(
self
.
pdf_name
)
print
(
pdf
.
metadata
)
# xref_list = []
# TODO 图片去重
# xref_list = []
for
pno
in
range
(
pdf
.
pageCount
):
print
(
'========================'
)
il
=
pdf
.
getPageImageList
(
pno
)
...
...
@@ -162,7 +162,7 @@ class PdfHandler:
img_il_list
=
self
.
split_il
(
il
)
il
=
None
print
(
img_il_list
)
print
(
len
(
img_il_list
))
# TODO 判断单页图片过多时,使用页面转图片
print
(
len
(
img_il_list
))
for
img_count
,
img_il
in
enumerate
(
img_il_list
):
print
(
img_il
)
...
...
src/settings/__init__.py
View file @
ad16112
...
...
@@ -91,7 +91,8 @@ WSGI_APPLICATION = 'wsgi.application'
# }
DATABASES
=
{
'default'
:
conf
.
get_namespace
(
'MYSQL_'
)
'default'
:
conf
.
get_namespace
(
'MYSQL_DEFAULT_'
),
'afc'
:
conf
.
get_namespace
(
'MYSQL_AFC_'
)
}
DATABASE_ROUTERS
=
[
'settings.database.DBRouter'
]
MYSQLPOOL_ARGUMENTS
=
database
.
MYSQLPOOL_ARGUMENTS
...
...
src/settings/database.py
View file @
ad16112
...
...
@@ -15,7 +15,7 @@ options.DEFAULT_NAMES = tuple(list(options.DEFAULT_NAMES) + ['situ_db_label'])
# 数据库连接池配置
MYSQLPOOL_ARGUMENTS
=
{
'recycle'
:
30
,
'pool_size'
:
128
,
'pool_size'
:
64
,
'max_overflow'
:
10
,
'timeout'
:
5
,
'use_threadlocal'
:
True
,
...
...
@@ -26,12 +26,12 @@ class DBRouter(object):
def
db_for_read
(
self
,
model
,
**
hints
):
if
hasattr
(
model
.
_meta
,
'situ_db_label'
):
return
model
.
_meta
.
aft
_db_label
return
model
.
_meta
.
situ
_db_label
return
None
def
db_for_write
(
self
,
model
,
**
hints
):
if
hasattr
(
model
.
_meta
,
'situ_db_label'
):
return
model
.
_meta
.
aft
_db_label
return
model
.
_meta
.
situ
_db_label
return
None
def
allow_relation
(
self
,
obj1
,
obj2
,
**
hints
):
...
...
Write
Preview
Styling with
Markdown
is supported
Attach a file
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to post a comment