ad161125 by 周伟奇

priority queue

1 parent e325cfc3
PAGE_DEFAULT = 1
PAGE_SIZE_DEFAULT = 10
BUSINESS_TYPE = ['HIL', 'AFC']
HIL_SET = {'HIL', 'hil', 'CO00002', 'C000002'}
HIL_PREFIX = 'HIL'
AFC_PREFIX = 'AFC'
......
......@@ -3,5 +3,5 @@ from . import views
urlpatterns = [
path(r'', views.UploadDocView.as_view()),
path(r'v1', views.UploadDocView.as_view()),
]
......
......@@ -13,7 +13,8 @@ from django.core.management import BaseCommand
from common.mixins import LoggerMixin
from common.redis_cache import redis_handler as rh
from common.tools.file_tools import write_zip_file
from apps.doc.models import UploadDocRecords, DocStatus
from apps.doc.models import DocStatus, HILDoc, AFCDoc
from apps.doc import consts
from settings import conf
......@@ -42,33 +43,39 @@ class Command(BaseCommand, LoggerMixin):
def signal_handler(self, sig, frame):
self.switch = False # 停止处理文件
def get_doc_info(self): # TODO 优先队列
doc_id = rh.dequeue()
if doc_id is None:
def get_doc_info(self):
task_str, is_priority = rh.dequeue()
if task_str is None:
self.cronjob_log.info('{0} [get_doc_info] [queue empty]'.format(self.log_base))
return
doc_info = UploadDocRecords.objects.filter(id=doc_id).values(
'id', 'metadata_version_id', 'document_name').first()
if doc_info is None:
self.cronjob_log.warn('{0} [get_doc_info] [doc not found] [doc_id={1}]'.format(self.log_base, doc_id))
return
UploadDocRecords.objects.filter(id=doc_id).update(status=DocStatus.PROCESSING.value)
self.cronjob_log.info('{0} [get_task_info success] [doc_info={1}]'.format(self.log_base, doc_info))
return doc_info
return None, None, None, None
def pdf_download(self, doc_info):
business_type, doc_id_str = task_str.split('_')
doc_id = int(doc_id_str)
doc_class = HILDoc if business_type == consts.HIL_PREFIX else AFCDoc
doc_info = doc_class.objects.filter(id=doc_id, status=DocStatus.INIT.value).values(
'id', 'metadata_version_id', 'document_name').first() # TODO 查不到时是否为None
if doc_info is None:
self.cronjob_log.warn('{0} [get_doc_info] [doc completed] [task_str={1}] [is_priority={2}]'.format(
self.log_base, task_str, is_priority))
return None, None, None, None
doc_class.objects.filter(id=doc_id).update(status=DocStatus.PROCESSING.value)
self.cronjob_log.info('{0} [get_doc_info] [task_str={1}] [is_priority={2}] [doc_info={3}]'.format(
self.log_base, task_str, is_priority, doc_info))
return doc_info, doc_class, doc_id, business_type
def pdf_download(self, doc_id, doc_info, business_type):
if doc_info is None:
return None, None, None
# TODO EDMS下载pdf
# pdf_path = '/Users/clay/Desktop/biz/biz_logic/data/2/横版-表格-工商银行CH-B008802400.pdf'
# doc_data_path = os.path.dirname(pdf_path)
doc_id = doc_info['id']
doc_data_path = os.path.join(self.data_dir, str(doc_id))
doc_data_path = os.path.join(self.data_dir, business_type, str(doc_id))
pdf_path = os.path.join(doc_data_path, '{0}.pdf'.format(doc_id))
excel_path = os.path.join(doc_data_path, '{0}.xls'.format(doc_id))
self.cronjob_log.info('{0} [pdf download success] [doc_info={1}] [pdf_path={2}]'.format(
self.log_base, doc_info, pdf_path))
return doc_data_path, excel_path, pdf_path, doc_id
self.cronjob_log.info('{0} [pdf download success] [business_type={1}] [doc_info={2}] [pdf_path={3}]'.format(
self.log_base, business_type, doc_info, pdf_path))
return doc_data_path, excel_path, pdf_path
@staticmethod
def append_sheet(wb, sheets_list, img_name):
......@@ -189,9 +196,9 @@ class Command(BaseCommand, LoggerMixin):
max_sleep_second = 60
while self.switch:
# 从队列获取文件信息
doc_info = self.get_doc_info()
doc_info, doc_class, doc_id, business_type = self.get_doc_info()
# 从EDMS获取PDF文件
doc_data_path, excel_path, pdf_path, doc_id = self.pdf_download(doc_info)
doc_data_path, excel_path, pdf_path = self.pdf_download(doc_id, doc_info, business_type)
# 队列为空时的处理
if pdf_path is None:
time.sleep(sleep_second)
......@@ -276,10 +283,10 @@ class Command(BaseCommand, LoggerMixin):
wb.save(excel_path) # TODO no sheet (res always [])
# 整合excel文件上传至EDMS
except Exception as e:
UploadDocRecords.objects.filter(id=doc_id).update(status=DocStatus.PROCESS_FAILED.value)
doc_class.objects.filter(id=doc_id).update(status=DocStatus.PROCESS_FAILED.value)
self.cronjob_log.error('{0} [process failed] [doc_id={1}] [err={2}]'.format(self.log_base, doc_id, e))
else:
UploadDocRecords.objects.filter(id=doc_id).update(status=DocStatus.COMPLETE.value)
doc_class.objects.filter(id=doc_id).update(status=DocStatus.COMPLETE.value)
self.cronjob_log.info('{0} [doc process complete] [doc_id={1}]'.format(self.log_base, doc_id))
self.cronjob_log.info('{0} [stop safely]')
self.cronjob_log.info('{0} [stop safely]'.format(self.log_base))
......
......@@ -5,7 +5,7 @@ from .named_enum import DocStatus
# 上传文件记录表/任务表
class UploadDocRecords(models.Model): # TODO records一张表、文件(任务)根据business_type分库存储
class UploadDocRecords(models.Model):
id = models.AutoField(primary_key=True, verbose_name="id")
metadata_version_id = models.CharField(max_length=64, verbose_name="元数据版本id")
application_id = models.CharField(max_length=64, verbose_name="申请id")
......@@ -13,7 +13,6 @@ class UploadDocRecords(models.Model): # TODO records一张表、文件(任务
co_applicant = models.CharField(max_length=16, verbose_name="共同申请人")
guarantor_1 = models.CharField(max_length=16, verbose_name="担保人1")
guarantor_2 = models.CharField(max_length=16, verbose_name="担保人2")
status = models.SmallIntegerField(default=DocStatus.INIT.value, verbose_name="文件状态")
document_name = models.CharField(max_length=255, verbose_name="文件名")
document_scheme = models.CharField(max_length=64, verbose_name="文件方案")
business_type = models.CharField(max_length=64, verbose_name="业务类型")
......@@ -26,3 +25,62 @@ class UploadDocRecords(models.Model): # TODO records一张表、文件(任务
managed = False
db_table = 'upload_doc_records'
class HILDoc(models.Model):
id = models.AutoField(primary_key=True, verbose_name="id")
record_id = models.IntegerField(verbose_name='记录id')
metadata_version_id = models.CharField(max_length=64, verbose_name="元数据版本id")
application_id = models.CharField(max_length=64, verbose_name="申请id") # 联合索引
status = models.SmallIntegerField(default=DocStatus.INIT.value, verbose_name="文件状态") # 联合索引
main_applicant = models.CharField(max_length=16, verbose_name="主申请人")
co_applicant = models.CharField(max_length=16, verbose_name="共同申请人")
guarantor_1 = models.CharField(max_length=16, verbose_name="担保人1")
guarantor_2 = models.CharField(max_length=16, verbose_name="担保人2")
document_name = models.CharField(max_length=255, verbose_name="文件名")
document_scheme = models.CharField(max_length=64, verbose_name="文件方案")
data_source = models.CharField(max_length=64, verbose_name="数据源")
upload_finish_time = models.DateTimeField(verbose_name="上传完成时间") # 索引
update_time = models.DateTimeField(auto_now=True, verbose_name='修改时间')
create_time = models.DateTimeField(auto_now_add=True, verbose_name='创建时间') # 索引
class Meta:
managed = False
db_table = 'hil_doc'
class AFCDoc(models.Model):
id = models.AutoField(primary_key=True, verbose_name="id")
record_id = models.IntegerField(verbose_name='记录id')
metadata_version_id = models.CharField(max_length=64, verbose_name="元数据版本id")
application_id = models.CharField(max_length=64, verbose_name="申请id")
status = models.SmallIntegerField(default=DocStatus.INIT.value, verbose_name="文件状态")
main_applicant = models.CharField(max_length=16, verbose_name="主申请人")
co_applicant = models.CharField(max_length=16, verbose_name="共同申请人")
guarantor_1 = models.CharField(max_length=16, verbose_name="担保人1")
guarantor_2 = models.CharField(max_length=16, verbose_name="担保人2")
document_name = models.CharField(max_length=255, verbose_name="文件名")
document_scheme = models.CharField(max_length=64, verbose_name="文件方案")
data_source = models.CharField(max_length=64, verbose_name="数据源")
upload_finish_time = models.DateTimeField(verbose_name="上传完成时间")
update_time = models.DateTimeField(auto_now=True, verbose_name='修改时间')
create_time = models.DateTimeField(auto_now_add=True, verbose_name='创建时间')
class Meta:
managed = False
situ_db_label = 'afc'
db_table = 'afc_doc'
class PriorityApplication(models.Model):
id = models.AutoField(primary_key=True, verbose_name="id")
application_id = models.CharField(max_length=64, verbose_name="申请id") # 联合索引
business_type = models.CharField(max_length=64, verbose_name="业务类型") # 联合索引
on_off = models.BooleanField(default=True, verbose_name="是否有效") # 联合索引
update_time = models.DateTimeField(auto_now=True, verbose_name='修改时间')
create_time = models.DateTimeField(auto_now_add=True, verbose_name='创建时间')
class Meta:
managed = False
situ_db_label = 'afc'
db_table = 'priority_application'
......
from django.urls import path
from . import views
urlpatterns = [
path(r'v1', views.PriorityDocView.as_view()),
]
......@@ -19,6 +19,7 @@ from django.urls import path, include
urlpatterns = [
path('admin/', admin.site.urls),
path(r'api/user/', include('apps.account.urls')),
path(r'api/create/v1', include('apps.doc.urls')),
path(r'api/create/', include('apps.doc.create_urls')),
path(r'api/priority/', include('apps.doc.priority_urls')),
path(r'api/doc/', include('apps.doc.internal_urls')),
]
......
......@@ -92,27 +92,13 @@ class Redis:
def expire(self, key, value):
return self.client.expire(key, value)
def hmset(self, name, mapping):
return self.client.hmset(name, mapping)
def lpush(self, key, values):
return self.client.lpush(key, *values) # int
def hgetall(self, name):
return self.client.hgetall(name)
def lrange(self, key, start, end):
return self.client.lrange(key, start, end) # list
def hincrby(self, name, key, amount=1):
return self.client.hincrby(name, key, amount)
def rpop(self, key):
return self.client.rpop(key) # str or None
def zadd(self, name, mapping):
return self.client.zadd(name, mapping)
def zremrangebyrank(self, name, start, end):
with self.client.pipeline() as pipe:
pipe.zrange(name, start, end) # TODO 可能出现不一致性
pipe.zremrangebyrank(name, start, end)
item = pipe.execute()
return item
def zrank(self, name, value):
return self.client.zrank(name, value)
def zrange(self, name, start, end):
return self.client.zrange(name, start, end)
......
......@@ -33,17 +33,20 @@ class RedisHandler:
self.time_expires = datetime.timedelta(hours=24)
self.time_format = '%a %b %d %H:%M:%S %Y'
self.prefix = 'bwm_ocr'
self.queue_key = '{0}:queue'.format(self.prefix)
self.common_queue_key = '{0}:common_queue'.format(self.prefix)
self.priority_queue_key = '{0}:priority_queue'.format(self.prefix)
def enqueue(self, task_id):
def enqueue(self, tasks, is_priority=False):
# 1
mapping = {task_id: time.time()}
return self.redis.zadd(self.queue_key, mapping)
key = self.priority_queue_key if is_priority else self.common_queue_key
return self.redis.lpush(key, tasks)
def dequeue(self):
# model_id:int or None
res_list = self.redis.zremrangebyrank(self.queue_key, 0, 0)
pop_item_list = res_list[0]
pop_item = int(pop_item_list[0]) if pop_item_list else None
return pop_item
# task or None
task = self.redis.rpop(self.priority_queue_key)
is_priority = True
if task is None:
task = self.redis.rpop(self.common_queue_key)
is_priority = False
return task, is_priority
......
......@@ -152,7 +152,7 @@ class PdfHandler:
print('----------------------------')
print(self.pdf_name)
print(pdf.metadata)
# xref_list = [] # TODO 图片去重
# xref_list = []
for pno in range(pdf.pageCount):
print('========================')
il = pdf.getPageImageList(pno)
......@@ -162,7 +162,7 @@ class PdfHandler:
img_il_list = self.split_il(il)
il = None
print(img_il_list)
print(len(img_il_list)) # TODO 判断单页图片过多时,使用页面转图片
print(len(img_il_list))
for img_count, img_il in enumerate(img_il_list):
print(img_il)
......
......@@ -91,7 +91,8 @@ WSGI_APPLICATION = 'wsgi.application'
# }
DATABASES = {
'default': conf.get_namespace('MYSQL_')
'default': conf.get_namespace('MYSQL_DEFAULT_'),
'afc': conf.get_namespace('MYSQL_AFC_')
}
DATABASE_ROUTERS = ['settings.database.DBRouter']
MYSQLPOOL_ARGUMENTS = database.MYSQLPOOL_ARGUMENTS
......
......@@ -15,7 +15,7 @@ options.DEFAULT_NAMES = tuple(list(options.DEFAULT_NAMES) + ['situ_db_label'])
# 数据库连接池配置
MYSQLPOOL_ARGUMENTS = {
'recycle': 30,
'pool_size': 128,
'pool_size': 64,
'max_overflow': 10,
'timeout': 5,
'use_threadlocal': True,
......@@ -26,12 +26,12 @@ class DBRouter(object):
def db_for_read(self, model, **hints):
if hasattr(model._meta, 'situ_db_label'):
return model._meta.aft_db_label
return model._meta.situ_db_label
return None
def db_for_write(self, model, **hints):
if hasattr(model._meta, 'situ_db_label'):
return model._meta.aft_db_label
return model._meta.situ_db_label
return None
def allow_relation(self, obj1, obj2, **hints):
......
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!