ad161125 by 周伟奇

priority queue

1 parent e325cfc3
1 PAGE_DEFAULT = 1 1 PAGE_DEFAULT = 1
2 PAGE_SIZE_DEFAULT = 10 2 PAGE_SIZE_DEFAULT = 10
3
4 BUSINESS_TYPE = ['HIL', 'AFC']
5 HIL_SET = {'HIL', 'hil', 'CO00002', 'C000002'}
6 HIL_PREFIX = 'HIL'
7 AFC_PREFIX = 'AFC'
......
...@@ -3,5 +3,5 @@ from . import views ...@@ -3,5 +3,5 @@ from . import views
3 3
4 4
5 urlpatterns = [ 5 urlpatterns = [
6 path(r'', views.UploadDocView.as_view()), 6 path(r'v1', views.UploadDocView.as_view()),
7 ] 7 ]
......
...@@ -13,7 +13,8 @@ from django.core.management import BaseCommand ...@@ -13,7 +13,8 @@ from django.core.management import BaseCommand
13 from common.mixins import LoggerMixin 13 from common.mixins import LoggerMixin
14 from common.redis_cache import redis_handler as rh 14 from common.redis_cache import redis_handler as rh
15 from common.tools.file_tools import write_zip_file 15 from common.tools.file_tools import write_zip_file
16 from apps.doc.models import UploadDocRecords, DocStatus 16 from apps.doc.models import DocStatus, HILDoc, AFCDoc
17 from apps.doc import consts
17 from settings import conf 18 from settings import conf
18 19
19 20
...@@ -42,33 +43,39 @@ class Command(BaseCommand, LoggerMixin): ...@@ -42,33 +43,39 @@ class Command(BaseCommand, LoggerMixin):
42 def signal_handler(self, sig, frame): 43 def signal_handler(self, sig, frame):
43 self.switch = False # 停止处理文件 44 self.switch = False # 停止处理文件
44 45
45 def get_doc_info(self): # TODO 优先队列 46 def get_doc_info(self):
46 doc_id = rh.dequeue() 47 task_str, is_priority = rh.dequeue()
47 if doc_id is None: 48 if task_str is None:
48 self.cronjob_log.info('{0} [get_doc_info] [queue empty]'.format(self.log_base)) 49 self.cronjob_log.info('{0} [get_doc_info] [queue empty]'.format(self.log_base))
49 return 50 return None, None, None, None
50 doc_info = UploadDocRecords.objects.filter(id=doc_id).values(
51 'id', 'metadata_version_id', 'document_name').first()
52 if doc_info is None:
53 self.cronjob_log.warn('{0} [get_doc_info] [doc not found] [doc_id={1}]'.format(self.log_base, doc_id))
54 return
55 UploadDocRecords.objects.filter(id=doc_id).update(status=DocStatus.PROCESSING.value)
56 self.cronjob_log.info('{0} [get_task_info success] [doc_info={1}]'.format(self.log_base, doc_info))
57 return doc_info
58 51
59 def pdf_download(self, doc_info): 52 business_type, doc_id_str = task_str.split('_')
53 doc_id = int(doc_id_str)
54 doc_class = HILDoc if business_type == consts.HIL_PREFIX else AFCDoc
55 doc_info = doc_class.objects.filter(id=doc_id, status=DocStatus.INIT.value).values(
56 'id', 'metadata_version_id', 'document_name').first() # TODO 查不到时是否为None
60 if doc_info is None: 57 if doc_info is None:
58 self.cronjob_log.warn('{0} [get_doc_info] [doc completed] [task_str={1}] [is_priority={2}]'.format(
59 self.log_base, task_str, is_priority))
61 return None, None, None, None 60 return None, None, None, None
61 doc_class.objects.filter(id=doc_id).update(status=DocStatus.PROCESSING.value)
62 self.cronjob_log.info('{0} [get_doc_info] [task_str={1}] [is_priority={2}] [doc_info={3}]'.format(
63 self.log_base, task_str, is_priority, doc_info))
64 return doc_info, doc_class, doc_id, business_type
65
66 def pdf_download(self, doc_id, doc_info, business_type):
67 if doc_info is None:
68 return None, None, None
62 # TODO EDMS下载pdf 69 # TODO EDMS下载pdf
63 # pdf_path = '/Users/clay/Desktop/biz/biz_logic/data/2/横版-表格-工商银行CH-B008802400.pdf' 70 # pdf_path = '/Users/clay/Desktop/biz/biz_logic/data/2/横版-表格-工商银行CH-B008802400.pdf'
64 # doc_data_path = os.path.dirname(pdf_path) 71 # doc_data_path = os.path.dirname(pdf_path)
65 doc_id = doc_info['id'] 72
66 doc_data_path = os.path.join(self.data_dir, str(doc_id)) 73 doc_data_path = os.path.join(self.data_dir, business_type, str(doc_id))
67 pdf_path = os.path.join(doc_data_path, '{0}.pdf'.format(doc_id)) 74 pdf_path = os.path.join(doc_data_path, '{0}.pdf'.format(doc_id))
68 excel_path = os.path.join(doc_data_path, '{0}.xls'.format(doc_id)) 75 excel_path = os.path.join(doc_data_path, '{0}.xls'.format(doc_id))
69 self.cronjob_log.info('{0} [pdf download success] [doc_info={1}] [pdf_path={2}]'.format( 76 self.cronjob_log.info('{0} [pdf download success] [business_type={1}] [doc_info={2}] [pdf_path={3}]'.format(
70 self.log_base, doc_info, pdf_path)) 77 self.log_base, business_type, doc_info, pdf_path))
71 return doc_data_path, excel_path, pdf_path, doc_id 78 return doc_data_path, excel_path, pdf_path
72 79
73 @staticmethod 80 @staticmethod
74 def append_sheet(wb, sheets_list, img_name): 81 def append_sheet(wb, sheets_list, img_name):
...@@ -189,9 +196,9 @@ class Command(BaseCommand, LoggerMixin): ...@@ -189,9 +196,9 @@ class Command(BaseCommand, LoggerMixin):
189 max_sleep_second = 60 196 max_sleep_second = 60
190 while self.switch: 197 while self.switch:
191 # 从队列获取文件信息 198 # 从队列获取文件信息
192 doc_info = self.get_doc_info() 199 doc_info, doc_class, doc_id, business_type = self.get_doc_info()
193 # 从EDMS获取PDF文件 200 # 从EDMS获取PDF文件
194 doc_data_path, excel_path, pdf_path, doc_id = self.pdf_download(doc_info) 201 doc_data_path, excel_path, pdf_path = self.pdf_download(doc_id, doc_info, business_type)
195 # 队列为空时的处理 202 # 队列为空时的处理
196 if pdf_path is None: 203 if pdf_path is None:
197 time.sleep(sleep_second) 204 time.sleep(sleep_second)
...@@ -276,10 +283,10 @@ class Command(BaseCommand, LoggerMixin): ...@@ -276,10 +283,10 @@ class Command(BaseCommand, LoggerMixin):
276 wb.save(excel_path) # TODO no sheet (res always []) 283 wb.save(excel_path) # TODO no sheet (res always [])
277 # 整合excel文件上传至EDMS 284 # 整合excel文件上传至EDMS
278 except Exception as e: 285 except Exception as e:
279 UploadDocRecords.objects.filter(id=doc_id).update(status=DocStatus.PROCESS_FAILED.value) 286 doc_class.objects.filter(id=doc_id).update(status=DocStatus.PROCESS_FAILED.value)
280 self.cronjob_log.error('{0} [process failed] [doc_id={1}] [err={2}]'.format(self.log_base, doc_id, e)) 287 self.cronjob_log.error('{0} [process failed] [doc_id={1}] [err={2}]'.format(self.log_base, doc_id, e))
281 else: 288 else:
282 UploadDocRecords.objects.filter(id=doc_id).update(status=DocStatus.COMPLETE.value) 289 doc_class.objects.filter(id=doc_id).update(status=DocStatus.COMPLETE.value)
283 self.cronjob_log.info('{0} [doc process complete] [doc_id={1}]'.format(self.log_base, doc_id)) 290 self.cronjob_log.info('{0} [doc process complete] [doc_id={1}]'.format(self.log_base, doc_id))
284 291
285 self.cronjob_log.info('{0} [stop safely]') 292 self.cronjob_log.info('{0} [stop safely]'.format(self.log_base))
......
...@@ -5,7 +5,7 @@ from .named_enum import DocStatus ...@@ -5,7 +5,7 @@ from .named_enum import DocStatus
5 5
6 6
7 # 上传文件记录表/任务表 7 # 上传文件记录表/任务表
8 class UploadDocRecords(models.Model): # TODO records一张表、文件(任务)根据business_type分库存储 8 class UploadDocRecords(models.Model):
9 id = models.AutoField(primary_key=True, verbose_name="id") 9 id = models.AutoField(primary_key=True, verbose_name="id")
10 metadata_version_id = models.CharField(max_length=64, verbose_name="元数据版本id") 10 metadata_version_id = models.CharField(max_length=64, verbose_name="元数据版本id")
11 application_id = models.CharField(max_length=64, verbose_name="申请id") 11 application_id = models.CharField(max_length=64, verbose_name="申请id")
...@@ -13,7 +13,6 @@ class UploadDocRecords(models.Model): # TODO records一张表、文件(任务 ...@@ -13,7 +13,6 @@ class UploadDocRecords(models.Model): # TODO records一张表、文件(任务
13 co_applicant = models.CharField(max_length=16, verbose_name="共同申请人") 13 co_applicant = models.CharField(max_length=16, verbose_name="共同申请人")
14 guarantor_1 = models.CharField(max_length=16, verbose_name="担保人1") 14 guarantor_1 = models.CharField(max_length=16, verbose_name="担保人1")
15 guarantor_2 = models.CharField(max_length=16, verbose_name="担保人2") 15 guarantor_2 = models.CharField(max_length=16, verbose_name="担保人2")
16 status = models.SmallIntegerField(default=DocStatus.INIT.value, verbose_name="文件状态")
17 document_name = models.CharField(max_length=255, verbose_name="文件名") 16 document_name = models.CharField(max_length=255, verbose_name="文件名")
18 document_scheme = models.CharField(max_length=64, verbose_name="文件方案") 17 document_scheme = models.CharField(max_length=64, verbose_name="文件方案")
19 business_type = models.CharField(max_length=64, verbose_name="业务类型") 18 business_type = models.CharField(max_length=64, verbose_name="业务类型")
...@@ -26,3 +25,62 @@ class UploadDocRecords(models.Model): # TODO records一张表、文件(任务 ...@@ -26,3 +25,62 @@ class UploadDocRecords(models.Model): # TODO records一张表、文件(任务
26 managed = False 25 managed = False
27 db_table = 'upload_doc_records' 26 db_table = 'upload_doc_records'
28 27
28
29 class HILDoc(models.Model):
30 id = models.AutoField(primary_key=True, verbose_name="id")
31 record_id = models.IntegerField(verbose_name='记录id')
32 metadata_version_id = models.CharField(max_length=64, verbose_name="元数据版本id")
33 application_id = models.CharField(max_length=64, verbose_name="申请id") # 联合索引
34 status = models.SmallIntegerField(default=DocStatus.INIT.value, verbose_name="文件状态") # 联合索引
35 main_applicant = models.CharField(max_length=16, verbose_name="主申请人")
36 co_applicant = models.CharField(max_length=16, verbose_name="共同申请人")
37 guarantor_1 = models.CharField(max_length=16, verbose_name="担保人1")
38 guarantor_2 = models.CharField(max_length=16, verbose_name="担保人2")
39 document_name = models.CharField(max_length=255, verbose_name="文件名")
40 document_scheme = models.CharField(max_length=64, verbose_name="文件方案")
41 data_source = models.CharField(max_length=64, verbose_name="数据源")
42 upload_finish_time = models.DateTimeField(verbose_name="上传完成时间") # 索引
43 update_time = models.DateTimeField(auto_now=True, verbose_name='修改时间')
44 create_time = models.DateTimeField(auto_now_add=True, verbose_name='创建时间') # 索引
45
46 class Meta:
47 managed = False
48 db_table = 'hil_doc'
49
50
51 class AFCDoc(models.Model):
52 id = models.AutoField(primary_key=True, verbose_name="id")
53 record_id = models.IntegerField(verbose_name='记录id')
54 metadata_version_id = models.CharField(max_length=64, verbose_name="元数据版本id")
55 application_id = models.CharField(max_length=64, verbose_name="申请id")
56 status = models.SmallIntegerField(default=DocStatus.INIT.value, verbose_name="文件状态")
57 main_applicant = models.CharField(max_length=16, verbose_name="主申请人")
58 co_applicant = models.CharField(max_length=16, verbose_name="共同申请人")
59 guarantor_1 = models.CharField(max_length=16, verbose_name="担保人1")
60 guarantor_2 = models.CharField(max_length=16, verbose_name="担保人2")
61 document_name = models.CharField(max_length=255, verbose_name="文件名")
62 document_scheme = models.CharField(max_length=64, verbose_name="文件方案")
63 data_source = models.CharField(max_length=64, verbose_name="数据源")
64 upload_finish_time = models.DateTimeField(verbose_name="上传完成时间")
65 update_time = models.DateTimeField(auto_now=True, verbose_name='修改时间')
66 create_time = models.DateTimeField(auto_now_add=True, verbose_name='创建时间')
67
68 class Meta:
69 managed = False
70 situ_db_label = 'afc'
71 db_table = 'afc_doc'
72
73
74 class PriorityApplication(models.Model):
75 id = models.AutoField(primary_key=True, verbose_name="id")
76 application_id = models.CharField(max_length=64, verbose_name="申请id") # 联合索引
77 business_type = models.CharField(max_length=64, verbose_name="业务类型") # 联合索引
78 on_off = models.BooleanField(default=True, verbose_name="是否有效") # 联合索引
79 update_time = models.DateTimeField(auto_now=True, verbose_name='修改时间')
80 create_time = models.DateTimeField(auto_now_add=True, verbose_name='创建时间')
81
82 class Meta:
83 managed = False
84 situ_db_label = 'afc'
85 db_table = 'priority_application'
86
......
1 from django.urls import path
2 from . import views
3
4
5 urlpatterns = [
6 path(r'v1', views.PriorityDocView.as_view()),
7 ]
...@@ -19,6 +19,7 @@ from django.urls import path, include ...@@ -19,6 +19,7 @@ from django.urls import path, include
19 urlpatterns = [ 19 urlpatterns = [
20 path('admin/', admin.site.urls), 20 path('admin/', admin.site.urls),
21 path(r'api/user/', include('apps.account.urls')), 21 path(r'api/user/', include('apps.account.urls')),
22 path(r'api/create/v1', include('apps.doc.urls')), 22 path(r'api/create/', include('apps.doc.create_urls')),
23 path(r'api/priority/', include('apps.doc.priority_urls')),
23 path(r'api/doc/', include('apps.doc.internal_urls')), 24 path(r'api/doc/', include('apps.doc.internal_urls')),
24 ] 25 ]
......
...@@ -92,27 +92,13 @@ class Redis: ...@@ -92,27 +92,13 @@ class Redis:
92 def expire(self, key, value): 92 def expire(self, key, value):
93 return self.client.expire(key, value) 93 return self.client.expire(key, value)
94 94
95 def hmset(self, name, mapping): 95 def lpush(self, key, values):
96 return self.client.hmset(name, mapping) 96 return self.client.lpush(key, *values) # int
97 97
98 def hgetall(self, name): 98 def lrange(self, key, start, end):
99 return self.client.hgetall(name) 99 return self.client.lrange(key, start, end) # list
100 100
101 def hincrby(self, name, key, amount=1): 101 def rpop(self, key):
102 return self.client.hincrby(name, key, amount) 102 return self.client.rpop(key) # str or None
103 103
104 def zadd(self, name, mapping):
105 return self.client.zadd(name, mapping)
106 104
107 def zremrangebyrank(self, name, start, end):
108 with self.client.pipeline() as pipe:
109 pipe.zrange(name, start, end) # TODO 可能出现不一致性
110 pipe.zremrangebyrank(name, start, end)
111 item = pipe.execute()
112 return item
113
114 def zrank(self, name, value):
115 return self.client.zrank(name, value)
116
117 def zrange(self, name, start, end):
118 return self.client.zrange(name, start, end)
......
...@@ -33,17 +33,20 @@ class RedisHandler: ...@@ -33,17 +33,20 @@ class RedisHandler:
33 self.time_expires = datetime.timedelta(hours=24) 33 self.time_expires = datetime.timedelta(hours=24)
34 self.time_format = '%a %b %d %H:%M:%S %Y' 34 self.time_format = '%a %b %d %H:%M:%S %Y'
35 self.prefix = 'bwm_ocr' 35 self.prefix = 'bwm_ocr'
36 self.queue_key = '{0}:queue'.format(self.prefix) 36 self.common_queue_key = '{0}:common_queue'.format(self.prefix)
37 self.priority_queue_key = '{0}:priority_queue'.format(self.prefix)
37 38
38 def enqueue(self, task_id): 39 def enqueue(self, tasks, is_priority=False):
39 # 1 40 # 1
40 mapping = {task_id: time.time()} 41 key = self.priority_queue_key if is_priority else self.common_queue_key
41 return self.redis.zadd(self.queue_key, mapping) 42 return self.redis.lpush(key, tasks)
42 43
43 def dequeue(self): 44 def dequeue(self):
44 # model_id:int or None 45 # task or None
45 res_list = self.redis.zremrangebyrank(self.queue_key, 0, 0) 46 task = self.redis.rpop(self.priority_queue_key)
46 pop_item_list = res_list[0] 47 is_priority = True
47 pop_item = int(pop_item_list[0]) if pop_item_list else None 48 if task is None:
48 return pop_item 49 task = self.redis.rpop(self.common_queue_key)
50 is_priority = False
51 return task, is_priority
49 52
......
...@@ -152,7 +152,7 @@ class PdfHandler: ...@@ -152,7 +152,7 @@ class PdfHandler:
152 print('----------------------------') 152 print('----------------------------')
153 print(self.pdf_name) 153 print(self.pdf_name)
154 print(pdf.metadata) 154 print(pdf.metadata)
155 # xref_list = [] # TODO 图片去重 155 # xref_list = []
156 for pno in range(pdf.pageCount): 156 for pno in range(pdf.pageCount):
157 print('========================') 157 print('========================')
158 il = pdf.getPageImageList(pno) 158 il = pdf.getPageImageList(pno)
...@@ -162,7 +162,7 @@ class PdfHandler: ...@@ -162,7 +162,7 @@ class PdfHandler:
162 img_il_list = self.split_il(il) 162 img_il_list = self.split_il(il)
163 il = None 163 il = None
164 print(img_il_list) 164 print(img_il_list)
165 print(len(img_il_list)) # TODO 判断单页图片过多时,使用页面转图片 165 print(len(img_il_list))
166 166
167 for img_count, img_il in enumerate(img_il_list): 167 for img_count, img_il in enumerate(img_il_list):
168 print(img_il) 168 print(img_il)
......
...@@ -91,7 +91,8 @@ WSGI_APPLICATION = 'wsgi.application' ...@@ -91,7 +91,8 @@ WSGI_APPLICATION = 'wsgi.application'
91 # } 91 # }
92 92
93 DATABASES = { 93 DATABASES = {
94 'default': conf.get_namespace('MYSQL_') 94 'default': conf.get_namespace('MYSQL_DEFAULT_'),
95 'afc': conf.get_namespace('MYSQL_AFC_')
95 } 96 }
96 DATABASE_ROUTERS = ['settings.database.DBRouter'] 97 DATABASE_ROUTERS = ['settings.database.DBRouter']
97 MYSQLPOOL_ARGUMENTS = database.MYSQLPOOL_ARGUMENTS 98 MYSQLPOOL_ARGUMENTS = database.MYSQLPOOL_ARGUMENTS
......
...@@ -15,7 +15,7 @@ options.DEFAULT_NAMES = tuple(list(options.DEFAULT_NAMES) + ['situ_db_label']) ...@@ -15,7 +15,7 @@ options.DEFAULT_NAMES = tuple(list(options.DEFAULT_NAMES) + ['situ_db_label'])
15 # 数据库连接池配置 15 # 数据库连接池配置
16 MYSQLPOOL_ARGUMENTS = { 16 MYSQLPOOL_ARGUMENTS = {
17 'recycle': 30, 17 'recycle': 30,
18 'pool_size': 128, 18 'pool_size': 64,
19 'max_overflow': 10, 19 'max_overflow': 10,
20 'timeout': 5, 20 'timeout': 5,
21 'use_threadlocal': True, 21 'use_threadlocal': True,
...@@ -26,12 +26,12 @@ class DBRouter(object): ...@@ -26,12 +26,12 @@ class DBRouter(object):
26 26
27 def db_for_read(self, model, **hints): 27 def db_for_read(self, model, **hints):
28 if hasattr(model._meta, 'situ_db_label'): 28 if hasattr(model._meta, 'situ_db_label'):
29 return model._meta.aft_db_label 29 return model._meta.situ_db_label
30 return None 30 return None
31 31
32 def db_for_write(self, model, **hints): 32 def db_for_write(self, model, **hints):
33 if hasattr(model._meta, 'situ_db_label'): 33 if hasattr(model._meta, 'situ_db_label'):
34 return model._meta.aft_db_label 34 return model._meta.situ_db_label
35 return None 35 return None
36 36
37 def allow_relation(self, obj1, obj2, **hints): 37 def allow_relation(self, obj1, obj2, **hints):
......
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!