doc process part 1
Showing
10 changed files
with
195 additions
and
183 deletions
| ... | @@ -4,12 +4,17 @@ Django==2.1 | ... | @@ -4,12 +4,17 @@ Django==2.1 |
| 4 | djangorestframework==3.9.0 | 4 | djangorestframework==3.9.0 |
| 5 | djangorestframework-jwt==1.11.0 | 5 | djangorestframework-jwt==1.11.0 |
| 6 | marshmallow==3.6.1 | 6 | marshmallow==3.6.1 |
| 7 | pdfminer3k==1.3.4 | ||
| 8 | Pillow==7.1.2 | ||
| 9 | ply==3.11 | ||
| 7 | PyJWT==1.7.1 | 10 | PyJWT==1.7.1 |
| 8 | PyMuPDF==1.17.0 | 11 | PyMuPDF==1.17.0 |
| 9 | PyMySQL==0.9.3 | 12 | PyMySQL==0.9.3 |
| 10 | pytz==2020.1 | 13 | pytz==2020.1 |
| 11 | # simple-config @ http://gitlab.situdata.com/zhouweiqi/simple_config/repository/archive.tar.gz?ref=master | 14 | PyYAML==5.3.1 |
| 12 | # situlogger @ http://gitlab.situdata.com/zhouweiqi/situlogger/repository/archive.tar.gz?ref=master | 15 | redis==3.4.1 |
| 16 | simple-config @ http://gitlab.situdata.com/zhouweiqi/simple_config/repository/archive.tar.gz?ref=master | ||
| 17 | situlogger @ http://gitlab.situdata.com/zhouweiqi/situlogger/repository/archive.tar.gz?ref=master | ||
| 13 | six==1.14.0 | 18 | six==1.14.0 |
| 14 | SQLAlchemy==0.9.10 | 19 | SQLAlchemy==0.9.10 |
| 15 | webargs==6.1.0 | 20 | webargs==6.1.0 | ... | ... |
| 1 | import time | 1 | import time |
| 2 | import os | ||
| 2 | import signal | 3 | import signal |
| 4 | import fitz | ||
| 5 | from PIL import Image | ||
| 6 | from io import BytesIO | ||
| 3 | 7 | ||
| 4 | from django.core.management import BaseCommand | 8 | from django.core.management import BaseCommand |
| 9 | from common.mixins import LoggerMixin | ||
| 10 | from common.redis_cache import redis_handler as rh | ||
| 11 | from apps.doc.models import UploadDocRecords | ||
| 12 | from settings import conf | ||
| 5 | 13 | ||
| 6 | 14 | ||
| 7 | class Command(BaseCommand): | 15 | class Command(BaseCommand, LoggerMixin): |
| 8 | 16 | ||
| 9 | def __init__(self): | 17 | def __init__(self): |
| 10 | super().__init__() | 18 | super().__init__() |
| 19 | self.log_base = '[doc process]' | ||
| 11 | # 处理文件开关 | 20 | # 处理文件开关 |
| 12 | self.switch = True | 21 | self.switch = True |
| 22 | # 数据目录 | ||
| 23 | self.data_dir = conf.DATA_DIR | ||
| 24 | # pdf页面转图片 | ||
| 25 | self.zoom_x = 2.0 | ||
| 26 | self.zoom_y = 2.0 | ||
| 27 | self.trans = fitz.Matrix(self.zoom_x, self.zoom_y).preRotate(0) # zoom factor 2 in each dimension | ||
| 13 | # 优雅退出信号:15 | 28 | # 优雅退出信号:15 |
| 14 | signal.signal(signal.SIGTERM, self.signal_handler) | 29 | signal.signal(signal.SIGTERM, self.signal_handler) |
| 15 | 30 | ||
| 16 | def signal_handler(self, sig, frame): | 31 | def signal_handler(self, sig, frame): |
| 17 | self.switch = False # 停止处理文件 | 32 | self.switch = False # 停止处理文件 |
| 18 | 33 | ||
| 19 | def get_task_info(self): | 34 | def get_task_info(self): # TODO 优先队列 & status modify |
| 20 | pass | 35 | task_id = rh.dequeue() |
| 36 | if task_id is None: | ||
| 37 | self.cronjob_log.info('{0} [get_task_info] [queue empty]'.format(self.log_base)) | ||
| 38 | return | ||
| 39 | task_info = UploadDocRecords.objects.filter(id=task_id).values( | ||
| 40 | 'id', 'metadata_version_id', 'document_name').first() | ||
| 41 | if task_info is None: | ||
| 42 | self.cronjob_log.warn('{0} [get_task_info] [task not found] [task_id={1}]'.format(self.log_base, task_id)) | ||
| 43 | self.cronjob_log.info('{0} [get_task_info success] [task_info={1}]'.format(self.log_base, task_info)) | ||
| 44 | return task_info | ||
| 21 | 45 | ||
| 22 | def pdf_download(self, task_info): | 46 | def pdf_download(self, task_info): |
| 23 | pass | 47 | if task_info is None: |
| 48 | return | ||
| 49 | # TODO EDMS下载pdf | ||
| 50 | pdf_path = '/Users/clay/Desktop/biz/biz_logic/data/2/横版-表格-工商银行CH-B008802400.pdf' | ||
| 51 | self.cronjob_log.info('{0} [pdf download success] [task_info={1}] [pdf_path={2}]'.format( | ||
| 52 | self.log_base, task_info, pdf_path)) | ||
| 53 | return pdf_path | ||
| 54 | |||
| 55 | @staticmethod | ||
| 56 | def getimage(pix): | ||
| 57 | if pix.colorspace.n != 4: | ||
| 58 | return pix | ||
| 59 | tpix = fitz.Pixmap(fitz.csRGB, pix) | ||
| 60 | return tpix | ||
| 61 | |||
| 62 | def recoverpix(self, doc, item): | ||
| 63 | x = item[0] # xref of PDF image | ||
| 64 | s = item[1] # xref of its /SMask | ||
| 65 | is_rgb = True if item[5] == 'DeviceRGB' else False | ||
| 66 | |||
| 67 | # RGB | ||
| 68 | if is_rgb: | ||
| 69 | if s == 0: | ||
| 70 | return doc.extractImage(x) | ||
| 71 | # we need to reconstruct the alpha channel with the smask | ||
| 72 | pix1 = fitz.Pixmap(doc, x) | ||
| 73 | pix2 = fitz.Pixmap(doc, s) # create pixmap of the /SMask entry | ||
| 74 | |||
| 75 | # sanity check | ||
| 76 | if not (pix1.irect == pix2.irect and pix1.alpha == pix2.alpha == 0 and pix2.n == 1): | ||
| 77 | pix2 = None | ||
| 78 | return self.getimage(pix1) | ||
| 79 | |||
| 80 | pix = fitz.Pixmap(pix1) # copy of pix1, alpha channel added | ||
| 81 | pix.setAlpha(pix2.samples) # treat pix2.samples as alpha value | ||
| 82 | pix1 = pix2 = None # free temp pixmaps | ||
| 83 | return self.getimage(pix) | ||
| 84 | |||
| 85 | # GRAY/CMYK | ||
| 86 | pix1 = fitz.Pixmap(doc, x) | ||
| 87 | pix = fitz.Pixmap(pix1) # copy of pix1, alpha channel added | ||
| 88 | |||
| 89 | if s != 0: | ||
| 90 | pix2 = fitz.Pixmap(doc, s) # create pixmap of the /SMask entry | ||
| 91 | |||
| 92 | # sanity check | ||
| 93 | if not (pix1.irect == pix2.irect and pix1.alpha == pix2.alpha == 0 and pix2.n == 1): | ||
| 94 | pix2 = None | ||
| 95 | return self.getimage(pix1) | ||
| 96 | |||
| 97 | pix.setAlpha(pix2.samples) # treat pix2.samples as alpha value | ||
| 98 | |||
| 99 | pix1 = pix2 = None # free temp pixmaps | ||
| 100 | |||
| 101 | pix = fitz.Pixmap(fitz.csRGB, pix) # GRAY/CMYK to RGB | ||
| 102 | return self.getimage(pix) | ||
| 103 | |||
| 104 | @staticmethod | ||
| 105 | def get_img_data(pix): | ||
| 106 | if type(pix) is dict: # we got a raw image | ||
| 107 | ext = pix["ext"] | ||
| 108 | img_data = pix["image"] | ||
| 109 | else: # we got a pixmap | ||
| 110 | ext = 'png' | ||
| 111 | img_data = pix.getPNGData() | ||
| 112 | return ext, img_data | ||
| 113 | |||
| 114 | @staticmethod | ||
| 115 | def split_il(il): | ||
| 116 | img_il_list = [] | ||
| 117 | start = 0 | ||
| 118 | length = len(il) | ||
| 119 | for i in range(length): | ||
| 120 | if i == start: | ||
| 121 | if i == length - 1: | ||
| 122 | img_il_list.append(il[start: length]) | ||
| 123 | continue | ||
| 124 | elif i == length - 1: | ||
| 125 | img_il_list.append(il[start: length]) | ||
| 126 | continue | ||
| 127 | if il[i][2] != il[i - 1][2]: | ||
| 128 | img_il_list.append(il[start: i]) | ||
| 129 | start = i | ||
| 130 | elif il[i][3] != il[i - 1][3]: | ||
| 131 | img_il_list.append(il[start: i + 1]) | ||
| 132 | start = i + 1 | ||
| 133 | return img_il_list | ||
| 24 | 134 | ||
| 25 | def handle(self, *args, **kwargs): | 135 | def handle(self, *args, **kwargs): |
| 26 | while self.switch: | 136 | while self.switch: |
| ... | @@ -28,8 +138,65 @@ class Command(BaseCommand): | ... | @@ -28,8 +138,65 @@ class Command(BaseCommand): |
| 28 | task_info = self.get_task_info() | 138 | task_info = self.get_task_info() |
| 29 | # 从EDMS获取PDF文件 | 139 | # 从EDMS获取PDF文件 |
| 30 | pdf_path = self.pdf_download(task_info) | 140 | pdf_path = self.pdf_download(task_info) |
| 141 | # 队列为空时的处理 | ||
| 142 | if pdf_path is None: | ||
| 143 | time.sleep(10) | ||
| 144 | continue | ||
| 31 | # PDF文件提取图片 | 145 | # PDF文件提取图片 |
| 146 | img_save_path = os.path.join(os.path.dirname(pdf_path), 'img') | ||
| 147 | os.makedirs(img_save_path, exist_ok=True) | ||
| 148 | with fitz.Document(pdf_path) as pdf: | ||
| 149 | self.cronjob_log.info('{0} [pdf_path={1}] [pdf_metadata={2}]'.format( | ||
| 150 | self.log_base, pdf_path, pdf.metadata)) | ||
| 151 | # xref_list = [] # TODO 图片去重 | ||
| 152 | for pno in range(pdf.pageCount): | ||
| 153 | il = pdf.getPageImageList(pno) | ||
| 154 | il.sort(key=lambda x: x[0]) | ||
| 155 | img_il_list = self.split_il(il) | ||
| 156 | del il | ||
| 157 | |||
| 158 | if len(img_il_list) > 3: # 单页无规律小图过多时,使用页面转图片 | ||
| 159 | page = pdf.loadPage(pno) | ||
| 160 | pm = page.getPixmap(matrix=self.trans, alpha=False) | ||
| 161 | save_path = os.path.join(img_save_path, 'page_{0}_img_0.png'.format(page.number)) | ||
| 162 | # pm.writePNG(save_path) | ||
| 163 | pm.writeImage(save_path) | ||
| 164 | else: # 提取图片 | ||
| 165 | for img_count, img_il in enumerate(img_il_list): | ||
| 166 | if len(img_il) == 1: # 当只有一张图片时, 简化处理 | ||
| 167 | pix = self.recoverpix(pdf, img_il[0]) | ||
| 168 | ext, img_data = self.get_img_data(pix) | ||
| 169 | save_path = os.path.join(img_save_path, 'page_{0}_img_{1}.{2}'.format( | ||
| 170 | pno, img_count, ext)) | ||
| 171 | with open(save_path, "wb") as f: | ||
| 172 | f.write(img_data) | ||
| 173 | else: # 多张图片,竖向拼接 | ||
| 174 | height_sum = 0 | ||
| 175 | im_list = [] | ||
| 176 | width = img_il[0][2] | ||
| 177 | for img in img_il: | ||
| 178 | # xref = img[0] | ||
| 179 | # if xref in xref_list: | ||
| 180 | # continue | ||
| 181 | height = img[3] | ||
| 182 | pix = self.recoverpix(pdf, img) | ||
| 183 | ext, img_data = self.get_img_data(pix) | ||
| 184 | |||
| 185 | # xref_list.append(xref) | ||
| 186 | |||
| 187 | im = Image.open(BytesIO(img_data)) | ||
| 188 | im_list.append((height, im, ext)) | ||
| 189 | height_sum += height | ||
| 190 | |||
| 191 | save_path = os.path.join(img_save_path, 'page_{0}_img_{1}.{2}'.format( | ||
| 192 | pno, img_count, im_list[0][2])) | ||
| 193 | res = Image.new(im_list[0][1].mode, (width, height_sum)) | ||
| 194 | h_now = 0 | ||
| 195 | for h, m, _ in im_list: | ||
| 196 | res.paste(m, box=(0, h_now)) | ||
| 197 | h_now += h | ||
| 198 | res.save(save_path) | ||
| 199 | |||
| 32 | # 图片调用算法判断是否为银行流水 | 200 | # 图片调用算法判断是否为银行流水 |
| 33 | # 图片调用算法OCR为excel文件 | 201 | # 图片调用算法OCR为excel文件 |
| 34 | # 整合excel文件上传至EDMS | 202 | # 整合excel文件上传至EDMS |
| 35 | pass | ... | ... |
| ... | @@ -4,7 +4,7 @@ from django.db import models | ... | @@ -4,7 +4,7 @@ from django.db import models |
| 4 | 4 | ||
| 5 | 5 | ||
| 6 | # 上传文件记录表/任务表 | 6 | # 上传文件记录表/任务表 |
| 7 | class UploadDocRecords(models.Model): | 7 | class UploadDocRecords(models.Model): # TODO add status |
| 8 | id = models.AutoField(primary_key=True, verbose_name="id") | 8 | id = models.AutoField(primary_key=True, verbose_name="id") |
| 9 | metadata_version_id = models.CharField(max_length=64, verbose_name="元数据版本id") | 9 | metadata_version_id = models.CharField(max_length=64, verbose_name="元数据版本id") |
| 10 | application_id = models.CharField(max_length=64, verbose_name="申请id") | 10 | application_id = models.CharField(max_length=64, verbose_name="申请id") | ... | ... |
| ... | @@ -5,6 +5,7 @@ from webargs.djangoparser import use_args, parser | ... | @@ -5,6 +5,7 @@ from webargs.djangoparser import use_args, parser |
| 5 | from common.mixins import GenericView | 5 | from common.mixins import GenericView |
| 6 | from common import response | 6 | from common import response |
| 7 | from .models import UploadDocRecords | 7 | from .models import UploadDocRecords |
| 8 | from common.redis_cache import redis_handler as rh | ||
| 8 | 9 | ||
| 9 | # Create your views here. | 10 | # Create your views here. |
| 10 | 11 | ||
| ... | @@ -51,7 +52,7 @@ class DocView(GenericView): | ... | @@ -51,7 +52,7 @@ class DocView(GenericView): |
| 51 | applicant_data = args.get('applicantData') | 52 | applicant_data = args.get('applicantData') |
| 52 | document = args.get('document') | 53 | document = args.get('document') |
| 53 | try: | 54 | try: |
| 54 | UploadDocRecords.objects.create( | 55 | task = UploadDocRecords.objects.create( |
| 55 | metadata_version_id=document.get('metadataVersionId'), | 56 | metadata_version_id=document.get('metadataVersionId'), |
| 56 | application_id=application_data.get('applicationId'), | 57 | application_id=application_data.get('applicationId'), |
| 57 | main_applicant=applicant_data.get('mainApplicantName'), | 58 | main_applicant=applicant_data.get('mainApplicantName'), |
| ... | @@ -68,6 +69,8 @@ class DocView(GenericView): | ... | @@ -68,6 +69,8 @@ class DocView(GenericView): |
| 68 | self.running_log.info('[doc upload fail] [args={0}] [err={1}]'.format(args, e)) | 69 | self.running_log.info('[doc upload fail] [args={0}] [err={1}]'.format(args, e)) |
| 69 | self.invalid_params(msg='metadataVersionId repeat') | 70 | self.invalid_params(msg='metadataVersionId repeat') |
| 70 | else: | 71 | else: |
| 72 | # TODO 查询加入优先队列 or 普通队列 | ||
| 73 | rh.enqueue(task.id) | ||
| 71 | self.running_log.info('[doc upload success] [args={0}]'.format(args)) | 74 | self.running_log.info('[doc upload success] [args={0}]'.format(args)) |
| 72 | return response.ok() | 75 | return response.ok() |
| 73 | 76 | ... | ... |
| ... | @@ -106,7 +106,7 @@ class Redis: | ... | @@ -106,7 +106,7 @@ class Redis: |
| 106 | 106 | ||
| 107 | def zremrangebyrank(self, name, start, end): | 107 | def zremrangebyrank(self, name, start, end): |
| 108 | with self.client.pipeline() as pipe: | 108 | with self.client.pipeline() as pipe: |
| 109 | pipe.zrange(name, start, end) | 109 | pipe.zrange(name, start, end) # TODO 可能出现不一致性 |
| 110 | pipe.zremrangebyrank(name, start, end) | 110 | pipe.zremrangebyrank(name, start, end) |
| 111 | item = pipe.execute() | 111 | item = pipe.execute() |
| 112 | return item | 112 | return item | ... | ... |
| ... | @@ -32,71 +32,12 @@ class RedisHandler: | ... | @@ -32,71 +32,12 @@ class RedisHandler: |
| 32 | self.redis = redis | 32 | self.redis = redis |
| 33 | self.time_expires = datetime.timedelta(hours=24) | 33 | self.time_expires = datetime.timedelta(hours=24) |
| 34 | self.time_format = '%a %b %d %H:%M:%S %Y' | 34 | self.time_format = '%a %b %d %H:%M:%S %Y' |
| 35 | self.prefix = 'automl' | 35 | self.prefix = 'bwm_ocr' |
| 36 | self.training_time_key = '{0}:training_time'.format(self.prefix) | ||
| 37 | self.queue_key = '{0}:queue'.format(self.prefix) | 36 | self.queue_key = '{0}:queue'.format(self.prefix) |
| 38 | self.prefix_training = '{0}:training'.format(self.prefix) | ||
| 39 | self.prefix_models = '{0}:models'.format(self.prefix) | ||
| 40 | self.prefix_img_info = '{0}:img_info'.format(self.prefix) | ||
| 41 | 37 | ||
| 42 | def get_training_model_key(self, user_id, model_type): | 38 | def enqueue(self, task_id): |
| 43 | return '{0}:{1}:{2}'.format(self.prefix_training, user_id, model_type) | ||
| 44 | |||
| 45 | def get_models_list_key(self, user_id, model_type): | ||
| 46 | return '{0}:{1}:{2}'.format(self.prefix_models, user_id, model_type) | ||
| 47 | |||
| 48 | def set_training_model(self, user_id, model_type, model_id, status): | ||
| 49 | # True | ||
| 50 | key = self.get_training_model_key(user_id, model_type) | ||
| 51 | mapping = { | ||
| 52 | 'model_id': model_id, | ||
| 53 | 'model_status': status | ||
| 54 | } | ||
| 55 | return self.redis.hmset(key, mapping) | ||
| 56 | |||
| 57 | def get_training_model(self, user_id, model_type): | ||
| 58 | # {} | ||
| 59 | # {'id': '1', 'status': '1'} | ||
| 60 | key = self.get_training_model_key(user_id, model_type) | ||
| 61 | res = self.redis.hgetall(key) | ||
| 62 | dict_str_to_int(res) | ||
| 63 | return res | ||
| 64 | |||
| 65 | def set_models_list(self, user_id, model_type, models_list): | ||
| 66 | key = self.get_models_list_key(user_id, model_type) | ||
| 67 | value = json.dumps(models_list, cls=DateTimeJSONEncoder) | ||
| 68 | return self.redis.set(key, value, expires=self.time_expires) | ||
| 69 | |||
| 70 | def get_models_list(self, user_id, model_type): | ||
| 71 | # list or None | ||
| 72 | key = self.get_models_list_key(user_id, model_type) | ||
| 73 | res_str = self.redis.get(key) | ||
| 74 | res = None if res_str is None else json.loads(res_str) | ||
| 75 | return res | ||
| 76 | |||
| 77 | def del_models_list(self, user_id, model_type): | ||
| 78 | # None | ||
| 79 | key = self.get_models_list_key(user_id, model_type) | ||
| 80 | return self.redis.delete(key) | ||
| 81 | |||
| 82 | def set_training_finish_time(self, finish_time): | ||
| 83 | # True | ||
| 84 | finish_time_str = datetime.datetime.strftime(finish_time, self.time_format) | ||
| 85 | return self.redis.set(self.training_time_key, finish_time_str) | ||
| 86 | |||
| 87 | def get_training_finish_time(self): | ||
| 88 | # datetime.datetime or None | ||
| 89 | res = self.redis.get(self.training_time_key) | ||
| 90 | finish_time = None if res is None else datetime.datetime.strptime(res, self.time_format) | ||
| 91 | return finish_time | ||
| 92 | |||
| 93 | def del_training_finish_time(self): | ||
| 94 | # None | ||
| 95 | return self.redis.delete(self.training_time_key) | ||
| 96 | |||
| 97 | def enqueue(self, model_id): | ||
| 98 | # 1 | 39 | # 1 |
| 99 | mapping = {model_id: time.time()} | 40 | mapping = {task_id: time.time()} |
| 100 | return self.redis.zadd(self.queue_key, mapping) | 41 | return self.redis.zadd(self.queue_key, mapping) |
| 101 | 42 | ||
| 102 | def dequeue(self): | 43 | def dequeue(self): |
| ... | @@ -106,110 +47,3 @@ class RedisHandler: | ... | @@ -106,110 +47,3 @@ class RedisHandler: |
| 106 | pop_item = int(pop_item_list[0]) if pop_item_list else None | 47 | pop_item = int(pop_item_list[0]) if pop_item_list else None |
| 107 | return pop_item | 48 | return pop_item |
| 108 | 49 | ||
| 109 | def get_queue_end(self): | ||
| 110 | # model_id:int or None | ||
| 111 | res_list = self.redis.zrange(self.queue_key, -1, -1) | ||
| 112 | end_id = int(res_list[0]) if res_list else None | ||
| 113 | return end_id | ||
| 114 | |||
| 115 | def get_queue_rank(self, model_id): | ||
| 116 | # rank:int or None | ||
| 117 | rank = self.redis.zrank(self.queue_key, model_id) | ||
| 118 | if rank is None: | ||
| 119 | return 0 | ||
| 120 | return rank + 1 | ||
| 121 | |||
| 122 | def set_img_info(self, user_id, model_id, count_sum, count_marked): | ||
| 123 | # True | ||
| 124 | key = '{0}:{1}:{2}'.format(self.prefix_img_info, user_id, model_id) | ||
| 125 | mapping = { | ||
| 126 | 'count_sum': count_sum, | ||
| 127 | 'count_marked': count_marked | ||
| 128 | } | ||
| 129 | return self.redis.hmset(key, mapping) | ||
| 130 | |||
| 131 | def get_img_info(self, user_id, model_id): | ||
| 132 | # {} | ||
| 133 | # {'count_sum': '70', 'count_marked': '0'} | ||
| 134 | key = '{0}:{1}:{2}'.format(self.prefix_img_info, user_id, model_id) | ||
| 135 | res = self.redis.hgetall(key) | ||
| 136 | dict_str_to_int(res) | ||
| 137 | return res | ||
| 138 | |||
| 139 | def update_img_info(self, user_id, model_id, del_img=False): | ||
| 140 | # res_count:int | ||
| 141 | key = '{0}:{1}:{2}'.format(self.prefix_img_info, user_id, model_id) | ||
| 142 | if del_img: | ||
| 143 | return self.redis.hincrby(key, 'count_sum', amount=-1) | ||
| 144 | else: | ||
| 145 | return self.redis.hincrby(key, 'count_marked') | ||
| 146 | |||
| 147 | def del_img_info(self, user_id, model_id): | ||
| 148 | # None | ||
| 149 | key = '{0}:{1}:{2}'.format(self.prefix_img_info, user_id, model_id) | ||
| 150 | return self.redis.delete(key) | ||
| 151 | |||
| 152 | def pipe_trained(self, user_id, model_type, model_id, status, success=True): | ||
| 153 | # redis.set_training_model(user_id, model_type, model_id, model_status) | ||
| 154 | # redis.del_training_finish_time() | ||
| 155 | # redis.del_models_list(user_id, model_type) | ||
| 156 | |||
| 157 | # redis.set_training_model(user_id, model_type, model_id, model_status) | ||
| 158 | # redis.del_training_finish_time() | ||
| 159 | |||
| 160 | training_model_key = self.get_training_model_key(user_id, model_type) | ||
| 161 | models_list_key = self.get_models_list_key(user_id, model_type) | ||
| 162 | mapping = { | ||
| 163 | 'model_id': model_id, | ||
| 164 | 'model_status': status | ||
| 165 | } | ||
| 166 | |||
| 167 | with self.redis.client.pipeline() as pipe: | ||
| 168 | pipe.hmset(training_model_key, mapping) | ||
| 169 | pipe.delete(self.training_time_key) | ||
| 170 | if success is True: | ||
| 171 | pipe.delete(models_list_key) | ||
| 172 | item = pipe.execute() | ||
| 173 | return item | ||
| 174 | |||
| 175 | def pipe_training(self, user_id, model_type, model_id, status, finish_time): | ||
| 176 | # redis.dequeue() | ||
| 177 | # redis.set_training_model(user_id, model_type, model_id, model_status) | ||
| 178 | # redis.set_training_finish_time(proleptic_finish_time) | ||
| 179 | |||
| 180 | training_model_key = self.get_training_model_key(user_id, model_type) | ||
| 181 | mapping = { | ||
| 182 | 'model_id': model_id, | ||
| 183 | 'model_status': status | ||
| 184 | } | ||
| 185 | finish_time_str = datetime.datetime.strftime(finish_time, self.time_format) | ||
| 186 | |||
| 187 | with self.redis.client.pipeline() as pipe: | ||
| 188 | pipe.zremrangebyrank(self.queue_key, 0, 0) | ||
| 189 | pipe.hmset(training_model_key, mapping) | ||
| 190 | pipe.set(self.training_time_key, finish_time_str) | ||
| 191 | item = pipe.execute() | ||
| 192 | return item | ||
| 193 | |||
| 194 | def pipe_enqueue(self, model_id, user_id, model_type, status, section=True): | ||
| 195 | # redis.enqueue(model_id) | ||
| 196 | # redis.set_training_model(user_id, model_type, | ||
| 197 | # model_id, ModelStatus.DATA_PRETREATMENT_DONE.value) | ||
| 198 | # if model_type == ModelType.SECTION.value: | ||
| 199 | # redis.del_img_info(user_id, model_id) | ||
| 200 | |||
| 201 | queue_mapping = {model_id: time.time()} | ||
| 202 | training_model_key = self.get_training_model_key(user_id, model_type) | ||
| 203 | mapping = { | ||
| 204 | 'model_id': model_id, | ||
| 205 | 'model_status': status | ||
| 206 | } | ||
| 207 | img_info_key = '{0}:{1}:{2}'.format(self.prefix_img_info, user_id, model_id) | ||
| 208 | |||
| 209 | with self.redis.client.pipeline() as pipe: | ||
| 210 | pipe.zadd(self.queue_key, queue_mapping) | ||
| 211 | pipe.hmset(training_model_key, mapping) | ||
| 212 | if section is True: | ||
| 213 | pipe.delete(img_info_key) | ||
| 214 | item = pipe.execute() | ||
| 215 | return item | ... | ... |
| 1 | import fitz | 1 | import fitz |
| 2 | import os | 2 | import os |
| 3 | from PIL import Image, ImageCms | 3 | from PIL import Image |
| 4 | from io import BytesIO | 4 | from io import BytesIO |
| 5 | 5 | ||
| 6 | 6 | ||
| ... | @@ -126,7 +126,8 @@ class PdfHandler: | ... | @@ -126,7 +126,8 @@ class PdfHandler: |
| 126 | fout.close() | 126 | fout.close() |
| 127 | xreflist.append(xref) | 127 | xreflist.append(xref) |
| 128 | 128 | ||
| 129 | def split_il(self, il): | 129 | @staticmethod |
| 130 | def split_il(il): | ||
| 130 | img_il_list = [] | 131 | img_il_list = [] |
| 131 | start = 0 | 132 | start = 0 |
| 132 | length = len(il) | 133 | length = len(il) | ... | ... |
| ... | @@ -4,6 +4,7 @@ import os | ... | @@ -4,6 +4,7 @@ import os |
| 4 | BASE_DIR = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) | 4 | BASE_DIR = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) |
| 5 | COMMON_CONF_DIR = os.path.dirname(os.path.abspath(__file__)) | 5 | COMMON_CONF_DIR = os.path.dirname(os.path.abspath(__file__)) |
| 6 | SECRET_CONF_DIR = os.path.join(os.path.dirname(BASE_DIR), 'conf') | 6 | SECRET_CONF_DIR = os.path.join(os.path.dirname(BASE_DIR), 'conf') |
| 7 | DATA_DIR = os.path.join(os.path.dirname(BASE_DIR), 'data') | ||
| 7 | SECRET_CONF_FILE = os.path.join(SECRET_CONF_DIR, 'secret.ini') | 8 | SECRET_CONF_FILE = os.path.join(SECRET_CONF_DIR, 'secret.ini') |
| 8 | LOGGING_CONFIG_FILE = os.path.join(COMMON_CONF_DIR, 'logging.conf') | 9 | LOGGING_CONFIG_FILE = os.path.join(COMMON_CONF_DIR, 'logging.conf') |
| 9 | 10 | ... | ... |
-
Please register or sign in to post a comment