d5c6be4b by 周伟奇

doc process part 1

1 parent a4a63da9
...@@ -28,6 +28,7 @@ sftp-config.json ...@@ -28,6 +28,7 @@ sftp-config.json
28 28
29 *.sqlite3 29 *.sqlite3
30 conf/* 30 conf/*
31 data/*
31 32
32 # 脚本 33 # 脚本
33 src/*.sh 34 src/*.sh
...\ No newline at end of file ...\ No newline at end of file
......
...@@ -4,12 +4,17 @@ Django==2.1 ...@@ -4,12 +4,17 @@ Django==2.1
4 djangorestframework==3.9.0 4 djangorestframework==3.9.0
5 djangorestframework-jwt==1.11.0 5 djangorestframework-jwt==1.11.0
6 marshmallow==3.6.1 6 marshmallow==3.6.1
7 pdfminer3k==1.3.4
8 Pillow==7.1.2
9 ply==3.11
7 PyJWT==1.7.1 10 PyJWT==1.7.1
8 PyMuPDF==1.17.0 11 PyMuPDF==1.17.0
9 PyMySQL==0.9.3 12 PyMySQL==0.9.3
10 pytz==2020.1 13 pytz==2020.1
11 # simple-config @ http://gitlab.situdata.com/zhouweiqi/simple_config/repository/archive.tar.gz?ref=master 14 PyYAML==5.3.1
12 # situlogger @ http://gitlab.situdata.com/zhouweiqi/situlogger/repository/archive.tar.gz?ref=master 15 redis==3.4.1
16 simple-config @ http://gitlab.situdata.com/zhouweiqi/simple_config/repository/archive.tar.gz?ref=master
17 situlogger @ http://gitlab.situdata.com/zhouweiqi/situlogger/repository/archive.tar.gz?ref=master
13 six==1.14.0 18 six==1.14.0
14 SQLAlchemy==0.9.10 19 SQLAlchemy==0.9.10
15 webargs==6.1.0 20 webargs==6.1.0
......
1 import time 1 import time
2 import os
2 import signal 3 import signal
4 import fitz
5 from PIL import Image
6 from io import BytesIO
3 7
4 from django.core.management import BaseCommand 8 from django.core.management import BaseCommand
9 from common.mixins import LoggerMixin
10 from common.redis_cache import redis_handler as rh
11 from apps.doc.models import UploadDocRecords
12 from settings import conf
5 13
6 14
7 class Command(BaseCommand): 15 class Command(BaseCommand, LoggerMixin):
8 16
9 def __init__(self): 17 def __init__(self):
10 super().__init__() 18 super().__init__()
19 self.log_base = '[doc process]'
11 # 处理文件开关 20 # 处理文件开关
12 self.switch = True 21 self.switch = True
22 # 数据目录
23 self.data_dir = conf.DATA_DIR
24 # pdf页面转图片
25 self.zoom_x = 2.0
26 self.zoom_y = 2.0
27 self.trans = fitz.Matrix(self.zoom_x, self.zoom_y).preRotate(0) # zoom factor 2 in each dimension
13 # 优雅退出信号:15 28 # 优雅退出信号:15
14 signal.signal(signal.SIGTERM, self.signal_handler) 29 signal.signal(signal.SIGTERM, self.signal_handler)
15 30
16 def signal_handler(self, sig, frame): 31 def signal_handler(self, sig, frame):
17 self.switch = False # 停止处理文件 32 self.switch = False # 停止处理文件
18 33
19 def get_task_info(self): 34 def get_task_info(self): # TODO 优先队列 & status modify
20 pass 35 task_id = rh.dequeue()
36 if task_id is None:
37 self.cronjob_log.info('{0} [get_task_info] [queue empty]'.format(self.log_base))
38 return
39 task_info = UploadDocRecords.objects.filter(id=task_id).values(
40 'id', 'metadata_version_id', 'document_name').first()
41 if task_info is None:
42 self.cronjob_log.warn('{0} [get_task_info] [task not found] [task_id={1}]'.format(self.log_base, task_id))
43 self.cronjob_log.info('{0} [get_task_info success] [task_info={1}]'.format(self.log_base, task_info))
44 return task_info
21 45
22 def pdf_download(self, task_info): 46 def pdf_download(self, task_info):
23 pass 47 if task_info is None:
48 return
49 # TODO EDMS下载pdf
50 pdf_path = '/Users/clay/Desktop/biz/biz_logic/data/2/横版-表格-工商银行CH-B008802400.pdf'
51 self.cronjob_log.info('{0} [pdf download success] [task_info={1}] [pdf_path={2}]'.format(
52 self.log_base, task_info, pdf_path))
53 return pdf_path
54
55 @staticmethod
56 def getimage(pix):
57 if pix.colorspace.n != 4:
58 return pix
59 tpix = fitz.Pixmap(fitz.csRGB, pix)
60 return tpix
61
62 def recoverpix(self, doc, item):
63 x = item[0] # xref of PDF image
64 s = item[1] # xref of its /SMask
65 is_rgb = True if item[5] == 'DeviceRGB' else False
66
67 # RGB
68 if is_rgb:
69 if s == 0:
70 return doc.extractImage(x)
71 # we need to reconstruct the alpha channel with the smask
72 pix1 = fitz.Pixmap(doc, x)
73 pix2 = fitz.Pixmap(doc, s) # create pixmap of the /SMask entry
74
75 # sanity check
76 if not (pix1.irect == pix2.irect and pix1.alpha == pix2.alpha == 0 and pix2.n == 1):
77 pix2 = None
78 return self.getimage(pix1)
79
80 pix = fitz.Pixmap(pix1) # copy of pix1, alpha channel added
81 pix.setAlpha(pix2.samples) # treat pix2.samples as alpha value
82 pix1 = pix2 = None # free temp pixmaps
83 return self.getimage(pix)
84
85 # GRAY/CMYK
86 pix1 = fitz.Pixmap(doc, x)
87 pix = fitz.Pixmap(pix1) # copy of pix1, alpha channel added
88
89 if s != 0:
90 pix2 = fitz.Pixmap(doc, s) # create pixmap of the /SMask entry
91
92 # sanity check
93 if not (pix1.irect == pix2.irect and pix1.alpha == pix2.alpha == 0 and pix2.n == 1):
94 pix2 = None
95 return self.getimage(pix1)
96
97 pix.setAlpha(pix2.samples) # treat pix2.samples as alpha value
98
99 pix1 = pix2 = None # free temp pixmaps
100
101 pix = fitz.Pixmap(fitz.csRGB, pix) # GRAY/CMYK to RGB
102 return self.getimage(pix)
103
104 @staticmethod
105 def get_img_data(pix):
106 if type(pix) is dict: # we got a raw image
107 ext = pix["ext"]
108 img_data = pix["image"]
109 else: # we got a pixmap
110 ext = 'png'
111 img_data = pix.getPNGData()
112 return ext, img_data
113
114 @staticmethod
115 def split_il(il):
116 img_il_list = []
117 start = 0
118 length = len(il)
119 for i in range(length):
120 if i == start:
121 if i == length - 1:
122 img_il_list.append(il[start: length])
123 continue
124 elif i == length - 1:
125 img_il_list.append(il[start: length])
126 continue
127 if il[i][2] != il[i - 1][2]:
128 img_il_list.append(il[start: i])
129 start = i
130 elif il[i][3] != il[i - 1][3]:
131 img_il_list.append(il[start: i + 1])
132 start = i + 1
133 return img_il_list
24 134
25 def handle(self, *args, **kwargs): 135 def handle(self, *args, **kwargs):
26 while self.switch: 136 while self.switch:
...@@ -28,8 +138,65 @@ class Command(BaseCommand): ...@@ -28,8 +138,65 @@ class Command(BaseCommand):
28 task_info = self.get_task_info() 138 task_info = self.get_task_info()
29 # 从EDMS获取PDF文件 139 # 从EDMS获取PDF文件
30 pdf_path = self.pdf_download(task_info) 140 pdf_path = self.pdf_download(task_info)
141 # 队列为空时的处理
142 if pdf_path is None:
143 time.sleep(10)
144 continue
31 # PDF文件提取图片 145 # PDF文件提取图片
146 img_save_path = os.path.join(os.path.dirname(pdf_path), 'img')
147 os.makedirs(img_save_path, exist_ok=True)
148 with fitz.Document(pdf_path) as pdf:
149 self.cronjob_log.info('{0} [pdf_path={1}] [pdf_metadata={2}]'.format(
150 self.log_base, pdf_path, pdf.metadata))
151 # xref_list = [] # TODO 图片去重
152 for pno in range(pdf.pageCount):
153 il = pdf.getPageImageList(pno)
154 il.sort(key=lambda x: x[0])
155 img_il_list = self.split_il(il)
156 del il
157
158 if len(img_il_list) > 3: # 单页无规律小图过多时,使用页面转图片
159 page = pdf.loadPage(pno)
160 pm = page.getPixmap(matrix=self.trans, alpha=False)
161 save_path = os.path.join(img_save_path, 'page_{0}_img_0.png'.format(page.number))
162 # pm.writePNG(save_path)
163 pm.writeImage(save_path)
164 else: # 提取图片
165 for img_count, img_il in enumerate(img_il_list):
166 if len(img_il) == 1: # 当只有一张图片时, 简化处理
167 pix = self.recoverpix(pdf, img_il[0])
168 ext, img_data = self.get_img_data(pix)
169 save_path = os.path.join(img_save_path, 'page_{0}_img_{1}.{2}'.format(
170 pno, img_count, ext))
171 with open(save_path, "wb") as f:
172 f.write(img_data)
173 else: # 多张图片,竖向拼接
174 height_sum = 0
175 im_list = []
176 width = img_il[0][2]
177 for img in img_il:
178 # xref = img[0]
179 # if xref in xref_list:
180 # continue
181 height = img[3]
182 pix = self.recoverpix(pdf, img)
183 ext, img_data = self.get_img_data(pix)
184
185 # xref_list.append(xref)
186
187 im = Image.open(BytesIO(img_data))
188 im_list.append((height, im, ext))
189 height_sum += height
190
191 save_path = os.path.join(img_save_path, 'page_{0}_img_{1}.{2}'.format(
192 pno, img_count, im_list[0][2]))
193 res = Image.new(im_list[0][1].mode, (width, height_sum))
194 h_now = 0
195 for h, m, _ in im_list:
196 res.paste(m, box=(0, h_now))
197 h_now += h
198 res.save(save_path)
199
32 # 图片调用算法判断是否为银行流水 200 # 图片调用算法判断是否为银行流水
33 # 图片调用算法OCR为excel文件 201 # 图片调用算法OCR为excel文件
34 # 整合excel文件上传至EDMS 202 # 整合excel文件上传至EDMS
35 pass
......
...@@ -4,7 +4,7 @@ from django.db import models ...@@ -4,7 +4,7 @@ from django.db import models
4 4
5 5
6 # 上传文件记录表/任务表 6 # 上传文件记录表/任务表
7 class UploadDocRecords(models.Model): 7 class UploadDocRecords(models.Model): # TODO add status
8 id = models.AutoField(primary_key=True, verbose_name="id") 8 id = models.AutoField(primary_key=True, verbose_name="id")
9 metadata_version_id = models.CharField(max_length=64, verbose_name="元数据版本id") 9 metadata_version_id = models.CharField(max_length=64, verbose_name="元数据版本id")
10 application_id = models.CharField(max_length=64, verbose_name="申请id") 10 application_id = models.CharField(max_length=64, verbose_name="申请id")
......
...@@ -5,6 +5,7 @@ from webargs.djangoparser import use_args, parser ...@@ -5,6 +5,7 @@ from webargs.djangoparser import use_args, parser
5 from common.mixins import GenericView 5 from common.mixins import GenericView
6 from common import response 6 from common import response
7 from .models import UploadDocRecords 7 from .models import UploadDocRecords
8 from common.redis_cache import redis_handler as rh
8 9
9 # Create your views here. 10 # Create your views here.
10 11
...@@ -51,7 +52,7 @@ class DocView(GenericView): ...@@ -51,7 +52,7 @@ class DocView(GenericView):
51 applicant_data = args.get('applicantData') 52 applicant_data = args.get('applicantData')
52 document = args.get('document') 53 document = args.get('document')
53 try: 54 try:
54 UploadDocRecords.objects.create( 55 task = UploadDocRecords.objects.create(
55 metadata_version_id=document.get('metadataVersionId'), 56 metadata_version_id=document.get('metadataVersionId'),
56 application_id=application_data.get('applicationId'), 57 application_id=application_data.get('applicationId'),
57 main_applicant=applicant_data.get('mainApplicantName'), 58 main_applicant=applicant_data.get('mainApplicantName'),
...@@ -68,6 +69,8 @@ class DocView(GenericView): ...@@ -68,6 +69,8 @@ class DocView(GenericView):
68 self.running_log.info('[doc upload fail] [args={0}] [err={1}]'.format(args, e)) 69 self.running_log.info('[doc upload fail] [args={0}] [err={1}]'.format(args, e))
69 self.invalid_params(msg='metadataVersionId repeat') 70 self.invalid_params(msg='metadataVersionId repeat')
70 else: 71 else:
72 # TODO 查询加入优先队列 or 普通队列
73 rh.enqueue(task.id)
71 self.running_log.info('[doc upload success] [args={0}]'.format(args)) 74 self.running_log.info('[doc upload success] [args={0}]'.format(args))
72 return response.ok() 75 return response.ok()
73 76
......
...@@ -4,5 +4,5 @@ from settings import conf ...@@ -4,5 +4,5 @@ from settings import conf
4 4
5 redis_url = conf.REDIS_URL 5 redis_url = conf.REDIS_URL
6 6
7 # redis = Redis(redis_url) 7 redis = Redis(redis_url)
8 # redis_handler = RedisHandler(redis) 8 redis_handler = RedisHandler(redis)
......
...@@ -106,7 +106,7 @@ class Redis: ...@@ -106,7 +106,7 @@ class Redis:
106 106
107 def zremrangebyrank(self, name, start, end): 107 def zremrangebyrank(self, name, start, end):
108 with self.client.pipeline() as pipe: 108 with self.client.pipeline() as pipe:
109 pipe.zrange(name, start, end) 109 pipe.zrange(name, start, end) # TODO 可能出现不一致性
110 pipe.zremrangebyrank(name, start, end) 110 pipe.zremrangebyrank(name, start, end)
111 item = pipe.execute() 111 item = pipe.execute()
112 return item 112 return item
......
...@@ -32,71 +32,12 @@ class RedisHandler: ...@@ -32,71 +32,12 @@ class RedisHandler:
32 self.redis = redis 32 self.redis = redis
33 self.time_expires = datetime.timedelta(hours=24) 33 self.time_expires = datetime.timedelta(hours=24)
34 self.time_format = '%a %b %d %H:%M:%S %Y' 34 self.time_format = '%a %b %d %H:%M:%S %Y'
35 self.prefix = 'automl' 35 self.prefix = 'bwm_ocr'
36 self.training_time_key = '{0}:training_time'.format(self.prefix)
37 self.queue_key = '{0}:queue'.format(self.prefix) 36 self.queue_key = '{0}:queue'.format(self.prefix)
38 self.prefix_training = '{0}:training'.format(self.prefix)
39 self.prefix_models = '{0}:models'.format(self.prefix)
40 self.prefix_img_info = '{0}:img_info'.format(self.prefix)
41 37
42 def get_training_model_key(self, user_id, model_type): 38 def enqueue(self, task_id):
43 return '{0}:{1}:{2}'.format(self.prefix_training, user_id, model_type)
44
45 def get_models_list_key(self, user_id, model_type):
46 return '{0}:{1}:{2}'.format(self.prefix_models, user_id, model_type)
47
48 def set_training_model(self, user_id, model_type, model_id, status):
49 # True
50 key = self.get_training_model_key(user_id, model_type)
51 mapping = {
52 'model_id': model_id,
53 'model_status': status
54 }
55 return self.redis.hmset(key, mapping)
56
57 def get_training_model(self, user_id, model_type):
58 # {}
59 # {'id': '1', 'status': '1'}
60 key = self.get_training_model_key(user_id, model_type)
61 res = self.redis.hgetall(key)
62 dict_str_to_int(res)
63 return res
64
65 def set_models_list(self, user_id, model_type, models_list):
66 key = self.get_models_list_key(user_id, model_type)
67 value = json.dumps(models_list, cls=DateTimeJSONEncoder)
68 return self.redis.set(key, value, expires=self.time_expires)
69
70 def get_models_list(self, user_id, model_type):
71 # list or None
72 key = self.get_models_list_key(user_id, model_type)
73 res_str = self.redis.get(key)
74 res = None if res_str is None else json.loads(res_str)
75 return res
76
77 def del_models_list(self, user_id, model_type):
78 # None
79 key = self.get_models_list_key(user_id, model_type)
80 return self.redis.delete(key)
81
82 def set_training_finish_time(self, finish_time):
83 # True
84 finish_time_str = datetime.datetime.strftime(finish_time, self.time_format)
85 return self.redis.set(self.training_time_key, finish_time_str)
86
87 def get_training_finish_time(self):
88 # datetime.datetime or None
89 res = self.redis.get(self.training_time_key)
90 finish_time = None if res is None else datetime.datetime.strptime(res, self.time_format)
91 return finish_time
92
93 def del_training_finish_time(self):
94 # None
95 return self.redis.delete(self.training_time_key)
96
97 def enqueue(self, model_id):
98 # 1 39 # 1
99 mapping = {model_id: time.time()} 40 mapping = {task_id: time.time()}
100 return self.redis.zadd(self.queue_key, mapping) 41 return self.redis.zadd(self.queue_key, mapping)
101 42
102 def dequeue(self): 43 def dequeue(self):
...@@ -106,110 +47,3 @@ class RedisHandler: ...@@ -106,110 +47,3 @@ class RedisHandler:
106 pop_item = int(pop_item_list[0]) if pop_item_list else None 47 pop_item = int(pop_item_list[0]) if pop_item_list else None
107 return pop_item 48 return pop_item
108 49
109 def get_queue_end(self):
110 # model_id:int or None
111 res_list = self.redis.zrange(self.queue_key, -1, -1)
112 end_id = int(res_list[0]) if res_list else None
113 return end_id
114
115 def get_queue_rank(self, model_id):
116 # rank:int or None
117 rank = self.redis.zrank(self.queue_key, model_id)
118 if rank is None:
119 return 0
120 return rank + 1
121
122 def set_img_info(self, user_id, model_id, count_sum, count_marked):
123 # True
124 key = '{0}:{1}:{2}'.format(self.prefix_img_info, user_id, model_id)
125 mapping = {
126 'count_sum': count_sum,
127 'count_marked': count_marked
128 }
129 return self.redis.hmset(key, mapping)
130
131 def get_img_info(self, user_id, model_id):
132 # {}
133 # {'count_sum': '70', 'count_marked': '0'}
134 key = '{0}:{1}:{2}'.format(self.prefix_img_info, user_id, model_id)
135 res = self.redis.hgetall(key)
136 dict_str_to_int(res)
137 return res
138
139 def update_img_info(self, user_id, model_id, del_img=False):
140 # res_count:int
141 key = '{0}:{1}:{2}'.format(self.prefix_img_info, user_id, model_id)
142 if del_img:
143 return self.redis.hincrby(key, 'count_sum', amount=-1)
144 else:
145 return self.redis.hincrby(key, 'count_marked')
146
147 def del_img_info(self, user_id, model_id):
148 # None
149 key = '{0}:{1}:{2}'.format(self.prefix_img_info, user_id, model_id)
150 return self.redis.delete(key)
151
152 def pipe_trained(self, user_id, model_type, model_id, status, success=True):
153 # redis.set_training_model(user_id, model_type, model_id, model_status)
154 # redis.del_training_finish_time()
155 # redis.del_models_list(user_id, model_type)
156
157 # redis.set_training_model(user_id, model_type, model_id, model_status)
158 # redis.del_training_finish_time()
159
160 training_model_key = self.get_training_model_key(user_id, model_type)
161 models_list_key = self.get_models_list_key(user_id, model_type)
162 mapping = {
163 'model_id': model_id,
164 'model_status': status
165 }
166
167 with self.redis.client.pipeline() as pipe:
168 pipe.hmset(training_model_key, mapping)
169 pipe.delete(self.training_time_key)
170 if success is True:
171 pipe.delete(models_list_key)
172 item = pipe.execute()
173 return item
174
175 def pipe_training(self, user_id, model_type, model_id, status, finish_time):
176 # redis.dequeue()
177 # redis.set_training_model(user_id, model_type, model_id, model_status)
178 # redis.set_training_finish_time(proleptic_finish_time)
179
180 training_model_key = self.get_training_model_key(user_id, model_type)
181 mapping = {
182 'model_id': model_id,
183 'model_status': status
184 }
185 finish_time_str = datetime.datetime.strftime(finish_time, self.time_format)
186
187 with self.redis.client.pipeline() as pipe:
188 pipe.zremrangebyrank(self.queue_key, 0, 0)
189 pipe.hmset(training_model_key, mapping)
190 pipe.set(self.training_time_key, finish_time_str)
191 item = pipe.execute()
192 return item
193
194 def pipe_enqueue(self, model_id, user_id, model_type, status, section=True):
195 # redis.enqueue(model_id)
196 # redis.set_training_model(user_id, model_type,
197 # model_id, ModelStatus.DATA_PRETREATMENT_DONE.value)
198 # if model_type == ModelType.SECTION.value:
199 # redis.del_img_info(user_id, model_id)
200
201 queue_mapping = {model_id: time.time()}
202 training_model_key = self.get_training_model_key(user_id, model_type)
203 mapping = {
204 'model_id': model_id,
205 'model_status': status
206 }
207 img_info_key = '{0}:{1}:{2}'.format(self.prefix_img_info, user_id, model_id)
208
209 with self.redis.client.pipeline() as pipe:
210 pipe.zadd(self.queue_key, queue_mapping)
211 pipe.hmset(training_model_key, mapping)
212 if section is True:
213 pipe.delete(img_info_key)
214 item = pipe.execute()
215 return item
......
1 import fitz 1 import fitz
2 import os 2 import os
3 from PIL import Image, ImageCms 3 from PIL import Image
4 from io import BytesIO 4 from io import BytesIO
5 5
6 6
...@@ -126,7 +126,8 @@ class PdfHandler: ...@@ -126,7 +126,8 @@ class PdfHandler:
126 fout.close() 126 fout.close()
127 xreflist.append(xref) 127 xreflist.append(xref)
128 128
129 def split_il(self, il): 129 @staticmethod
130 def split_il(il):
130 img_il_list = [] 131 img_il_list = []
131 start = 0 132 start = 0
132 length = len(il) 133 length = len(il)
......
...@@ -4,6 +4,7 @@ import os ...@@ -4,6 +4,7 @@ import os
4 BASE_DIR = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 4 BASE_DIR = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
5 COMMON_CONF_DIR = os.path.dirname(os.path.abspath(__file__)) 5 COMMON_CONF_DIR = os.path.dirname(os.path.abspath(__file__))
6 SECRET_CONF_DIR = os.path.join(os.path.dirname(BASE_DIR), 'conf') 6 SECRET_CONF_DIR = os.path.join(os.path.dirname(BASE_DIR), 'conf')
7 DATA_DIR = os.path.join(os.path.dirname(BASE_DIR), 'data')
7 SECRET_CONF_FILE = os.path.join(SECRET_CONF_DIR, 'secret.ini') 8 SECRET_CONF_FILE = os.path.join(SECRET_CONF_DIR, 'secret.ini')
8 LOGGING_CONFIG_FILE = os.path.join(COMMON_CONF_DIR, 'logging.conf') 9 LOGGING_CONFIG_FILE = os.path.join(COMMON_CONF_DIR, 'logging.conf')
9 10
......
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!