c1ca6fa5 by 周伟奇

add ltgt wb daily

1 parent c39b3051
1 import os
2 import re
3 import time
4 import shutil
5 import base64
6 import signal
7 import requests
8 import traceback
9 from PIL import Image
10 from datetime import datetime
11 from django.core.management import BaseCommand
12 from multiprocessing import Process, Queue
13 from openpyxl import load_workbook, Workbook
14
15 from settings import conf
16 from common.mixins import LoggerMixin
17 from common.tools.pdf_to_img import PDFHandler
18 from apps.doc import consts
19 from apps.doc.exceptions import OCR1Exception, OCR4Exception, LTGTException
20 from apps.doc.ocr.wb import BSWorkbook
21
22
23 class TIFFHandler:
24
25 def __init__(self, path, img_save_path):
26 self.path = path
27 self.img_save_path = img_save_path
28 self.img_path_list = []
29
30 def extract_image(self):
31 os.makedirs(self.img_save_path, exist_ok=True)
32 tiff = Image.open(self.path)
33 tiff.load()
34
35 for i in range(tiff.n_frames):
36 try:
37 save_path = os.path.join(self.img_save_path, 'page_{0}.jpeg'.format(i))
38 tiff.seek(i)
39 tiff.save(save_path)
40 self.img_path_list.append(save_path)
41 except EOFError:
42 break
43
44
45 class Command(BaseCommand, LoggerMixin):
46
47 def __init__(self):
48 super().__init__()
49 self.log_base = '[folder ltgt process]'
50 # 处理文件开关
51 self.switch = True
52 self.ltgt_classify_mapping = {
53 128: '执行裁定书',
54 129: '民事判决书',
55 130: '民事调解书'
56 }
57 self.sheet_content = {
58 128: ('执行裁定书', ('承办法院', '案号/标号', '被执行人', '债权金额', '诉讼时间')),
59 129: ('民事判决书', ('承办法院', '案号/标号', '被告', '判决结果: 贷款本金', '判决结果: 罚息', '判决结果: 律师费', '判决结果: 案件受理费', '诉讼时间')),
60 130: ('民事调解书', ('承办法院', '案号/标号', '被告', '协议内容: 支付金额', '协议内容: 案件受理费', '诉讼时间')),
61 }
62 self.DATE_KEY = 'date'
63 self.CLASSIFY_KEY = 'classify'
64 self.RESULT_KEY = 'result'
65 self.daily_wb_name = 'Output_{0}.xlsx'
66 self.short_sleep_time = 10
67 self.long_sleep_time = 3600
68 # 睡眠时间
69 self.sleep_time = float(conf.SLEEP_SECOND_FOLDER)
70 # input folder
71 self.input_dirs = conf.get_namespace('LTGT_DIR_')
72 # ocr相关
73 # self.ocr_url = conf.OCR_URL_FOLDER
74 # self.ocr_url_4 = conf.IC_URL
75 self.ltgt_ocr_url = conf.LTGT_URL
76 # 优雅退出信号:15
77 signal.signal(signal.SIGTERM, self.signal_handler)
78
79 def signal_handler(self, sig, frame):
80 self.switch = False # 停止处理文件
81
82 def license1_process(self, ocr_data, license_summary, classify, img_path):
83 # 类别:'0'身份证, '1'居住证
84 license_data = ocr_data.get('data', [])
85 if not license_data:
86 return
87 if classify == consts.MVC_CLASSIFY: # 车辆登记证 3/4页结果整合
88 for mvc_dict in license_data:
89 try:
90 mvc_page = mvc_dict.pop('page')
91 except Exception as e:
92 pass
93 else:
94 if mvc_page == 'VehicleRegArea':
95 mvc_res = mvc_dict.pop('results', {})
96 mvc_dict['机动车登记证书编号'] = mvc_res.get('register_no', {}).get('words', '')
97 for register_info in mvc_res.get('register_info', []):
98 for detail_dict in register_info.get('details', {}).values():
99 mvc_dict.setdefault(detail_dict.get('chinese_key', '未知'), []).append(
100 detail_dict.get('words', ''))
101 del mvc_res
102 if classify == consts.IC_CLASSIFY:
103 for id_card_dict in license_data:
104 try:
105 base64_img = id_card_dict.pop('base64_img')
106 except Exception as e:
107 continue
108 else:
109 card_type = -1
110 json_data_4 = {
111 'mode': 1,
112 'user_info': {
113 'image_content': base64_img,
114 },
115 'options': {
116 'distinguish_type': 1,
117 'auto_rotate': True,
118 },
119 }
120 for times in range(consts.RETRY_TIMES):
121 try:
122 start_time = time.time()
123 ocr_4_response = requests.post(self.ocr_url_4, json=json_data_4)
124 if ocr_4_response.status_code != 200:
125 raise OCR4Exception('ocr_4 status code: {0}'.format(ocr_4_response.status_code))
126 except Exception as e:
127 self.folder_log.warn(
128 '{0} [ocr_4 failed] [times={1}] [img_path={2}] [error={3}]'.format(
129 self.log_base, times, img_path, traceback.format_exc()))
130 else:
131 ocr_4_res = ocr_4_response.json()
132 end_time = time.time()
133 speed_time = int(end_time - start_time)
134
135 if ocr_4_res.get('code') == 0 and ocr_4_res.get('result', {}).get('rtn') == 0:
136 card_type = ocr_4_res.get('result', {}).get(
137 'idcard_distinguish_result', {}).get('result', -1)
138
139 self.folder_log.info(
140 '{0} [ocr_4 success] [img_path={1}] [speed_time={2}]'.format(
141 self.log_base, img_path, speed_time))
142 break
143 else:
144 self.folder_log.warn(
145 '{0} [ocr_4 failed] [img_path={1}]'.format(self.log_base, img_path))
146
147 id_card_dict[consts.IC_TURE_OR_FALSE] = consts.IC_RES_MAPPING.get(card_type)
148 license_summary.setdefault(classify, []).extend(license_data)
149
150 @staticmethod
151 def parse_img_path(img_path):
152 # 'page_{0}_img_{1}.{2}'.format(pno, img_index, ext)
153 img_name, _ = os.path.splitext(os.path.basename(img_path))
154 if re.match(r'page_\d+_img_\d+', img_name):
155 part_list = img_name.split('_')
156 return img_name, int(part_list[1])+1, int(part_list[3])+1
157 else:
158 return img_name, 1, 1
159
160 @staticmethod
161 def get_path(name, img_output_dir, wb_output_dir, pdf_output_dir):
162 time_stamp = datetime.now().strftime('%Y-%m-%d_%H:%M:%S')
163 new_name = '{0}_{1}'.format(time_stamp, name)
164 img_save_path = os.path.join(img_output_dir, new_name)
165 pdf_save_path = os.path.join(pdf_output_dir, new_name)
166 excel_name = '{0}.xlsx'.format(os.path.splitext(new_name)[0])
167 excel_path = os.path.join(wb_output_dir, excel_name)
168 return img_save_path, excel_path, pdf_save_path
169
170 def res_process(self, all_res, classify, excel_path):
171 try:
172 license_summary = {}
173
174 if not all_res:
175 return
176 else:
177 for img_path, ocr_res in all_res.items():
178 # img_name, pno, ino = self.parse_img_path(img_path)
179 # part_idx = 1
180
181 if isinstance(ocr_res, dict):
182 if ocr_res.get('code') == 1:
183 data_list = ocr_res.get('data', [])
184 if isinstance(data_list, list):
185 for ocr_data in data_list:
186 # part_idx = part_idx + 1
187 self.license1_process(ocr_data, license_summary, classify, img_path)
188
189 wb = BSWorkbook(set(), set(), set(), set(), set())
190 wb.simple_license_rebuild(license_summary, consts.DOC_SCHEME_LIST[0])
191 wb.remove_base_sheet()
192 wb.save(excel_path)
193 except Exception as e:
194 self.folder_log.error('{0} [wb build error] [path={1}] [error={2}]'.format(
195 self.log_base, excel_path, traceback.format_exc()))
196
197 def ocr_process(self, img_path, classify):
198 if os.path.exists(img_path):
199 # TODO 图片验证
200 with open(img_path, 'rb') as f:
201 base64_data = base64.b64encode(f.read())
202 # 获取解码后的base64值
203 file_data = base64_data.decode()
204 json_data = {
205 "file": file_data,
206 "classify": classify
207 }
208
209 for times in range(consts.RETRY_TIMES):
210 try:
211 start_time = time.time()
212 ocr_response = requests.post(self.ocr_url, json=json_data)
213 if ocr_response.status_code != 200:
214 raise OCR1Exception('{0} ocr status code: {1}'.format(self.log_base, ocr_response.status_code))
215 except Exception as e:
216 self.folder_log.warn('{0} [ocr failed] [times={1}] [img_path={2}] [error={3}]'.format(
217 self.log_base, times, img_path, traceback.format_exc()))
218 else:
219 ocr_res = ocr_response.json()
220 end_time = time.time()
221 speed_time = int(end_time - start_time)
222 self.folder_log.info('{0} [ocr success] [img={1}] [res={2}] [speed_time={3}]'.format(
223 self.log_base, img_path, ocr_res, speed_time))
224 return ocr_res
225 else:
226 self.folder_log.warn('{0} [ocr failed] [img_path={1}]'.format(self.log_base, img_path))
227
228 def ltgt_ocr_process(self, img_path_list, label, path):
229 img_data_list = []
230
231 for img_path in img_path_list:
232 if os.path.exists(img_path):
233 with open(img_path, 'rb') as f:
234 base64_data = base64.b64encode(f.read())
235 # 获取解码后的base64值
236 file_data = base64_data.decode()
237 img_data_list.append(file_data)
238
239 json_data = {
240 "label": label,
241 "img_data_list": img_data_list
242 }
243
244 for times in range(consts.RETRY_TIMES):
245 try:
246 start_time = time.time()
247 ocr_response = requests.post(self.ltgt_ocr_url, json=json_data)
248 if ocr_response.status_code != 200:
249 raise LTGTException('{0} ltgt ocr status code: {1}'.format(self.log_base, ocr_response.status_code))
250 except Exception as e:
251 self.folder_log.warn('{0} [ltgt ocr failed] [times={1}] [path={2}] [error={3}]'.format(
252 self.log_base, times, path, traceback.format_exc()))
253 else:
254 ocr_res = ocr_response.json()
255 end_time = time.time()
256 speed_time = int(end_time - start_time)
257 self.folder_log.info('{0} [ltgt ocr success] [path={1}] [res={2}] [speed_time={3}]'.format(
258 self.log_base, path, ocr_res, speed_time))
259 return ocr_res
260 else:
261 self.folder_log.warn('{0} [ltgt ocr failed] [path={1}]'.format(self.log_base, path))
262
263 def ltgt_res_process(self, ocr_res, label, excel_path):
264 try:
265 if isinstance(ocr_res, dict):
266 if ocr_res.get('code') == 1:
267 result_dict = ocr_res.get('data', {})
268
269 wb = BSWorkbook(set(), set(), set(), set(), set())
270 rebuild_res = wb.ltgt_build(label, result_dict)
271 wb.remove_base_sheet()
272 wb.save(excel_path)
273 return rebuild_res
274 except Exception as e:
275 self.folder_log.error('{0} [wb build error] [path={1}] [error={2}]'.format(
276 self.log_base, excel_path, traceback.format_exc()))
277
278 def ltgt_process(self, img_path_list, label, excel_path, path):
279 ocr_res = self.ltgt_ocr_process(img_path_list, label, path)
280 rebuild_res = self.ltgt_res_process(ocr_res, label, excel_path)
281 return rebuild_res
282
283 def images_process(self, img_path_list, classify, excel_path):
284 all_res = {}
285 for img_path in img_path_list:
286 ocr_res = self.ocr_process(img_path, classify)
287 all_res[img_path] = ocr_res
288 self.res_process(all_res, classify, excel_path)
289
290 def pdf_process(self, name, path, classify, img_output_dir, wb_output_dir, pdf_output_dir):
291 if os.path.exists(path):
292 rebuild_res = None
293 try:
294 img_save_path, excel_path, pdf_save_path = self.get_path(name, img_output_dir, wb_output_dir, pdf_output_dir)
295 self.folder_log.info('{0} [pdf to img start] [path={1}]'.format(self.log_base, path))
296 pdf_handler = PDFHandler(path, img_save_path)
297 if classify in self.ltgt_classify_mapping:
298 pdf_handler.extract_page_image()
299 else:
300 pdf_handler.extract_image()
301 self.folder_log.info('{0} [pdf to img end] [path={1}]'.format(self.log_base, path))
302 except Exception as e:
303 self.folder_log.error('{0} [pdf to img error] [path={1}] [error={2}]'.format(
304 self.log_base, path, traceback.format_exc()))
305 raise e
306 else:
307 if classify in self.ltgt_classify_mapping:
308 rebuild_res = self.ltgt_process(pdf_handler.img_path_list, self.ltgt_classify_mapping[classify],
309 excel_path, path)
310 else:
311 self.images_process(pdf_handler.img_path_list, classify, excel_path)
312 shutil.move(path, pdf_save_path)
313 return rebuild_res
314
315 def tif_process(self, name, path, classify, img_output_dir, wb_output_dir, tiff_output_dir):
316 if os.path.exists(path):
317 rebuild_res = None
318 try:
319 img_save_path, excel_path, tiff_save_path = self.get_path(name, img_output_dir, wb_output_dir, tiff_output_dir)
320 self.folder_log.info('{0} [tiff to img start] [path={1}]'.format(self.log_base, path))
321 tiff_handler = TIFFHandler(path, img_save_path)
322 tiff_handler.extract_image()
323 self.folder_log.info('{0} [tiff to img end] [path={1}]'.format(self.log_base, path))
324 except Exception as e:
325 self.folder_log.error('{0} [tiff to img error] [path={1}] [error={2}]'.format(
326 self.log_base, path, traceback.format_exc()))
327 raise e
328 else:
329 if classify in self.ltgt_classify_mapping:
330 rebuild_res = self.ltgt_process(tiff_handler.img_path_list, self.ltgt_classify_mapping[classify],
331 excel_path, path)
332 else:
333 self.images_process(tiff_handler.img_path_list, classify, excel_path)
334 shutil.move(path, tiff_save_path)
335 return rebuild_res
336
337 def img_process(self, name, path, classify, wb_output_dir, img_output_dir, pdf_output_dir):
338 rebuild_res = None
339 try:
340 img_save_path, excel_path, _ = self.get_path(name, img_output_dir, wb_output_dir, pdf_output_dir)
341 except Exception as e:
342 self.folder_log.error('{0} [get path error] [path={1}] [error={2}]'.format(
343 self.log_base, path, traceback.format_exc()))
344 else:
345 if classify in self.ltgt_classify_mapping:
346 rebuild_res = self.ltgt_process([path], self.ltgt_classify_mapping[classify], excel_path, path)
347 else:
348 ocr_res = self.ocr_process(path, classify)
349 all_res = {path: ocr_res}
350 self.res_process(all_res, classify, excel_path)
351 shutil.move(path, img_save_path)
352 return rebuild_res
353
354 def wb_process(self, wb_dir, result_queue):
355 while self.switch:
356 result_list = []
357 date_str = None
358 for i in range(100):
359 try:
360 result = result_queue.get(block=False)
361 except Exception as e:
362 time.sleep(self.short_sleep_time)
363 else:
364 if date_str is None:
365 date_str = result[self.DATE_KEY]
366 result_list.append(result)
367 elif result[self.DATE_KEY] == date_str:
368 result_list.append(result)
369 else:
370 break
371 if date_str is None:
372 time.sleep(self.long_sleep_time)
373 continue
374 else:
375 wb_name = self.daily_wb_name.format(date_str)
376 wb_path = os.path.join(wb_dir, wb_name)
377 if os.path.isfile(wb_path):
378 wb = load_workbook(wb_path)
379 else:
380 wb = Workbook()
381 for result in result_list:
382 sheet_name, head_fields = self.sheet_content[result[self.CLASSIFY_KEY]]
383 row = []
384 for field in head_fields:
385 row.append(result[self.RESULT_KEY].get(field))
386 if sheet_name in wb.sheetnames:
387 ws = wb.get_sheet_by_name(sheet_name)
388 else:
389 ws = wb.create_sheet(sheet_name)
390 ws.append(head_fields)
391 ws.append(row)
392 wb.save(wb_path)
393
394 def folder_process(self, input_dir, classify, result_queue):
395 while not os.path.isdir(input_dir):
396 self.folder_log.info('{0} [input dir is not dir] [input_dir={1}]'.format(self.log_base, input_dir))
397 if self.switch:
398 time.sleep(self.sleep_time)
399 continue
400 else:
401 return
402 output_dir = os.path.join(os.path.dirname(input_dir), 'Output')
403 img_output_dir = os.path.join(output_dir, 'image')
404 wb_output_dir = os.path.join(output_dir, 'excel')
405 pdf_output_dir = os.path.join(output_dir, 'pdf')
406 tiff_output_dir = os.path.join(output_dir, 'tiff')
407 failed_output_dir = os.path.join(output_dir, 'failed')
408 os.makedirs(output_dir, exist_ok=True)
409 os.makedirs(img_output_dir, exist_ok=True)
410 os.makedirs(wb_output_dir, exist_ok=True)
411 os.makedirs(pdf_output_dir, exist_ok=True)
412 os.makedirs(tiff_output_dir, exist_ok=True)
413 os.makedirs(failed_output_dir, exist_ok=True)
414 os_error_filename_set = set()
415 while self.switch:
416 # if not os.path.isdir(input_dir):
417 # self.folder_log.info('{0} [input dir is not dir] [input_dir={1}]'.format(self.log_base, input_dir))
418 # time.sleep(self.sleep_time)
419 # continue
420 # 1. 从input dir获取pdf or image
421 list_dir = os.listdir(input_dir)
422 if not list_dir and len(os_error_filename_set) == 0:
423 self.folder_log.info('{0} [input dir empty] [input_dir={1}]'.format(self.log_base, input_dir))
424 time.sleep(self.sleep_time)
425 continue
426 all_file_set = set(list_dir)
427 true_file_set = all_file_set - os_error_filename_set
428 if len(true_file_set) == 0 and len(os_error_filename_set) > 0:
429 true_file_set.add(os_error_filename_set.pop())
430 for name in true_file_set:
431 path = os.path.join(input_dir, name)
432
433 try:
434 if os.path.isfile(path):
435 self.folder_log.info('{0} [file start] [path={1}]'.format(self.log_base, path))
436 if name.endswith('.pdf') or name.endswith('.PDF'):
437 result = self.pdf_process(name, path, classify, img_output_dir,
438 wb_output_dir, pdf_output_dir)
439 elif name.endswith('.tif') or name.endswith('.TIF'):
440 result = self.tif_process(name, path, classify, img_output_dir,
441 wb_output_dir, tiff_output_dir)
442 else:
443 result = self.img_process(name, path, classify, wb_output_dir,
444 img_output_dir, pdf_output_dir)
445 self.folder_log.info('{0} [file end] [path={1}]'.format(self.log_base, path))
446 else:
447 result = None
448 self.folder_log.info('{0} [path is dir] [path={1}]'.format(self.log_base, input_dir))
449 failed_path = os.path.join(failed_output_dir, '{0}_{1}'.format(time.time(), name))
450 shutil.move(path, failed_path)
451 except OSError:
452 os_error_filename_set.add(name)
453 self.folder_log.error('{0} [os error] [path={1}] [error={2}]'.format(
454 self.log_base, path, traceback.format_exc()))
455 except Exception as e:
456 try:
457 self.folder_log.error('{0} [file error] [path={1}] [error={2}]'.format(self.log_base, path,
458 traceback.format_exc()))
459 failed_path = os.path.join(failed_output_dir, '{0}_{1}'.format(time.time(), name))
460 shutil.move(path, failed_path)
461 except Exception as e:
462 os_error_filename_set.add(name)
463 self.folder_log.error('{0} [file move error] [path={1}] [error={2}]'.format(
464 self.log_base, path, traceback.format_exc()))
465 else:
466 if isinstance(result, dict) and len(result) > 0:
467 date_str = time.strftime("%Y-%m-%d")
468 result_queue.put(
469 {
470 self.CLASSIFY_KEY: classify,
471 self.RESULT_KEY: result,
472 self.DATE_KEY: date_str
473 }
474 )
475 elif isinstance(result, list) and len(result) > 0:
476 date_str = time.strftime("%Y-%m-%d")
477 for res in result:
478 result_queue.put(
479 {
480 self.CLASSIFY_KEY: classify,
481 self.RESULT_KEY: res,
482 self.DATE_KEY: date_str
483 }
484 )
485
486 def handle(self, *args, **kwargs):
487 if len(self.input_dirs) == 0:
488 return
489 result_queue = Queue()
490 process_list = []
491 one_input_dir = None
492 for classify_idx, input_dir in self.input_dirs.items():
493 if one_input_dir is None:
494 one_input_dir = input_dir
495 classify = int(classify_idx.split('_')[0])
496 process = Process(target=self.folder_process, args=(input_dir, classify, result_queue))
497 process_list.append(process)
498
499 wb_dir = os.path.dirname(os.path.dirname(one_input_dir))
500 wb_process = Process(target=self.wb_process, args=(wb_dir, result_queue, ))
501 process_list.append(wb_process)
502
503 for p in process_list:
504 p.start()
505 for p in process_list:
506 p.join()
507
508 self.folder_log.info('{0} [stop safely]'.format(self.log_base))
...@@ -15,7 +15,7 @@ from settings import conf ...@@ -15,7 +15,7 @@ from settings import conf
15 from common.mixins import LoggerMixin 15 from common.mixins import LoggerMixin
16 from common.tools.pdf_to_img import PDFHandler 16 from common.tools.pdf_to_img import PDFHandler
17 from apps.doc import consts 17 from apps.doc import consts
18 from apps.doc.exceptions import OCR1Exception, OCR4Exception, LTGTException 18 from apps.doc.exceptions import OCR1Exception, OCR4Exception
19 from apps.doc.ocr.wb import BSWorkbook 19 from apps.doc.ocr.wb import BSWorkbook
20 20
21 21
...@@ -48,11 +48,6 @@ class Command(BaseCommand, LoggerMixin): ...@@ -48,11 +48,6 @@ class Command(BaseCommand, LoggerMixin):
48 self.log_base = '[folder ocr process]' 48 self.log_base = '[folder ocr process]'
49 # 处理文件开关 49 # 处理文件开关
50 self.switch = True 50 self.switch = True
51 self.ltgt_classify_mapping = {
52 128: '执行裁定书',
53 129: '民事判决书',
54 130: '民事调解书'
55 }
56 # 睡眠时间 51 # 睡眠时间
57 self.sleep_time = float(conf.SLEEP_SECOND_FOLDER) 52 self.sleep_time = float(conf.SLEEP_SECOND_FOLDER)
58 # input folder 53 # input folder
...@@ -60,7 +55,6 @@ class Command(BaseCommand, LoggerMixin): ...@@ -60,7 +55,6 @@ class Command(BaseCommand, LoggerMixin):
60 # ocr相关 55 # ocr相关
61 self.ocr_url = conf.OCR_URL_FOLDER 56 self.ocr_url = conf.OCR_URL_FOLDER
62 self.ocr_url_4 = conf.IC_URL 57 self.ocr_url_4 = conf.IC_URL
63 self.ltgt_ocr_url = conf.LTGT_URL
64 # 优雅退出信号:15 58 # 优雅退出信号:15
65 signal.signal(signal.SIGTERM, self.signal_handler) 59 signal.signal(signal.SIGTERM, self.signal_handler)
66 60
...@@ -213,59 +207,6 @@ class Command(BaseCommand, LoggerMixin): ...@@ -213,59 +207,6 @@ class Command(BaseCommand, LoggerMixin):
213 else: 207 else:
214 self.folder_log.warn('{0} [ocr failed] [img_path={1}]'.format(self.log_base, img_path)) 208 self.folder_log.warn('{0} [ocr failed] [img_path={1}]'.format(self.log_base, img_path))
215 209
216 def ltgt_ocr_process(self, img_path_list, label, path):
217 img_data_list = []
218
219 for img_path in img_path_list:
220 if os.path.exists(img_path):
221 with open(img_path, 'rb') as f:
222 base64_data = base64.b64encode(f.read())
223 # 获取解码后的base64值
224 file_data = base64_data.decode()
225 img_data_list.append(file_data)
226
227 json_data = {
228 "label": label,
229 "img_data_list": img_data_list
230 }
231
232 for times in range(consts.RETRY_TIMES):
233 try:
234 start_time = time.time()
235 ocr_response = requests.post(self.ltgt_ocr_url, json=json_data)
236 if ocr_response.status_code != 200:
237 raise LTGTException('{0} ltgt ocr status code: {1}'.format(self.log_base, ocr_response.status_code))
238 except Exception as e:
239 self.folder_log.warn('{0} [ltgt ocr failed] [times={1}] [path={2}] [error={3}]'.format(
240 self.log_base, times, path, traceback.format_exc()))
241 else:
242 ocr_res = ocr_response.json()
243 end_time = time.time()
244 speed_time = int(end_time - start_time)
245 self.folder_log.info('{0} [ltgt ocr success] [path={1}] [res={2}] [speed_time={3}]'.format(
246 self.log_base, path, ocr_res, speed_time))
247 return ocr_res
248 else:
249 self.folder_log.warn('{0} [ltgt ocr failed] [path={1}]'.format(self.log_base, path))
250
251 def ltgt_res_process(self, ocr_res, label, excel_path):
252 try:
253 if isinstance(ocr_res, dict):
254 if ocr_res.get('code') == 1:
255 result_dict = ocr_res.get('data', {})
256
257 wb = BSWorkbook(set(), set(), set(), set(), set())
258 rebuild_res = wb.ltgt_build(label, result_dict)
259 wb.remove_base_sheet()
260 wb.save(excel_path)
261 except Exception as e:
262 self.folder_log.error('{0} [wb build error] [path={1}] [error={2}]'.format(
263 self.log_base, excel_path, traceback.format_exc()))
264
265 def ltgt_process(self, img_path_list, label, excel_path, path):
266 ocr_res = self.ltgt_ocr_process(img_path_list, label, path)
267 self.ltgt_res_process(ocr_res, label, excel_path)
268
269 def images_process(self, img_path_list, classify, excel_path): 210 def images_process(self, img_path_list, classify, excel_path):
270 all_res = {} 211 all_res = {}
271 for img_path in img_path_list: 212 for img_path in img_path_list:
...@@ -279,20 +220,14 @@ class Command(BaseCommand, LoggerMixin): ...@@ -279,20 +220,14 @@ class Command(BaseCommand, LoggerMixin):
279 img_save_path, excel_path, pdf_save_path = self.get_path(name, img_output_dir, wb_output_dir, pdf_output_dir) 220 img_save_path, excel_path, pdf_save_path = self.get_path(name, img_output_dir, wb_output_dir, pdf_output_dir)
280 self.folder_log.info('{0} [pdf to img start] [path={1}]'.format(self.log_base, path)) 221 self.folder_log.info('{0} [pdf to img start] [path={1}]'.format(self.log_base, path))
281 pdf_handler = PDFHandler(path, img_save_path) 222 pdf_handler = PDFHandler(path, img_save_path)
282 if classify in self.ltgt_classify_mapping: 223 pdf_handler.extract_image()
283 pdf_handler.extract_page_image()
284 else:
285 pdf_handler.extract_image()
286 self.folder_log.info('{0} [pdf to img end] [path={1}]'.format(self.log_base, path)) 224 self.folder_log.info('{0} [pdf to img end] [path={1}]'.format(self.log_base, path))
287 except Exception as e: 225 except Exception as e:
288 self.folder_log.error('{0} [pdf to img error] [path={1}] [error={2}]'.format( 226 self.folder_log.error('{0} [pdf to img error] [path={1}] [error={2}]'.format(
289 self.log_base, path, traceback.format_exc())) 227 self.log_base, path, traceback.format_exc()))
290 raise e 228 raise e
291 else: 229 else:
292 if classify in self.ltgt_classify_mapping: 230 self.images_process(pdf_handler.img_path_list, classify, excel_path)
293 self.ltgt_process(pdf_handler.img_path_list, self.ltgt_classify_mapping[classify], excel_path, path)
294 else:
295 self.images_process(pdf_handler.img_path_list, classify, excel_path)
296 shutil.move(path, pdf_save_path) 231 shutil.move(path, pdf_save_path)
297 232
298 def tif_process(self, name, path, classify, img_output_dir, wb_output_dir, tiff_output_dir): 233 def tif_process(self, name, path, classify, img_output_dir, wb_output_dir, tiff_output_dir):
...@@ -308,10 +243,7 @@ class Command(BaseCommand, LoggerMixin): ...@@ -308,10 +243,7 @@ class Command(BaseCommand, LoggerMixin):
308 self.log_base, path, traceback.format_exc())) 243 self.log_base, path, traceback.format_exc()))
309 raise e 244 raise e
310 else: 245 else:
311 if classify in self.ltgt_classify_mapping: 246 self.images_process(tiff_handler.img_path_list, classify, excel_path)
312 self.ltgt_process(tiff_handler.img_path_list, self.ltgt_classify_mapping[classify], excel_path, path)
313 else:
314 self.images_process(tiff_handler.img_path_list, classify, excel_path)
315 shutil.move(path, tiff_save_path) 247 shutil.move(path, tiff_save_path)
316 248
317 def img_process(self, name, path, classify, wb_output_dir, img_output_dir, pdf_output_dir): 249 def img_process(self, name, path, classify, wb_output_dir, img_output_dir, pdf_output_dir):
...@@ -321,12 +253,9 @@ class Command(BaseCommand, LoggerMixin): ...@@ -321,12 +253,9 @@ class Command(BaseCommand, LoggerMixin):
321 self.folder_log.error('{0} [get path error] [path={1}] [error={2}]'.format( 253 self.folder_log.error('{0} [get path error] [path={1}] [error={2}]'.format(
322 self.log_base, path, traceback.format_exc())) 254 self.log_base, path, traceback.format_exc()))
323 else: 255 else:
324 if classify in self.ltgt_classify_mapping: 256 ocr_res = self.ocr_process(path, classify)
325 self.ltgt_process([path], self.ltgt_classify_mapping[classify], excel_path, path) 257 all_res = {path: ocr_res}
326 else: 258 self.res_process(all_res, classify, excel_path)
327 ocr_res = self.ocr_process(path, classify)
328 all_res = {path: ocr_res}
329 self.res_process(all_res, classify, excel_path)
330 shutil.move(path, img_save_path) 259 shutil.move(path, img_save_path)
331 260
332 def folder_process(self, input_dir, classify): 261 def folder_process(self, input_dir, classify):
......
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!