b8745dc6 by 周伟奇

Merge branch 'feature/ltgt' into feature/0611

2 parents d78669c5 f63f9c2a
...@@ -1448,4 +1448,16 @@ SE_SECOND_ID_FIELD_MAPPING = { ...@@ -1448,4 +1448,16 @@ SE_SECOND_ID_FIELD_MAPPING = {
1448 1448
1449 HEAD_LIST = ['Info', 'Index', 'License', 'Field', 'Input', 'OCR', 'Result', 'Position', 'Image', 'errorType'] 1449 HEAD_LIST = ['Info', 'Index', 'License', 'Field', 'Input', 'OCR', 'Result', 'Position', 'Image', 'errorType']
1450 1450
1451 # ----------------litigation------------------------
1452 IC_FIELD_ORDER_2 = (('姓名', '姓名'),
1453 ('公民身份号码', '公民身份号码'),
1454 ('出生年月', '出生年月'),
1455 ('住址', '住址'),
1456 ('性别', '性别'),
1457 ('民族', '民族'),)
1458 IC_FIELD_ORDER_3 = (('有效期限', '有效期限'), ('签发机关', '签发机关'),)
1459
1460 BC_FIELD_ORDER_2 = (('BankName', '发卡行名称'),
1461 ('CardNum', '银行卡号'),
1462 ('CardType', '银行卡类型'),)
1451 1463
......
...@@ -13,6 +13,9 @@ class OCR2Exception(Exception): ...@@ -13,6 +13,9 @@ class OCR2Exception(Exception):
13 class OCR4Exception(Exception): 13 class OCR4Exception(Exception):
14 pass 14 pass
15 15
16 class LTGTException(Exception):
17 pass
18
16 19
17 class GCAPException(Exception): 20 class GCAPException(Exception):
18 pass 21 pass
......
1 import os
2 import re
3 import time
4 import json
5 import shutil
6 import base64
7 import signal
8 import requests
9 import traceback
10 from PIL import Image
11 from datetime import datetime
12 from django.core.management import BaseCommand
13 from multiprocessing import Process, Queue
14 from openpyxl import load_workbook, Workbook
15
16 from settings import conf
17 from common.mixins import LoggerMixin
18 from common.tools.pdf_to_img import PDFHandler
19 from apps.doc import consts
20 from apps.doc.exceptions import OCR1Exception, OCR2Exception, LTGTException
21 from apps.doc.ocr.wb import BSWorkbook
22
23
24 class TIFFHandler:
25
26 def __init__(self, path, img_save_path):
27 self.path = path
28 self.img_save_path = img_save_path
29 self.img_path_list = []
30
31 def extract_image(self):
32 os.makedirs(self.img_save_path, exist_ok=True)
33 tiff = Image.open(self.path)
34 tiff.load()
35
36 for i in range(tiff.n_frames):
37 try:
38 save_path = os.path.join(self.img_save_path, 'page_{0}.jpeg'.format(i))
39 tiff.seek(i)
40 tiff.save(save_path)
41 self.img_path_list.append(save_path)
42 except EOFError:
43 break
44
45
46 class Command(BaseCommand, LoggerMixin):
47
48 def __init__(self):
49 super().__init__()
50 self.log_base = '[folder ltgt process]'
51 # 处理文件开关
52 self.switch = True
53 self.ltgt_classify_mapping = {
54 128: '执行裁定书',
55 129: '民事判决书',
56 130: '民事调解书'
57 }
58 self.sheet_content = {
59 128: ('执行裁定书', ('承办法院', '案号/标号', '被执行人', '债权金额', '诉讼时间')),
60 129: ('民事判决书', ('承办法院', '案号/标号', '被告', '判决结果: 贷款本金', '判决结果: 罚息', '判决结果: 律师费', '判决结果: 案件受理费', '诉讼时间')),
61 130: ('民事调解书', ('承办法院', '案号/标号', '被告', '协议内容: 支付金额', '协议内容: 案件受理费', '诉讼时间')),
62 }
63 self.DATE_KEY = 'date'
64 self.CLASSIFY_KEY = 'classify'
65 self.RESULT_KEY = 'result'
66 self.daily_wb_name = 'Output_{0}.xlsx'
67 self.short_sleep_time = 10
68 self.long_sleep_time = 3600
69 # 睡眠时间
70 self.sleep_time = float(conf.SLEEP_SECOND_FOLDER)
71 # input folder
72 self.input_dirs = conf.get_namespace('LTGT_DIR_')
73 # seperate folder name
74 self.seperate_map = {
75 consts.IC_CLASSIFY: 'IDCard',
76 consts.BC_CLASSIFY: 'BankCard'
77 }
78 self.field_map = {
79 consts.VAT_CLASSIFY: (consts.VAT_CN_NAME, None, None, consts.VATS_FIELD_ORDER),
80 consts.IC_CLASSIFY: (consts.IC_CN_NAME, '有效期限', consts.IC_FIELD_ORDER_3, consts.IC_FIELD_ORDER_2),
81 consts.BC_CLASSIFY: (consts.BC_CN_NAME, None, None, consts.BC_FIELD_ORDER_2)
82 }
83 # ocr相关
84 self.ocr_url = conf.OCR_URL_FOLDER
85 self.ocr_url_2 = conf.OCR2_URL_FOLDER
86 # self.ocr_url_4 = conf.IC_URL
87 self.ltgt_ocr_url = conf.LTGT_URL
88 # 优雅退出信号:15
89 signal.signal(signal.SIGTERM, self.signal_handler)
90
91 def signal_handler(self, sig, frame):
92 self.switch = False # 停止处理文件
93
94 def license1_process(self, ocr_data, all_res, classify):
95 # 类别:'0'身份证, '1'居住证
96 license_data = ocr_data.get('data', [])
97 if not license_data:
98 return
99 if classify == consts.IC_CLASSIFY:
100 for id_card_dict in license_data:
101 try:
102 id_card_dict.pop('base64_img')
103 except Exception as e:
104 continue
105 all_res.extend(license_data)
106
107 def license2_process(self, ocr_data, all_res, classify, img_path):
108 pid, _, _, _, _, _ = consts.LICENSE_CLASSIFY_MAPPING.get(classify)
109 file_data = ocr_data.get('section_img')
110 if file_data is None:
111 with open(img_path, 'rb') as f:
112 base64_data = base64.b64encode(f.read())
113 # 获取解码后的base64值
114 file_data = base64_data.decode()
115 json_data_2 = {
116 "pid": str(pid),
117 "filedata": file_data
118 }
119
120 for times in range(consts.RETRY_TIMES):
121 try:
122 start_time = time.time()
123 ocr_2_response = requests.post(self.ocr_url_2, data=json_data_2)
124 if ocr_2_response.status_code != 200:
125 raise OCR2Exception('ocr_2 status code: {0}'.format(ocr_2_response.status_code))
126 except Exception as e:
127 self.folder_log.warn(
128 '{0} [ocr_2 failed] [times={1}] [img_path={2}] [error={3}]'.format(
129 self.log_base, times, img_path, traceback.format_exc()))
130 else:
131 ocr_res_2 = json.loads(ocr_2_response.text)
132 end_time = time.time()
133 speed_time = int(end_time - start_time)
134 self.folder_log.info(
135 '{0} [ocr_2 success] [img={1}] [speed_time={2}]'.format(
136 self.log_base, img_path, speed_time))
137
138 if ocr_res_2.get('ErrorCode') in consts.SUCCESS_CODE_SET:
139 if pid == consts.BC_PID:
140 all_res.append(ocr_res_2)
141 else:
142 # 营业执照等
143 for result_dict in ocr_res_2.get('ResultList', []):
144 res_dict = {}
145 for field_dict in result_dict.get('FieldList', []):
146 res_dict[field_dict.get('chn_key', '')] = field_dict.get('value', '')
147 all_res.append(res_dict)
148
149 @staticmethod
150 def parse_img_path(img_path):
151 # 'page_{0}_img_{1}.{2}'.format(pno, img_index, ext)
152 img_name, _ = os.path.splitext(os.path.basename(img_path))
153 if re.match(r'page_\d+_img_\d+', img_name):
154 part_list = img_name.split('_')
155 return img_name, int(part_list[1])+1, int(part_list[3])+1
156 else:
157 return img_name, 1, 1
158
159 @staticmethod
160 def get_path(name, img_output_dir, wb_output_dir, pdf_output_dir, seperate_dir):
161 time_stamp = datetime.now().strftime('%Y-%m-%d_%H:%M:%S')
162 new_name = '{0}_{1}'.format(time_stamp, name)
163 img_save_path = os.path.join(img_output_dir, new_name)
164 pdf_save_path = os.path.join(pdf_output_dir, new_name)
165 excel_name = '{0}.xlsx'.format(os.path.splitext(new_name)[0])
166 excel_path = os.path.join(wb_output_dir, excel_name)
167 seperate_path = None if seperate_dir is None else os.path.join(seperate_dir, new_name)
168 return img_save_path, excel_path, pdf_save_path, seperate_path
169
170 def res_process(self, all_res, excel_path, classify):
171 try:
172 wb = BSWorkbook(set(), set(), set(), set(), set())
173 sheet_name, key_field, side_field_order, src_field_order = self.field_map.get(classify)
174 ws = wb.create_sheet(sheet_name)
175 for res in all_res:
176 if key_field is not None and key_field in res:
177 field_order = side_field_order
178 else:
179 field_order = src_field_order
180 for search_field, write_field in field_order:
181 field_value = res.get(search_field, '')
182 if isinstance(field_value, list):
183 ws.append((write_field, *field_value))
184 else:
185 ws.append((write_field, field_value))
186 ws.append((None,))
187 wb.remove_base_sheet()
188 wb.save(excel_path)
189 except Exception as e:
190 self.folder_log.error('{0} [wb build error] [path={1}] [error={2}]'.format(
191 self.log_base, excel_path, traceback.format_exc()))
192
193 def basename(self, path):
194 # A basename() variant which first strips the trailing slash, if present.
195 # Thus we always get the last component of the path, even for directories.
196 sep = os.path.sep + (os.path.altsep or '')
197 return os.path.basename(path.rstrip(sep))
198
199 def ocr_process(self, img_path, classify, all_res, seperate_dir):
200 if os.path.exists(img_path):
201 # TODO 图片验证
202 with open(img_path, 'rb') as f:
203 base64_data = base64.b64encode(f.read())
204 # 获取解码后的base64值
205 file_data = base64_data.decode()
206 json_data = {
207 "file": file_data,
208 }
209 if seperate_dir is None:
210 json_data["classify"] = classify
211
212 for times in range(consts.RETRY_TIMES):
213 try:
214 start_time = time.time()
215 ocr_response = requests.post(self.ocr_url, json=json_data)
216 if ocr_response.status_code != 200:
217 raise OCR1Exception('{0} ocr status code: {1}'.format(self.log_base, ocr_response.status_code))
218 except Exception as e:
219 self.folder_log.warn('{0} [ocr failed] [times={1}] [img_path={2}] [error={3}]'.format(
220 self.log_base, times, img_path, traceback.format_exc()))
221 else:
222 ocr_res = ocr_response.json()
223 end_time = time.time()
224 speed_time = int(end_time - start_time)
225 self.folder_log.info('{0} [ocr success] [img={1}] [res={2}] [speed_time={3}]'.format(
226 self.log_base, img_path, ocr_res, speed_time))
227
228 if isinstance(ocr_res, dict):
229 if ocr_res.get('code') == 1:
230 data_list = ocr_res.get('data', [])
231 if isinstance(data_list, list):
232 for ocr_data in data_list:
233 if ocr_data.get('classify') == classify:
234 if seperate_dir is not None:
235 os.makedirs(seperate_dir, exist_ok=True)
236 real_dst = os.path.join(seperate_dir, self.basename(img_path))
237 if not os.path.exists(real_dst):
238 shutil.move(img_path, seperate_dir)
239 if classify in consts.LICENSE_CLASSIFY_SET_1:
240 self.license1_process(ocr_data, all_res, classify)
241 elif classify in consts.LICENSE_CLASSIFY_SET_2:
242 self.license2_process(ocr_data, all_res, classify, img_path)
243 break
244 else:
245 self.folder_log.warn('{0} [ocr failed] [img_path={1}]'.format(self.log_base, img_path))
246
247 def ltgt_ocr_process(self, img_path_list, label, path):
248 img_data_list = []
249
250 for img_path in img_path_list:
251 if os.path.exists(img_path):
252 with open(img_path, 'rb') as f:
253 base64_data = base64.b64encode(f.read())
254 # 获取解码后的base64值
255 file_data = base64_data.decode()
256 img_data_list.append(file_data)
257
258 json_data = {
259 "label": label,
260 "img_data_list": img_data_list
261 }
262
263 for times in range(consts.RETRY_TIMES):
264 try:
265 start_time = time.time()
266 ocr_response = requests.post(self.ltgt_ocr_url, json=json_data)
267 if ocr_response.status_code != 200:
268 raise LTGTException('{0} ltgt ocr status code: {1}'.format(self.log_base, ocr_response.status_code))
269 except Exception as e:
270 self.folder_log.warn('{0} [ltgt ocr failed] [times={1}] [path={2}] [error={3}]'.format(
271 self.log_base, times, path, traceback.format_exc()))
272 else:
273 ocr_res = ocr_response.json()
274 end_time = time.time()
275 speed_time = int(end_time - start_time)
276 self.folder_log.info('{0} [ltgt ocr success] [path={1}] [res={2}] [speed_time={3}]'.format(
277 self.log_base, path, ocr_res, speed_time))
278 return ocr_res
279 else:
280 self.folder_log.warn('{0} [ltgt ocr failed] [path={1}]'.format(self.log_base, path))
281
282 def ltgt_res_process(self, ocr_res, label, excel_path):
283 try:
284 if isinstance(ocr_res, dict):
285 if ocr_res.get('code') == 1:
286 result_dict = ocr_res.get('data', {})
287
288 wb = BSWorkbook(set(), set(), set(), set(), set())
289 rebuild_res = wb.ltgt_build(label, result_dict)
290 wb.remove_base_sheet()
291 wb.save(excel_path)
292 return rebuild_res
293 except Exception as e:
294 self.folder_log.error('{0} [wb build error] [path={1}] [error={2}]'.format(
295 self.log_base, excel_path, traceback.format_exc()))
296
297 def ltgt_process(self, img_path_list, label, excel_path, path):
298 ocr_res = self.ltgt_ocr_process(img_path_list, label, path)
299 rebuild_res = self.ltgt_res_process(ocr_res, label, excel_path)
300 return rebuild_res
301
302 def images_process(self, img_path_list, classify, excel_path, seperate_dir):
303 all_res = []
304 for img_path in img_path_list:
305 self.ocr_process(img_path, classify, all_res, seperate_dir)
306 # if len(all_res) > 0:
307 self.res_process(all_res, excel_path, classify)
308 return all_res
309
310 def pdf_process(self, name, path, classify, img_output_dir, wb_output_dir, pdf_output_dir, seperate_dir):
311 if os.path.exists(path):
312 rebuild_res = None
313 try:
314 img_save_path, excel_path, pdf_save_path, seperate_path = self.get_path(
315 name, img_output_dir, wb_output_dir, pdf_output_dir, seperate_dir)
316 self.folder_log.info('{0} [pdf to img start] [path={1}]'.format(self.log_base, path))
317 pdf_handler = PDFHandler(path, img_save_path)
318 if classify in self.ltgt_classify_mapping:
319 pdf_handler.extract_page_image()
320 else:
321 pdf_handler.extract_image()
322 self.folder_log.info('{0} [pdf to img end] [path={1}]'.format(self.log_base, path))
323 except Exception as e:
324 self.folder_log.error('{0} [pdf to img error] [path={1}] [error={2}]'.format(
325 self.log_base, path, traceback.format_exc()))
326 raise e
327 else:
328 if classify in self.ltgt_classify_mapping:
329 rebuild_res = self.ltgt_process(pdf_handler.img_path_list, self.ltgt_classify_mapping[classify],
330 excel_path, path)
331 else:
332 rebuild_res = self.images_process(pdf_handler.img_path_list, classify, excel_path, seperate_path)
333 shutil.move(path, pdf_save_path)
334 return rebuild_res
335
336 def tif_process(self, name, path, classify, img_output_dir, wb_output_dir, tiff_output_dir, seperate_dir):
337 if os.path.exists(path):
338 rebuild_res = None
339 try:
340 img_save_path, excel_path, tiff_save_path, seperate_path = self.get_path(
341 name, img_output_dir, wb_output_dir, tiff_output_dir, seperate_dir)
342 self.folder_log.info('{0} [tiff to img start] [path={1}]'.format(self.log_base, path))
343 tiff_handler = TIFFHandler(path, img_save_path)
344 tiff_handler.extract_image()
345 self.folder_log.info('{0} [tiff to img end] [path={1}]'.format(self.log_base, path))
346 except Exception as e:
347 self.folder_log.error('{0} [tiff to img error] [path={1}] [error={2}]'.format(
348 self.log_base, path, traceback.format_exc()))
349 raise e
350 else:
351 if classify in self.ltgt_classify_mapping:
352 rebuild_res = self.ltgt_process(tiff_handler.img_path_list, self.ltgt_classify_mapping[classify],
353 excel_path, path)
354 else:
355 rebuild_res = self.images_process(tiff_handler.img_path_list, classify, excel_path, seperate_path)
356 shutil.move(path, tiff_save_path)
357 return rebuild_res
358
359 def img_process(self, name, path, classify, wb_output_dir, img_output_dir, pdf_output_dir, seperate_dir):
360 rebuild_res = None
361 try:
362 img_save_path, excel_path, _, seperate_path = self.get_path(
363 name, img_output_dir, wb_output_dir, pdf_output_dir, seperate_dir)
364 except Exception as e:
365 self.folder_log.error('{0} [get path error] [path={1}] [error={2}]'.format(
366 self.log_base, path, traceback.format_exc()))
367 else:
368 if classify in self.ltgt_classify_mapping:
369 rebuild_res = self.ltgt_process([path], self.ltgt_classify_mapping[classify], excel_path, path)
370 else:
371 rebuild_res = self.images_process([path], classify, excel_path, seperate_path)
372 shutil.move(path, img_save_path)
373 return rebuild_res
374
375 def wb_process(self, wb_dir, result_queue):
376 while self.switch:
377 result_list = []
378 date_str = None
379 for i in range(100):
380 try:
381 result = result_queue.get(block=False)
382 except Exception as e:
383 time.sleep(self.short_sleep_time)
384 else:
385 if date_str is None:
386 date_str = result[self.DATE_KEY]
387 result_list.append(result)
388 elif result[self.DATE_KEY] == date_str:
389 result_list.append(result)
390 else:
391 break
392 if date_str is None:
393 time.sleep(self.long_sleep_time)
394 continue
395 else:
396 wb_name = self.daily_wb_name.format(date_str)
397 wb_path = os.path.join(wb_dir, wb_name)
398 if os.path.isfile(wb_path):
399 wb = load_workbook(wb_path)
400 else:
401 wb = Workbook()
402 for result in result_list:
403 try:
404 if result[self.CLASSIFY_KEY] in self.sheet_content:
405 sheet_name, head_fields = self.sheet_content[result[self.CLASSIFY_KEY]]
406 else:
407 sheet_name, key_field, side_field_order, field_order = self.field_map[result[self.CLASSIFY_KEY]]
408 if key_field is not None and key_field in result[self.RESULT_KEY]:
409 head_fields = [a for a, _ in side_field_order]
410 else:
411 head_fields = [a for a, _ in field_order]
412 row = []
413 for field in head_fields:
414 row.append(result[self.RESULT_KEY].get(field))
415 if sheet_name in wb.sheetnames:
416 ws = wb.get_sheet_by_name(sheet_name)
417 else:
418 ws = wb.create_sheet(sheet_name)
419 ws.append(head_fields)
420 ws.append(row)
421 except Exception as e:
422 self.folder_log.info('{0} [daily wb failed] [result={1}] [error={2}]'.format(
423 self.log_base, result, traceback.format_exc()))
424 wb.save(wb_path)
425
426 def folder_process(self, input_dir, classify, is_combined, result_queue):
427 while not os.path.isdir(input_dir):
428 self.folder_log.info('{0} [input dir is not dir] [input_dir={1}]'.format(self.log_base, input_dir))
429 if self.switch:
430 time.sleep(self.sleep_time)
431 continue
432 else:
433 return
434 output_dir = os.path.join(os.path.dirname(input_dir), 'Output')
435 seperate_dir = os.path.join(output_dir, self.seperate_map.get(classify, 'Unknown')) if is_combined else None
436 img_output_dir = os.path.join(output_dir, 'image')
437 wb_output_dir = os.path.join(output_dir, 'excel')
438 pdf_output_dir = os.path.join(output_dir, 'pdf')
439 tiff_output_dir = os.path.join(output_dir, 'tiff')
440 failed_output_dir = os.path.join(output_dir, 'failed')
441 os.makedirs(output_dir, exist_ok=True)
442 os.makedirs(img_output_dir, exist_ok=True)
443 os.makedirs(wb_output_dir, exist_ok=True)
444 os.makedirs(pdf_output_dir, exist_ok=True)
445 os.makedirs(tiff_output_dir, exist_ok=True)
446 os.makedirs(failed_output_dir, exist_ok=True)
447 if seperate_dir is not None:
448 os.makedirs(seperate_dir, exist_ok=True)
449 os_error_filename_set = set()
450 while self.switch:
451 # if not os.path.isdir(input_dir):
452 # self.folder_log.info('{0} [input dir is not dir] [input_dir={1}]'.format(self.log_base, input_dir))
453 # time.sleep(self.sleep_time)
454 # continue
455 # 1. 从input dir获取pdf or image
456 list_dir = os.listdir(input_dir)
457 if not list_dir and len(os_error_filename_set) == 0:
458 self.folder_log.info('{0} [input dir empty] [input_dir={1}]'.format(self.log_base, input_dir))
459 time.sleep(self.sleep_time)
460 continue
461 all_file_set = set(list_dir)
462 true_file_set = all_file_set - os_error_filename_set
463 if len(true_file_set) == 0 and len(os_error_filename_set) > 0:
464 true_file_set.add(os_error_filename_set.pop())
465 for name in true_file_set:
466 path = os.path.join(input_dir, name)
467
468 try:
469 if os.path.isfile(path):
470 self.folder_log.info('{0} [file start] [path={1}]'.format(self.log_base, path))
471 if name.endswith('.pdf') or name.endswith('.PDF'):
472 result = self.pdf_process(name, path, classify, img_output_dir, wb_output_dir,
473 pdf_output_dir, seperate_dir)
474 elif name.endswith('.tif') or name.endswith('.TIF'):
475 result = self.tif_process(name, path, classify, img_output_dir, wb_output_dir,
476 tiff_output_dir, seperate_dir)
477 else:
478 result = self.img_process(name, path, classify, wb_output_dir, img_output_dir,
479 pdf_output_dir, seperate_dir)
480 self.folder_log.info('{0} [file end] [path={1}]'.format(self.log_base, path))
481 else:
482 result = None
483 self.folder_log.info('{0} [path is dir] [path={1}]'.format(self.log_base, input_dir))
484 failed_path = os.path.join(failed_output_dir, '{0}_{1}'.format(time.time(), name))
485 shutil.move(path, failed_path)
486 except OSError:
487 os_error_filename_set.add(name)
488 self.folder_log.error('{0} [os error] [path={1}] [error={2}]'.format(
489 self.log_base, path, traceback.format_exc()))
490 except Exception as e:
491 try:
492 self.folder_log.error('{0} [file error] [path={1}] [error={2}]'.format(self.log_base, path,
493 traceback.format_exc()))
494 failed_path = os.path.join(failed_output_dir, '{0}_{1}'.format(time.time(), name))
495 shutil.move(path, failed_path)
496 except Exception as e:
497 os_error_filename_set.add(name)
498 self.folder_log.error('{0} [file move error] [path={1}] [error={2}]'.format(
499 self.log_base, path, traceback.format_exc()))
500 else:
501 if isinstance(result, dict) and len(result) > 0:
502 date_str = time.strftime("%Y-%m-%d")
503 result_queue.put(
504 {
505 self.CLASSIFY_KEY: classify,
506 self.RESULT_KEY: result,
507 self.DATE_KEY: date_str
508 }
509 )
510 elif isinstance(result, list) and len(result) > 0:
511 date_str = time.strftime("%Y-%m-%d")
512 for res in result:
513 result_queue.put(
514 {
515 self.CLASSIFY_KEY: classify,
516 self.RESULT_KEY: res,
517 self.DATE_KEY: date_str
518 }
519 )
520
521 def handle(self, *args, **kwargs):
522 if len(self.input_dirs) == 0:
523 return
524 result_queue = Queue()
525 process_list = []
526 one_input_dir = None
527 for classify_idx, input_dir in self.input_dirs.items():
528 if one_input_dir is None:
529 one_input_dir = input_dir
530 classify = int(classify_idx.split('_')[0])
531 is_combined = True if int(classify_idx.split('_')[2]) == 1 else False
532 process = Process(target=self.folder_process, args=(input_dir, classify, is_combined, result_queue))
533 process_list.append(process)
534
535 wb_dir = os.path.dirname(os.path.dirname(one_input_dir))
536 wb_process = Process(target=self.wb_process, args=(wb_dir, result_queue, ))
537 process_list.append(wb_process)
538
539 for p in process_list:
540 p.start()
541 for p in process_list:
542 p.join()
543
544 self.folder_log.info('{0} [stop safely]'.format(self.log_base))
...@@ -61,13 +61,11 @@ class Command(BaseCommand, LoggerMixin): ...@@ -61,13 +61,11 @@ class Command(BaseCommand, LoggerMixin):
61 def signal_handler(self, sig, frame): 61 def signal_handler(self, sig, frame):
62 self.switch = False # 停止处理文件 62 self.switch = False # 停止处理文件
63 63
64 def license1_process(self, ocr_data, license_summary, classify, res_list, pno, ino, part_idx, img_path): 64 def license1_process(self, ocr_data, license_summary, classify, img_path):
65 # 类别:'0'身份证, '1'居住证 65 # 类别:'0'身份证, '1'居住证
66 license_data = ocr_data.get('data', []) 66 license_data = ocr_data.get('data', [])
67 if not license_data: 67 if not license_data:
68 res_list.append((pno, ino, part_idx, consts.RES_SUCCESS_EMPTY))
69 return 68 return
70 res_list.append((pno, ino, part_idx, consts.RES_SUCCESS))
71 if classify == consts.MVC_CLASSIFY: # 车辆登记证 3/4页结果整合 69 if classify == consts.MVC_CLASSIFY: # 车辆登记证 3/4页结果整合
72 for mvc_dict in license_data: 70 for mvc_dict in license_data:
73 try: 71 try:
...@@ -154,29 +152,21 @@ class Command(BaseCommand, LoggerMixin): ...@@ -154,29 +152,21 @@ class Command(BaseCommand, LoggerMixin):
154 def res_process(self, all_res, classify, excel_path): 152 def res_process(self, all_res, classify, excel_path):
155 try: 153 try:
156 license_summary = {} 154 license_summary = {}
157 res_list = []
158 155
159 if not all_res: 156 if not all_res:
160 return 157 return
161 else: 158 else:
162 for img_path, ocr_res in all_res.items(): 159 for img_path, ocr_res in all_res.items():
163 img_name, pno, ino = self.parse_img_path(img_path) 160 # img_name, pno, ino = self.parse_img_path(img_path)
164 part_idx = 1 161 # part_idx = 1
165 162
166 if isinstance(ocr_res, dict): 163 if isinstance(ocr_res, dict):
167 if ocr_res.get('code') == 1: 164 if ocr_res.get('code') == 1:
168 data_list = ocr_res.get('data', []) 165 data_list = ocr_res.get('data', [])
169 if isinstance(data_list, list): 166 if isinstance(data_list, list):
170 for part_idx, ocr_data in enumerate(data_list): 167 for ocr_data in data_list:
171 part_idx = part_idx + 1 168 # part_idx = part_idx + 1
172 self.license1_process(ocr_data, license_summary, classify, 169 self.license1_process(ocr_data, license_summary, classify, img_path)
173 res_list, pno, ino, part_idx, img_path)
174 else:
175 res_list.append((pno, ino, part_idx, consts.RES_FAILED_3))
176 else:
177 res_list.append((pno, ino, part_idx, consts.RES_FAILED))
178 else:
179 res_list.append((pno, ino, part_idx, consts.RES_FAILED))
180 170
181 wb = BSWorkbook(set(), set(), set(), set(), set()) 171 wb = BSWorkbook(set(), set(), set(), set(), set())
182 wb.simple_license_rebuild(license_summary, consts.DOC_SCHEME_LIST[0]) 172 wb.simple_license_rebuild(license_summary, consts.DOC_SCHEME_LIST[0])
...@@ -216,6 +206,13 @@ class Command(BaseCommand, LoggerMixin): ...@@ -216,6 +206,13 @@ class Command(BaseCommand, LoggerMixin):
216 return ocr_res 206 return ocr_res
217 else: 207 else:
218 self.folder_log.warn('{0} [ocr failed] [img_path={1}]'.format(self.log_base, img_path)) 208 self.folder_log.warn('{0} [ocr failed] [img_path={1}]'.format(self.log_base, img_path))
209
210 def images_process(self, img_path_list, classify, excel_path):
211 all_res = {}
212 for img_path in img_path_list:
213 ocr_res = self.ocr_process(img_path, classify)
214 all_res[img_path] = ocr_res
215 self.res_process(all_res, classify, excel_path)
219 216
220 def pdf_process(self, name, path, classify, img_output_dir, wb_output_dir, pdf_output_dir): 217 def pdf_process(self, name, path, classify, img_output_dir, wb_output_dir, pdf_output_dir):
221 if os.path.exists(path): 218 if os.path.exists(path):
...@@ -230,11 +227,7 @@ class Command(BaseCommand, LoggerMixin): ...@@ -230,11 +227,7 @@ class Command(BaseCommand, LoggerMixin):
230 self.log_base, path, traceback.format_exc())) 227 self.log_base, path, traceback.format_exc()))
231 raise e 228 raise e
232 else: 229 else:
233 all_res = {} 230 self.images_process(pdf_handler.img_path_list, classify, excel_path)
234 for img_path in pdf_handler.img_path_list:
235 ocr_res = self.ocr_process(img_path, classify)
236 all_res[img_path] = ocr_res
237 self.res_process(all_res, classify, excel_path)
238 shutil.move(path, pdf_save_path) 231 shutil.move(path, pdf_save_path)
239 232
240 def tif_process(self, name, path, classify, img_output_dir, wb_output_dir, tiff_output_dir): 233 def tif_process(self, name, path, classify, img_output_dir, wb_output_dir, tiff_output_dir):
...@@ -250,23 +243,18 @@ class Command(BaseCommand, LoggerMixin): ...@@ -250,23 +243,18 @@ class Command(BaseCommand, LoggerMixin):
250 self.log_base, path, traceback.format_exc())) 243 self.log_base, path, traceback.format_exc()))
251 raise e 244 raise e
252 else: 245 else:
253 all_res = {} 246 self.images_process(tiff_handler.img_path_list, classify, excel_path)
254 for img_path in tiff_handler.img_path_list:
255 ocr_res = self.ocr_process(img_path, classify)
256 all_res[img_path] = ocr_res
257 self.res_process(all_res, classify, excel_path)
258 shutil.move(path, tiff_save_path) 247 shutil.move(path, tiff_save_path)
259 248
260 def img_process(self, name, path, classify, wb_output_dir, img_output_dir, pdf_output_dir): 249 def img_process(self, name, path, classify, wb_output_dir, img_output_dir, pdf_output_dir):
261 ocr_res = self.ocr_process(path, classify)
262 all_res = {path: ocr_res}
263
264 try: 250 try:
265 img_save_path, excel_path, _ = self.get_path(name, img_output_dir, wb_output_dir, pdf_output_dir) 251 img_save_path, excel_path, _ = self.get_path(name, img_output_dir, wb_output_dir, pdf_output_dir)
266 except Exception as e: 252 except Exception as e:
267 self.folder_log.error('{0} [get path error] [path={1}] [error={2}]'.format( 253 self.folder_log.error('{0} [get path error] [path={1}] [error={2}]'.format(
268 self.log_base, path, traceback.format_exc())) 254 self.log_base, path, traceback.format_exc()))
269 else: 255 else:
256 ocr_res = self.ocr_process(path, classify)
257 all_res = {path: ocr_res}
270 self.res_process(all_res, classify, excel_path) 258 self.res_process(all_res, classify, excel_path)
271 shutil.move(path, img_save_path) 259 shutil.move(path, img_save_path)
272 260
...@@ -312,9 +300,9 @@ class Command(BaseCommand, LoggerMixin): ...@@ -312,9 +300,9 @@ class Command(BaseCommand, LoggerMixin):
312 try: 300 try:
313 if os.path.isfile(path): 301 if os.path.isfile(path):
314 self.folder_log.info('{0} [file start] [path={1}]'.format(self.log_base, path)) 302 self.folder_log.info('{0} [file start] [path={1}]'.format(self.log_base, path))
315 if name.endswith('.pdf'): 303 if name.endswith('.pdf') or name.endswith('.PDF'):
316 self.pdf_process(name, path, classify, img_output_dir, wb_output_dir, pdf_output_dir) 304 self.pdf_process(name, path, classify, img_output_dir, wb_output_dir, pdf_output_dir)
317 elif name.endswith('.tif'): 305 elif name.endswith('.tif') or name.endswith('.TIF'):
318 self.tif_process(name, path, classify, img_output_dir, wb_output_dir, tiff_output_dir) 306 self.tif_process(name, path, classify, img_output_dir, wb_output_dir, tiff_output_dir)
319 else: 307 else:
320 self.img_process(name, path, classify, wb_output_dir, img_output_dir, pdf_output_dir) 308 self.img_process(name, path, classify, wb_output_dir, img_output_dir, pdf_output_dir)
......
...@@ -702,6 +702,31 @@ class BSWorkbook(Workbook): ...@@ -702,6 +702,31 @@ class BSWorkbook(Workbook):
702 if field_str is not None: 702 if field_str is not None:
703 count_list.append((field_str, count)) 703 count_list.append((field_str, count))
704 704
705 def ltgt_build(self, label, result_dict):
706 ws = self.create_sheet(label)
707 rebuild_res = {}
708 for key, value in result_dict.items():
709 if isinstance(value, list):
710 value_list = [dict_item.get('words') for dict_item in value]
711 ws.append((key, '、'.join(value_list)))
712 rebuild_res[key] = '、'.join(value_list)
713 elif isinstance(value, dict):
714 if 'words' in value:
715 ws.append((key, value['words']))
716 rebuild_res[key] = value['words']
717 else:
718 for sub_key, sub_value in value.items():
719 if isinstance(sub_value, dict):
720 ws.append(('{0}: {1}'.format(key, sub_key), sub_value.get('words', '')))
721 rebuild_res['{0}: {1}'.format(key, sub_key)] = sub_value.get('words', '')
722 else:
723 ws.append(('{0}: {1}'.format(key, sub_key), sub_value))
724 rebuild_res['{0}: {1}'.format(key, sub_key)] = sub_value
725 else:
726 ws.append((key, value))
727 rebuild_res[key] = value
728 return rebuild_res
729
705 def simple_license_rebuild(self, license_summary, document_scheme): 730 def simple_license_rebuild(self, license_summary, document_scheme):
706 # for ic_license_dict in license_summary.get(consts.IC_CLASSIFY, []): 731 # for ic_license_dict in license_summary.get(consts.IC_CLASSIFY, []):
707 # if ic_license_dict.get('类别') == '1': 732 # if ic_license_dict.get('类别') == '1':
......
...@@ -225,3 +225,13 @@ class PDFHandler: ...@@ -225,3 +225,13 @@ class PDFHandler:
225 else: 225 else:
226 self.merge_il(pdf, pno, il) 226 self.merge_il(pdf, pno, il)
227 self.img_count = len(self.img_path_list) 227 self.img_count = len(self.img_path_list)
228
229 def extract_page_image(self):
230 self.img_path_list = []
231 self.xref_set = set()
232 os.makedirs(self.img_dir_path, exist_ok=True)
233 with fitz.Document(self.path) as pdf:
234 for pno in range(pdf.pageCount):
235 page = pdf.loadPage(pno)
236 self.page_to_png(page)
237 self.img_count = len(self.img_path_list)
......
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!