2134d7e0 by 周伟奇

Merge branch 'feature/smart_dda' into feature/0918

2 parents d37597e9 4ce34ec0
...@@ -60,13 +60,6 @@ class Command(BaseCommand, LoggerMixin): ...@@ -60,13 +60,6 @@ class Command(BaseCommand, LoggerMixin):
60 self.sleep_time = float(conf.SLEEP_SECOND_FOLDER) 60 self.sleep_time = float(conf.SLEEP_SECOND_FOLDER)
61 # input folder 61 # input folder
62 self.input_dirs = conf.get_namespace('DDA_DIR_') 62 self.input_dirs = conf.get_namespace('DDA_DIR_')
63 # seperate folder name
64 self.seperate_map = {
65 consts.IC_CLASSIFY: 'IDCard',
66 consts.BC_CLASSIFY: 'BankCard',
67 consts.PP_CLASSIFY: 'Passport',
68 consts.EEP_CLASSIFY: 'EntryPermit',
69 }
70 self.field_map = { 63 self.field_map = {
71 consts.IC_CLASSIFY: (consts.IC_CN_NAME, '有效期限', consts.IC_FIELD_ORDER_3, consts.IC_FIELD_ORDER_2), 64 consts.IC_CLASSIFY: (consts.IC_CN_NAME, '有效期限', consts.IC_FIELD_ORDER_3, consts.IC_FIELD_ORDER_2),
72 consts.BC_CLASSIFY: (consts.BC_CN_NAME, None, None, consts.BC_FIELD_ORDER_2), 65 consts.BC_CLASSIFY: (consts.BC_CN_NAME, None, None, consts.BC_FIELD_ORDER_2),
...@@ -77,6 +70,7 @@ class Command(BaseCommand, LoggerMixin): ...@@ -77,6 +70,7 @@ class Command(BaseCommand, LoggerMixin):
77 self.ocr_url = conf.OCR_URL_FOLDER 70 self.ocr_url = conf.OCR_URL_FOLDER
78 self.ocr_url_2 = conf.OCR2_URL_FOLDER 71 self.ocr_url_2 = conf.OCR2_URL_FOLDER
79 # self.ocr_url_4 = conf.IC_URL 72 # self.ocr_url_4 = conf.IC_URL
73 self.classify_set = {consts.IC_CLASSIFY, consts.PP_CLASSIFY, consts.EEP_CLASSIFY, consts.BC_CLASSIFY}
80 # 优雅退出信号:15 74 # 优雅退出信号:15
81 signal.signal(signal.SIGTERM, self.signal_handler) 75 signal.signal(signal.SIGTERM, self.signal_handler)
82 76
...@@ -94,7 +88,7 @@ class Command(BaseCommand, LoggerMixin): ...@@ -94,7 +88,7 @@ class Command(BaseCommand, LoggerMixin):
94 id_card_dict.pop('base64_img') 88 id_card_dict.pop('base64_img')
95 except Exception as e: 89 except Exception as e:
96 continue 90 continue
97 all_res.extend(license_data) 91 all_res.setdefault(classify, []).extend(license_data)
98 92
99 def license2_process(self, ocr_data, all_res, classify, img_path): 93 def license2_process(self, ocr_data, all_res, classify, img_path):
100 pid, _, _, _, _, _ = consts.LICENSE_CLASSIFY_MAPPING.get(classify) 94 pid, _, _, _, _, _ = consts.LICENSE_CLASSIFY_MAPPING.get(classify)
...@@ -129,14 +123,14 @@ class Command(BaseCommand, LoggerMixin): ...@@ -129,14 +123,14 @@ class Command(BaseCommand, LoggerMixin):
129 123
130 if ocr_res_2.get('ErrorCode') in consts.SUCCESS_CODE_SET: 124 if ocr_res_2.get('ErrorCode') in consts.SUCCESS_CODE_SET:
131 if pid == consts.BC_PID: 125 if pid == consts.BC_PID:
132 all_res.append(ocr_res_2) 126 all_res.setdefault(classify, []).append(ocr_res_2)
133 else: 127 else:
134 # 营业执照等 128 # 营业执照等
135 for result_dict in ocr_res_2.get('ResultList', []): 129 for result_dict in ocr_res_2.get('ResultList', []):
136 res_dict = {} 130 res_dict = {}
137 for field_dict in result_dict.get('FieldList', []): 131 for field_dict in result_dict.get('FieldList', []):
138 res_dict[field_dict.get('chn_key', '')] = field_dict.get('value', '') 132 res_dict[field_dict.get('chn_key', '')] = field_dict.get('value', '')
139 all_res.append(res_dict) 133 all_res.setdefault(classify, []).append(res_dict)
140 break 134 break
141 135
142 @staticmethod 136 @staticmethod
...@@ -150,33 +144,33 @@ class Command(BaseCommand, LoggerMixin): ...@@ -150,33 +144,33 @@ class Command(BaseCommand, LoggerMixin):
150 return img_name, 1, 1 144 return img_name, 1, 1
151 145
152 @staticmethod 146 @staticmethod
153 def get_path(name, img_output_dir, wb_output_dir, pdf_output_dir, seperate_dir): 147 def get_path(name, img_output_dir, wb_output_dir, pdf_output_dir):
154 time_stamp = datetime.now().strftime('%Y-%m-%d_%H:%M:%S') 148 time_stamp = datetime.now().strftime('%Y-%m-%d_%H:%M:%S')
155 new_name = '{0}_{1}'.format(time_stamp, name) 149 new_name = '{0}_{1}'.format(time_stamp, name)
156 img_save_path = os.path.join(img_output_dir, new_name) 150 img_save_path = os.path.join(img_output_dir, new_name)
157 pdf_save_path = os.path.join(pdf_output_dir, new_name) 151 pdf_save_path = os.path.join(pdf_output_dir, new_name)
158 excel_name = '{0}.xlsx'.format(os.path.splitext(new_name)[0]) 152 excel_name = '{0}.xlsx'.format(os.path.splitext(new_name)[0])
159 excel_path = os.path.join(wb_output_dir, excel_name) 153 excel_path = os.path.join(wb_output_dir, excel_name)
160 seperate_path = None if seperate_dir is None else os.path.join(seperate_dir, new_name) 154 return img_save_path, excel_path, pdf_save_path
161 return img_save_path, excel_path, pdf_save_path, seperate_path
162 155
163 def res_process(self, all_res, excel_path, classify): 156 def res_process(self, all_res, excel_path):
164 try: 157 try:
165 wb = BSWorkbook(set(), set(), set(), set(), set()) 158 wb = BSWorkbook(set(), set(), set(), set(), set())
166 sheet_name, key_field, side_field_order, src_field_order = self.field_map.get(classify) 159 for classify, ress in all_res.items():
167 ws = wb.create_sheet(sheet_name) 160 sheet_name, key_field, side_field_order, src_field_order = self.field_map.get(classify)
168 for res in all_res: 161 ws = wb.create_sheet(sheet_name)
169 if key_field is not None and key_field in res: 162 for res in ress:
170 field_order = side_field_order 163 if key_field is not None and key_field in res:
171 else: 164 field_order = side_field_order
172 field_order = src_field_order
173 for search_field, write_field in field_order:
174 field_value = res.get(search_field, '')
175 if isinstance(field_value, list):
176 ws.append((write_field, *field_value))
177 else: 165 else:
178 ws.append((write_field, field_value)) 166 field_order = src_field_order
179 ws.append((None,)) 167 for search_field, write_field in field_order:
168 field_value = res.get(search_field, '')
169 if isinstance(field_value, list):
170 ws.append((write_field, *field_value))
171 else:
172 ws.append((write_field, field_value))
173 ws.append((None,))
180 wb.remove_base_sheet() 174 wb.remove_base_sheet()
181 wb.save(excel_path) 175 wb.save(excel_path)
182 except Exception as e: 176 except Exception as e:
...@@ -189,7 +183,7 @@ class Command(BaseCommand, LoggerMixin): ...@@ -189,7 +183,7 @@ class Command(BaseCommand, LoggerMixin):
189 sep = os.path.sep + (os.path.altsep or '') 183 sep = os.path.sep + (os.path.altsep or '')
190 return os.path.basename(path.rstrip(sep)) 184 return os.path.basename(path.rstrip(sep))
191 185
192 def ocr_process(self, img_path, classify, all_res, seperate_dir): 186 def ocr_process(self, img_path, all_res):
193 if os.path.exists(img_path): 187 if os.path.exists(img_path):
194 # TODO 图片验证 188 # TODO 图片验证
195 with open(img_path, 'rb') as f: 189 with open(img_path, 'rb') as f:
...@@ -199,9 +193,6 @@ class Command(BaseCommand, LoggerMixin): ...@@ -199,9 +193,6 @@ class Command(BaseCommand, LoggerMixin):
199 json_data = { 193 json_data = {
200 "file": file_data, 194 "file": file_data,
201 } 195 }
202 if seperate_dir is None:
203 json_data["classify"] = classify
204
205 for times in range(consts.RETRY_TIMES): 196 for times in range(consts.RETRY_TIMES):
206 try: 197 try:
207 start_time = time.time() 198 start_time = time.time()
...@@ -223,34 +214,31 @@ class Command(BaseCommand, LoggerMixin): ...@@ -223,34 +214,31 @@ class Command(BaseCommand, LoggerMixin):
223 data_list = ocr_res.get('data', []) 214 data_list = ocr_res.get('data', [])
224 if isinstance(data_list, list): 215 if isinstance(data_list, list):
225 for ocr_data in data_list: 216 for ocr_data in data_list:
226 if ocr_data.get('classify') == classify: 217 classify = ocr_data.get('classify')
227 if seperate_dir is not None: 218 if classify not in self.classify_set:
228 os.makedirs(seperate_dir, exist_ok=True) 219 continue
229 real_dst = os.path.join(seperate_dir, self.basename(img_path)) 220 if classify in consts.LICENSE_CLASSIFY_SET_1:
230 if not os.path.exists(real_dst): 221 self.license1_process(ocr_data, all_res, classify)
231 shutil.move(img_path, seperate_dir) 222 elif classify in consts.LICENSE_CLASSIFY_SET_2:
232 if classify in consts.LICENSE_CLASSIFY_SET_1: 223 self.license2_process(ocr_data, all_res, classify, img_path)
233 self.license1_process(ocr_data, all_res, classify)
234 elif classify in consts.LICENSE_CLASSIFY_SET_2:
235 self.license2_process(ocr_data, all_res, classify, img_path)
236 break 224 break
237 else: 225 else:
238 self.folder_log.warn('{0} [ocr failed] [img_path={1}]'.format(self.log_base, img_path)) 226 self.folder_log.warn('{0} [ocr failed] [img_path={1}]'.format(self.log_base, img_path))
239 227
240 def images_process(self, img_path_list, classify, excel_path, seperate_dir): 228 def images_process(self, img_path_list, excel_path):
241 all_res = [] 229 all_res = {}
242 for img_path in img_path_list: 230 for img_path in img_path_list:
243 self.ocr_process(img_path, classify, all_res, seperate_dir) 231 self.ocr_process(img_path, all_res)
244 # if len(all_res) > 0: 232 # if len(all_res) > 0:
245 self.res_process(all_res, excel_path, classify) 233 self.res_process(all_res, excel_path)
246 return all_res 234 return all_res
247 235
248 def pdf_process(self, name, path, classify, img_output_dir, wb_output_dir, pdf_output_dir, seperate_dir): 236 def pdf_process(self, name, path, img_output_dir, wb_output_dir, pdf_output_dir):
249 if os.path.exists(path): 237 if os.path.exists(path):
250 rebuild_res = None 238 rebuild_res = None
251 try: 239 try:
252 img_save_path, excel_path, pdf_save_path, seperate_path = self.get_path( 240 img_save_path, excel_path, pdf_save_path = self.get_path(
253 name, img_output_dir, wb_output_dir, pdf_output_dir, seperate_dir) 241 name, img_output_dir, wb_output_dir, pdf_output_dir)
254 self.folder_log.info('{0} [pdf to img start] [path={1}]'.format(self.log_base, path)) 242 self.folder_log.info('{0} [pdf to img start] [path={1}]'.format(self.log_base, path))
255 pdf_handler = PDFHandler(path, img_save_path) 243 pdf_handler = PDFHandler(path, img_save_path)
256 pdf_handler.extract_image() 244 pdf_handler.extract_image()
...@@ -260,16 +248,16 @@ class Command(BaseCommand, LoggerMixin): ...@@ -260,16 +248,16 @@ class Command(BaseCommand, LoggerMixin):
260 self.log_base, path, traceback.format_exc())) 248 self.log_base, path, traceback.format_exc()))
261 raise e 249 raise e
262 else: 250 else:
263 rebuild_res = self.images_process(pdf_handler.img_path_list, classify, excel_path, seperate_path) 251 rebuild_res = self.images_process(pdf_handler.img_path_list, excel_path)
264 shutil.move(path, pdf_save_path) 252 shutil.move(path, pdf_save_path)
265 return rebuild_res 253 return rebuild_res
266 254
267 def tif_process(self, name, path, classify, img_output_dir, wb_output_dir, tiff_output_dir, seperate_dir): 255 def tif_process(self, name, path, img_output_dir, wb_output_dir, tiff_output_dir):
268 if os.path.exists(path): 256 if os.path.exists(path):
269 rebuild_res = None 257 rebuild_res = None
270 try: 258 try:
271 img_save_path, excel_path, tiff_save_path, seperate_path = self.get_path( 259 img_save_path, excel_path, tiff_save_path = self.get_path(
272 name, img_output_dir, wb_output_dir, tiff_output_dir, seperate_dir) 260 name, img_output_dir, wb_output_dir, tiff_output_dir)
273 self.folder_log.info('{0} [tiff to img start] [path={1}]'.format(self.log_base, path)) 261 self.folder_log.info('{0} [tiff to img start] [path={1}]'.format(self.log_base, path))
274 tiff_handler = TIFFHandler(path, img_save_path) 262 tiff_handler = TIFFHandler(path, img_save_path)
275 tiff_handler.extract_image() 263 tiff_handler.extract_image()
...@@ -279,20 +267,19 @@ class Command(BaseCommand, LoggerMixin): ...@@ -279,20 +267,19 @@ class Command(BaseCommand, LoggerMixin):
279 self.log_base, path, traceback.format_exc())) 267 self.log_base, path, traceback.format_exc()))
280 raise e 268 raise e
281 else: 269 else:
282 rebuild_res = self.images_process(tiff_handler.img_path_list, classify, excel_path, seperate_path) 270 rebuild_res = self.images_process(tiff_handler.img_path_list, excel_path)
283 shutil.move(path, tiff_save_path) 271 shutil.move(path, tiff_save_path)
284 return rebuild_res 272 return rebuild_res
285 273
286 def img_process(self, name, path, classify, wb_output_dir, img_output_dir, pdf_output_dir, seperate_dir): 274 def img_process(self, name, path, wb_output_dir, img_output_dir, pdf_output_dir):
287 rebuild_res = None 275 rebuild_res = None
288 try: 276 try:
289 img_save_path, excel_path, _, seperate_path = self.get_path( 277 img_save_path, excel_path, _ = self.get_path(name, img_output_dir, wb_output_dir, pdf_output_dir)
290 name, img_output_dir, wb_output_dir, pdf_output_dir, seperate_dir)
291 except Exception as e: 278 except Exception as e:
292 self.folder_log.error('{0} [get path error] [path={1}] [error={2}]'.format( 279 self.folder_log.error('{0} [get path error] [path={1}] [error={2}]'.format(
293 self.log_base, path, traceback.format_exc())) 280 self.log_base, path, traceback.format_exc()))
294 else: 281 else:
295 rebuild_res = self.images_process([path], classify, excel_path, seperate_path) 282 rebuild_res = self.images_process([path], excel_path)
296 shutil.move(path, img_save_path) 283 shutil.move(path, img_save_path)
297 return rebuild_res 284 return rebuild_res
298 285
...@@ -344,7 +331,7 @@ class Command(BaseCommand, LoggerMixin): ...@@ -344,7 +331,7 @@ class Command(BaseCommand, LoggerMixin):
344 self.log_base, result, traceback.format_exc())) 331 self.log_base, result, traceback.format_exc()))
345 wb.save(wb_path) 332 wb.save(wb_path)
346 333
347 def folder_process(self, input_dir, classify, is_combined, result_queue): 334 def folder_process(self, input_dir, result_queue):
348 while not os.path.isdir(input_dir): 335 while not os.path.isdir(input_dir):
349 self.folder_log.info('{0} [input dir is not dir] [input_dir={1}]'.format(self.log_base, input_dir)) 336 self.folder_log.info('{0} [input dir is not dir] [input_dir={1}]'.format(self.log_base, input_dir))
350 if self.switch: 337 if self.switch:
...@@ -353,7 +340,6 @@ class Command(BaseCommand, LoggerMixin): ...@@ -353,7 +340,6 @@ class Command(BaseCommand, LoggerMixin):
353 else: 340 else:
354 return 341 return
355 output_dir = os.path.join(os.path.dirname(input_dir), 'Output') 342 output_dir = os.path.join(os.path.dirname(input_dir), 'Output')
356 seperate_dir = os.path.join(output_dir, self.seperate_map.get(classify, 'Unknown')) if is_combined else None
357 img_output_dir = os.path.join(output_dir, 'image') 343 img_output_dir = os.path.join(output_dir, 'image')
358 wb_output_dir = os.path.join(output_dir, 'excel') 344 wb_output_dir = os.path.join(output_dir, 'excel')
359 pdf_output_dir = os.path.join(output_dir, 'pdf') 345 pdf_output_dir = os.path.join(output_dir, 'pdf')
...@@ -365,8 +351,6 @@ class Command(BaseCommand, LoggerMixin): ...@@ -365,8 +351,6 @@ class Command(BaseCommand, LoggerMixin):
365 os.makedirs(pdf_output_dir, exist_ok=True) 351 os.makedirs(pdf_output_dir, exist_ok=True)
366 os.makedirs(tiff_output_dir, exist_ok=True) 352 os.makedirs(tiff_output_dir, exist_ok=True)
367 os.makedirs(failed_output_dir, exist_ok=True) 353 os.makedirs(failed_output_dir, exist_ok=True)
368 if seperate_dir is not None:
369 os.makedirs(seperate_dir, exist_ok=True)
370 os_error_filename_set = set() 354 os_error_filename_set = set()
371 while self.switch: 355 while self.switch:
372 # if not os.path.isdir(input_dir): 356 # if not os.path.isdir(input_dir):
...@@ -389,14 +373,14 @@ class Command(BaseCommand, LoggerMixin): ...@@ -389,14 +373,14 @@ class Command(BaseCommand, LoggerMixin):
389 if os.path.isfile(path): 373 if os.path.isfile(path):
390 self.folder_log.info('{0} [file start] [path={1}]'.format(self.log_base, path)) 374 self.folder_log.info('{0} [file start] [path={1}]'.format(self.log_base, path))
391 if name.endswith('.pdf') or name.endswith('.PDF'): 375 if name.endswith('.pdf') or name.endswith('.PDF'):
392 result = self.pdf_process(name, path, classify, img_output_dir, wb_output_dir, 376 result = self.pdf_process(name, path, img_output_dir, wb_output_dir,
393 pdf_output_dir, seperate_dir) 377 pdf_output_dir)
394 elif name.endswith('.tif') or name.endswith('.TIF'): 378 elif name.endswith('.tif') or name.endswith('.TIF'):
395 result = self.tif_process(name, path, classify, img_output_dir, wb_output_dir, 379 result = self.tif_process(name, path, img_output_dir, wb_output_dir,
396 tiff_output_dir, seperate_dir) 380 tiff_output_dir)
397 else: 381 else:
398 result = self.img_process(name, path, classify, wb_output_dir, img_output_dir, 382 result = self.img_process(name, path, wb_output_dir, img_output_dir,
399 pdf_output_dir, seperate_dir) 383 pdf_output_dir)
400 self.folder_log.info('{0} [file end] [path={1}]'.format(self.log_base, path)) 384 self.folder_log.info('{0} [file end] [path={1}]'.format(self.log_base, path))
401 else: 385 else:
402 result = None 386 result = None
...@@ -420,23 +404,15 @@ class Command(BaseCommand, LoggerMixin): ...@@ -420,23 +404,15 @@ class Command(BaseCommand, LoggerMixin):
420 else: 404 else:
421 if isinstance(result, dict) and len(result) > 0: 405 if isinstance(result, dict) and len(result) > 0:
422 date_str = time.strftime("%Y-%m-%d") 406 date_str = time.strftime("%Y-%m-%d")
423 result_queue.put( 407 for classify, res_list in result.items():
424 { 408 for res in res_list:
425 self.CLASSIFY_KEY: classify, 409 result_queue.put(
426 self.RESULT_KEY: result, 410 {
427 self.DATE_KEY: date_str 411 self.CLASSIFY_KEY: classify,
428 } 412 self.RESULT_KEY: res,
429 ) 413 self.DATE_KEY: date_str
430 elif isinstance(result, list) and len(result) > 0: 414 }
431 date_str = time.strftime("%Y-%m-%d") 415 )
432 for res in result:
433 result_queue.put(
434 {
435 self.CLASSIFY_KEY: classify,
436 self.RESULT_KEY: res,
437 self.DATE_KEY: date_str
438 }
439 )
440 416
441 def handle(self, *args, **kwargs): 417 def handle(self, *args, **kwargs):
442 if len(self.input_dirs) == 0: 418 if len(self.input_dirs) == 0:
...@@ -444,12 +420,10 @@ class Command(BaseCommand, LoggerMixin): ...@@ -444,12 +420,10 @@ class Command(BaseCommand, LoggerMixin):
444 result_queue = Queue() 420 result_queue = Queue()
445 process_list = [] 421 process_list = []
446 one_input_dir = None 422 one_input_dir = None
447 for classify_idx, input_dir in self.input_dirs.items(): 423 for _, input_dir in self.input_dirs.items():
448 if one_input_dir is None: 424 if one_input_dir is None:
449 one_input_dir = input_dir 425 one_input_dir = input_dir
450 classify = int(classify_idx.split('_')[0]) 426 process = Process(target=self.folder_process, args=(input_dir, result_queue))
451 is_combined = True if int(classify_idx.split('_')[2]) == 1 else False
452 process = Process(target=self.folder_process, args=(input_dir, classify, is_combined, result_queue))
453 process_list.append(process) 427 process_list.append(process)
454 428
455 wb_dir = os.path.dirname(os.path.dirname(one_input_dir)) 429 wb_dir = os.path.dirname(os.path.dirname(one_input_dir))
......
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!