ltgt part 2
Showing
2 changed files
with
178 additions
and
54 deletions
... | @@ -16,6 +16,7 @@ from openpyxl import load_workbook, Workbook | ... | @@ -16,6 +16,7 @@ from openpyxl import load_workbook, Workbook |
16 | from settings import conf | 16 | from settings import conf |
17 | from common.mixins import LoggerMixin | 17 | from common.mixins import LoggerMixin |
18 | from common.tools.pdf_to_img import PDFHandler | 18 | from common.tools.pdf_to_img import PDFHandler |
19 | from common.electronic_afc_contract.afc_contract_ocr import predict as afc_predict | ||
19 | from apps.doc import consts | 20 | from apps.doc import consts |
20 | from apps.doc.exceptions import OCR1Exception, OCR2Exception, LTGTException | 21 | from apps.doc.exceptions import OCR1Exception, OCR2Exception, LTGTException |
21 | from apps.doc.ocr.wb import BSWorkbook | 22 | from apps.doc.ocr.wb import BSWorkbook |
... | @@ -72,14 +73,16 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -72,14 +73,16 @@ class Command(BaseCommand, LoggerMixin): |
72 | # input folder | 73 | # input folder |
73 | self.input_dirs = conf.get_namespace('LTGT_DIR_') | 74 | self.input_dirs = conf.get_namespace('LTGT_DIR_') |
74 | # seperate folder name | 75 | # seperate folder name |
75 | self.seperate_map = { | 76 | self.combined_map = { |
76 | consts.IC_CLASSIFY: 'IDCard', | 77 | consts.IC_CLASSIFY: 'IDCard', |
77 | consts.MVC_CLASSIFY: 'GreenBook', | 78 | consts.MVC_CLASSIFY: 'GreenBook', |
78 | consts.CONTRACT_CLASSIFY: 'Contract', | 79 | consts.CONTRACT_CLASSIFY: 'Contract', |
79 | } | 80 | } |
80 | self.field_map = { | 81 | self.field_map = { |
82 | # sheet_name, key_field, side_field_order, src_field_order | ||
81 | consts.VAT_CLASSIFY: (consts.VAT_CN_NAME, None, None, consts.VATS_FIELD_ORDER), | 83 | consts.VAT_CLASSIFY: (consts.VAT_CN_NAME, None, None, consts.VATS_FIELD_ORDER), |
82 | consts.IC_CLASSIFY: (consts.IC_CN_NAME, '有效期限', consts.IC_FIELD_ORDER_3, consts.IC_FIELD_ORDER_2), | 84 | consts.IC_CLASSIFY: (consts.IC_CN_NAME, '有效期限', consts.IC_FIELD_ORDER_3, consts.IC_FIELD_ORDER_2), |
85 | consts.MVC_CLASSIFY: (consts.MVC_CN_NAME, '机动车登记证书编号', consts.MVC_SE_FIELD_ORDER_3_4, consts.MVC_SE_FIELD_ORDER_1_2), | ||
83 | } | 86 | } |
84 | # ocr相关 | 87 | # ocr相关 |
85 | self.ocr_url = conf.OCR_URL_FOLDER | 88 | self.ocr_url = conf.OCR_URL_FOLDER |
... | @@ -92,18 +95,96 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -92,18 +95,96 @@ class Command(BaseCommand, LoggerMixin): |
92 | def signal_handler(self, sig, frame): | 95 | def signal_handler(self, sig, frame): |
93 | self.switch = False # 停止处理文件 | 96 | self.switch = False # 停止处理文件 |
94 | 97 | ||
98 | def contract_process(self, ocr_data, contract_result, classify): | ||
99 | contract_dict = ocr_data.get('data') | ||
100 | if not contract_dict or contract_dict.get('page_num') is None or contract_dict.get('page_info') is None: | ||
101 | return | ||
102 | page_num = contract_dict.get('page_num') | ||
103 | if page_num.startswith('page_'): | ||
104 | page_num_only = page_num.split('_')[-1] | ||
105 | else: | ||
106 | page_num_only = page_num | ||
107 | rebuild_page_info = [] | ||
108 | text_key = 'words' | ||
109 | for key, value in contract_dict.get('page_info', {}).items(): | ||
110 | if value is None: | ||
111 | rebuild_page_info.append((key,)) | ||
112 | elif text_key in value: | ||
113 | if value[text_key] is None: | ||
114 | rebuild_page_info.append((key,)) | ||
115 | elif isinstance(value[text_key], str): | ||
116 | rebuild_page_info.append((key, value[text_key])) | ||
117 | elif isinstance(value[text_key], list): | ||
118 | rebuild_page_info.append((key,)) | ||
119 | for row_list in value[text_key]: | ||
120 | rebuild_page_info.append(row_list) | ||
121 | else: | ||
122 | rebuild_page_info.append((key,)) | ||
123 | for sub_key, sub_value in value.items(): | ||
124 | if sub_value is None: | ||
125 | rebuild_page_info.append((sub_key,)) | ||
126 | elif text_key in sub_value: | ||
127 | if sub_value[text_key] is None: | ||
128 | rebuild_page_info.append((sub_key,)) | ||
129 | elif isinstance(sub_value[text_key], str): | ||
130 | rebuild_page_info.append((sub_key, sub_value[text_key])) | ||
131 | elif isinstance(sub_value[text_key], list): | ||
132 | rebuild_page_info.append((sub_key,)) | ||
133 | for row_list in sub_value[text_key]: | ||
134 | rebuild_page_info.append(row_list) | ||
135 | |||
136 | contract_result.setdefault(classify, dict()).setdefault(page_num_only, []).append(rebuild_page_info) | ||
137 | |||
95 | def license1_process(self, ocr_data, all_res, classify): | 138 | def license1_process(self, ocr_data, all_res, classify): |
96 | # 类别:'0'身份证, '1'居住证 | 139 | # 类别:'0'身份证, '1'居住证 |
97 | license_data = ocr_data.get('data', []) | 140 | license_data = ocr_data.get('data') |
98 | if not license_data: | 141 | if not license_data: |
99 | return | 142 | return |
143 | if isinstance(license_data, dict): | ||
144 | license_data.pop('base64_img', '') | ||
100 | if classify == consts.IC_CLASSIFY: | 145 | if classify == consts.IC_CLASSIFY: |
101 | for id_card_dict in license_data: | 146 | id_card_dict = {} |
102 | try: | 147 | card_type = license_data.get('type', '') |
103 | id_card_dict.pop('base64_img') | 148 | is_ic = card_type.startswith('身份证') |
104 | except Exception as e: | 149 | is_info_side = card_type.endswith('信息面') |
105 | continue | 150 | id_card_dict['类别'] = '0' if is_ic else '1' |
106 | all_res.extend(license_data) | 151 | if is_ic: |
152 | field_map = consts.IC_MAP_0 if is_info_side else consts.IC_MAP_1 | ||
153 | else: | ||
154 | field_map = consts.RP_MAP_0 if is_info_side else consts.RP_MAP_1 | ||
155 | for write_field, search_field in field_map: | ||
156 | id_card_dict[write_field] = license_data.get('words_result', {}).get(search_field, {}).get('words', '') | ||
157 | if not is_info_side: | ||
158 | start_time = license_data.get('words_result', {}).get('签发日期', {}).get('words', '') | ||
159 | end_time = license_data.get('words_result', {}).get('失效日期', {}).get('words', '') | ||
160 | id_card_dict['有效期限'] = '{0}-{1}'.format(start_time, end_time) | ||
161 | # for id_card_dict in license_data: | ||
162 | # try: | ||
163 | # id_card_dict.pop('base64_img') | ||
164 | # except Exception as e: | ||
165 | # continue | ||
166 | all_res.setdefault(classify, []).append(id_card_dict) | ||
167 | elif classify == consts.MVC_CLASSIFY: | ||
168 | rebuild_data_dict = {} | ||
169 | mvc_page = license_data.pop('page', 'VehicleRCI') | ||
170 | mvc_res = license_data.pop('results', {}) | ||
171 | if mvc_page == 'VehicleRegArea': | ||
172 | rebuild_data_dict['机动车登记证书编号'] = mvc_res.get('机动车登记证书编号', {}).get('words', '') | ||
173 | for register_info in mvc_res.get('登记信息', []): | ||
174 | register_info.pop('register_type', None) | ||
175 | register_info.pop('register_type_name', None) | ||
176 | for cn_key, detail_dict in register_info.items(): | ||
177 | rebuild_data_dict.setdefault(cn_key, []).append( | ||
178 | detail_dict.get('words', '')) | ||
179 | else: | ||
180 | for cn_key, detail_dict in mvc_res.items(): | ||
181 | rebuild_data_dict[cn_key] = detail_dict.get('words', '') | ||
182 | all_res.setdefault(classify, []).append(rebuild_data_dict) | ||
183 | elif classify == consts.CONTRACT_CLASSIFY: | ||
184 | pass | ||
185 | else: | ||
186 | # all_res.extend(license_data) | ||
187 | all_res.setdefault(classify, []).extend(license_data) | ||
107 | 188 | ||
108 | def license2_process(self, ocr_data, all_res, classify, img_path): | 189 | def license2_process(self, ocr_data, all_res, classify, img_path): |
109 | pid, _, _, _, _, _ = consts.LICENSE_CLASSIFY_MAPPING.get(classify) | 190 | pid, _, _, _, _, _ = consts.LICENSE_CLASSIFY_MAPPING.get(classify) |
... | @@ -159,20 +240,24 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -159,20 +240,24 @@ class Command(BaseCommand, LoggerMixin): |
159 | return img_name, 1, 1 | 240 | return img_name, 1, 1 |
160 | 241 | ||
161 | @staticmethod | 242 | @staticmethod |
162 | def get_path(name, img_output_dir, wb_output_dir, pdf_output_dir, seperate_dir): | 243 | def get_path(name, img_output_dir, wb_output_dir, pdf_output_dir, seperate_dir_map): |
163 | time_stamp = datetime.now().strftime('%Y-%m-%d_%H:%M:%S') | 244 | time_stamp = datetime.now().strftime('%Y-%m-%d_%H:%M:%S') |
164 | new_name = '{0}_{1}'.format(time_stamp, name) | 245 | new_name = '{0}_{1}'.format(time_stamp, name) |
165 | img_save_path = os.path.join(img_output_dir, new_name) | 246 | img_save_path = os.path.join(img_output_dir, new_name) |
166 | pdf_save_path = os.path.join(pdf_output_dir, new_name) | 247 | pdf_save_path = os.path.join(pdf_output_dir, new_name) |
167 | excel_name = '{0}.xlsx'.format(os.path.splitext(new_name)[0]) | 248 | excel_name = '{0}.xlsx'.format(os.path.splitext(new_name)[0]) |
168 | excel_path = os.path.join(wb_output_dir, excel_name) | 249 | excel_path = os.path.join(wb_output_dir, excel_name) |
169 | seperate_path = None if seperate_dir is None else os.path.join(seperate_dir, new_name) | 250 | seperate_path_map = dict() |
170 | return img_save_path, excel_path, pdf_save_path, seperate_path | 251 | if len(seperate_dir_map) > 0: |
252 | for c, seperate_dir in seperate_dir_map.items(): | ||
253 | seperate_path_map[c] = os.path.join(seperate_dir, new_name) | ||
254 | return img_save_path, excel_path, pdf_save_path, seperate_path_map | ||
171 | 255 | ||
172 | def res_process(self, all_res, excel_path, classify): | 256 | def res_process(self, all_res, excel_path, classify, contract_result): |
173 | try: | 257 | try: |
174 | wb = BSWorkbook(set(), set(), set(), set(), set()) | 258 | wb = BSWorkbook(set(), set(), set(), set(), set()) |
175 | sheet_name, key_field, side_field_order, src_field_order = self.field_map.get(classify) | 259 | for c, res_list in all_res.items(): |
260 | sheet_name, key_field, side_field_order, src_field_order = self.field_map.get(c) | ||
176 | ws = wb.create_sheet(sheet_name) | 261 | ws = wb.create_sheet(sheet_name) |
177 | for res in all_res: | 262 | for res in all_res: |
178 | if key_field is not None and key_field in res: | 263 | if key_field is not None and key_field in res: |
... | @@ -186,6 +271,7 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -186,6 +271,7 @@ class Command(BaseCommand, LoggerMixin): |
186 | else: | 271 | else: |
187 | ws.append((write_field, field_value)) | 272 | ws.append((write_field, field_value)) |
188 | ws.append((None,)) | 273 | ws.append((None,)) |
274 | wb.contract_rebuild(contract_result) | ||
189 | wb.remove_base_sheet() | 275 | wb.remove_base_sheet() |
190 | wb.save(excel_path) | 276 | wb.save(excel_path) |
191 | except Exception as e: | 277 | except Exception as e: |
... | @@ -198,7 +284,7 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -198,7 +284,7 @@ class Command(BaseCommand, LoggerMixin): |
198 | sep = os.path.sep + (os.path.altsep or '') | 284 | sep = os.path.sep + (os.path.altsep or '') |
199 | return os.path.basename(path.rstrip(sep)) | 285 | return os.path.basename(path.rstrip(sep)) |
200 | 286 | ||
201 | def ocr_process(self, img_path, classify, all_res, seperate_dir): | 287 | def ocr_process(self, img_path, classify, all_res, seperate_path_map, contract_result): |
202 | if os.path.exists(img_path): | 288 | if os.path.exists(img_path): |
203 | # TODO 图片验证 | 289 | # TODO 图片验证 |
204 | with open(img_path, 'rb') as f: | 290 | with open(img_path, 'rb') as f: |
... | @@ -208,7 +294,7 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -208,7 +294,7 @@ class Command(BaseCommand, LoggerMixin): |
208 | json_data = { | 294 | json_data = { |
209 | "file": file_data, | 295 | "file": file_data, |
210 | } | 296 | } |
211 | if seperate_dir is None: | 297 | if len(seperate_path_map) > 0: |
212 | json_data["classify"] = classify | 298 | json_data["classify"] = classify |
213 | 299 | ||
214 | for times in range(consts.RETRY_TIMES): | 300 | for times in range(consts.RETRY_TIMES): |
... | @@ -232,8 +318,9 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -232,8 +318,9 @@ class Command(BaseCommand, LoggerMixin): |
232 | data_list = ocr_res.get('data', []) | 318 | data_list = ocr_res.get('data', []) |
233 | if isinstance(data_list, list): | 319 | if isinstance(data_list, list): |
234 | for ocr_data in data_list: | 320 | for ocr_data in data_list: |
235 | if ocr_data.get('classify') == classify: | 321 | if ocr_data.get('classify') in seperate_path_map or ocr_data.get('classify') == classify: |
236 | if seperate_dir is not None: | 322 | if ocr_data.get('classify') in seperate_path_map: |
323 | seperate_dir = seperate_path_map[ocr_data.get('classify')] | ||
237 | os.makedirs(seperate_dir, exist_ok=True) | 324 | os.makedirs(seperate_dir, exist_ok=True) |
238 | real_dst = os.path.join(seperate_dir, self.basename(img_path)) | 325 | real_dst = os.path.join(seperate_dir, self.basename(img_path)) |
239 | if not os.path.exists(real_dst): | 326 | if not os.path.exists(real_dst): |
... | @@ -242,6 +329,8 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -242,6 +329,8 @@ class Command(BaseCommand, LoggerMixin): |
242 | self.license1_process(ocr_data, all_res, classify) | 329 | self.license1_process(ocr_data, all_res, classify) |
243 | elif classify in consts.LICENSE_CLASSIFY_SET_2: | 330 | elif classify in consts.LICENSE_CLASSIFY_SET_2: |
244 | self.license2_process(ocr_data, all_res, classify, img_path) | 331 | self.license2_process(ocr_data, all_res, classify, img_path) |
332 | elif classify in consts.CONTRACT_SET: | ||
333 | self.contract_process(ocr_data, contract_result, classify) | ||
245 | break | 334 | break |
246 | else: | 335 | else: |
247 | self.folder_log.warn('{0} [ocr failed] [img_path={1}]'.format(self.log_base, img_path)) | 336 | self.folder_log.warn('{0} [ocr failed] [img_path={1}]'.format(self.log_base, img_path)) |
... | @@ -301,25 +390,56 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -301,25 +390,56 @@ class Command(BaseCommand, LoggerMixin): |
301 | rebuild_res = self.ltgt_res_process(ocr_res, label, excel_path) | 390 | rebuild_res = self.ltgt_res_process(ocr_res, label, excel_path) |
302 | return rebuild_res | 391 | return rebuild_res |
303 | 392 | ||
304 | def images_process(self, img_path_list, classify, excel_path, seperate_dir): | 393 | def images_process(self, img_path_list, classify, excel_path, seperate_path_map): |
305 | all_res = [] | 394 | all_res = dict() |
395 | contract_result = dict() | ||
306 | for img_path in img_path_list: | 396 | for img_path in img_path_list: |
307 | self.ocr_process(img_path, classify, all_res, seperate_dir) | 397 | self.ocr_process(img_path, classify, all_res, seperate_path_map, contract_result) |
308 | # if len(all_res) > 0: | 398 | # if len(all_res) > 0: |
309 | self.res_process(all_res, excel_path, classify) | 399 | self.res_process(all_res, excel_path, classify, contract_result) |
310 | return all_res | 400 | return all_res |
311 | 401 | ||
312 | def pdf_process(self, name, path, classify, img_output_dir, wb_output_dir, pdf_output_dir, seperate_dir): | 402 | def pdf_process(self, name, path, classify, img_output_dir, wb_output_dir, pdf_output_dir, seperate_dir_map): |
313 | if os.path.exists(path): | 403 | if os.path.exists(path): |
314 | rebuild_res = None | 404 | rebuild_res = None |
405 | img_save_path, excel_path, pdf_save_path, seperate_path_map = self.get_path( | ||
406 | name, img_output_dir, wb_output_dir, pdf_output_dir, seperate_dir_map) | ||
407 | pdf_handler = PDFHandler(path, img_save_path) | ||
408 | |||
315 | if classify == consts.CONTRACT_CLASSIFY: | 409 | if classify == consts.CONTRACT_CLASSIFY: |
316 | pass | 410 | try: |
411 | self.folder_log.info('{0} [e-contract pdf to img start] [path={1}]'.format(self.log_base, path)) | ||
412 | pdf_handler.e_contract_process() | ||
413 | self.folder_log.info('{0} [e-contract pdf to img end] [path={1}]'.format(self.log_base, path)) | ||
414 | except Exception as e: | ||
415 | self.folder_log.error('{0} [e-contract pdf to img error] [path={1}] [error={2}]'.format( | ||
416 | self.log_base, path, traceback.format_exc())) | ||
417 | raise e | ||
418 | else: | ||
419 | ocr_result = afc_predict(pdf_handler.pdf_info) | ||
420 | contract_result = dict() | ||
421 | page_res = {} | ||
422 | for page_num, page_info in ocr_result.get('page_info', {}).items(): | ||
423 | if isinstance(page_num, str) and page_num.startswith('page_'): | ||
424 | page_res[page_num] = { | ||
425 | 'classify': classify, | ||
426 | "is_asp": ocr_result.get('is_asp', False), | ||
427 | 'page_num': page_num, | ||
428 | 'page_info': page_info | ||
429 | } | ||
430 | for _, page_key in pdf_handler.img_path_pno_list: | ||
431 | if page_key in page_res: | ||
432 | ocr_data = { | ||
433 | 'classify': page_res[page_key].pop('classify', consts.OTHER_CLASSIFY), | ||
434 | 'data': page_res[page_key] | ||
435 | } | ||
436 | self.contract_process(ocr_data, contract_result, classify) | ||
437 | self.res_process({}, excel_path, classify, contract_result) | ||
438 | shutil.move(path, pdf_save_path) | ||
317 | else: | 439 | else: |
318 | try: | 440 | try: |
319 | img_save_path, excel_path, pdf_save_path, seperate_path = self.get_path( | ||
320 | name, img_output_dir, wb_output_dir, pdf_output_dir, seperate_dir) | ||
321 | self.folder_log.info('{0} [pdf to img start] [path={1}]'.format(self.log_base, path)) | 441 | self.folder_log.info('{0} [pdf to img start] [path={1}]'.format(self.log_base, path)) |
322 | pdf_handler = PDFHandler(path, img_save_path) | 442 | |
323 | if classify in self.ltgt_classify_mapping: | 443 | if classify in self.ltgt_classify_mapping: |
324 | pdf_handler.extract_page_image() | 444 | pdf_handler.extract_page_image() |
325 | else: | 445 | else: |
... | @@ -331,19 +451,22 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -331,19 +451,22 @@ class Command(BaseCommand, LoggerMixin): |
331 | raise e | 451 | raise e |
332 | else: | 452 | else: |
333 | if classify in self.ltgt_classify_mapping: | 453 | if classify in self.ltgt_classify_mapping: |
334 | rebuild_res = self.ltgt_process(pdf_handler.img_path_list, self.ltgt_classify_mapping[classify], | 454 | ltgt_res = self.ltgt_process(pdf_handler.img_path_list, self.ltgt_classify_mapping[classify], |
335 | excel_path, path) | 455 | excel_path, path) |
456 | rebuild_res = { | ||
457 | classify: [ltgt_res] | ||
458 | } | ||
336 | else: | 459 | else: |
337 | rebuild_res = self.images_process(pdf_handler.img_path_list, classify, excel_path, seperate_path) | 460 | rebuild_res = self.images_process(pdf_handler.img_path_list, classify, excel_path, seperate_path_map) |
338 | shutil.move(path, pdf_save_path) | 461 | shutil.move(path, pdf_save_path) |
339 | return rebuild_res | 462 | return rebuild_res |
340 | 463 | ||
341 | def tif_process(self, name, path, classify, img_output_dir, wb_output_dir, tiff_output_dir, seperate_dir): | 464 | def tif_process(self, name, path, classify, img_output_dir, wb_output_dir, tiff_output_dir, seperate_dir_map): |
342 | if os.path.exists(path): | 465 | if os.path.exists(path): |
343 | rebuild_res = None | 466 | rebuild_res = None |
344 | try: | 467 | try: |
345 | img_save_path, excel_path, tiff_save_path, seperate_path = self.get_path( | 468 | img_save_path, excel_path, tiff_save_path, seperate_path_map = self.get_path( |
346 | name, img_output_dir, wb_output_dir, tiff_output_dir, seperate_dir) | 469 | name, img_output_dir, wb_output_dir, tiff_output_dir, seperate_dir_map) |
347 | self.folder_log.info('{0} [tiff to img start] [path={1}]'.format(self.log_base, path)) | 470 | self.folder_log.info('{0} [tiff to img start] [path={1}]'.format(self.log_base, path)) |
348 | tiff_handler = TIFFHandler(path, img_save_path) | 471 | tiff_handler = TIFFHandler(path, img_save_path) |
349 | tiff_handler.extract_image() | 472 | tiff_handler.extract_image() |
... | @@ -354,26 +477,32 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -354,26 +477,32 @@ class Command(BaseCommand, LoggerMixin): |
354 | raise e | 477 | raise e |
355 | else: | 478 | else: |
356 | if classify in self.ltgt_classify_mapping: | 479 | if classify in self.ltgt_classify_mapping: |
357 | rebuild_res = self.ltgt_process(tiff_handler.img_path_list, self.ltgt_classify_mapping[classify], | 480 | ltgt_res = self.ltgt_process(tiff_handler.img_path_list, self.ltgt_classify_mapping[classify], |
358 | excel_path, path) | 481 | excel_path, path) |
482 | rebuild_res = { | ||
483 | classify: [ltgt_res] | ||
484 | } | ||
359 | else: | 485 | else: |
360 | rebuild_res = self.images_process(tiff_handler.img_path_list, classify, excel_path, seperate_path) | 486 | rebuild_res = self.images_process(tiff_handler.img_path_list, classify, excel_path, seperate_path_map) |
361 | shutil.move(path, tiff_save_path) | 487 | shutil.move(path, tiff_save_path) |
362 | return rebuild_res | 488 | return rebuild_res |
363 | 489 | ||
364 | def img_process(self, name, path, classify, wb_output_dir, img_output_dir, pdf_output_dir, seperate_dir): | 490 | def img_process(self, name, path, classify, wb_output_dir, img_output_dir, pdf_output_dir, seperate_dir_map): |
365 | rebuild_res = None | 491 | rebuild_res = None |
366 | try: | 492 | try: |
367 | img_save_path, excel_path, _, seperate_path = self.get_path( | 493 | img_save_path, excel_path, _, seperate_path_map = self.get_path( |
368 | name, img_output_dir, wb_output_dir, pdf_output_dir, seperate_dir) | 494 | name, img_output_dir, wb_output_dir, pdf_output_dir, seperate_dir_map) |
369 | except Exception as e: | 495 | except Exception as e: |
370 | self.folder_log.error('{0} [get path error] [path={1}] [error={2}]'.format( | 496 | self.folder_log.error('{0} [get path error] [path={1}] [error={2}]'.format( |
371 | self.log_base, path, traceback.format_exc())) | 497 | self.log_base, path, traceback.format_exc())) |
372 | else: | 498 | else: |
373 | if classify in self.ltgt_classify_mapping: | 499 | if classify in self.ltgt_classify_mapping: |
374 | rebuild_res = self.ltgt_process([path], self.ltgt_classify_mapping[classify], excel_path, path) | 500 | ltgt_res = self.ltgt_process([path], self.ltgt_classify_mapping[classify], excel_path, path) |
501 | rebuild_res = { | ||
502 | classify: [ltgt_res] | ||
503 | } | ||
375 | else: | 504 | else: |
376 | rebuild_res = self.images_process([path], classify, excel_path, seperate_path) | 505 | rebuild_res = self.images_process([path], classify, excel_path, seperate_path_map) |
377 | shutil.move(path, img_save_path) | 506 | shutil.move(path, img_save_path) |
378 | return rebuild_res | 507 | return rebuild_res |
379 | 508 | ||
... | @@ -450,10 +579,13 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -450,10 +579,13 @@ class Command(BaseCommand, LoggerMixin): |
450 | os.makedirs(failed_output_dir, exist_ok=True) | 579 | os.makedirs(failed_output_dir, exist_ok=True) |
451 | 580 | ||
452 | if is_combined: | 581 | if is_combined: |
453 | seperate_dir = os.path.join(output_dir, self.seperate_map.get(classify, 'Unknown')) | 582 | seperate_dir_map = dict() |
583 | for c in self.combined_map.keys(): | ||
584 | seperate_dir = os.path.join(output_dir, self.combined_map[c]) | ||
454 | os.makedirs(seperate_dir, exist_ok=True) | 585 | os.makedirs(seperate_dir, exist_ok=True) |
586 | seperate_dir_map[c] = seperate_dir | ||
455 | else: | 587 | else: |
456 | seperate_dir = None | 588 | seperate_dir_map = dict() |
457 | 589 | ||
458 | os_error_filename_set = set() | 590 | os_error_filename_set = set() |
459 | while self.switch: | 591 | while self.switch: |
... | @@ -479,17 +611,17 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -479,17 +611,17 @@ class Command(BaseCommand, LoggerMixin): |
479 | self.folder_log.info('{0} [file start] [path={1}]'.format(self.log_base, path)) | 611 | self.folder_log.info('{0} [file start] [path={1}]'.format(self.log_base, path)) |
480 | if name.endswith('.pdf') or name.endswith('.PDF'): | 612 | if name.endswith('.pdf') or name.endswith('.PDF'): |
481 | result = self.pdf_process(name, path, classify, img_output_dir, wb_output_dir, | 613 | result = self.pdf_process(name, path, classify, img_output_dir, wb_output_dir, |
482 | pdf_output_dir, seperate_dir) | 614 | pdf_output_dir, seperate_dir_map) |
483 | elif name.endswith('.tif') or name.endswith('.TIF'): | 615 | elif name.endswith('.tif') or name.endswith('.TIF'): |
484 | if classify == consts.CONTRACT_CLASSIFY: | 616 | if classify == consts.CONTRACT_CLASSIFY: |
485 | raise LTGTException('e-contract must be pdf') | 617 | raise LTGTException('e-contract must be pdf') |
486 | result = self.tif_process(name, path, classify, img_output_dir, wb_output_dir, | 618 | result = self.tif_process(name, path, classify, img_output_dir, wb_output_dir, |
487 | tiff_output_dir, seperate_dir) | 619 | tiff_output_dir, seperate_dir_map) |
488 | else: | 620 | else: |
489 | if classify == consts.CONTRACT_CLASSIFY: | 621 | if classify == consts.CONTRACT_CLASSIFY: |
490 | raise LTGTException('e-contract must be pdf') | 622 | raise LTGTException('e-contract must be pdf') |
491 | result = self.img_process(name, path, classify, wb_output_dir, img_output_dir, | 623 | result = self.img_process(name, path, classify, wb_output_dir, img_output_dir, |
492 | pdf_output_dir, seperate_dir) | 624 | pdf_output_dir, seperate_dir_map) |
493 | self.folder_log.info('{0} [file end] [path={1}]'.format(self.log_base, path)) | 625 | self.folder_log.info('{0} [file end] [path={1}]'.format(self.log_base, path)) |
494 | else: | 626 | else: |
495 | result = None | 627 | result = None |
... | @@ -513,19 +645,11 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -513,19 +645,11 @@ class Command(BaseCommand, LoggerMixin): |
513 | else: | 645 | else: |
514 | if isinstance(result, dict) and len(result) > 0: | 646 | if isinstance(result, dict) and len(result) > 0: |
515 | date_str = time.strftime("%Y-%m-%d") | 647 | date_str = time.strftime("%Y-%m-%d") |
648 | for c, res_list in result.items(): | ||
649 | for res in res_list: | ||
516 | result_queue.put( | 650 | result_queue.put( |
517 | { | 651 | { |
518 | self.CLASSIFY_KEY: classify, | 652 | self.CLASSIFY_KEY: c, |
519 | self.RESULT_KEY: result, | ||
520 | self.DATE_KEY: date_str | ||
521 | } | ||
522 | ) | ||
523 | elif isinstance(result, list) and len(result) > 0: | ||
524 | date_str = time.strftime("%Y-%m-%d") | ||
525 | for res in result: | ||
526 | result_queue.put( | ||
527 | { | ||
528 | self.CLASSIFY_KEY: classify, | ||
529 | self.RESULT_KEY: res, | 653 | self.RESULT_KEY: res, |
530 | self.DATE_KEY: date_str | 654 | self.DATE_KEY: date_str |
531 | } | 655 | } | ... | ... |
-
Please register or sign in to post a comment