4fad0d1f by 周伟奇

add latigation

1 parent ce86bdd5
...@@ -13,6 +13,9 @@ class OCR2Exception(Exception): ...@@ -13,6 +13,9 @@ class OCR2Exception(Exception):
13 class OCR4Exception(Exception): 13 class OCR4Exception(Exception):
14 pass 14 pass
15 15
16 class LTGTException(Exception):
17 pass
18
16 19
17 class GCAPException(Exception): 20 class GCAPException(Exception):
18 pass 21 pass
......
...@@ -15,7 +15,7 @@ from settings import conf ...@@ -15,7 +15,7 @@ from settings import conf
15 from common.mixins import LoggerMixin 15 from common.mixins import LoggerMixin
16 from common.tools.pdf_to_img import PDFHandler 16 from common.tools.pdf_to_img import PDFHandler
17 from apps.doc import consts 17 from apps.doc import consts
18 from apps.doc.exceptions import OCR1Exception, OCR4Exception 18 from apps.doc.exceptions import OCR1Exception, OCR4Exception, LTGTException
19 from apps.doc.ocr.wb import BSWorkbook 19 from apps.doc.ocr.wb import BSWorkbook
20 20
21 21
...@@ -48,6 +48,11 @@ class Command(BaseCommand, LoggerMixin): ...@@ -48,6 +48,11 @@ class Command(BaseCommand, LoggerMixin):
48 self.log_base = '[folder ocr process]' 48 self.log_base = '[folder ocr process]'
49 # 处理文件开关 49 # 处理文件开关
50 self.switch = True 50 self.switch = True
51 self.ltgt_classify_mapping = {
52 128: '执行裁定书',
53 129: '民事判决书',
54 130: '民事调解书'
55 }
51 # 睡眠时间 56 # 睡眠时间
52 self.sleep_time = float(conf.SLEEP_SECOND_FOLDER) 57 self.sleep_time = float(conf.SLEEP_SECOND_FOLDER)
53 # input folder 58 # input folder
...@@ -55,19 +60,18 @@ class Command(BaseCommand, LoggerMixin): ...@@ -55,19 +60,18 @@ class Command(BaseCommand, LoggerMixin):
55 # ocr相关 60 # ocr相关
56 self.ocr_url = conf.OCR_URL_FOLDER 61 self.ocr_url = conf.OCR_URL_FOLDER
57 self.ocr_url_4 = conf.IC_URL 62 self.ocr_url_4 = conf.IC_URL
63 self.ltgt_ocr_url = conf.LTGT_URL
58 # 优雅退出信号:15 64 # 优雅退出信号:15
59 signal.signal(signal.SIGTERM, self.signal_handler) 65 signal.signal(signal.SIGTERM, self.signal_handler)
60 66
61 def signal_handler(self, sig, frame): 67 def signal_handler(self, sig, frame):
62 self.switch = False # 停止处理文件 68 self.switch = False # 停止处理文件
63 69
64 def license1_process(self, ocr_data, license_summary, classify, res_list, pno, ino, part_idx, img_path): 70 def license1_process(self, ocr_data, license_summary, classify, img_path):
65 # 类别:'0'身份证, '1'居住证 71 # 类别:'0'身份证, '1'居住证
66 license_data = ocr_data.get('data', []) 72 license_data = ocr_data.get('data', [])
67 if not license_data: 73 if not license_data:
68 res_list.append((pno, ino, part_idx, consts.RES_SUCCESS_EMPTY))
69 return 74 return
70 res_list.append((pno, ino, part_idx, consts.RES_SUCCESS))
71 if classify == consts.MVC_CLASSIFY: # 车辆登记证 3/4页结果整合 75 if classify == consts.MVC_CLASSIFY: # 车辆登记证 3/4页结果整合
72 for mvc_dict in license_data: 76 for mvc_dict in license_data:
73 try: 77 try:
...@@ -154,29 +158,21 @@ class Command(BaseCommand, LoggerMixin): ...@@ -154,29 +158,21 @@ class Command(BaseCommand, LoggerMixin):
154 def res_process(self, all_res, classify, excel_path): 158 def res_process(self, all_res, classify, excel_path):
155 try: 159 try:
156 license_summary = {} 160 license_summary = {}
157 res_list = []
158 161
159 if not all_res: 162 if not all_res:
160 return 163 return
161 else: 164 else:
162 for img_path, ocr_res in all_res.items(): 165 for img_path, ocr_res in all_res.items():
163 img_name, pno, ino = self.parse_img_path(img_path) 166 # img_name, pno, ino = self.parse_img_path(img_path)
164 part_idx = 1 167 # part_idx = 1
165 168
166 if isinstance(ocr_res, dict): 169 if isinstance(ocr_res, dict):
167 if ocr_res.get('code') == 1: 170 if ocr_res.get('code') == 1:
168 data_list = ocr_res.get('data', []) 171 data_list = ocr_res.get('data', [])
169 if isinstance(data_list, list): 172 if isinstance(data_list, list):
170 for part_idx, ocr_data in enumerate(data_list): 173 for ocr_data in data_list:
171 part_idx = part_idx + 1 174 # part_idx = part_idx + 1
172 self.license1_process(ocr_data, license_summary, classify, 175 self.license1_process(ocr_data, license_summary, classify, img_path)
173 res_list, pno, ino, part_idx, img_path)
174 else:
175 res_list.append((pno, ino, part_idx, consts.RES_FAILED_3))
176 else:
177 res_list.append((pno, ino, part_idx, consts.RES_FAILED))
178 else:
179 res_list.append((pno, ino, part_idx, consts.RES_FAILED))
180 176
181 wb = BSWorkbook(set(), set(), set(), set(), set()) 177 wb = BSWorkbook(set(), set(), set(), set(), set())
182 wb.simple_license_rebuild(license_summary, consts.DOC_SCHEME_LIST[0]) 178 wb.simple_license_rebuild(license_summary, consts.DOC_SCHEME_LIST[0])
...@@ -216,6 +212,66 @@ class Command(BaseCommand, LoggerMixin): ...@@ -216,6 +212,66 @@ class Command(BaseCommand, LoggerMixin):
216 return ocr_res 212 return ocr_res
217 else: 213 else:
218 self.folder_log.warn('{0} [ocr failed] [img_path={1}]'.format(self.log_base, img_path)) 214 self.folder_log.warn('{0} [ocr failed] [img_path={1}]'.format(self.log_base, img_path))
215
216 def ltgt_ocr_process(self, img_path_list, label, path):
217 img_data_list = []
218
219 for img_path in img_path_list:
220 if os.path.exists(img_path):
221 with open(img_path, 'rb') as f:
222 base64_data = base64.b64encode(f.read())
223 # 获取解码后的base64值
224 file_data = base64_data.decode()
225 img_data_list.append(file_data)
226
227 json_data = {
228 "label": label,
229 "img_data_list": img_data_list
230 }
231
232 for times in range(consts.RETRY_TIMES):
233 try:
234 start_time = time.time()
235 ocr_response = requests.post(self.ltgt_ocr_url, json=json_data)
236 if ocr_response.status_code != 200:
237 raise LTGTException('{0} ltgt ocr status code: {1}'.format(self.log_base, ocr_response.status_code))
238 except Exception as e:
239 self.folder_log.warn('{0} [ltgt ocr failed] [times={1}] [path={2}] [error={3}]'.format(
240 self.log_base, times, path, traceback.format_exc()))
241 else:
242 ocr_res = ocr_response.json()
243 end_time = time.time()
244 speed_time = int(end_time - start_time)
245 self.folder_log.info('{0} [ltgt ocr success] [path={1}] [res={2}] [speed_time={3}]'.format(
246 self.log_base, path, ocr_res, speed_time))
247 return ocr_res
248 else:
249 self.folder_log.warn('{0} [ltgt ocr failed] [path={1}]'.format(self.log_base, path))
250
251 def ltgt_res_process(self, ocr_res, label, excel_path):
252 try:
253 if isinstance(ocr_res, dict):
254 if ocr_res.get('code') == 1:
255 result_dict = ocr_res.get('data', {})
256
257 wb = BSWorkbook(set(), set(), set(), set(), set())
258 wb.ltgt_build(label, result_dict)
259 wb.remove_base_sheet()
260 wb.save(excel_path)
261 except Exception as e:
262 self.folder_log.error('{0} [wb build error] [path={1}] [error={2}]'.format(
263 self.log_base, excel_path, traceback.format_exc()))
264
265 def ltgt_process(self, img_path_list, label, excel_path, path):
266 ocr_res = self.ltgt_ocr_process(img_path_list, label, path)
267 self.ltgt_res_process(ocr_res, label, excel_path)
268
269 def images_process(self, img_path_list, classify, excel_path):
270 all_res = {}
271 for img_path in img_path_list:
272 ocr_res = self.ocr_process(img_path, classify)
273 all_res[img_path] = ocr_res
274 self.res_process(all_res, classify, excel_path)
219 275
220 def pdf_process(self, name, path, classify, img_output_dir, wb_output_dir, pdf_output_dir): 276 def pdf_process(self, name, path, classify, img_output_dir, wb_output_dir, pdf_output_dir):
221 if os.path.exists(path): 277 if os.path.exists(path):
...@@ -223,18 +279,20 @@ class Command(BaseCommand, LoggerMixin): ...@@ -223,18 +279,20 @@ class Command(BaseCommand, LoggerMixin):
223 img_save_path, excel_path, pdf_save_path = self.get_path(name, img_output_dir, wb_output_dir, pdf_output_dir) 279 img_save_path, excel_path, pdf_save_path = self.get_path(name, img_output_dir, wb_output_dir, pdf_output_dir)
224 self.folder_log.info('{0} [pdf to img start] [path={1}]'.format(self.log_base, path)) 280 self.folder_log.info('{0} [pdf to img start] [path={1}]'.format(self.log_base, path))
225 pdf_handler = PDFHandler(path, img_save_path) 281 pdf_handler = PDFHandler(path, img_save_path)
226 pdf_handler.extract_image() 282 if classify in self.ltgt_classify_mapping:
283 pdf_handler.extract_page_image()
284 else:
285 pdf_handler.extract_image()
227 self.folder_log.info('{0} [pdf to img end] [path={1}]'.format(self.log_base, path)) 286 self.folder_log.info('{0} [pdf to img end] [path={1}]'.format(self.log_base, path))
228 except Exception as e: 287 except Exception as e:
229 self.folder_log.error('{0} [pdf to img error] [path={1}] [error={2}]'.format( 288 self.folder_log.error('{0} [pdf to img error] [path={1}] [error={2}]'.format(
230 self.log_base, path, traceback.format_exc())) 289 self.log_base, path, traceback.format_exc()))
231 raise e 290 raise e
232 else: 291 else:
233 all_res = {} 292 if classify in self.ltgt_classify_mapping:
234 for img_path in pdf_handler.img_path_list: 293 self.ltgt_process(pdf_handler.img_path_list, self.ltgt_classify_mapping[classify], excel_path, path)
235 ocr_res = self.ocr_process(img_path, classify) 294 else:
236 all_res[img_path] = ocr_res 295 self.images_process(pdf_handler.img_path_list, classify, excel_path)
237 self.res_process(all_res, classify, excel_path)
238 shutil.move(path, pdf_save_path) 296 shutil.move(path, pdf_save_path)
239 297
240 def tif_process(self, name, path, classify, img_output_dir, wb_output_dir, tiff_output_dir): 298 def tif_process(self, name, path, classify, img_output_dir, wb_output_dir, tiff_output_dir):
...@@ -250,24 +308,25 @@ class Command(BaseCommand, LoggerMixin): ...@@ -250,24 +308,25 @@ class Command(BaseCommand, LoggerMixin):
250 self.log_base, path, traceback.format_exc())) 308 self.log_base, path, traceback.format_exc()))
251 raise e 309 raise e
252 else: 310 else:
253 all_res = {} 311 if classify in self.ltgt_classify_mapping:
254 for img_path in tiff_handler.img_path_list: 312 self.ltgt_process(tiff_handler.img_path_list, self.ltgt_classify_mapping[classify], excel_path, path)
255 ocr_res = self.ocr_process(img_path, classify) 313 else:
256 all_res[img_path] = ocr_res 314 self.images_process(tiff_handler.img_path_list, classify, excel_path)
257 self.res_process(all_res, classify, excel_path)
258 shutil.move(path, tiff_save_path) 315 shutil.move(path, tiff_save_path)
259 316
260 def img_process(self, name, path, classify, wb_output_dir, img_output_dir, pdf_output_dir): 317 def img_process(self, name, path, classify, wb_output_dir, img_output_dir, pdf_output_dir):
261 ocr_res = self.ocr_process(path, classify)
262 all_res = {path: ocr_res}
263
264 try: 318 try:
265 img_save_path, excel_path, _ = self.get_path(name, img_output_dir, wb_output_dir, pdf_output_dir) 319 img_save_path, excel_path, _ = self.get_path(name, img_output_dir, wb_output_dir, pdf_output_dir)
266 except Exception as e: 320 except Exception as e:
267 self.folder_log.error('{0} [get path error] [path={1}] [error={2}]'.format( 321 self.folder_log.error('{0} [get path error] [path={1}] [error={2}]'.format(
268 self.log_base, path, traceback.format_exc())) 322 self.log_base, path, traceback.format_exc()))
269 else: 323 else:
270 self.res_process(all_res, classify, excel_path) 324 if classify in self.ltgt_classify_mapping:
325 self.ltgt_process([path], self.ltgt_classify_mapping[classify], excel_path, path)
326 else:
327 ocr_res = self.ocr_process(path, classify)
328 all_res = {path: ocr_res}
329 self.res_process(all_res, classify, excel_path)
271 shutil.move(path, img_save_path) 330 shutil.move(path, img_save_path)
272 331
273 def folder_process(self, input_dir, classify): 332 def folder_process(self, input_dir, classify):
...@@ -312,9 +371,9 @@ class Command(BaseCommand, LoggerMixin): ...@@ -312,9 +371,9 @@ class Command(BaseCommand, LoggerMixin):
312 try: 371 try:
313 if os.path.isfile(path): 372 if os.path.isfile(path):
314 self.folder_log.info('{0} [file start] [path={1}]'.format(self.log_base, path)) 373 self.folder_log.info('{0} [file start] [path={1}]'.format(self.log_base, path))
315 if name.endswith('.pdf'): 374 if name.endswith('.pdf') or name.endswith('.PDF'):
316 self.pdf_process(name, path, classify, img_output_dir, wb_output_dir, pdf_output_dir) 375 self.pdf_process(name, path, classify, img_output_dir, wb_output_dir, pdf_output_dir)
317 elif name.endswith('.tif'): 376 elif name.endswith('.tif') or name.endswith('.TIF'):
318 self.tif_process(name, path, classify, img_output_dir, wb_output_dir, tiff_output_dir) 377 self.tif_process(name, path, classify, img_output_dir, wb_output_dir, tiff_output_dir)
319 else: 378 else:
320 self.img_process(name, path, classify, wb_output_dir, img_output_dir, pdf_output_dir) 379 self.img_process(name, path, classify, wb_output_dir, img_output_dir, pdf_output_dir)
......
...@@ -702,6 +702,23 @@ class BSWorkbook(Workbook): ...@@ -702,6 +702,23 @@ class BSWorkbook(Workbook):
702 if field_str is not None: 702 if field_str is not None:
703 count_list.append((field_str, count)) 703 count_list.append((field_str, count))
704 704
705 def ltgt_build(self, label, result_dict):
706 ws = self.create_sheet(label)
707 for key, value in result_dict.items():
708 if isinstance(value, list):
709 ws.append((key, *value))
710 elif isinstance(value, dict):
711 if 'words' in value:
712 ws.append((key, value['words']))
713 else:
714 for sub_key, sub_value in value.items():
715 if isinstance(sub_value, dict):
716 ws.append(('{0}: {1}'.format(key, sub_key), sub_value.get('words', '')))
717 else:
718 ws.append(('{0}: {1}'.format(key, sub_key), sub_value))
719 else:
720 ws.append((key, value))
721
705 def simple_license_rebuild(self, license_summary, document_scheme): 722 def simple_license_rebuild(self, license_summary, document_scheme):
706 # for ic_license_dict in license_summary.get(consts.IC_CLASSIFY, []): 723 # for ic_license_dict in license_summary.get(consts.IC_CLASSIFY, []):
707 # if ic_license_dict.get('类别') == '1': 724 # if ic_license_dict.get('类别') == '1':
......
...@@ -225,3 +225,13 @@ class PDFHandler: ...@@ -225,3 +225,13 @@ class PDFHandler:
225 else: 225 else:
226 self.merge_il(pdf, pno, il) 226 self.merge_il(pdf, pno, il)
227 self.img_count = len(self.img_path_list) 227 self.img_count = len(self.img_path_list)
228
229 def extract_page_image(self):
230 self.img_path_list = []
231 self.xref_set = set()
232 os.makedirs(self.img_dir_path, exist_ok=True)
233 with fitz.Document(self.path) as pdf:
234 for pno in range(pdf.pageCount):
235 page = pdf.loadPage(pno)
236 self.page_to_png(page)
237 self.img_count = len(self.img_path_list)
......
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!