c1ca6fa5 by 周伟奇

add ltgt wb daily

1 parent c39b3051
...@@ -15,7 +15,7 @@ from settings import conf ...@@ -15,7 +15,7 @@ from settings import conf
15 from common.mixins import LoggerMixin 15 from common.mixins import LoggerMixin
16 from common.tools.pdf_to_img import PDFHandler 16 from common.tools.pdf_to_img import PDFHandler
17 from apps.doc import consts 17 from apps.doc import consts
18 from apps.doc.exceptions import OCR1Exception, OCR4Exception, LTGTException 18 from apps.doc.exceptions import OCR1Exception, OCR4Exception
19 from apps.doc.ocr.wb import BSWorkbook 19 from apps.doc.ocr.wb import BSWorkbook
20 20
21 21
...@@ -48,11 +48,6 @@ class Command(BaseCommand, LoggerMixin): ...@@ -48,11 +48,6 @@ class Command(BaseCommand, LoggerMixin):
48 self.log_base = '[folder ocr process]' 48 self.log_base = '[folder ocr process]'
49 # 处理文件开关 49 # 处理文件开关
50 self.switch = True 50 self.switch = True
51 self.ltgt_classify_mapping = {
52 128: '执行裁定书',
53 129: '民事判决书',
54 130: '民事调解书'
55 }
56 # 睡眠时间 51 # 睡眠时间
57 self.sleep_time = float(conf.SLEEP_SECOND_FOLDER) 52 self.sleep_time = float(conf.SLEEP_SECOND_FOLDER)
58 # input folder 53 # input folder
...@@ -60,7 +55,6 @@ class Command(BaseCommand, LoggerMixin): ...@@ -60,7 +55,6 @@ class Command(BaseCommand, LoggerMixin):
60 # ocr相关 55 # ocr相关
61 self.ocr_url = conf.OCR_URL_FOLDER 56 self.ocr_url = conf.OCR_URL_FOLDER
62 self.ocr_url_4 = conf.IC_URL 57 self.ocr_url_4 = conf.IC_URL
63 self.ltgt_ocr_url = conf.LTGT_URL
64 # 优雅退出信号:15 58 # 优雅退出信号:15
65 signal.signal(signal.SIGTERM, self.signal_handler) 59 signal.signal(signal.SIGTERM, self.signal_handler)
66 60
...@@ -213,59 +207,6 @@ class Command(BaseCommand, LoggerMixin): ...@@ -213,59 +207,6 @@ class Command(BaseCommand, LoggerMixin):
213 else: 207 else:
214 self.folder_log.warn('{0} [ocr failed] [img_path={1}]'.format(self.log_base, img_path)) 208 self.folder_log.warn('{0} [ocr failed] [img_path={1}]'.format(self.log_base, img_path))
215 209
216 def ltgt_ocr_process(self, img_path_list, label, path):
217 img_data_list = []
218
219 for img_path in img_path_list:
220 if os.path.exists(img_path):
221 with open(img_path, 'rb') as f:
222 base64_data = base64.b64encode(f.read())
223 # 获取解码后的base64值
224 file_data = base64_data.decode()
225 img_data_list.append(file_data)
226
227 json_data = {
228 "label": label,
229 "img_data_list": img_data_list
230 }
231
232 for times in range(consts.RETRY_TIMES):
233 try:
234 start_time = time.time()
235 ocr_response = requests.post(self.ltgt_ocr_url, json=json_data)
236 if ocr_response.status_code != 200:
237 raise LTGTException('{0} ltgt ocr status code: {1}'.format(self.log_base, ocr_response.status_code))
238 except Exception as e:
239 self.folder_log.warn('{0} [ltgt ocr failed] [times={1}] [path={2}] [error={3}]'.format(
240 self.log_base, times, path, traceback.format_exc()))
241 else:
242 ocr_res = ocr_response.json()
243 end_time = time.time()
244 speed_time = int(end_time - start_time)
245 self.folder_log.info('{0} [ltgt ocr success] [path={1}] [res={2}] [speed_time={3}]'.format(
246 self.log_base, path, ocr_res, speed_time))
247 return ocr_res
248 else:
249 self.folder_log.warn('{0} [ltgt ocr failed] [path={1}]'.format(self.log_base, path))
250
251 def ltgt_res_process(self, ocr_res, label, excel_path):
252 try:
253 if isinstance(ocr_res, dict):
254 if ocr_res.get('code') == 1:
255 result_dict = ocr_res.get('data', {})
256
257 wb = BSWorkbook(set(), set(), set(), set(), set())
258 rebuild_res = wb.ltgt_build(label, result_dict)
259 wb.remove_base_sheet()
260 wb.save(excel_path)
261 except Exception as e:
262 self.folder_log.error('{0} [wb build error] [path={1}] [error={2}]'.format(
263 self.log_base, excel_path, traceback.format_exc()))
264
265 def ltgt_process(self, img_path_list, label, excel_path, path):
266 ocr_res = self.ltgt_ocr_process(img_path_list, label, path)
267 self.ltgt_res_process(ocr_res, label, excel_path)
268
269 def images_process(self, img_path_list, classify, excel_path): 210 def images_process(self, img_path_list, classify, excel_path):
270 all_res = {} 211 all_res = {}
271 for img_path in img_path_list: 212 for img_path in img_path_list:
...@@ -279,20 +220,14 @@ class Command(BaseCommand, LoggerMixin): ...@@ -279,20 +220,14 @@ class Command(BaseCommand, LoggerMixin):
279 img_save_path, excel_path, pdf_save_path = self.get_path(name, img_output_dir, wb_output_dir, pdf_output_dir) 220 img_save_path, excel_path, pdf_save_path = self.get_path(name, img_output_dir, wb_output_dir, pdf_output_dir)
280 self.folder_log.info('{0} [pdf to img start] [path={1}]'.format(self.log_base, path)) 221 self.folder_log.info('{0} [pdf to img start] [path={1}]'.format(self.log_base, path))
281 pdf_handler = PDFHandler(path, img_save_path) 222 pdf_handler = PDFHandler(path, img_save_path)
282 if classify in self.ltgt_classify_mapping: 223 pdf_handler.extract_image()
283 pdf_handler.extract_page_image()
284 else:
285 pdf_handler.extract_image()
286 self.folder_log.info('{0} [pdf to img end] [path={1}]'.format(self.log_base, path)) 224 self.folder_log.info('{0} [pdf to img end] [path={1}]'.format(self.log_base, path))
287 except Exception as e: 225 except Exception as e:
288 self.folder_log.error('{0} [pdf to img error] [path={1}] [error={2}]'.format( 226 self.folder_log.error('{0} [pdf to img error] [path={1}] [error={2}]'.format(
289 self.log_base, path, traceback.format_exc())) 227 self.log_base, path, traceback.format_exc()))
290 raise e 228 raise e
291 else: 229 else:
292 if classify in self.ltgt_classify_mapping: 230 self.images_process(pdf_handler.img_path_list, classify, excel_path)
293 self.ltgt_process(pdf_handler.img_path_list, self.ltgt_classify_mapping[classify], excel_path, path)
294 else:
295 self.images_process(pdf_handler.img_path_list, classify, excel_path)
296 shutil.move(path, pdf_save_path) 231 shutil.move(path, pdf_save_path)
297 232
298 def tif_process(self, name, path, classify, img_output_dir, wb_output_dir, tiff_output_dir): 233 def tif_process(self, name, path, classify, img_output_dir, wb_output_dir, tiff_output_dir):
...@@ -308,10 +243,7 @@ class Command(BaseCommand, LoggerMixin): ...@@ -308,10 +243,7 @@ class Command(BaseCommand, LoggerMixin):
308 self.log_base, path, traceback.format_exc())) 243 self.log_base, path, traceback.format_exc()))
309 raise e 244 raise e
310 else: 245 else:
311 if classify in self.ltgt_classify_mapping: 246 self.images_process(tiff_handler.img_path_list, classify, excel_path)
312 self.ltgt_process(tiff_handler.img_path_list, self.ltgt_classify_mapping[classify], excel_path, path)
313 else:
314 self.images_process(tiff_handler.img_path_list, classify, excel_path)
315 shutil.move(path, tiff_save_path) 247 shutil.move(path, tiff_save_path)
316 248
317 def img_process(self, name, path, classify, wb_output_dir, img_output_dir, pdf_output_dir): 249 def img_process(self, name, path, classify, wb_output_dir, img_output_dir, pdf_output_dir):
...@@ -321,12 +253,9 @@ class Command(BaseCommand, LoggerMixin): ...@@ -321,12 +253,9 @@ class Command(BaseCommand, LoggerMixin):
321 self.folder_log.error('{0} [get path error] [path={1}] [error={2}]'.format( 253 self.folder_log.error('{0} [get path error] [path={1}] [error={2}]'.format(
322 self.log_base, path, traceback.format_exc())) 254 self.log_base, path, traceback.format_exc()))
323 else: 255 else:
324 if classify in self.ltgt_classify_mapping: 256 ocr_res = self.ocr_process(path, classify)
325 self.ltgt_process([path], self.ltgt_classify_mapping[classify], excel_path, path) 257 all_res = {path: ocr_res}
326 else: 258 self.res_process(all_res, classify, excel_path)
327 ocr_res = self.ocr_process(path, classify)
328 all_res = {path: ocr_res}
329 self.res_process(all_res, classify, excel_path)
330 shutil.move(path, img_save_path) 259 shutil.move(path, img_save_path)
331 260
332 def folder_process(self, input_dir, classify): 261 def folder_process(self, input_dir, classify):
......
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!