add ltgt wb daily
Showing
2 changed files
with
1 additions
and
72 deletions
This diff is collapsed.
Click to expand it.
... | @@ -15,7 +15,7 @@ from settings import conf | ... | @@ -15,7 +15,7 @@ from settings import conf |
15 | from common.mixins import LoggerMixin | 15 | from common.mixins import LoggerMixin |
16 | from common.tools.pdf_to_img import PDFHandler | 16 | from common.tools.pdf_to_img import PDFHandler |
17 | from apps.doc import consts | 17 | from apps.doc import consts |
18 | from apps.doc.exceptions import OCR1Exception, OCR4Exception, LTGTException | 18 | from apps.doc.exceptions import OCR1Exception, OCR4Exception |
19 | from apps.doc.ocr.wb import BSWorkbook | 19 | from apps.doc.ocr.wb import BSWorkbook |
20 | 20 | ||
21 | 21 | ||
... | @@ -48,11 +48,6 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -48,11 +48,6 @@ class Command(BaseCommand, LoggerMixin): |
48 | self.log_base = '[folder ocr process]' | 48 | self.log_base = '[folder ocr process]' |
49 | # 处理文件开关 | 49 | # 处理文件开关 |
50 | self.switch = True | 50 | self.switch = True |
51 | self.ltgt_classify_mapping = { | ||
52 | 128: '执行裁定书', | ||
53 | 129: '民事判决书', | ||
54 | 130: '民事调解书' | ||
55 | } | ||
56 | # 睡眠时间 | 51 | # 睡眠时间 |
57 | self.sleep_time = float(conf.SLEEP_SECOND_FOLDER) | 52 | self.sleep_time = float(conf.SLEEP_SECOND_FOLDER) |
58 | # input folder | 53 | # input folder |
... | @@ -60,7 +55,6 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -60,7 +55,6 @@ class Command(BaseCommand, LoggerMixin): |
60 | # ocr相关 | 55 | # ocr相关 |
61 | self.ocr_url = conf.OCR_URL_FOLDER | 56 | self.ocr_url = conf.OCR_URL_FOLDER |
62 | self.ocr_url_4 = conf.IC_URL | 57 | self.ocr_url_4 = conf.IC_URL |
63 | self.ltgt_ocr_url = conf.LTGT_URL | ||
64 | # 优雅退出信号:15 | 58 | # 优雅退出信号:15 |
65 | signal.signal(signal.SIGTERM, self.signal_handler) | 59 | signal.signal(signal.SIGTERM, self.signal_handler) |
66 | 60 | ||
... | @@ -213,59 +207,6 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -213,59 +207,6 @@ class Command(BaseCommand, LoggerMixin): |
213 | else: | 207 | else: |
214 | self.folder_log.warn('{0} [ocr failed] [img_path={1}]'.format(self.log_base, img_path)) | 208 | self.folder_log.warn('{0} [ocr failed] [img_path={1}]'.format(self.log_base, img_path)) |
215 | 209 | ||
216 | def ltgt_ocr_process(self, img_path_list, label, path): | ||
217 | img_data_list = [] | ||
218 | |||
219 | for img_path in img_path_list: | ||
220 | if os.path.exists(img_path): | ||
221 | with open(img_path, 'rb') as f: | ||
222 | base64_data = base64.b64encode(f.read()) | ||
223 | # 获取解码后的base64值 | ||
224 | file_data = base64_data.decode() | ||
225 | img_data_list.append(file_data) | ||
226 | |||
227 | json_data = { | ||
228 | "label": label, | ||
229 | "img_data_list": img_data_list | ||
230 | } | ||
231 | |||
232 | for times in range(consts.RETRY_TIMES): | ||
233 | try: | ||
234 | start_time = time.time() | ||
235 | ocr_response = requests.post(self.ltgt_ocr_url, json=json_data) | ||
236 | if ocr_response.status_code != 200: | ||
237 | raise LTGTException('{0} ltgt ocr status code: {1}'.format(self.log_base, ocr_response.status_code)) | ||
238 | except Exception as e: | ||
239 | self.folder_log.warn('{0} [ltgt ocr failed] [times={1}] [path={2}] [error={3}]'.format( | ||
240 | self.log_base, times, path, traceback.format_exc())) | ||
241 | else: | ||
242 | ocr_res = ocr_response.json() | ||
243 | end_time = time.time() | ||
244 | speed_time = int(end_time - start_time) | ||
245 | self.folder_log.info('{0} [ltgt ocr success] [path={1}] [res={2}] [speed_time={3}]'.format( | ||
246 | self.log_base, path, ocr_res, speed_time)) | ||
247 | return ocr_res | ||
248 | else: | ||
249 | self.folder_log.warn('{0} [ltgt ocr failed] [path={1}]'.format(self.log_base, path)) | ||
250 | |||
251 | def ltgt_res_process(self, ocr_res, label, excel_path): | ||
252 | try: | ||
253 | if isinstance(ocr_res, dict): | ||
254 | if ocr_res.get('code') == 1: | ||
255 | result_dict = ocr_res.get('data', {}) | ||
256 | |||
257 | wb = BSWorkbook(set(), set(), set(), set(), set()) | ||
258 | rebuild_res = wb.ltgt_build(label, result_dict) | ||
259 | wb.remove_base_sheet() | ||
260 | wb.save(excel_path) | ||
261 | except Exception as e: | ||
262 | self.folder_log.error('{0} [wb build error] [path={1}] [error={2}]'.format( | ||
263 | self.log_base, excel_path, traceback.format_exc())) | ||
264 | |||
265 | def ltgt_process(self, img_path_list, label, excel_path, path): | ||
266 | ocr_res = self.ltgt_ocr_process(img_path_list, label, path) | ||
267 | self.ltgt_res_process(ocr_res, label, excel_path) | ||
268 | |||
269 | def images_process(self, img_path_list, classify, excel_path): | 210 | def images_process(self, img_path_list, classify, excel_path): |
270 | all_res = {} | 211 | all_res = {} |
271 | for img_path in img_path_list: | 212 | for img_path in img_path_list: |
... | @@ -279,9 +220,6 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -279,9 +220,6 @@ class Command(BaseCommand, LoggerMixin): |
279 | img_save_path, excel_path, pdf_save_path = self.get_path(name, img_output_dir, wb_output_dir, pdf_output_dir) | 220 | img_save_path, excel_path, pdf_save_path = self.get_path(name, img_output_dir, wb_output_dir, pdf_output_dir) |
280 | self.folder_log.info('{0} [pdf to img start] [path={1}]'.format(self.log_base, path)) | 221 | self.folder_log.info('{0} [pdf to img start] [path={1}]'.format(self.log_base, path)) |
281 | pdf_handler = PDFHandler(path, img_save_path) | 222 | pdf_handler = PDFHandler(path, img_save_path) |
282 | if classify in self.ltgt_classify_mapping: | ||
283 | pdf_handler.extract_page_image() | ||
284 | else: | ||
285 | pdf_handler.extract_image() | 223 | pdf_handler.extract_image() |
286 | self.folder_log.info('{0} [pdf to img end] [path={1}]'.format(self.log_base, path)) | 224 | self.folder_log.info('{0} [pdf to img end] [path={1}]'.format(self.log_base, path)) |
287 | except Exception as e: | 225 | except Exception as e: |
... | @@ -289,9 +227,6 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -289,9 +227,6 @@ class Command(BaseCommand, LoggerMixin): |
289 | self.log_base, path, traceback.format_exc())) | 227 | self.log_base, path, traceback.format_exc())) |
290 | raise e | 228 | raise e |
291 | else: | 229 | else: |
292 | if classify in self.ltgt_classify_mapping: | ||
293 | self.ltgt_process(pdf_handler.img_path_list, self.ltgt_classify_mapping[classify], excel_path, path) | ||
294 | else: | ||
295 | self.images_process(pdf_handler.img_path_list, classify, excel_path) | 230 | self.images_process(pdf_handler.img_path_list, classify, excel_path) |
296 | shutil.move(path, pdf_save_path) | 231 | shutil.move(path, pdf_save_path) |
297 | 232 | ||
... | @@ -308,9 +243,6 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -308,9 +243,6 @@ class Command(BaseCommand, LoggerMixin): |
308 | self.log_base, path, traceback.format_exc())) | 243 | self.log_base, path, traceback.format_exc())) |
309 | raise e | 244 | raise e |
310 | else: | 245 | else: |
311 | if classify in self.ltgt_classify_mapping: | ||
312 | self.ltgt_process(tiff_handler.img_path_list, self.ltgt_classify_mapping[classify], excel_path, path) | ||
313 | else: | ||
314 | self.images_process(tiff_handler.img_path_list, classify, excel_path) | 246 | self.images_process(tiff_handler.img_path_list, classify, excel_path) |
315 | shutil.move(path, tiff_save_path) | 247 | shutil.move(path, tiff_save_path) |
316 | 248 | ||
... | @@ -321,9 +253,6 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -321,9 +253,6 @@ class Command(BaseCommand, LoggerMixin): |
321 | self.folder_log.error('{0} [get path error] [path={1}] [error={2}]'.format( | 253 | self.folder_log.error('{0} [get path error] [path={1}] [error={2}]'.format( |
322 | self.log_base, path, traceback.format_exc())) | 254 | self.log_base, path, traceback.format_exc())) |
323 | else: | 255 | else: |
324 | if classify in self.ltgt_classify_mapping: | ||
325 | self.ltgt_process([path], self.ltgt_classify_mapping[classify], excel_path, path) | ||
326 | else: | ||
327 | ocr_res = self.ocr_process(path, classify) | 256 | ocr_res = self.ocr_process(path, classify) |
328 | all_res = {path: ocr_res} | 257 | all_res = {path: ocr_res} |
329 | self.res_process(all_res, classify, excel_path) | 258 | self.res_process(all_res, classify, excel_path) | ... | ... |
-
Please register or sign in to post a comment