c1ca6fa5 by 周伟奇

add ltgt wb daily

1 parent c39b3051
......@@ -15,7 +15,7 @@ from settings import conf
from common.mixins import LoggerMixin
from common.tools.pdf_to_img import PDFHandler
from apps.doc import consts
from apps.doc.exceptions import OCR1Exception, OCR4Exception, LTGTException
from apps.doc.exceptions import OCR1Exception, OCR4Exception
from apps.doc.ocr.wb import BSWorkbook
......@@ -48,11 +48,6 @@ class Command(BaseCommand, LoggerMixin):
self.log_base = '[folder ocr process]'
# 处理文件开关
self.switch = True
self.ltgt_classify_mapping = {
128: '执行裁定书',
129: '民事判决书',
130: '民事调解书'
}
# 睡眠时间
self.sleep_time = float(conf.SLEEP_SECOND_FOLDER)
# input folder
......@@ -60,7 +55,6 @@ class Command(BaseCommand, LoggerMixin):
# ocr相关
self.ocr_url = conf.OCR_URL_FOLDER
self.ocr_url_4 = conf.IC_URL
self.ltgt_ocr_url = conf.LTGT_URL
# 优雅退出信号:15
signal.signal(signal.SIGTERM, self.signal_handler)
......@@ -213,59 +207,6 @@ class Command(BaseCommand, LoggerMixin):
else:
self.folder_log.warn('{0} [ocr failed] [img_path={1}]'.format(self.log_base, img_path))
def ltgt_ocr_process(self, img_path_list, label, path):
img_data_list = []
for img_path in img_path_list:
if os.path.exists(img_path):
with open(img_path, 'rb') as f:
base64_data = base64.b64encode(f.read())
# 获取解码后的base64值
file_data = base64_data.decode()
img_data_list.append(file_data)
json_data = {
"label": label,
"img_data_list": img_data_list
}
for times in range(consts.RETRY_TIMES):
try:
start_time = time.time()
ocr_response = requests.post(self.ltgt_ocr_url, json=json_data)
if ocr_response.status_code != 200:
raise LTGTException('{0} ltgt ocr status code: {1}'.format(self.log_base, ocr_response.status_code))
except Exception as e:
self.folder_log.warn('{0} [ltgt ocr failed] [times={1}] [path={2}] [error={3}]'.format(
self.log_base, times, path, traceback.format_exc()))
else:
ocr_res = ocr_response.json()
end_time = time.time()
speed_time = int(end_time - start_time)
self.folder_log.info('{0} [ltgt ocr success] [path={1}] [res={2}] [speed_time={3}]'.format(
self.log_base, path, ocr_res, speed_time))
return ocr_res
else:
self.folder_log.warn('{0} [ltgt ocr failed] [path={1}]'.format(self.log_base, path))
def ltgt_res_process(self, ocr_res, label, excel_path):
try:
if isinstance(ocr_res, dict):
if ocr_res.get('code') == 1:
result_dict = ocr_res.get('data', {})
wb = BSWorkbook(set(), set(), set(), set(), set())
rebuild_res = wb.ltgt_build(label, result_dict)
wb.remove_base_sheet()
wb.save(excel_path)
except Exception as e:
self.folder_log.error('{0} [wb build error] [path={1}] [error={2}]'.format(
self.log_base, excel_path, traceback.format_exc()))
def ltgt_process(self, img_path_list, label, excel_path, path):
ocr_res = self.ltgt_ocr_process(img_path_list, label, path)
self.ltgt_res_process(ocr_res, label, excel_path)
def images_process(self, img_path_list, classify, excel_path):
all_res = {}
for img_path in img_path_list:
......@@ -279,20 +220,14 @@ class Command(BaseCommand, LoggerMixin):
img_save_path, excel_path, pdf_save_path = self.get_path(name, img_output_dir, wb_output_dir, pdf_output_dir)
self.folder_log.info('{0} [pdf to img start] [path={1}]'.format(self.log_base, path))
pdf_handler = PDFHandler(path, img_save_path)
if classify in self.ltgt_classify_mapping:
pdf_handler.extract_page_image()
else:
pdf_handler.extract_image()
pdf_handler.extract_image()
self.folder_log.info('{0} [pdf to img end] [path={1}]'.format(self.log_base, path))
except Exception as e:
self.folder_log.error('{0} [pdf to img error] [path={1}] [error={2}]'.format(
self.log_base, path, traceback.format_exc()))
raise e
else:
if classify in self.ltgt_classify_mapping:
self.ltgt_process(pdf_handler.img_path_list, self.ltgt_classify_mapping[classify], excel_path, path)
else:
self.images_process(pdf_handler.img_path_list, classify, excel_path)
self.images_process(pdf_handler.img_path_list, classify, excel_path)
shutil.move(path, pdf_save_path)
def tif_process(self, name, path, classify, img_output_dir, wb_output_dir, tiff_output_dir):
......@@ -308,10 +243,7 @@ class Command(BaseCommand, LoggerMixin):
self.log_base, path, traceback.format_exc()))
raise e
else:
if classify in self.ltgt_classify_mapping:
self.ltgt_process(tiff_handler.img_path_list, self.ltgt_classify_mapping[classify], excel_path, path)
else:
self.images_process(tiff_handler.img_path_list, classify, excel_path)
self.images_process(tiff_handler.img_path_list, classify, excel_path)
shutil.move(path, tiff_save_path)
def img_process(self, name, path, classify, wb_output_dir, img_output_dir, pdf_output_dir):
......@@ -321,12 +253,9 @@ class Command(BaseCommand, LoggerMixin):
self.folder_log.error('{0} [get path error] [path={1}] [error={2}]'.format(
self.log_base, path, traceback.format_exc()))
else:
if classify in self.ltgt_classify_mapping:
self.ltgt_process([path], self.ltgt_classify_mapping[classify], excel_path, path)
else:
ocr_res = self.ocr_process(path, classify)
all_res = {path: ocr_res}
self.res_process(all_res, classify, excel_path)
ocr_res = self.ocr_process(path, classify)
all_res = {path: ocr_res}
self.res_process(all_res, classify, excel_path)
shutil.move(path, img_save_path)
def folder_process(self, input_dir, classify):
......
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!