4fad0d1f by 周伟奇

add latigation

1 parent ce86bdd5
......@@ -13,6 +13,9 @@ class OCR2Exception(Exception):
class OCR4Exception(Exception):
pass
class LTGTException(Exception):
pass
class GCAPException(Exception):
pass
......
......@@ -15,7 +15,7 @@ from settings import conf
from common.mixins import LoggerMixin
from common.tools.pdf_to_img import PDFHandler
from apps.doc import consts
from apps.doc.exceptions import OCR1Exception, OCR4Exception
from apps.doc.exceptions import OCR1Exception, OCR4Exception, LTGTException
from apps.doc.ocr.wb import BSWorkbook
......@@ -48,6 +48,11 @@ class Command(BaseCommand, LoggerMixin):
self.log_base = '[folder ocr process]'
# 处理文件开关
self.switch = True
self.ltgt_classify_mapping = {
128: '执行裁定书',
129: '民事判决书',
130: '民事调解书'
}
# 睡眠时间
self.sleep_time = float(conf.SLEEP_SECOND_FOLDER)
# input folder
......@@ -55,19 +60,18 @@ class Command(BaseCommand, LoggerMixin):
# ocr相关
self.ocr_url = conf.OCR_URL_FOLDER
self.ocr_url_4 = conf.IC_URL
self.ltgt_ocr_url = conf.LTGT_URL
# 优雅退出信号:15
signal.signal(signal.SIGTERM, self.signal_handler)
def signal_handler(self, sig, frame):
self.switch = False # 停止处理文件
def license1_process(self, ocr_data, license_summary, classify, res_list, pno, ino, part_idx, img_path):
def license1_process(self, ocr_data, license_summary, classify, img_path):
# 类别:'0'身份证, '1'居住证
license_data = ocr_data.get('data', [])
if not license_data:
res_list.append((pno, ino, part_idx, consts.RES_SUCCESS_EMPTY))
return
res_list.append((pno, ino, part_idx, consts.RES_SUCCESS))
if classify == consts.MVC_CLASSIFY: # 车辆登记证 3/4页结果整合
for mvc_dict in license_data:
try:
......@@ -154,29 +158,21 @@ class Command(BaseCommand, LoggerMixin):
def res_process(self, all_res, classify, excel_path):
try:
license_summary = {}
res_list = []
if not all_res:
return
else:
for img_path, ocr_res in all_res.items():
img_name, pno, ino = self.parse_img_path(img_path)
part_idx = 1
# img_name, pno, ino = self.parse_img_path(img_path)
# part_idx = 1
if isinstance(ocr_res, dict):
if ocr_res.get('code') == 1:
data_list = ocr_res.get('data', [])
if isinstance(data_list, list):
for part_idx, ocr_data in enumerate(data_list):
part_idx = part_idx + 1
self.license1_process(ocr_data, license_summary, classify,
res_list, pno, ino, part_idx, img_path)
else:
res_list.append((pno, ino, part_idx, consts.RES_FAILED_3))
else:
res_list.append((pno, ino, part_idx, consts.RES_FAILED))
else:
res_list.append((pno, ino, part_idx, consts.RES_FAILED))
for ocr_data in data_list:
# part_idx = part_idx + 1
self.license1_process(ocr_data, license_summary, classify, img_path)
wb = BSWorkbook(set(), set(), set(), set(), set())
wb.simple_license_rebuild(license_summary, consts.DOC_SCHEME_LIST[0])
......@@ -216,6 +212,66 @@ class Command(BaseCommand, LoggerMixin):
return ocr_res
else:
self.folder_log.warn('{0} [ocr failed] [img_path={1}]'.format(self.log_base, img_path))
def ltgt_ocr_process(self, img_path_list, label, path):
img_data_list = []
for img_path in img_path_list:
if os.path.exists(img_path):
with open(img_path, 'rb') as f:
base64_data = base64.b64encode(f.read())
# 获取解码后的base64值
file_data = base64_data.decode()
img_data_list.append(file_data)
json_data = {
"label": label,
"img_data_list": img_data_list
}
for times in range(consts.RETRY_TIMES):
try:
start_time = time.time()
ocr_response = requests.post(self.ltgt_ocr_url, json=json_data)
if ocr_response.status_code != 200:
raise LTGTException('{0} ltgt ocr status code: {1}'.format(self.log_base, ocr_response.status_code))
except Exception as e:
self.folder_log.warn('{0} [ltgt ocr failed] [times={1}] [path={2}] [error={3}]'.format(
self.log_base, times, path, traceback.format_exc()))
else:
ocr_res = ocr_response.json()
end_time = time.time()
speed_time = int(end_time - start_time)
self.folder_log.info('{0} [ltgt ocr success] [path={1}] [res={2}] [speed_time={3}]'.format(
self.log_base, path, ocr_res, speed_time))
return ocr_res
else:
self.folder_log.warn('{0} [ltgt ocr failed] [path={1}]'.format(self.log_base, path))
def ltgt_res_process(self, ocr_res, label, excel_path):
try:
if isinstance(ocr_res, dict):
if ocr_res.get('code') == 1:
result_dict = ocr_res.get('data', {})
wb = BSWorkbook(set(), set(), set(), set(), set())
wb.ltgt_build(label, result_dict)
wb.remove_base_sheet()
wb.save(excel_path)
except Exception as e:
self.folder_log.error('{0} [wb build error] [path={1}] [error={2}]'.format(
self.log_base, excel_path, traceback.format_exc()))
def ltgt_process(self, img_path_list, label, excel_path, path):
ocr_res = self.ltgt_ocr_process(img_path_list, label, path)
self.ltgt_res_process(ocr_res, label, excel_path)
def images_process(self, img_path_list, classify, excel_path):
all_res = {}
for img_path in img_path_list:
ocr_res = self.ocr_process(img_path, classify)
all_res[img_path] = ocr_res
self.res_process(all_res, classify, excel_path)
def pdf_process(self, name, path, classify, img_output_dir, wb_output_dir, pdf_output_dir):
if os.path.exists(path):
......@@ -223,18 +279,20 @@ class Command(BaseCommand, LoggerMixin):
img_save_path, excel_path, pdf_save_path = self.get_path(name, img_output_dir, wb_output_dir, pdf_output_dir)
self.folder_log.info('{0} [pdf to img start] [path={1}]'.format(self.log_base, path))
pdf_handler = PDFHandler(path, img_save_path)
pdf_handler.extract_image()
if classify in self.ltgt_classify_mapping:
pdf_handler.extract_page_image()
else:
pdf_handler.extract_image()
self.folder_log.info('{0} [pdf to img end] [path={1}]'.format(self.log_base, path))
except Exception as e:
self.folder_log.error('{0} [pdf to img error] [path={1}] [error={2}]'.format(
self.log_base, path, traceback.format_exc()))
raise e
else:
all_res = {}
for img_path in pdf_handler.img_path_list:
ocr_res = self.ocr_process(img_path, classify)
all_res[img_path] = ocr_res
self.res_process(all_res, classify, excel_path)
if classify in self.ltgt_classify_mapping:
self.ltgt_process(pdf_handler.img_path_list, self.ltgt_classify_mapping[classify], excel_path, path)
else:
self.images_process(pdf_handler.img_path_list, classify, excel_path)
shutil.move(path, pdf_save_path)
def tif_process(self, name, path, classify, img_output_dir, wb_output_dir, tiff_output_dir):
......@@ -250,24 +308,25 @@ class Command(BaseCommand, LoggerMixin):
self.log_base, path, traceback.format_exc()))
raise e
else:
all_res = {}
for img_path in tiff_handler.img_path_list:
ocr_res = self.ocr_process(img_path, classify)
all_res[img_path] = ocr_res
self.res_process(all_res, classify, excel_path)
if classify in self.ltgt_classify_mapping:
self.ltgt_process(tiff_handler.img_path_list, self.ltgt_classify_mapping[classify], excel_path, path)
else:
self.images_process(tiff_handler.img_path_list, classify, excel_path)
shutil.move(path, tiff_save_path)
def img_process(self, name, path, classify, wb_output_dir, img_output_dir, pdf_output_dir):
ocr_res = self.ocr_process(path, classify)
all_res = {path: ocr_res}
try:
img_save_path, excel_path, _ = self.get_path(name, img_output_dir, wb_output_dir, pdf_output_dir)
except Exception as e:
self.folder_log.error('{0} [get path error] [path={1}] [error={2}]'.format(
self.log_base, path, traceback.format_exc()))
else:
self.res_process(all_res, classify, excel_path)
if classify in self.ltgt_classify_mapping:
self.ltgt_process([path], self.ltgt_classify_mapping[classify], excel_path, path)
else:
ocr_res = self.ocr_process(path, classify)
all_res = {path: ocr_res}
self.res_process(all_res, classify, excel_path)
shutil.move(path, img_save_path)
def folder_process(self, input_dir, classify):
......@@ -312,9 +371,9 @@ class Command(BaseCommand, LoggerMixin):
try:
if os.path.isfile(path):
self.folder_log.info('{0} [file start] [path={1}]'.format(self.log_base, path))
if name.endswith('.pdf'):
if name.endswith('.pdf') or name.endswith('.PDF'):
self.pdf_process(name, path, classify, img_output_dir, wb_output_dir, pdf_output_dir)
elif name.endswith('.tif'):
elif name.endswith('.tif') or name.endswith('.TIF'):
self.tif_process(name, path, classify, img_output_dir, wb_output_dir, tiff_output_dir)
else:
self.img_process(name, path, classify, wb_output_dir, img_output_dir, pdf_output_dir)
......
......@@ -702,6 +702,23 @@ class BSWorkbook(Workbook):
if field_str is not None:
count_list.append((field_str, count))
def ltgt_build(self, label, result_dict):
ws = self.create_sheet(label)
for key, value in result_dict.items():
if isinstance(value, list):
ws.append((key, *value))
elif isinstance(value, dict):
if 'words' in value:
ws.append((key, value['words']))
else:
for sub_key, sub_value in value.items():
if isinstance(sub_value, dict):
ws.append(('{0}: {1}'.format(key, sub_key), sub_value.get('words', '')))
else:
ws.append(('{0}: {1}'.format(key, sub_key), sub_value))
else:
ws.append((key, value))
def simple_license_rebuild(self, license_summary, document_scheme):
# for ic_license_dict in license_summary.get(consts.IC_CLASSIFY, []):
# if ic_license_dict.get('类别') == '1':
......
......@@ -225,3 +225,13 @@ class PDFHandler:
else:
self.merge_il(pdf, pno, il)
self.img_count = len(self.img_path_list)
def extract_page_image(self):
self.img_path_list = []
self.xref_set = set()
os.makedirs(self.img_dir_path, exist_ok=True)
with fitz.Document(self.path) as pdf:
for pno in range(pdf.pageCount):
page = pdf.loadPage(pno)
self.page_to_png(page)
self.img_count = len(self.img_path_list)
......
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!