a94ce721 by 周伟奇

ltgt part 2

1 parent f417660d
......@@ -16,6 +16,7 @@ from openpyxl import load_workbook, Workbook
from settings import conf
from common.mixins import LoggerMixin
from common.tools.pdf_to_img import PDFHandler
from common.electronic_afc_contract.afc_contract_ocr import predict as afc_predict
from apps.doc import consts
from apps.doc.exceptions import OCR1Exception, OCR2Exception, LTGTException
from apps.doc.ocr.wb import BSWorkbook
......@@ -72,14 +73,16 @@ class Command(BaseCommand, LoggerMixin):
# input folder
self.input_dirs = conf.get_namespace('LTGT_DIR_')
# seperate folder name
self.seperate_map = {
self.combined_map = {
consts.IC_CLASSIFY: 'IDCard',
consts.MVC_CLASSIFY: 'GreenBook',
consts.CONTRACT_CLASSIFY: 'Contract',
}
self.field_map = {
# sheet_name, key_field, side_field_order, src_field_order
consts.VAT_CLASSIFY: (consts.VAT_CN_NAME, None, None, consts.VATS_FIELD_ORDER),
consts.IC_CLASSIFY: (consts.IC_CN_NAME, '有效期限', consts.IC_FIELD_ORDER_3, consts.IC_FIELD_ORDER_2),
consts.MVC_CLASSIFY: (consts.MVC_CN_NAME, '机动车登记证书编号', consts.MVC_SE_FIELD_ORDER_3_4, consts.MVC_SE_FIELD_ORDER_1_2),
}
# ocr相关
self.ocr_url = conf.OCR_URL_FOLDER
......@@ -92,18 +95,96 @@ class Command(BaseCommand, LoggerMixin):
def signal_handler(self, sig, frame):
self.switch = False # 停止处理文件
def contract_process(self, ocr_data, contract_result, classify):
contract_dict = ocr_data.get('data')
if not contract_dict or contract_dict.get('page_num') is None or contract_dict.get('page_info') is None:
return
page_num = contract_dict.get('page_num')
if page_num.startswith('page_'):
page_num_only = page_num.split('_')[-1]
else:
page_num_only = page_num
rebuild_page_info = []
text_key = 'words'
for key, value in contract_dict.get('page_info', {}).items():
if value is None:
rebuild_page_info.append((key,))
elif text_key in value:
if value[text_key] is None:
rebuild_page_info.append((key,))
elif isinstance(value[text_key], str):
rebuild_page_info.append((key, value[text_key]))
elif isinstance(value[text_key], list):
rebuild_page_info.append((key,))
for row_list in value[text_key]:
rebuild_page_info.append(row_list)
else:
rebuild_page_info.append((key,))
for sub_key, sub_value in value.items():
if sub_value is None:
rebuild_page_info.append((sub_key,))
elif text_key in sub_value:
if sub_value[text_key] is None:
rebuild_page_info.append((sub_key,))
elif isinstance(sub_value[text_key], str):
rebuild_page_info.append((sub_key, sub_value[text_key]))
elif isinstance(sub_value[text_key], list):
rebuild_page_info.append((sub_key,))
for row_list in sub_value[text_key]:
rebuild_page_info.append(row_list)
contract_result.setdefault(classify, dict()).setdefault(page_num_only, []).append(rebuild_page_info)
def license1_process(self, ocr_data, all_res, classify):
# 类别:'0'身份证, '1'居住证
license_data = ocr_data.get('data', [])
license_data = ocr_data.get('data')
if not license_data:
return
if isinstance(license_data, dict):
license_data.pop('base64_img', '')
if classify == consts.IC_CLASSIFY:
for id_card_dict in license_data:
try:
id_card_dict.pop('base64_img')
except Exception as e:
continue
all_res.extend(license_data)
id_card_dict = {}
card_type = license_data.get('type', '')
is_ic = card_type.startswith('身份证')
is_info_side = card_type.endswith('信息面')
id_card_dict['类别'] = '0' if is_ic else '1'
if is_ic:
field_map = consts.IC_MAP_0 if is_info_side else consts.IC_MAP_1
else:
field_map = consts.RP_MAP_0 if is_info_side else consts.RP_MAP_1
for write_field, search_field in field_map:
id_card_dict[write_field] = license_data.get('words_result', {}).get(search_field, {}).get('words', '')
if not is_info_side:
start_time = license_data.get('words_result', {}).get('签发日期', {}).get('words', '')
end_time = license_data.get('words_result', {}).get('失效日期', {}).get('words', '')
id_card_dict['有效期限'] = '{0}-{1}'.format(start_time, end_time)
# for id_card_dict in license_data:
# try:
# id_card_dict.pop('base64_img')
# except Exception as e:
# continue
all_res.setdefault(classify, []).append(id_card_dict)
elif classify == consts.MVC_CLASSIFY:
rebuild_data_dict = {}
mvc_page = license_data.pop('page', 'VehicleRCI')
mvc_res = license_data.pop('results', {})
if mvc_page == 'VehicleRegArea':
rebuild_data_dict['机动车登记证书编号'] = mvc_res.get('机动车登记证书编号', {}).get('words', '')
for register_info in mvc_res.get('登记信息', []):
register_info.pop('register_type', None)
register_info.pop('register_type_name', None)
for cn_key, detail_dict in register_info.items():
rebuild_data_dict.setdefault(cn_key, []).append(
detail_dict.get('words', ''))
else:
for cn_key, detail_dict in mvc_res.items():
rebuild_data_dict[cn_key] = detail_dict.get('words', '')
all_res.setdefault(classify, []).append(rebuild_data_dict)
elif classify == consts.CONTRACT_CLASSIFY:
pass
else:
# all_res.extend(license_data)
all_res.setdefault(classify, []).extend(license_data)
def license2_process(self, ocr_data, all_res, classify, img_path):
pid, _, _, _, _, _ = consts.LICENSE_CLASSIFY_MAPPING.get(classify)
......@@ -159,33 +240,38 @@ class Command(BaseCommand, LoggerMixin):
return img_name, 1, 1
@staticmethod
def get_path(name, img_output_dir, wb_output_dir, pdf_output_dir, seperate_dir):
def get_path(name, img_output_dir, wb_output_dir, pdf_output_dir, seperate_dir_map):
time_stamp = datetime.now().strftime('%Y-%m-%d_%H:%M:%S')
new_name = '{0}_{1}'.format(time_stamp, name)
img_save_path = os.path.join(img_output_dir, new_name)
pdf_save_path = os.path.join(pdf_output_dir, new_name)
excel_name = '{0}.xlsx'.format(os.path.splitext(new_name)[0])
excel_path = os.path.join(wb_output_dir, excel_name)
seperate_path = None if seperate_dir is None else os.path.join(seperate_dir, new_name)
return img_save_path, excel_path, pdf_save_path, seperate_path
seperate_path_map = dict()
if len(seperate_dir_map) > 0:
for c, seperate_dir in seperate_dir_map.items():
seperate_path_map[c] = os.path.join(seperate_dir, new_name)
return img_save_path, excel_path, pdf_save_path, seperate_path_map
def res_process(self, all_res, excel_path, classify):
def res_process(self, all_res, excel_path, classify, contract_result):
try:
wb = BSWorkbook(set(), set(), set(), set(), set())
sheet_name, key_field, side_field_order, src_field_order = self.field_map.get(classify)
ws = wb.create_sheet(sheet_name)
for res in all_res:
if key_field is not None and key_field in res:
field_order = side_field_order
else:
field_order = src_field_order
for search_field, write_field in field_order:
field_value = res.get(search_field, '')
if isinstance(field_value, list):
ws.append((write_field, *field_value))
for c, res_list in all_res.items():
sheet_name, key_field, side_field_order, src_field_order = self.field_map.get(c)
ws = wb.create_sheet(sheet_name)
for res in all_res:
if key_field is not None and key_field in res:
field_order = side_field_order
else:
ws.append((write_field, field_value))
ws.append((None,))
field_order = src_field_order
for search_field, write_field in field_order:
field_value = res.get(search_field, '')
if isinstance(field_value, list):
ws.append((write_field, *field_value))
else:
ws.append((write_field, field_value))
ws.append((None,))
wb.contract_rebuild(contract_result)
wb.remove_base_sheet()
wb.save(excel_path)
except Exception as e:
......@@ -198,7 +284,7 @@ class Command(BaseCommand, LoggerMixin):
sep = os.path.sep + (os.path.altsep or '')
return os.path.basename(path.rstrip(sep))
def ocr_process(self, img_path, classify, all_res, seperate_dir):
def ocr_process(self, img_path, classify, all_res, seperate_path_map, contract_result):
if os.path.exists(img_path):
# TODO 图片验证
with open(img_path, 'rb') as f:
......@@ -208,7 +294,7 @@ class Command(BaseCommand, LoggerMixin):
json_data = {
"file": file_data,
}
if seperate_dir is None:
if len(seperate_path_map) > 0:
json_data["classify"] = classify
for times in range(consts.RETRY_TIMES):
......@@ -232,8 +318,9 @@ class Command(BaseCommand, LoggerMixin):
data_list = ocr_res.get('data', [])
if isinstance(data_list, list):
for ocr_data in data_list:
if ocr_data.get('classify') == classify:
if seperate_dir is not None:
if ocr_data.get('classify') in seperate_path_map or ocr_data.get('classify') == classify:
if ocr_data.get('classify') in seperate_path_map:
seperate_dir = seperate_path_map[ocr_data.get('classify')]
os.makedirs(seperate_dir, exist_ok=True)
real_dst = os.path.join(seperate_dir, self.basename(img_path))
if not os.path.exists(real_dst):
......@@ -242,6 +329,8 @@ class Command(BaseCommand, LoggerMixin):
self.license1_process(ocr_data, all_res, classify)
elif classify in consts.LICENSE_CLASSIFY_SET_2:
self.license2_process(ocr_data, all_res, classify, img_path)
elif classify in consts.CONTRACT_SET:
self.contract_process(ocr_data, contract_result, classify)
break
else:
self.folder_log.warn('{0} [ocr failed] [img_path={1}]'.format(self.log_base, img_path))
......@@ -301,25 +390,56 @@ class Command(BaseCommand, LoggerMixin):
rebuild_res = self.ltgt_res_process(ocr_res, label, excel_path)
return rebuild_res
def images_process(self, img_path_list, classify, excel_path, seperate_dir):
all_res = []
def images_process(self, img_path_list, classify, excel_path, seperate_path_map):
all_res = dict()
contract_result = dict()
for img_path in img_path_list:
self.ocr_process(img_path, classify, all_res, seperate_dir)
self.ocr_process(img_path, classify, all_res, seperate_path_map, contract_result)
# if len(all_res) > 0:
self.res_process(all_res, excel_path, classify)
self.res_process(all_res, excel_path, classify, contract_result)
return all_res
def pdf_process(self, name, path, classify, img_output_dir, wb_output_dir, pdf_output_dir, seperate_dir):
def pdf_process(self, name, path, classify, img_output_dir, wb_output_dir, pdf_output_dir, seperate_dir_map):
if os.path.exists(path):
rebuild_res = None
img_save_path, excel_path, pdf_save_path, seperate_path_map = self.get_path(
name, img_output_dir, wb_output_dir, pdf_output_dir, seperate_dir_map)
pdf_handler = PDFHandler(path, img_save_path)
if classify == consts.CONTRACT_CLASSIFY:
pass
try:
self.folder_log.info('{0} [e-contract pdf to img start] [path={1}]'.format(self.log_base, path))
pdf_handler.e_contract_process()
self.folder_log.info('{0} [e-contract pdf to img end] [path={1}]'.format(self.log_base, path))
except Exception as e:
self.folder_log.error('{0} [e-contract pdf to img error] [path={1}] [error={2}]'.format(
self.log_base, path, traceback.format_exc()))
raise e
else:
ocr_result = afc_predict(pdf_handler.pdf_info)
contract_result = dict()
page_res = {}
for page_num, page_info in ocr_result.get('page_info', {}).items():
if isinstance(page_num, str) and page_num.startswith('page_'):
page_res[page_num] = {
'classify': classify,
"is_asp": ocr_result.get('is_asp', False),
'page_num': page_num,
'page_info': page_info
}
for _, page_key in pdf_handler.img_path_pno_list:
if page_key in page_res:
ocr_data = {
'classify': page_res[page_key].pop('classify', consts.OTHER_CLASSIFY),
'data': page_res[page_key]
}
self.contract_process(ocr_data, contract_result, classify)
self.res_process({}, excel_path, classify, contract_result)
shutil.move(path, pdf_save_path)
else:
try:
img_save_path, excel_path, pdf_save_path, seperate_path = self.get_path(
name, img_output_dir, wb_output_dir, pdf_output_dir, seperate_dir)
self.folder_log.info('{0} [pdf to img start] [path={1}]'.format(self.log_base, path))
pdf_handler = PDFHandler(path, img_save_path)
if classify in self.ltgt_classify_mapping:
pdf_handler.extract_page_image()
else:
......@@ -331,19 +451,22 @@ class Command(BaseCommand, LoggerMixin):
raise e
else:
if classify in self.ltgt_classify_mapping:
rebuild_res = self.ltgt_process(pdf_handler.img_path_list, self.ltgt_classify_mapping[classify],
ltgt_res = self.ltgt_process(pdf_handler.img_path_list, self.ltgt_classify_mapping[classify],
excel_path, path)
rebuild_res = {
classify: [ltgt_res]
}
else:
rebuild_res = self.images_process(pdf_handler.img_path_list, classify, excel_path, seperate_path)
rebuild_res = self.images_process(pdf_handler.img_path_list, classify, excel_path, seperate_path_map)
shutil.move(path, pdf_save_path)
return rebuild_res
def tif_process(self, name, path, classify, img_output_dir, wb_output_dir, tiff_output_dir, seperate_dir):
def tif_process(self, name, path, classify, img_output_dir, wb_output_dir, tiff_output_dir, seperate_dir_map):
if os.path.exists(path):
rebuild_res = None
try:
img_save_path, excel_path, tiff_save_path, seperate_path = self.get_path(
name, img_output_dir, wb_output_dir, tiff_output_dir, seperate_dir)
img_save_path, excel_path, tiff_save_path, seperate_path_map = self.get_path(
name, img_output_dir, wb_output_dir, tiff_output_dir, seperate_dir_map)
self.folder_log.info('{0} [tiff to img start] [path={1}]'.format(self.log_base, path))
tiff_handler = TIFFHandler(path, img_save_path)
tiff_handler.extract_image()
......@@ -354,26 +477,32 @@ class Command(BaseCommand, LoggerMixin):
raise e
else:
if classify in self.ltgt_classify_mapping:
rebuild_res = self.ltgt_process(tiff_handler.img_path_list, self.ltgt_classify_mapping[classify],
ltgt_res = self.ltgt_process(tiff_handler.img_path_list, self.ltgt_classify_mapping[classify],
excel_path, path)
rebuild_res = {
classify: [ltgt_res]
}
else:
rebuild_res = self.images_process(tiff_handler.img_path_list, classify, excel_path, seperate_path)
rebuild_res = self.images_process(tiff_handler.img_path_list, classify, excel_path, seperate_path_map)
shutil.move(path, tiff_save_path)
return rebuild_res
def img_process(self, name, path, classify, wb_output_dir, img_output_dir, pdf_output_dir, seperate_dir):
def img_process(self, name, path, classify, wb_output_dir, img_output_dir, pdf_output_dir, seperate_dir_map):
rebuild_res = None
try:
img_save_path, excel_path, _, seperate_path = self.get_path(
name, img_output_dir, wb_output_dir, pdf_output_dir, seperate_dir)
img_save_path, excel_path, _, seperate_path_map = self.get_path(
name, img_output_dir, wb_output_dir, pdf_output_dir, seperate_dir_map)
except Exception as e:
self.folder_log.error('{0} [get path error] [path={1}] [error={2}]'.format(
self.log_base, path, traceback.format_exc()))
else:
if classify in self.ltgt_classify_mapping:
rebuild_res = self.ltgt_process([path], self.ltgt_classify_mapping[classify], excel_path, path)
ltgt_res = self.ltgt_process([path], self.ltgt_classify_mapping[classify], excel_path, path)
rebuild_res = {
classify: [ltgt_res]
}
else:
rebuild_res = self.images_process([path], classify, excel_path, seperate_path)
rebuild_res = self.images_process([path], classify, excel_path, seperate_path_map)
shutil.move(path, img_save_path)
return rebuild_res
......@@ -450,10 +579,13 @@ class Command(BaseCommand, LoggerMixin):
os.makedirs(failed_output_dir, exist_ok=True)
if is_combined:
seperate_dir = os.path.join(output_dir, self.seperate_map.get(classify, 'Unknown'))
os.makedirs(seperate_dir, exist_ok=True)
seperate_dir_map = dict()
for c in self.combined_map.keys():
seperate_dir = os.path.join(output_dir, self.combined_map[c])
os.makedirs(seperate_dir, exist_ok=True)
seperate_dir_map[c] = seperate_dir
else:
seperate_dir = None
seperate_dir_map = dict()
os_error_filename_set = set()
while self.switch:
......@@ -479,17 +611,17 @@ class Command(BaseCommand, LoggerMixin):
self.folder_log.info('{0} [file start] [path={1}]'.format(self.log_base, path))
if name.endswith('.pdf') or name.endswith('.PDF'):
result = self.pdf_process(name, path, classify, img_output_dir, wb_output_dir,
pdf_output_dir, seperate_dir)
pdf_output_dir, seperate_dir_map)
elif name.endswith('.tif') or name.endswith('.TIF'):
if classify == consts.CONTRACT_CLASSIFY:
raise LTGTException('e-contract must be pdf')
result = self.tif_process(name, path, classify, img_output_dir, wb_output_dir,
tiff_output_dir, seperate_dir)
tiff_output_dir, seperate_dir_map)
else:
if classify == consts.CONTRACT_CLASSIFY:
raise LTGTException('e-contract must be pdf')
result = self.img_process(name, path, classify, wb_output_dir, img_output_dir,
pdf_output_dir, seperate_dir)
pdf_output_dir, seperate_dir_map)
self.folder_log.info('{0} [file end] [path={1}]'.format(self.log_base, path))
else:
result = None
......@@ -513,23 +645,15 @@ class Command(BaseCommand, LoggerMixin):
else:
if isinstance(result, dict) and len(result) > 0:
date_str = time.strftime("%Y-%m-%d")
result_queue.put(
{
self.CLASSIFY_KEY: classify,
self.RESULT_KEY: result,
self.DATE_KEY: date_str
}
)
elif isinstance(result, list) and len(result) > 0:
date_str = time.strftime("%Y-%m-%d")
for res in result:
result_queue.put(
{
self.CLASSIFY_KEY: classify,
self.RESULT_KEY: res,
self.DATE_KEY: date_str
}
)
for c, res_list in result.items():
for res in res_list:
result_queue.put(
{
self.CLASSIFY_KEY: c,
self.RESULT_KEY: res,
self.DATE_KEY: date_str
}
)
def handle(self, *args, **kwargs):
if len(self.input_dirs) == 0:
......
......@@ -282,7 +282,7 @@ class Command(BaseCommand, LoggerMixin):
elif isinstance(sub_value[text_key], str):
page_compare_dict[key][sub_key] = sub_value[text_key]
contract_result_compare.setdefault(classify, dict())[consts.ASP_KEY] = contract_dict.get(consts.ASP_KEY, False)
contract_result_compare.setdefault(classify, dict())[consts.ASP_KEY] = contract_dict.get(consts.ASP_KEY, False)
contract_result_compare.setdefault(classify, dict())[page_num_only] = page_compare_dict
......
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!