2134d7e0 by 周伟奇

Merge branch 'feature/smart_dda' into feature/0918

2 parents d37597e9 4ce34ec0
......@@ -60,13 +60,6 @@ class Command(BaseCommand, LoggerMixin):
self.sleep_time = float(conf.SLEEP_SECOND_FOLDER)
# input folder
self.input_dirs = conf.get_namespace('DDA_DIR_')
# seperate folder name
self.seperate_map = {
consts.IC_CLASSIFY: 'IDCard',
consts.BC_CLASSIFY: 'BankCard',
consts.PP_CLASSIFY: 'Passport',
consts.EEP_CLASSIFY: 'EntryPermit',
}
self.field_map = {
consts.IC_CLASSIFY: (consts.IC_CN_NAME, '有效期限', consts.IC_FIELD_ORDER_3, consts.IC_FIELD_ORDER_2),
consts.BC_CLASSIFY: (consts.BC_CN_NAME, None, None, consts.BC_FIELD_ORDER_2),
......@@ -77,6 +70,7 @@ class Command(BaseCommand, LoggerMixin):
self.ocr_url = conf.OCR_URL_FOLDER
self.ocr_url_2 = conf.OCR2_URL_FOLDER
# self.ocr_url_4 = conf.IC_URL
self.classify_set = {consts.IC_CLASSIFY, consts.PP_CLASSIFY, consts.EEP_CLASSIFY, consts.BC_CLASSIFY}
# 优雅退出信号:15
signal.signal(signal.SIGTERM, self.signal_handler)
......@@ -94,7 +88,7 @@ class Command(BaseCommand, LoggerMixin):
id_card_dict.pop('base64_img')
except Exception as e:
continue
all_res.extend(license_data)
all_res.setdefault(classify, []).extend(license_data)
def license2_process(self, ocr_data, all_res, classify, img_path):
pid, _, _, _, _, _ = consts.LICENSE_CLASSIFY_MAPPING.get(classify)
......@@ -129,14 +123,14 @@ class Command(BaseCommand, LoggerMixin):
if ocr_res_2.get('ErrorCode') in consts.SUCCESS_CODE_SET:
if pid == consts.BC_PID:
all_res.append(ocr_res_2)
all_res.setdefault(classify, []).append(ocr_res_2)
else:
# 营业执照等
for result_dict in ocr_res_2.get('ResultList', []):
res_dict = {}
for field_dict in result_dict.get('FieldList', []):
res_dict[field_dict.get('chn_key', '')] = field_dict.get('value', '')
all_res.append(res_dict)
all_res.setdefault(classify, []).append(res_dict)
break
@staticmethod
......@@ -150,22 +144,22 @@ class Command(BaseCommand, LoggerMixin):
return img_name, 1, 1
@staticmethod
def get_path(name, img_output_dir, wb_output_dir, pdf_output_dir, seperate_dir):
def get_path(name, img_output_dir, wb_output_dir, pdf_output_dir):
time_stamp = datetime.now().strftime('%Y-%m-%d_%H:%M:%S')
new_name = '{0}_{1}'.format(time_stamp, name)
img_save_path = os.path.join(img_output_dir, new_name)
pdf_save_path = os.path.join(pdf_output_dir, new_name)
excel_name = '{0}.xlsx'.format(os.path.splitext(new_name)[0])
excel_path = os.path.join(wb_output_dir, excel_name)
seperate_path = None if seperate_dir is None else os.path.join(seperate_dir, new_name)
return img_save_path, excel_path, pdf_save_path, seperate_path
return img_save_path, excel_path, pdf_save_path
def res_process(self, all_res, excel_path, classify):
def res_process(self, all_res, excel_path):
try:
wb = BSWorkbook(set(), set(), set(), set(), set())
for classify, ress in all_res.items():
sheet_name, key_field, side_field_order, src_field_order = self.field_map.get(classify)
ws = wb.create_sheet(sheet_name)
for res in all_res:
for res in ress:
if key_field is not None and key_field in res:
field_order = side_field_order
else:
......@@ -189,7 +183,7 @@ class Command(BaseCommand, LoggerMixin):
sep = os.path.sep + (os.path.altsep or '')
return os.path.basename(path.rstrip(sep))
def ocr_process(self, img_path, classify, all_res, seperate_dir):
def ocr_process(self, img_path, all_res):
if os.path.exists(img_path):
# TODO 图片验证
with open(img_path, 'rb') as f:
......@@ -199,9 +193,6 @@ class Command(BaseCommand, LoggerMixin):
json_data = {
"file": file_data,
}
if seperate_dir is None:
json_data["classify"] = classify
for times in range(consts.RETRY_TIMES):
try:
start_time = time.time()
......@@ -223,12 +214,9 @@ class Command(BaseCommand, LoggerMixin):
data_list = ocr_res.get('data', [])
if isinstance(data_list, list):
for ocr_data in data_list:
if ocr_data.get('classify') == classify:
if seperate_dir is not None:
os.makedirs(seperate_dir, exist_ok=True)
real_dst = os.path.join(seperate_dir, self.basename(img_path))
if not os.path.exists(real_dst):
shutil.move(img_path, seperate_dir)
classify = ocr_data.get('classify')
if classify not in self.classify_set:
continue
if classify in consts.LICENSE_CLASSIFY_SET_1:
self.license1_process(ocr_data, all_res, classify)
elif classify in consts.LICENSE_CLASSIFY_SET_2:
......@@ -237,20 +225,20 @@ class Command(BaseCommand, LoggerMixin):
else:
self.folder_log.warn('{0} [ocr failed] [img_path={1}]'.format(self.log_base, img_path))
def images_process(self, img_path_list, classify, excel_path, seperate_dir):
all_res = []
def images_process(self, img_path_list, excel_path):
all_res = {}
for img_path in img_path_list:
self.ocr_process(img_path, classify, all_res, seperate_dir)
self.ocr_process(img_path, all_res)
# if len(all_res) > 0:
self.res_process(all_res, excel_path, classify)
self.res_process(all_res, excel_path)
return all_res
def pdf_process(self, name, path, classify, img_output_dir, wb_output_dir, pdf_output_dir, seperate_dir):
def pdf_process(self, name, path, img_output_dir, wb_output_dir, pdf_output_dir):
if os.path.exists(path):
rebuild_res = None
try:
img_save_path, excel_path, pdf_save_path, seperate_path = self.get_path(
name, img_output_dir, wb_output_dir, pdf_output_dir, seperate_dir)
img_save_path, excel_path, pdf_save_path = self.get_path(
name, img_output_dir, wb_output_dir, pdf_output_dir)
self.folder_log.info('{0} [pdf to img start] [path={1}]'.format(self.log_base, path))
pdf_handler = PDFHandler(path, img_save_path)
pdf_handler.extract_image()
......@@ -260,16 +248,16 @@ class Command(BaseCommand, LoggerMixin):
self.log_base, path, traceback.format_exc()))
raise e
else:
rebuild_res = self.images_process(pdf_handler.img_path_list, classify, excel_path, seperate_path)
rebuild_res = self.images_process(pdf_handler.img_path_list, excel_path)
shutil.move(path, pdf_save_path)
return rebuild_res
def tif_process(self, name, path, classify, img_output_dir, wb_output_dir, tiff_output_dir, seperate_dir):
def tif_process(self, name, path, img_output_dir, wb_output_dir, tiff_output_dir):
if os.path.exists(path):
rebuild_res = None
try:
img_save_path, excel_path, tiff_save_path, seperate_path = self.get_path(
name, img_output_dir, wb_output_dir, tiff_output_dir, seperate_dir)
img_save_path, excel_path, tiff_save_path = self.get_path(
name, img_output_dir, wb_output_dir, tiff_output_dir)
self.folder_log.info('{0} [tiff to img start] [path={1}]'.format(self.log_base, path))
tiff_handler = TIFFHandler(path, img_save_path)
tiff_handler.extract_image()
......@@ -279,20 +267,19 @@ class Command(BaseCommand, LoggerMixin):
self.log_base, path, traceback.format_exc()))
raise e
else:
rebuild_res = self.images_process(tiff_handler.img_path_list, classify, excel_path, seperate_path)
rebuild_res = self.images_process(tiff_handler.img_path_list, excel_path)
shutil.move(path, tiff_save_path)
return rebuild_res
def img_process(self, name, path, classify, wb_output_dir, img_output_dir, pdf_output_dir, seperate_dir):
def img_process(self, name, path, wb_output_dir, img_output_dir, pdf_output_dir):
rebuild_res = None
try:
img_save_path, excel_path, _, seperate_path = self.get_path(
name, img_output_dir, wb_output_dir, pdf_output_dir, seperate_dir)
img_save_path, excel_path, _ = self.get_path(name, img_output_dir, wb_output_dir, pdf_output_dir)
except Exception as e:
self.folder_log.error('{0} [get path error] [path={1}] [error={2}]'.format(
self.log_base, path, traceback.format_exc()))
else:
rebuild_res = self.images_process([path], classify, excel_path, seperate_path)
rebuild_res = self.images_process([path], excel_path)
shutil.move(path, img_save_path)
return rebuild_res
......@@ -344,7 +331,7 @@ class Command(BaseCommand, LoggerMixin):
self.log_base, result, traceback.format_exc()))
wb.save(wb_path)
def folder_process(self, input_dir, classify, is_combined, result_queue):
def folder_process(self, input_dir, result_queue):
while not os.path.isdir(input_dir):
self.folder_log.info('{0} [input dir is not dir] [input_dir={1}]'.format(self.log_base, input_dir))
if self.switch:
......@@ -353,7 +340,6 @@ class Command(BaseCommand, LoggerMixin):
else:
return
output_dir = os.path.join(os.path.dirname(input_dir), 'Output')
seperate_dir = os.path.join(output_dir, self.seperate_map.get(classify, 'Unknown')) if is_combined else None
img_output_dir = os.path.join(output_dir, 'image')
wb_output_dir = os.path.join(output_dir, 'excel')
pdf_output_dir = os.path.join(output_dir, 'pdf')
......@@ -365,8 +351,6 @@ class Command(BaseCommand, LoggerMixin):
os.makedirs(pdf_output_dir, exist_ok=True)
os.makedirs(tiff_output_dir, exist_ok=True)
os.makedirs(failed_output_dir, exist_ok=True)
if seperate_dir is not None:
os.makedirs(seperate_dir, exist_ok=True)
os_error_filename_set = set()
while self.switch:
# if not os.path.isdir(input_dir):
......@@ -389,14 +373,14 @@ class Command(BaseCommand, LoggerMixin):
if os.path.isfile(path):
self.folder_log.info('{0} [file start] [path={1}]'.format(self.log_base, path))
if name.endswith('.pdf') or name.endswith('.PDF'):
result = self.pdf_process(name, path, classify, img_output_dir, wb_output_dir,
pdf_output_dir, seperate_dir)
result = self.pdf_process(name, path, img_output_dir, wb_output_dir,
pdf_output_dir)
elif name.endswith('.tif') or name.endswith('.TIF'):
result = self.tif_process(name, path, classify, img_output_dir, wb_output_dir,
tiff_output_dir, seperate_dir)
result = self.tif_process(name, path, img_output_dir, wb_output_dir,
tiff_output_dir)
else:
result = self.img_process(name, path, classify, wb_output_dir, img_output_dir,
pdf_output_dir, seperate_dir)
result = self.img_process(name, path, wb_output_dir, img_output_dir,
pdf_output_dir)
self.folder_log.info('{0} [file end] [path={1}]'.format(self.log_base, path))
else:
result = None
......@@ -420,16 +404,8 @@ class Command(BaseCommand, LoggerMixin):
else:
if isinstance(result, dict) and len(result) > 0:
date_str = time.strftime("%Y-%m-%d")
result_queue.put(
{
self.CLASSIFY_KEY: classify,
self.RESULT_KEY: result,
self.DATE_KEY: date_str
}
)
elif isinstance(result, list) and len(result) > 0:
date_str = time.strftime("%Y-%m-%d")
for res in result:
for classify, res_list in result.items():
for res in res_list:
result_queue.put(
{
self.CLASSIFY_KEY: classify,
......@@ -444,12 +420,10 @@ class Command(BaseCommand, LoggerMixin):
result_queue = Queue()
process_list = []
one_input_dir = None
for classify_idx, input_dir in self.input_dirs.items():
for _, input_dir in self.input_dirs.items():
if one_input_dir is None:
one_input_dir = input_dir
classify = int(classify_idx.split('_')[0])
is_combined = True if int(classify_idx.split('_')[2]) == 1 else False
process = Process(target=self.folder_process, args=(input_dir, classify, is_combined, result_queue))
process = Process(target=self.folder_process, args=(input_dir, result_queue))
process_list.append(process)
wb_dir = os.path.dirname(os.path.dirname(one_input_dir))
......
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!