83e1571a by 冯轩

Merge branch 'feature/add_log_20240924' into feature/uat-tmp

2 parents 8c0cdbfb 7653f384
......@@ -1015,6 +1015,13 @@ class Command(BaseCommand, LoggerMixin):
tmp_res = page_info_dict.get(str(pno2), {}).get(key1, '')
img_pno = pno1
res[key] = tmp_res
# 添加处理,
# [售后回租合同] - 如果 key 是 "承租人签字", 且内容中包含 签署日期:XXXX, 则将签署日期去除
# [车辆租赁抵押合同] - 如果 key 是 ""
if key == '承租人签字' and '签署日期' in tmp_res:
res[key] = tmp_res.split('签署日期')[0]
if key == "抵押人签字" and "签署日期" in tmp_res:
res[key] = tmp_res.split("签署日期")[0]
res.setdefault(consts.IMG_PATH_KEY, dict())[key] = page_info_dict.get(str(img_pno), {}).get(
consts.IMG_PATH_KEY, '')
else:
......@@ -1668,16 +1675,20 @@ class Command(BaseCommand, LoggerMixin):
path_split = img_path.split('/')
task_str = consts.SPLIT_STR.join((path_split[-5], path_split[-3]))
self.online_log.info('{0} [before lock] [img={1}] '.format(self.log_base, img_path))
with lock:
self.online_log.info('{0} [get lock] [img={1}] '.format(self.log_base, img_path))
doc_res_dict = res_dict.setdefault(task_str, {})
doc_res_dict[img_path] = ocr_1_res
res_dict[task_str] = doc_res_dict
todo_count = todo_count_dict.get(task_str)
if todo_count == 1:
finish_queue.put(task_str)
self.online_log.info('{0} [ocr_1 to finish_queue] [img={1}] '.format(self.log_base, img_path))
del todo_count_dict[task_str]
else:
todo_count_dict[task_str] = todo_count - 1
self.online_log.info('{0} [after lock] [img={1}] '.format(self.log_base, img_path))
except Exception as e:
self.online_log.error('{0} [process error (store ocr res)] [img_path={1}] [error={2}]'.format(
self.log_base, img_path, traceback.format_exc()))
......
......@@ -19,10 +19,18 @@ class HMHRetriever:
def get_target_fields(self, pdf_text_list):
result = dict()
is_find_name_id_company, is_find_application_no, is_find_name_date = False, False, False
for bbox, text in pdf_text_list.pop(str(0), []):
# print(text)
# for bbox, text in pdf_text_list.pop(str(0), []):
pdf_text_items = pdf_text_list.pop(str(0), [])
for i in range(len(pdf_text_items)):
bbox, text = pdf_text_items[i]
combined_text = text
if i < len(pdf_text_items) - 1:
combined_text += pdf_text_items[i + 1][1]
if not is_find_name_id_company:
name_id_company_list = re.findall(r'姓名(.*)证件号码(.*)与(.*公司)', text)
# name_id_company_list = re.findall(r'姓名(.*?)证件号码(.*?)与(.*?公司|.*)', combined_text)
name_id_company_list = re.findall(r'姓名(.*)证件号码(.*)与(.*公司)', combined_text)
for name_id_company_tuple in name_id_company_list:
if len(name_id_company_tuple) == 3:
result[self.search_fields_list[0][0]] = {
......@@ -40,7 +48,7 @@ class HMHRetriever:
is_find_name_id_company = True
break
if not is_find_application_no:
application_no_list = re.findall(r'合同编号.*(CH-B\d*-\d*).*', text)
application_no_list = re.findall(r'合同编号.*(CH-B\d*-\d*).*', combined_text)
if len(application_no_list) == 1:
result[self.search_fields_list[3][0]] = {
self.words_str: application_no_list[0],
......@@ -48,7 +56,7 @@ class HMHRetriever:
}
is_find_application_no = True
if not is_find_name_date:
name_date_list = re.findall(r'(.*).*签署日期.*(\d{4}-\d{2}-\d{2})', text)
name_date_list = re.findall(r'(.*).*签署日期.*(\d{4}-\d{2}-\d{2})', combined_text)
for name_date_tuple in name_date_list:
if len(name_date_tuple) == 2:
result[self.search_fields_list[4][0]] = {
......
......@@ -10,9 +10,9 @@ from io import BytesIO
from unicodedata import normalize
# 页面保存为png图片参数
ZOOM_X_1 = ZOOM_Y_1 = 3.0
ZOOM_X_2 = ZOOM_Y_2 = 5.0
ZOOM_X_3 = ZOOM_Y_3 = 7.0
ZOOM_X_1 = ZOOM_Y_1 = 1.0
ZOOM_X_2 = ZOOM_Y_2 = 2.0
ZOOM_X_3 = ZOOM_Y_3 = 3.0
trans_1 = fitz.Matrix(ZOOM_X_1, ZOOM_X_1).preRotate(0) # zoom factor 1 in each dimension
trans_2 = fitz.Matrix(ZOOM_X_2, ZOOM_X_2).preRotate(0) # zoom factor 2 in each dimension
trans_3 = fitz.Matrix(ZOOM_X_3, ZOOM_X_3).preRotate(0) # zoom factor 3 in each dimension
......
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!