83e1571a by 冯轩

Merge branch 'feature/add_log_20240924' into feature/uat-tmp

2 parents 8c0cdbfb 7653f384
...@@ -1015,6 +1015,13 @@ class Command(BaseCommand, LoggerMixin): ...@@ -1015,6 +1015,13 @@ class Command(BaseCommand, LoggerMixin):
1015 tmp_res = page_info_dict.get(str(pno2), {}).get(key1, '') 1015 tmp_res = page_info_dict.get(str(pno2), {}).get(key1, '')
1016 img_pno = pno1 1016 img_pno = pno1
1017 res[key] = tmp_res 1017 res[key] = tmp_res
1018 # 添加处理,
1019 # [售后回租合同] - 如果 key 是 "承租人签字", 且内容中包含 签署日期:XXXX, 则将签署日期去除
1020 # [车辆租赁抵押合同] - 如果 key 是 ""
1021 if key == '承租人签字' and '签署日期' in tmp_res:
1022 res[key] = tmp_res.split('签署日期')[0]
1023 if key == "抵押人签字" and "签署日期" in tmp_res:
1024 res[key] = tmp_res.split("签署日期")[0]
1018 res.setdefault(consts.IMG_PATH_KEY, dict())[key] = page_info_dict.get(str(img_pno), {}).get( 1025 res.setdefault(consts.IMG_PATH_KEY, dict())[key] = page_info_dict.get(str(img_pno), {}).get(
1019 consts.IMG_PATH_KEY, '') 1026 consts.IMG_PATH_KEY, '')
1020 else: 1027 else:
...@@ -1668,16 +1675,20 @@ class Command(BaseCommand, LoggerMixin): ...@@ -1668,16 +1675,20 @@ class Command(BaseCommand, LoggerMixin):
1668 path_split = img_path.split('/') 1675 path_split = img_path.split('/')
1669 task_str = consts.SPLIT_STR.join((path_split[-5], path_split[-3])) 1676 task_str = consts.SPLIT_STR.join((path_split[-5], path_split[-3]))
1670 1677
1678 self.online_log.info('{0} [before lock] [img={1}] '.format(self.log_base, img_path))
1671 with lock: 1679 with lock:
1680 self.online_log.info('{0} [get lock] [img={1}] '.format(self.log_base, img_path))
1672 doc_res_dict = res_dict.setdefault(task_str, {}) 1681 doc_res_dict = res_dict.setdefault(task_str, {})
1673 doc_res_dict[img_path] = ocr_1_res 1682 doc_res_dict[img_path] = ocr_1_res
1674 res_dict[task_str] = doc_res_dict 1683 res_dict[task_str] = doc_res_dict
1675 todo_count = todo_count_dict.get(task_str) 1684 todo_count = todo_count_dict.get(task_str)
1676 if todo_count == 1: 1685 if todo_count == 1:
1677 finish_queue.put(task_str) 1686 finish_queue.put(task_str)
1687 self.online_log.info('{0} [ocr_1 to finish_queue] [img={1}] '.format(self.log_base, img_path))
1678 del todo_count_dict[task_str] 1688 del todo_count_dict[task_str]
1679 else: 1689 else:
1680 todo_count_dict[task_str] = todo_count - 1 1690 todo_count_dict[task_str] = todo_count - 1
1691 self.online_log.info('{0} [after lock] [img={1}] '.format(self.log_base, img_path))
1681 except Exception as e: 1692 except Exception as e:
1682 self.online_log.error('{0} [process error (store ocr res)] [img_path={1}] [error={2}]'.format( 1693 self.online_log.error('{0} [process error (store ocr res)] [img_path={1}] [error={2}]'.format(
1683 self.log_base, img_path, traceback.format_exc())) 1694 self.log_base, img_path, traceback.format_exc()))
......
...@@ -19,10 +19,18 @@ class HMHRetriever: ...@@ -19,10 +19,18 @@ class HMHRetriever:
19 def get_target_fields(self, pdf_text_list): 19 def get_target_fields(self, pdf_text_list):
20 result = dict() 20 result = dict()
21 is_find_name_id_company, is_find_application_no, is_find_name_date = False, False, False 21 is_find_name_id_company, is_find_application_no, is_find_name_date = False, False, False
22 for bbox, text in pdf_text_list.pop(str(0), []): 22 # for bbox, text in pdf_text_list.pop(str(0), []):
23 # print(text) 23 pdf_text_items = pdf_text_list.pop(str(0), [])
24
25 for i in range(len(pdf_text_items)):
26 bbox, text = pdf_text_items[i]
27 combined_text = text
28 if i < len(pdf_text_items) - 1:
29 combined_text += pdf_text_items[i + 1][1]
30
24 if not is_find_name_id_company: 31 if not is_find_name_id_company:
25 name_id_company_list = re.findall(r'姓名(.*)证件号码(.*)与(.*公司)', text) 32 # name_id_company_list = re.findall(r'姓名(.*?)证件号码(.*?)与(.*?公司|.*)', combined_text)
33 name_id_company_list = re.findall(r'姓名(.*)证件号码(.*)与(.*公司)', combined_text)
26 for name_id_company_tuple in name_id_company_list: 34 for name_id_company_tuple in name_id_company_list:
27 if len(name_id_company_tuple) == 3: 35 if len(name_id_company_tuple) == 3:
28 result[self.search_fields_list[0][0]] = { 36 result[self.search_fields_list[0][0]] = {
...@@ -40,7 +48,7 @@ class HMHRetriever: ...@@ -40,7 +48,7 @@ class HMHRetriever:
40 is_find_name_id_company = True 48 is_find_name_id_company = True
41 break 49 break
42 if not is_find_application_no: 50 if not is_find_application_no:
43 application_no_list = re.findall(r'合同编号.*(CH-B\d*-\d*).*', text) 51 application_no_list = re.findall(r'合同编号.*(CH-B\d*-\d*).*', combined_text)
44 if len(application_no_list) == 1: 52 if len(application_no_list) == 1:
45 result[self.search_fields_list[3][0]] = { 53 result[self.search_fields_list[3][0]] = {
46 self.words_str: application_no_list[0], 54 self.words_str: application_no_list[0],
...@@ -48,7 +56,7 @@ class HMHRetriever: ...@@ -48,7 +56,7 @@ class HMHRetriever:
48 } 56 }
49 is_find_application_no = True 57 is_find_application_no = True
50 if not is_find_name_date: 58 if not is_find_name_date:
51 name_date_list = re.findall(r'(.*).*签署日期.*(\d{4}-\d{2}-\d{2})', text) 59 name_date_list = re.findall(r'(.*).*签署日期.*(\d{4}-\d{2}-\d{2})', combined_text)
52 for name_date_tuple in name_date_list: 60 for name_date_tuple in name_date_list:
53 if len(name_date_tuple) == 2: 61 if len(name_date_tuple) == 2:
54 result[self.search_fields_list[4][0]] = { 62 result[self.search_fields_list[4][0]] = {
......
...@@ -10,9 +10,9 @@ from io import BytesIO ...@@ -10,9 +10,9 @@ from io import BytesIO
10 from unicodedata import normalize 10 from unicodedata import normalize
11 11
12 # 页面保存为png图片参数 12 # 页面保存为png图片参数
13 ZOOM_X_1 = ZOOM_Y_1 = 3.0 13 ZOOM_X_1 = ZOOM_Y_1 = 1.0
14 ZOOM_X_2 = ZOOM_Y_2 = 5.0 14 ZOOM_X_2 = ZOOM_Y_2 = 2.0
15 ZOOM_X_3 = ZOOM_Y_3 = 7.0 15 ZOOM_X_3 = ZOOM_Y_3 = 3.0
16 trans_1 = fitz.Matrix(ZOOM_X_1, ZOOM_X_1).preRotate(0) # zoom factor 1 in each dimension 16 trans_1 = fitz.Matrix(ZOOM_X_1, ZOOM_X_1).preRotate(0) # zoom factor 1 in each dimension
17 trans_2 = fitz.Matrix(ZOOM_X_2, ZOOM_X_2).preRotate(0) # zoom factor 2 in each dimension 17 trans_2 = fitz.Matrix(ZOOM_X_2, ZOOM_X_2).preRotate(0) # zoom factor 2 in each dimension
18 trans_3 = fitz.Matrix(ZOOM_X_3, ZOOM_X_3).preRotate(0) # zoom factor 3 in each dimension 18 trans_3 = fitz.Matrix(ZOOM_X_3, ZOOM_X_3).preRotate(0) # zoom factor 3 in each dimension
......
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!