Merge branch 'feature/add_log_20240924' into feature/uat-tmp
Showing
3 changed files
with
27 additions
and
8 deletions
... | @@ -1015,6 +1015,13 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -1015,6 +1015,13 @@ class Command(BaseCommand, LoggerMixin): |
1015 | tmp_res = page_info_dict.get(str(pno2), {}).get(key1, '') | 1015 | tmp_res = page_info_dict.get(str(pno2), {}).get(key1, '') |
1016 | img_pno = pno1 | 1016 | img_pno = pno1 |
1017 | res[key] = tmp_res | 1017 | res[key] = tmp_res |
1018 | # 添加处理, | ||
1019 | # [售后回租合同] - 如果 key 是 "承租人签字", 且内容中包含 签署日期:XXXX, 则将签署日期去除 | ||
1020 | # [车辆租赁抵押合同] - 如果 key 是 "" | ||
1021 | if key == '承租人签字' and '签署日期' in tmp_res: | ||
1022 | res[key] = tmp_res.split('签署日期')[0] | ||
1023 | if key == "抵押人签字" and "签署日期" in tmp_res: | ||
1024 | res[key] = tmp_res.split("签署日期")[0] | ||
1018 | res.setdefault(consts.IMG_PATH_KEY, dict())[key] = page_info_dict.get(str(img_pno), {}).get( | 1025 | res.setdefault(consts.IMG_PATH_KEY, dict())[key] = page_info_dict.get(str(img_pno), {}).get( |
1019 | consts.IMG_PATH_KEY, '') | 1026 | consts.IMG_PATH_KEY, '') |
1020 | else: | 1027 | else: |
... | @@ -1668,16 +1675,20 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -1668,16 +1675,20 @@ class Command(BaseCommand, LoggerMixin): |
1668 | path_split = img_path.split('/') | 1675 | path_split = img_path.split('/') |
1669 | task_str = consts.SPLIT_STR.join((path_split[-5], path_split[-3])) | 1676 | task_str = consts.SPLIT_STR.join((path_split[-5], path_split[-3])) |
1670 | 1677 | ||
1678 | self.online_log.info('{0} [before lock] [img={1}] '.format(self.log_base, img_path)) | ||
1671 | with lock: | 1679 | with lock: |
1680 | self.online_log.info('{0} [get lock] [img={1}] '.format(self.log_base, img_path)) | ||
1672 | doc_res_dict = res_dict.setdefault(task_str, {}) | 1681 | doc_res_dict = res_dict.setdefault(task_str, {}) |
1673 | doc_res_dict[img_path] = ocr_1_res | 1682 | doc_res_dict[img_path] = ocr_1_res |
1674 | res_dict[task_str] = doc_res_dict | 1683 | res_dict[task_str] = doc_res_dict |
1675 | todo_count = todo_count_dict.get(task_str) | 1684 | todo_count = todo_count_dict.get(task_str) |
1676 | if todo_count == 1: | 1685 | if todo_count == 1: |
1677 | finish_queue.put(task_str) | 1686 | finish_queue.put(task_str) |
1687 | self.online_log.info('{0} [ocr_1 to finish_queue] [img={1}] '.format(self.log_base, img_path)) | ||
1678 | del todo_count_dict[task_str] | 1688 | del todo_count_dict[task_str] |
1679 | else: | 1689 | else: |
1680 | todo_count_dict[task_str] = todo_count - 1 | 1690 | todo_count_dict[task_str] = todo_count - 1 |
1691 | self.online_log.info('{0} [after lock] [img={1}] '.format(self.log_base, img_path)) | ||
1681 | except Exception as e: | 1692 | except Exception as e: |
1682 | self.online_log.error('{0} [process error (store ocr res)] [img_path={1}] [error={2}]'.format( | 1693 | self.online_log.error('{0} [process error (store ocr res)] [img_path={1}] [error={2}]'.format( |
1683 | self.log_base, img_path, traceback.format_exc())) | 1694 | self.log_base, img_path, traceback.format_exc())) | ... | ... |
... | @@ -19,10 +19,18 @@ class HMHRetriever: | ... | @@ -19,10 +19,18 @@ class HMHRetriever: |
19 | def get_target_fields(self, pdf_text_list): | 19 | def get_target_fields(self, pdf_text_list): |
20 | result = dict() | 20 | result = dict() |
21 | is_find_name_id_company, is_find_application_no, is_find_name_date = False, False, False | 21 | is_find_name_id_company, is_find_application_no, is_find_name_date = False, False, False |
22 | for bbox, text in pdf_text_list.pop(str(0), []): | 22 | # for bbox, text in pdf_text_list.pop(str(0), []): |
23 | # print(text) | 23 | pdf_text_items = pdf_text_list.pop(str(0), []) |
24 | |||
25 | for i in range(len(pdf_text_items)): | ||
26 | bbox, text = pdf_text_items[i] | ||
27 | combined_text = text | ||
28 | if i < len(pdf_text_items) - 1: | ||
29 | combined_text += pdf_text_items[i + 1][1] | ||
30 | |||
24 | if not is_find_name_id_company: | 31 | if not is_find_name_id_company: |
25 | name_id_company_list = re.findall(r'姓名(.*)证件号码(.*)与(.*公司)', text) | 32 | # name_id_company_list = re.findall(r'姓名(.*?)证件号码(.*?)与(.*?公司|.*)', combined_text) |
33 | name_id_company_list = re.findall(r'姓名(.*)证件号码(.*)与(.*公司)', combined_text) | ||
26 | for name_id_company_tuple in name_id_company_list: | 34 | for name_id_company_tuple in name_id_company_list: |
27 | if len(name_id_company_tuple) == 3: | 35 | if len(name_id_company_tuple) == 3: |
28 | result[self.search_fields_list[0][0]] = { | 36 | result[self.search_fields_list[0][0]] = { |
... | @@ -40,7 +48,7 @@ class HMHRetriever: | ... | @@ -40,7 +48,7 @@ class HMHRetriever: |
40 | is_find_name_id_company = True | 48 | is_find_name_id_company = True |
41 | break | 49 | break |
42 | if not is_find_application_no: | 50 | if not is_find_application_no: |
43 | application_no_list = re.findall(r'合同编号.*(CH-B\d*-\d*).*', text) | 51 | application_no_list = re.findall(r'合同编号.*(CH-B\d*-\d*).*', combined_text) |
44 | if len(application_no_list) == 1: | 52 | if len(application_no_list) == 1: |
45 | result[self.search_fields_list[3][0]] = { | 53 | result[self.search_fields_list[3][0]] = { |
46 | self.words_str: application_no_list[0], | 54 | self.words_str: application_no_list[0], |
... | @@ -48,7 +56,7 @@ class HMHRetriever: | ... | @@ -48,7 +56,7 @@ class HMHRetriever: |
48 | } | 56 | } |
49 | is_find_application_no = True | 57 | is_find_application_no = True |
50 | if not is_find_name_date: | 58 | if not is_find_name_date: |
51 | name_date_list = re.findall(r'(.*).*签署日期.*(\d{4}-\d{2}-\d{2})', text) | 59 | name_date_list = re.findall(r'(.*).*签署日期.*(\d{4}-\d{2}-\d{2})', combined_text) |
52 | for name_date_tuple in name_date_list: | 60 | for name_date_tuple in name_date_list: |
53 | if len(name_date_tuple) == 2: | 61 | if len(name_date_tuple) == 2: |
54 | result[self.search_fields_list[4][0]] = { | 62 | result[self.search_fields_list[4][0]] = { | ... | ... |
... | @@ -10,9 +10,9 @@ from io import BytesIO | ... | @@ -10,9 +10,9 @@ from io import BytesIO |
10 | from unicodedata import normalize | 10 | from unicodedata import normalize |
11 | 11 | ||
12 | # 页面保存为png图片参数 | 12 | # 页面保存为png图片参数 |
13 | ZOOM_X_1 = ZOOM_Y_1 = 3.0 | 13 | ZOOM_X_1 = ZOOM_Y_1 = 1.0 |
14 | ZOOM_X_2 = ZOOM_Y_2 = 5.0 | 14 | ZOOM_X_2 = ZOOM_Y_2 = 2.0 |
15 | ZOOM_X_3 = ZOOM_Y_3 = 7.0 | 15 | ZOOM_X_3 = ZOOM_Y_3 = 3.0 |
16 | trans_1 = fitz.Matrix(ZOOM_X_1, ZOOM_X_1).preRotate(0) # zoom factor 1 in each dimension | 16 | trans_1 = fitz.Matrix(ZOOM_X_1, ZOOM_X_1).preRotate(0) # zoom factor 1 in each dimension |
17 | trans_2 = fitz.Matrix(ZOOM_X_2, ZOOM_X_2).preRotate(0) # zoom factor 2 in each dimension | 17 | trans_2 = fitz.Matrix(ZOOM_X_2, ZOOM_X_2).preRotate(0) # zoom factor 2 in each dimension |
18 | trans_3 = fitz.Matrix(ZOOM_X_3, ZOOM_X_3).preRotate(0) # zoom factor 3 in each dimension | 18 | trans_3 = fitz.Matrix(ZOOM_X_3, ZOOM_X_3).preRotate(0) # zoom factor 3 in each dimension | ... | ... |
-
Please register or sign in to post a comment