修改抵押登记豁免函的字段处理
Showing
1 changed file
with
13 additions
and
5 deletions
... | @@ -19,10 +19,18 @@ class HMHRetriever: | ... | @@ -19,10 +19,18 @@ class HMHRetriever: |
19 | def get_target_fields(self, pdf_text_list): | 19 | def get_target_fields(self, pdf_text_list): |
20 | result = dict() | 20 | result = dict() |
21 | is_find_name_id_company, is_find_application_no, is_find_name_date = False, False, False | 21 | is_find_name_id_company, is_find_application_no, is_find_name_date = False, False, False |
22 | for bbox, text in pdf_text_list.pop(str(0), []): | 22 | # for bbox, text in pdf_text_list.pop(str(0), []): |
23 | # print(text) | 23 | pdf_text_items = pdf_text_list.pop(str(0), []) |
24 | |||
25 | for i in range(len(pdf_text_items)): | ||
26 | bbox, text = pdf_text_items[i] | ||
27 | combined_text = text | ||
28 | if i < len(pdf_text_items) - 1: | ||
29 | combined_text += pdf_text_items[i + 1][1] | ||
30 | |||
24 | if not is_find_name_id_company: | 31 | if not is_find_name_id_company: |
25 | name_id_company_list = re.findall(r'姓名(.*)证件号码(.*)与(.*公司)', text) | 32 | # name_id_company_list = re.findall(r'姓名(.*?)证件号码(.*?)与(.*?公司|.*)', combined_text) |
33 | name_id_company_list = re.findall(r'姓名(.*)证件号码(.*)与(.*公司)', combined_text) | ||
26 | for name_id_company_tuple in name_id_company_list: | 34 | for name_id_company_tuple in name_id_company_list: |
27 | if len(name_id_company_tuple) == 3: | 35 | if len(name_id_company_tuple) == 3: |
28 | result[self.search_fields_list[0][0]] = { | 36 | result[self.search_fields_list[0][0]] = { |
... | @@ -40,7 +48,7 @@ class HMHRetriever: | ... | @@ -40,7 +48,7 @@ class HMHRetriever: |
40 | is_find_name_id_company = True | 48 | is_find_name_id_company = True |
41 | break | 49 | break |
42 | if not is_find_application_no: | 50 | if not is_find_application_no: |
43 | application_no_list = re.findall(r'合同编号.*(CH-B\d*-\d*).*', text) | 51 | application_no_list = re.findall(r'合同编号.*(CH-B\d*-\d*).*', combined_text) |
44 | if len(application_no_list) == 1: | 52 | if len(application_no_list) == 1: |
45 | result[self.search_fields_list[3][0]] = { | 53 | result[self.search_fields_list[3][0]] = { |
46 | self.words_str: application_no_list[0], | 54 | self.words_str: application_no_list[0], |
... | @@ -48,7 +56,7 @@ class HMHRetriever: | ... | @@ -48,7 +56,7 @@ class HMHRetriever: |
48 | } | 56 | } |
49 | is_find_application_no = True | 57 | is_find_application_no = True |
50 | if not is_find_name_date: | 58 | if not is_find_name_date: |
51 | name_date_list = re.findall(r'(.*).*签署日期.*(\d{4}-\d{2}-\d{2})', text) | 59 | name_date_list = re.findall(r'(.*).*签署日期.*(\d{4}-\d{2}-\d{2})', combined_text) |
52 | for name_date_tuple in name_date_list: | 60 | for name_date_tuple in name_date_list: |
53 | if len(name_date_tuple) == 2: | 61 | if len(name_date_tuple) == 2: |
54 | result[self.search_fields_list[4][0]] = { | 62 | result[self.search_fields_list[4][0]] = { | ... | ... |
-
Please register or sign in to post a comment