修改抵押登记豁免函的字段处理
Showing
1 changed file
with
13 additions
and
5 deletions
| ... | @@ -19,10 +19,18 @@ class HMHRetriever: | ... | @@ -19,10 +19,18 @@ class HMHRetriever: |
| 19 | def get_target_fields(self, pdf_text_list): | 19 | def get_target_fields(self, pdf_text_list): |
| 20 | result = dict() | 20 | result = dict() |
| 21 | is_find_name_id_company, is_find_application_no, is_find_name_date = False, False, False | 21 | is_find_name_id_company, is_find_application_no, is_find_name_date = False, False, False |
| 22 | for bbox, text in pdf_text_list.pop(str(0), []): | 22 | # for bbox, text in pdf_text_list.pop(str(0), []): |
| 23 | # print(text) | 23 | pdf_text_items = pdf_text_list.pop(str(0), []) |
| 24 | |||
| 25 | for i in range(len(pdf_text_items)): | ||
| 26 | bbox, text = pdf_text_items[i] | ||
| 27 | combined_text = text | ||
| 28 | if i < len(pdf_text_items) - 1: | ||
| 29 | combined_text += pdf_text_items[i + 1][1] | ||
| 30 | |||
| 24 | if not is_find_name_id_company: | 31 | if not is_find_name_id_company: |
| 25 | name_id_company_list = re.findall(r'姓名(.*)证件号码(.*)与(.*公司)', text) | 32 | # name_id_company_list = re.findall(r'姓名(.*?)证件号码(.*?)与(.*?公司|.*)', combined_text) |
| 33 | name_id_company_list = re.findall(r'姓名(.*)证件号码(.*)与(.*公司)', combined_text) | ||
| 26 | for name_id_company_tuple in name_id_company_list: | 34 | for name_id_company_tuple in name_id_company_list: |
| 27 | if len(name_id_company_tuple) == 3: | 35 | if len(name_id_company_tuple) == 3: |
| 28 | result[self.search_fields_list[0][0]] = { | 36 | result[self.search_fields_list[0][0]] = { |
| ... | @@ -40,7 +48,7 @@ class HMHRetriever: | ... | @@ -40,7 +48,7 @@ class HMHRetriever: |
| 40 | is_find_name_id_company = True | 48 | is_find_name_id_company = True |
| 41 | break | 49 | break |
| 42 | if not is_find_application_no: | 50 | if not is_find_application_no: |
| 43 | application_no_list = re.findall(r'合同编号.*(CH-B\d*-\d*).*', text) | 51 | application_no_list = re.findall(r'合同编号.*(CH-B\d*-\d*).*', combined_text) |
| 44 | if len(application_no_list) == 1: | 52 | if len(application_no_list) == 1: |
| 45 | result[self.search_fields_list[3][0]] = { | 53 | result[self.search_fields_list[3][0]] = { |
| 46 | self.words_str: application_no_list[0], | 54 | self.words_str: application_no_list[0], |
| ... | @@ -48,7 +56,7 @@ class HMHRetriever: | ... | @@ -48,7 +56,7 @@ class HMHRetriever: |
| 48 | } | 56 | } |
| 49 | is_find_application_no = True | 57 | is_find_application_no = True |
| 50 | if not is_find_name_date: | 58 | if not is_find_name_date: |
| 51 | name_date_list = re.findall(r'(.*).*签署日期.*(\d{4}-\d{2}-\d{2})', text) | 59 | name_date_list = re.findall(r'(.*).*签署日期.*(\d{4}-\d{2}-\d{2})', combined_text) |
| 52 | for name_date_tuple in name_date_list: | 60 | for name_date_tuple in name_date_list: |
| 53 | if len(name_date_tuple) == 2: | 61 | if len(name_date_tuple) == 2: |
| 54 | result[self.search_fields_list[4][0]] = { | 62 | result[self.search_fields_list[4][0]] = { | ... | ... |
-
Please register or sign in to post a comment