1aa502ce by chenyao

修改抵押登记豁免函的字段处理

1 parent a0d6443d
...@@ -19,10 +19,18 @@ class HMHRetriever: ...@@ -19,10 +19,18 @@ class HMHRetriever:
19 def get_target_fields(self, pdf_text_list): 19 def get_target_fields(self, pdf_text_list):
20 result = dict() 20 result = dict()
21 is_find_name_id_company, is_find_application_no, is_find_name_date = False, False, False 21 is_find_name_id_company, is_find_application_no, is_find_name_date = False, False, False
22 for bbox, text in pdf_text_list.pop(str(0), []): 22 # for bbox, text in pdf_text_list.pop(str(0), []):
23 # print(text) 23 pdf_text_items = pdf_text_list.pop(str(0), [])
24
25 for i in range(len(pdf_text_items)):
26 bbox, text = pdf_text_items[i]
27 combined_text = text
28 if i < len(pdf_text_items) - 1:
29 combined_text += pdf_text_items[i + 1][1]
30
24 if not is_find_name_id_company: 31 if not is_find_name_id_company:
25 name_id_company_list = re.findall(r'姓名(.*)证件号码(.*)与(.*公司)', text) 32 # name_id_company_list = re.findall(r'姓名(.*?)证件号码(.*?)与(.*?公司|.*)', combined_text)
33 name_id_company_list = re.findall(r'姓名(.*)证件号码(.*)与(.*公司)', combined_text)
26 for name_id_company_tuple in name_id_company_list: 34 for name_id_company_tuple in name_id_company_list:
27 if len(name_id_company_tuple) == 3: 35 if len(name_id_company_tuple) == 3:
28 result[self.search_fields_list[0][0]] = { 36 result[self.search_fields_list[0][0]] = {
...@@ -40,7 +48,7 @@ class HMHRetriever: ...@@ -40,7 +48,7 @@ class HMHRetriever:
40 is_find_name_id_company = True 48 is_find_name_id_company = True
41 break 49 break
42 if not is_find_application_no: 50 if not is_find_application_no:
43 application_no_list = re.findall(r'合同编号.*(CH-B\d*-\d*).*', text) 51 application_no_list = re.findall(r'合同编号.*(CH-B\d*-\d*).*', combined_text)
44 if len(application_no_list) == 1: 52 if len(application_no_list) == 1:
45 result[self.search_fields_list[3][0]] = { 53 result[self.search_fields_list[3][0]] = {
46 self.words_str: application_no_list[0], 54 self.words_str: application_no_list[0],
...@@ -48,7 +56,7 @@ class HMHRetriever: ...@@ -48,7 +56,7 @@ class HMHRetriever:
48 } 56 }
49 is_find_application_no = True 57 is_find_application_no = True
50 if not is_find_name_date: 58 if not is_find_name_date:
51 name_date_list = re.findall(r'(.*).*签署日期.*(\d{4}-\d{2}-\d{2})', text) 59 name_date_list = re.findall(r'(.*).*签署日期.*(\d{4}-\d{2}-\d{2})', combined_text)
52 for name_date_tuple in name_date_list: 60 for name_date_tuple in name_date_list:
53 if len(name_date_tuple) == 2: 61 if len(name_date_tuple) == 2:
54 result[self.search_fields_list[4][0]] = { 62 result[self.search_fields_list[4][0]] = {
......
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!