1aa502ce by chenyao

修改抵押登记豁免函的字段处理

1 parent a0d6443d
......@@ -19,10 +19,18 @@ class HMHRetriever:
def get_target_fields(self, pdf_text_list):
result = dict()
is_find_name_id_company, is_find_application_no, is_find_name_date = False, False, False
for bbox, text in pdf_text_list.pop(str(0), []):
# print(text)
# for bbox, text in pdf_text_list.pop(str(0), []):
pdf_text_items = pdf_text_list.pop(str(0), [])
for i in range(len(pdf_text_items)):
bbox, text = pdf_text_items[i]
combined_text = text
if i < len(pdf_text_items) - 1:
combined_text += pdf_text_items[i + 1][1]
if not is_find_name_id_company:
name_id_company_list = re.findall(r'姓名(.*)证件号码(.*)与(.*公司)', text)
# name_id_company_list = re.findall(r'姓名(.*?)证件号码(.*?)与(.*?公司|.*)', combined_text)
name_id_company_list = re.findall(r'姓名(.*)证件号码(.*)与(.*公司)', combined_text)
for name_id_company_tuple in name_id_company_list:
if len(name_id_company_tuple) == 3:
result[self.search_fields_list[0][0]] = {
......@@ -40,7 +48,7 @@ class HMHRetriever:
is_find_name_id_company = True
break
if not is_find_application_no:
application_no_list = re.findall(r'合同编号.*(CH-B\d*-\d*).*', text)
application_no_list = re.findall(r'合同编号.*(CH-B\d*-\d*).*', combined_text)
if len(application_no_list) == 1:
result[self.search_fields_list[3][0]] = {
self.words_str: application_no_list[0],
......@@ -48,7 +56,7 @@ class HMHRetriever:
}
is_find_application_no = True
if not is_find_name_date:
name_date_list = re.findall(r'(.*).*签署日期.*(\d{4}-\d{2}-\d{2})', text)
name_date_list = re.findall(r'(.*).*签署日期.*(\d{4}-\d{2}-\d{2})', combined_text)
for name_date_tuple in name_date_list:
if len(name_date_tuple) == 2:
result[self.search_fields_list[4][0]] = {
......
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!