d3de42e6 by 周伟奇

fix HIL contract

1 parent a7933381
...@@ -6,14 +6,11 @@ ...@@ -6,14 +6,11 @@
6 # @Description : 6 # @Description :
7 7
8 import re 8 import re
9 import cv2
10 import base64
11 import numpy as np 9 import numpy as np
12 from fuzzywuzzy import fuzz 10 from fuzzywuzzy import fuzz
13 11
14 12
15 class Finder: 13 class Finder:
16
17 def __init__(self, pdf_info): 14 def __init__(self, pdf_info):
18 self.pdf_info = pdf_info 15 self.pdf_info = pdf_info
19 self.item = {"words": None, 16 self.item = {"words": None,
...@@ -25,6 +22,9 @@ class Finder: ...@@ -25,6 +22,9 @@ class Finder:
25 "承租人-姓名": self.item, 22 "承租人-姓名": self.item,
26 "承租人-证件号码": self.item, 23 "承租人-证件号码": self.item,
27 "承租人-法定代表人或授权代表": self.item, 24 "承租人-法定代表人或授权代表": self.item,
25 "共同承租人-姓名": self.item,
26 "共同承租人-证件号码": self.item,
27 "共同承租人-法定代表人或授权代表": self.item,
28 "保证人1-姓名": self.item, 28 "保证人1-姓名": self.item,
29 "保证人1-证件号码": self.item, 29 "保证人1-证件号码": self.item,
30 "保证人1-法定代表人或授权代表": self.item, 30 "保证人1-法定代表人或授权代表": self.item,
...@@ -47,6 +47,8 @@ class Finder: ...@@ -47,6 +47,8 @@ class Finder:
47 "银行账户-开户行": self.item, 47 "银行账户-开户行": self.item,
48 "签字页-承租人姓名": self.item, 48 "签字页-承租人姓名": self.item,
49 "签字页-承租人签章": self.item, 49 "签字页-承租人签章": self.item,
50 "签字页-共同承租人姓名": self.item,
51 "签字页-共同承租人签章": self.item,
50 "签字页-保证人1姓名": self.item, 52 "签字页-保证人1姓名": self.item,
51 "签字页-保证人1签章": self.item, 53 "签字页-保证人1签章": self.item,
52 "签字页-保证人2姓名": self.item, 54 "签字页-保证人2姓名": self.item,
...@@ -54,7 +56,6 @@ class Finder: ...@@ -54,7 +56,6 @@ class Finder:
54 "签字页-保证人3姓名": self.item, 56 "签字页-保证人3姓名": self.item,
55 "签字页-保证人3签章": self.item, 57 "签字页-保证人3签章": self.item,
56 } 58 }
57
58 # 格式化输出 车辆处置协议 要是别的字段 59 # 格式化输出 车辆处置协议 要是别的字段
59 self.init_result_1 = {"合同编号": self.item, 60 self.init_result_1 = {"合同编号": self.item,
60 "承租人-姓名": self.item, 61 "承租人-姓名": self.item,
...@@ -66,9 +67,7 @@ class Finder: ...@@ -66,9 +67,7 @@ class Finder:
66 "签字页-承租人签章": self.item, 67 "签字页-承租人签章": self.item,
67 "签字页-销售经销商": self.item, 68 "签字页-销售经销商": self.item,
68 "签字页-销售经销商签章": self.item, 69 "签字页-销售经销商签章": self.item,
69
70 } 70 }
71
72 # 格式化输出 车辆租赁抵押合同 71 # 格式化输出 车辆租赁抵押合同
73 self.init_result_2 = {"合同编号": self.item, 72 self.init_result_2 = {"合同编号": self.item,
74 "合同编号(正文)": self.item, 73 "合同编号(正文)": self.item,
...@@ -150,23 +149,24 @@ class Finder: ...@@ -150,23 +149,24 @@ class Finder:
150 words = matchObj.group(1) 149 words = matchObj.group(1)
151 contract_no['position'] = None 150 contract_no['position'] = None
152 contract_no['page'] = pno 151 contract_no['page'] = pno
153 contract_no['words'] = words 152 # contract_no['words'] = words
153 contract_no['words'] = re.sub("\s", "", words).replace(")", "")
154 return contract_no 154 return contract_no
155
156 matchObj = re.search(r'编号为(.*?)的', all_text) 155 matchObj = re.search(r'编号为(.*?)的', all_text)
157 if matchObj: 156 if matchObj:
158 words = matchObj.group(1).strip() 157 words = matchObj.group(1).strip()
159 contract_no['position'] = None 158 contract_no['position'] = None
160 contract_no['page'] = pno 159 contract_no['page'] = pno
161 contract_no['words'] = words 160 # contract_no['words'] = words
161 contract_no['words'] = re.sub("\s", "", words).replace(")", "")
162 return contract_no 162 return contract_no
163
164 matchObj = re.search(r'编号为(.*?))的', all_text) 163 matchObj = re.search(r'编号为(.*?))的', all_text)
165 if matchObj: 164 if matchObj:
166 words = matchObj.group(1).strip() 165 words = matchObj.group(1).strip()
167 contract_no['position'] = None 166 contract_no['position'] = None
168 contract_no['page'] = pno 167 contract_no['page'] = pno
169 contract_no['words'] = words 168 # contract_no['words'] = words
169 contract_no['words'] = re.sub("\s", "", words)
170 return contract_no 170 return contract_no
171 171
172 def get_key_value(self, key, page_num=None): 172 def get_key_value(self, key, page_num=None):
...@@ -180,10 +180,11 @@ class Finder: ...@@ -180,10 +180,11 @@ class Finder:
180 for span in line['spans']: 180 for span in line['spans']:
181 bbox, text = span['bbox'], span['text'] 181 bbox, text = span['bbox'], span['text']
182 if key in text: 182 if key in text:
183 words = text.split(':')[-1] 183 words = text.split(':')[-1].replace("。", "")
184 value['position'] = bbox 184 value['position'] = bbox
185 value['page'] = pno 185 value['page'] = pno
186 value['words'] = words 186 # value['words'] = words
187 value['words'] = re.sub("\s", "", words)
187 else: 188 else:
188 for pno in self.pdf_info: 189 for pno in self.pdf_info:
189 for block in self.pdf_info[pno]['blocks']: 190 for block in self.pdf_info[pno]['blocks']:
...@@ -194,10 +195,11 @@ class Finder: ...@@ -194,10 +195,11 @@ class Finder:
194 bbox, text = span['bbox'], span['text'] 195 bbox, text = span['bbox'], span['text']
195 if key in text: 196 if key in text:
196 # print(self.pdf_info[pno]) 197 # print(self.pdf_info[pno])
197 words = text.split(':')[-1] 198 words = text.split(':')[-1].replace("。", "")
198 value['position'] = bbox 199 value['position'] = bbox
199 value['page'] = pno 200 value['page'] = pno
200 value['words'] = words 201 # value['words'] = words
202 value['words'] = re.sub("\s", "", words)
201 return value 203 return value
202 204
203 def get_loan_principal(self, page_num='0'): 205 def get_loan_principal(self, page_num='0'):
...@@ -267,7 +269,6 @@ class Finder: ...@@ -267,7 +269,6 @@ class Finder:
267 269
268 def get_asp_details(self, page_num): 270 def get_asp_details(self, page_num):
269 asp_details_table_term = self.item.copy() 271 asp_details_table_term = self.item.copy()
270
271 asp_details_table = [] 272 asp_details_table = []
272 asp_details_text_list = [] 273 asp_details_text_list = []
273 table = False 274 table = False
...@@ -283,25 +284,20 @@ class Finder: ...@@ -283,25 +284,20 @@ class Finder:
283 table = False 284 table = False
284 if table == True: 285 if table == True:
285 asp_details_text_list.append(text) 286 asp_details_text_list.append(text)
286 287 for i in range((len(asp_details_text_list) + 2) // 3):
287 for i in range((len(asp_details_text_list)+2)//3):
288
289 line = [] 288 line = []
290 if i == 0: 289 if i == 0:
291 line = [asp_details_text_list[0]] 290 line = [asp_details_text_list[0]]
292 else: 291 else:
293 for j in range(3): 292 for j in range(3):
294 line.append(asp_details_text_list[i*3-2+j]) 293 line.append(asp_details_text_list[i * 3 - 2 + j])
295
296 asp_details_table.append(line) 294 asp_details_table.append(line)
297
298 if len(asp_details_table) > 0: 295 if len(asp_details_table) > 0:
299 asp_details_table_term['words'] = asp_details_table 296 asp_details_table_term['words'] = asp_details_table
300 return asp_details_table_term 297 return asp_details_table_term
301 298
302 def get_signature(self): 299 def get_signature(self):
303 signature = self.item.copy() 300 signature = self.item.copy()
304
305 for block in self.pdf_info['0']['blocks']: 301 for block in self.pdf_info['0']['blocks']:
306 if block['type'] != 0: 302 if block['type'] != 0:
307 continue 303 continue
...@@ -369,8 +365,8 @@ class Finder: ...@@ -369,8 +365,8 @@ class Finder:
369 for line in block['lines']: 365 for line in block['lines']:
370 for span in line['spans']: 366 for span in line['spans']:
371 bbox, text = span['bbox'], span['text'] 367 bbox, text = span['bbox'], span['text']
372 if anchor_bbox[2]<np.mean(bbox[::2])<half_width and \ 368 if anchor_bbox[2] < np.mean(bbox[::2]) < half_width and \
373 anchor_bbox[1]<np.mean(bbox[1::2])<anchor_bbox[3]: 369 anchor_bbox[1] < np.mean(bbox[1::2]) < anchor_bbox[3]:
374 seller['position'] = bbox 370 seller['position'] = bbox
375 seller['words'] = text 371 seller['words'] = text
376 return seller 372 return seller
...@@ -430,7 +426,6 @@ class Finder: ...@@ -430,7 +426,6 @@ class Finder:
430 426
431 def get_repayment_schedule(self): 427 def get_repayment_schedule(self):
432 repayment_schedule = self.item.copy() 428 repayment_schedule = self.item.copy()
433
434 repayment_schedule_text_list = [] 429 repayment_schedule_text_list = []
435 table = False 430 table = False
436 page = None 431 page = None
...@@ -444,20 +439,25 @@ class Finder: ...@@ -444,20 +439,25 @@ class Finder:
444 if '以上表格中所列序号' in text: 439 if '以上表格中所列序号' in text:
445 table = False 440 table = False
446 if table == True: 441 if table == True:
442 # 过滤汉字
443 if re.compile(r'[\u4e00-\u9fff]').search(text):
444 continue
445 # 过滤 1. - 61. 这些标题
446 if re.findall("\d+", text):
447 if len(re.findall("\d+", text)) == 1:
448 continue
447 repayment_schedule_text_list.append(text) 449 repayment_schedule_text_list.append(text)
448 if '61.' in text: 450 if '61.' in text:
449 page = pno 451 page = pno
450 table = True 452 table = True
451 453 # print("repayment_schedule_text_list = ", repayment_schedule_text_list)
452 repayment_schedule_table = [['序号', '融资租赁成本', '融资租赁费用', '租金', '剩余融资租赁成本']] 454 repayment_schedule_table = [['序号', '融资租赁成本', '融资租赁费用', '租金', '剩余融资租赁成本']]
453 for i in range(len(repayment_schedule_text_list)//4): 455 for i in range(len(repayment_schedule_text_list) // 4):
454 line = [f'{i+1}.'] 456 line = [f'{i + 1}.']
455 # 4表示4列的意思 457 # 4表示4列的意思
456 for j in range(4): 458 for j in range(4):
457 line.append(repayment_schedule_text_list[i*4+j]) 459 line.append(repayment_schedule_text_list[i * 4 + j])
458
459 repayment_schedule_table.append(line) 460 repayment_schedule_table.append(line)
460
461 repayment_schedule['words'] = repayment_schedule_table 461 repayment_schedule['words'] = repayment_schedule_table
462 repayment_schedule['page'] = page 462 repayment_schedule['page'] = page
463 return repayment_schedule 463 return repayment_schedule
...@@ -506,7 +506,7 @@ class Finder: ...@@ -506,7 +506,7 @@ class Finder:
506 else: 506 else:
507 words = '无' 507 words = '无'
508 boxes = np.array(boxes).reshape((-1, 2)) 508 boxes = np.array(boxes).reshape((-1, 2))
509 position = [min(boxes[:,0]), min(boxes[:,1]), max(boxes[:,0]), max(boxes[:,1])] 509 position = [min(boxes[:, 0]), min(boxes[:, 1]), max(boxes[:, 0]), max(boxes[:, 1])]
510 signature_role_2['page_num'] = page_num 510 signature_role_2['page_num'] = page_num
511 signature_role_2['position'] = position 511 signature_role_2['position'] = position
512 signature_role_2['words'] = words 512 signature_role_2['words'] = words
...@@ -541,7 +541,7 @@ class Finder: ...@@ -541,7 +541,7 @@ class Finder:
541 else: 541 else:
542 words = '无' 542 words = '无'
543 boxes = np.array(boxes).reshape((-1, 2)) 543 boxes = np.array(boxes).reshape((-1, 2))
544 position = [min(boxes[:,0]), min(boxes[:,1]), max(boxes[:,0]), max(boxes[:,1])] 544 position = [min(boxes[:, 0]), min(boxes[:, 1]), max(boxes[:, 0]), max(boxes[:, 1])]
545 signature_role_3['page_num'] = page_num 545 signature_role_3['page_num'] = page_num
546 signature_role_3['position'] = position 546 signature_role_3['position'] = position
547 signature_role_3['words'] = words 547 signature_role_3['words'] = words
...@@ -576,7 +576,7 @@ class Finder: ...@@ -576,7 +576,7 @@ class Finder:
576 else: 576 else:
577 words = '无' 577 words = '无'
578 boxes = np.array(boxes).reshape((-1, 2)) 578 boxes = np.array(boxes).reshape((-1, 2))
579 position = [min(boxes[:,0]), min(boxes[:,1]), max(boxes[:,0]), max(boxes[:,1])] 579 position = [min(boxes[:, 0]), min(boxes[:, 1]), max(boxes[:, 0]), max(boxes[:, 1])]
580 signature_role_4['page_num'] = page_num 580 signature_role_4['page_num'] = page_num
581 signature_role_4['position'] = position 581 signature_role_4['position'] = position
582 signature_role_4['words'] = words 582 signature_role_4['words'] = words
...@@ -612,7 +612,7 @@ class Finder: ...@@ -612,7 +612,7 @@ class Finder:
612 else: 612 else:
613 words = '无' 613 words = '无'
614 boxes = np.array(boxes).reshape((-1, 2)) 614 boxes = np.array(boxes).reshape((-1, 2))
615 position = [min(boxes[:,0]), min(boxes[:,1]), max(boxes[:,0]), max(boxes[:,1])] 615 position = [min(boxes[:, 0]), min(boxes[:, 1]), max(boxes[:, 0]), max(boxes[:, 1])]
616 signature_role_5['page_num'] = page_num 616 signature_role_5['page_num'] = page_num
617 signature_role_5['position'] = position 617 signature_role_5['position'] = position
618 signature_role_5['words'] = words 618 signature_role_5['words'] = words
...@@ -640,7 +640,7 @@ class Finder: ...@@ -640,7 +640,7 @@ class Finder:
640 for line in block['lines']: 640 for line in block['lines']:
641 for span in line['spans']: 641 for span in line['spans']:
642 bbox, text = span['bbox'], span['text'] 642 bbox, text = span['bbox'], span['text']
643 if '签署日期' in text and int(anchor_top)<np.mean(bbox[1::2])<int(anchor_bottom): 643 if '签署日期' in text and int(anchor_top) < np.mean(bbox[1::2]) < int(anchor_bottom):
644 name = text.split(' ')[0] 644 name = text.split(' ')[0]
645 date = text.split(':')[-1] 645 date = text.split(':')[-1]
646 signature_name['words'] = name 646 signature_name['words'] = name
...@@ -663,7 +663,7 @@ class Finder: ...@@ -663,7 +663,7 @@ class Finder:
663 if top in text: 663 if top in text:
664 anchor_top = bbox[1] 664 anchor_top = bbox[1]
665 if bottom in text: 665 if bottom in text:
666 anchor_bottom = bbox[1] 666 anchor_bottom = bbox[3]
667 if anchor_top is not None and anchor_bottom is not None: 667 if anchor_top is not None and anchor_bottom is not None:
668 for pno in self.pdf_info: 668 for pno in self.pdf_info:
669 for block in self.pdf_info[pno]['blocks']: 669 for block in self.pdf_info[pno]['blocks']:
...@@ -672,7 +672,9 @@ class Finder: ...@@ -672,7 +672,9 @@ class Finder:
672 for line in block['lines']: 672 for line in block['lines']:
673 for span in line['spans']: 673 for span in line['spans']:
674 bbox, text = span['bbox'], span['text'] 674 bbox, text = span['bbox'], span['text']
675 if '签署日期' in text and int(anchor_top)<np.mean(bbox[1::2])<int(anchor_bottom): 675 # ------------ #
676 # print("--text = ", text)
677 if '签署日期' in text and int(anchor_top) < np.mean(bbox[1::2]) < int(anchor_bottom):
676 words = text 678 words = text
677 signature['words'] = words 679 signature['words'] = words
678 signature['page'] = pno 680 signature['page'] = pno
...@@ -683,7 +685,6 @@ class Finder: ...@@ -683,7 +685,6 @@ class Finder:
683 name = self.item.copy() 685 name = self.item.copy()
684 id_num = self.item.copy() 686 id_num = self.item.copy()
685 representative = self.item.copy() 687 representative = self.item.copy()
686
687 # 以保证人3 的左上角为定位点 688 # 以保证人3 的左上角为定位点
688 anchor = None 689 anchor = None
689 for block in self.pdf_info[page_num]['blocks']: 690 for block in self.pdf_info[page_num]['blocks']:
...@@ -695,7 +696,6 @@ class Finder: ...@@ -695,7 +696,6 @@ class Finder:
695 # 找到角色姓名 696 # 找到角色姓名
696 if re.match('保证人3', text) is not None: 697 if re.match('保证人3', text) is not None:
697 anchor = [bbox[0], bbox[1]] 698 anchor = [bbox[0], bbox[1]]
698
699 if anchor is not None: 699 if anchor is not None:
700 for block in self.pdf_info[page_num]['blocks']: 700 for block in self.pdf_info[page_num]['blocks']:
701 if block['type'] != 0: 701 if block['type'] != 0:
...@@ -711,52 +711,60 @@ class Finder: ...@@ -711,52 +711,60 @@ class Finder:
711 name['position'] = bbox 711 name['position'] = bbox
712 if role_key == '承租人:': 712 if role_key == '承租人:':
713 # 找到证件号码且确定位置 713 # 找到证件号码且确定位置
714 if re.match('证件号码:', text) is not None and np.mean(bbox[::2]) < anchor[0] and np.mean(bbox[1::2]) < anchor[1]: 714 if re.match('证件号码:', text) is not None and np.mean(bbox[::2]) < anchor[0] and np.mean(
715 bbox[1::2]) < anchor[1]:
715 words = text.split(':')[-1] 716 words = text.split(':')[-1]
716 id_num['words'] = words 717 id_num['words'] = words
717 id_num['page'] = page_num 718 id_num['page'] = page_num
718 id_num['position'] = bbox 719 id_num['position'] = bbox
719 # 找到法人代表且确定位置 720 # 找到法人代表且确定位置
720 if re.match('法定代表人或授权代表:', text) is not None and np.mean(bbox[::2]) < anchor[0] and np.mean(bbox[1::2]) < anchor[1]: 721 if re.match('法定代表人或授权代表:', text) is not None and np.mean(bbox[::2]) < anchor[0] and np.mean(
722 bbox[1::2]) < anchor[1]:
721 words = text.split(':')[-1] 723 words = text.split(':')[-1]
722 representative['words'] = words 724 representative['words'] = words
723 representative['page'] = page_num 725 representative['page'] = page_num
724 representative['position'] = bbox 726 representative['position'] = bbox
725 if role_key == '保证人1:': 727 if role_key == '保证人1:':
726 # 找到证件号码且确定位置 728 # 找到证件号码且确定位置
727 if re.match('证件号码:', text) is not None and np.mean(bbox[::2]) < anchor[0] and np.mean(bbox[1::2]) > anchor[1]: 729 if re.match('证件号码:', text) is not None and np.mean(bbox[::2]) < anchor[0] and np.mean(
730 bbox[1::2]) > anchor[1]:
728 words = text.split(':')[-1] 731 words = text.split(':')[-1]
729 id_num['words'] = words 732 id_num['words'] = words
730 id_num['page'] = page_num 733 id_num['page'] = page_num
731 id_num['position'] = bbox 734 id_num['position'] = bbox
732 # 找到法人代表且确定位置 735 # 找到法人代表且确定位置
733 if re.match('法定代表人或授权代表:', text) is not None and np.mean(bbox[::2]) < anchor[0] and np.mean(bbox[1::2]) > anchor[1]: 736 if re.match('法定代表人或授权代表:', text) is not None and np.mean(bbox[::2]) < anchor[0] and np.mean(
737 bbox[1::2]) > anchor[1]:
734 words = text.split(':')[-1] 738 words = text.split(':')[-1]
735 representative['words'] = words 739 representative['words'] = words
736 representative['page'] = page_num 740 representative['page'] = page_num
737 representative['position'] = bbox 741 representative['position'] = bbox
738 if role_key == '保证人2:': 742 if role_key == '保证人2:':
739 # 找到证件号码且确定位置 743 # 找到证件号码且确定位置
740 if re.match('证件号码:', text) is not None and np.mean(bbox[::2]) > anchor[0] and np.mean(bbox[1::2]) < anchor[1]: 744 if re.match('证件号码:', text) is not None and np.mean(bbox[::2]) > anchor[0] and np.mean(
745 bbox[1::2]) < anchor[1]:
741 words = text.split(':')[-1] 746 words = text.split(':')[-1]
742 id_num['words'] = words 747 id_num['words'] = words
743 id_num['page'] = page_num 748 id_num['page'] = page_num
744 id_num['position'] = bbox 749 id_num['position'] = bbox
745 # 找到法人代表且确定位置 750 # 找到法人代表且确定位置
746 if re.match('法定代表人或授权代表:', text) is not None and np.mean(bbox[::2]) > anchor[0] and np.mean(bbox[1::2]) < anchor[1]: 751 if re.match('法定代表人或授权代表:', text) is not None and np.mean(bbox[::2]) > anchor[0] and np.mean(
752 bbox[1::2]) < anchor[1]:
747 words = text.split(':')[-1] 753 words = text.split(':')[-1]
748 representative['words'] = words 754 representative['words'] = words
749 representative['page'] = page_num 755 representative['page'] = page_num
750 representative['position'] = bbox 756 representative['position'] = bbox
751 if role_key == '保证人3:': 757 if role_key == '保证人3:':
752 # 找到证件号码且确定位置 758 # 找到证件号码且确定位置
753 if re.match('证件号码:', text) is not None and np.mean(bbox[::2]) > anchor[0] and np.mean(bbox[1::2]) > anchor[1]: 759 if re.match('证件号码:', text) is not None and np.mean(bbox[::2]) > anchor[0] and np.mean(
760 bbox[1::2]) > anchor[1]:
754 words = text.split(':')[-1] 761 words = text.split(':')[-1]
755 id_num['words'] = words 762 id_num['words'] = words
756 id_num['page'] = page_num 763 id_num['page'] = page_num
757 id_num['position'] = bbox 764 id_num['position'] = bbox
758 # 找到法人代表且确定位置 765 # 找到法人代表且确定位置
759 if re.match('法定代表人或授权代表:', text) is not None and np.mean(bbox[::2]) > anchor[0] and np.mean(bbox[1::2]) > anchor[1]: 766 if re.match('法定代表人或授权代表:', text) is not None and np.mean(bbox[::2]) > anchor[0] and np.mean(
767 bbox[1::2]) > anchor[1]:
760 words = text.split(':')[-1] 768 words = text.split(':')[-1]
761 representative['words'] = words 769 representative['words'] = words
762 representative['page'] = page_num 770 representative['page'] = page_num
...@@ -783,12 +791,10 @@ class Finder: ...@@ -783,12 +791,10 @@ class Finder:
783 start = False 791 start = False
784 if start == True: 792 if start == True:
785 items.append(text) 793 items.append(text)
786
787 lines = [['项目', '购买价格', '实际融资金额']] 794 lines = [['项目', '购买价格', '实际融资金额']]
788 for i in range(len(items)//3): 795 for i in range(len(items) // 3):
789 line = [items[2+i*3+0], items[2+i*3+1], items[2+i*3+2]] 796 line = [items[2 + i * 3 + 0], items[2 + i * 3 + 1], items[2 + i * 3 + 2]]
790 lines.append(line) 797 lines.append(line)
791
792 if len(items) > 0: 798 if len(items) > 0:
793 lines.append([items[0], '', items[1]]) 799 lines.append([items[0], '', items[1]])
794 800
...@@ -800,7 +806,6 @@ class Finder: ...@@ -800,7 +806,6 @@ class Finder:
800 def get_contract_no_dy(self): 806 def get_contract_no_dy(self):
801 # 查找抵押合同编号 807 # 查找抵押合同编号
802 contract_no = self.item.copy() 808 contract_no = self.item.copy()
803
804 key_box = None 809 key_box = None
805 for pno in self.pdf_info: 810 for pno in self.pdf_info:
806 for block in self.pdf_info[pno]['blocks']: 811 for block in self.pdf_info[pno]['blocks']:
...@@ -811,7 +816,6 @@ class Finder: ...@@ -811,7 +816,6 @@ class Finder:
811 bbox, text = span['bbox'], span['text'] 816 bbox, text = span['bbox'], span['text']
812 if '抵押合同编号' in text: 817 if '抵押合同编号' in text:
813 key_box = bbox 818 key_box = bbox
814
815 if key_box is not None: 819 if key_box is not None:
816 for pno in self.pdf_info: 820 for pno in self.pdf_info:
817 for block in self.pdf_info[pno]['blocks']: 821 for block in self.pdf_info[pno]['blocks']:
...@@ -829,7 +833,6 @@ class Finder: ...@@ -829,7 +833,6 @@ class Finder:
829 def get_dyr_name_id(self): 833 def get_dyr_name_id(self):
830 name = self.item.copy() 834 name = self.item.copy()
831 _id = self.item.copy() 835 _id = self.item.copy()
832
833 key_box = None 836 key_box = None
834 for pno in self.pdf_info: 837 for pno in self.pdf_info:
835 for block in self.pdf_info[pno]['blocks']: 838 for block in self.pdf_info[pno]['blocks']:
...@@ -842,7 +845,7 @@ class Finder: ...@@ -842,7 +845,7 @@ class Finder:
842 key_box = bbox 845 key_box = bbox
843 846
844 if key_box is not None: 847 if key_box is not None:
845 rh = abs(key_box[1]-key_box[3]) 848 rh = abs(key_box[1] - key_box[3])
846 for pno in self.pdf_info: 849 for pno in self.pdf_info:
847 for block in self.pdf_info[pno]['blocks']: 850 for block in self.pdf_info[pno]['blocks']:
848 if block['type'] != 0: 851 if block['type'] != 0:
...@@ -850,12 +853,12 @@ class Finder: ...@@ -850,12 +853,12 @@ class Finder:
850 for line in block['lines']: 853 for line in block['lines']:
851 for span in line['spans']: 854 for span in line['spans']:
852 bbox, text = span['bbox'], span['text'] 855 bbox, text = span['bbox'], span['text']
853 if key_box[1] < np.mean(bbox[1::2]) < key_box[3]+rh*3 and '姓名' in text: 856 if key_box[1] < np.mean(bbox[1::2]) < key_box[3] + rh * 3 and '姓名' in text:
854 words = text.split(':')[-1] 857 words = text.split(':')[-1]
855 name['position'] = bbox 858 name['position'] = bbox
856 name['page'] = pno 859 name['page'] = pno
857 name['words'] = words 860 name['words'] = words
858 if key_box[1] < np.mean(bbox[1::2]) < key_box[3]+rh*3 and '证件号码' in text: 861 if key_box[1] < np.mean(bbox[1::2]) < key_box[3] + rh * 3 and '证件号码' in text:
859 words = text.split(':')[-1] 862 words = text.split(':')[-1]
860 _id['position'] = bbox 863 _id['position'] = bbox
861 _id['page'] = pno 864 _id['page'] = pno
...@@ -864,7 +867,6 @@ class Finder: ...@@ -864,7 +867,6 @@ class Finder:
864 867
865 def get_key_value_position(self, key): 868 def get_key_value_position(self, key):
866 value = self.item.copy() 869 value = self.item.copy()
867
868 key_box = None 870 key_box = None
869 for pno in self.pdf_info: 871 for pno in self.pdf_info:
870 for block in self.pdf_info[pno]['blocks']: 872 for block in self.pdf_info[pno]['blocks']:
...@@ -875,9 +877,8 @@ class Finder: ...@@ -875,9 +877,8 @@ class Finder:
875 bbox, text = span['bbox'], span['text'] 877 bbox, text = span['bbox'], span['text']
876 if text == key: 878 if text == key:
877 key_box = bbox 879 key_box = bbox
878
879 if key_box is not None: 880 if key_box is not None:
880 rh = abs(key_box[1]-key_box[3]) 881 rh = abs(key_box[1] - key_box[3])
881 for pno in self.pdf_info: 882 for pno in self.pdf_info:
882 for block in self.pdf_info[pno]['blocks']: 883 for block in self.pdf_info[pno]['blocks']:
883 if block['type'] != 0: 884 if block['type'] != 0:
...@@ -885,13 +886,104 @@ class Finder: ...@@ -885,13 +886,104 @@ class Finder:
885 for line in block['lines']: 886 for line in block['lines']:
886 for span in line['spans']: 887 for span in line['spans']:
887 bbox, text = span['bbox'], span['text'] 888 bbox, text = span['bbox'], span['text']
888 if key_box[1] < np.mean(bbox[1::2]) < key_box[3] and key_box[0] < bbox[0] and abs(key_box[2]-bbox[0]) < rh*10: 889 if key_box[1] < np.mean(bbox[1::2]) < key_box[3] and key_box[0] < bbox[0] and abs(
890 key_box[2] - bbox[0]) < rh * 10:
889 words = text 891 words = text
890 value['position'] = bbox 892 value['position'] = bbox
891 value['page'] = pno 893 value['page'] = pno
892 value['words'] = words 894 value['words'] = words
893 return value 895 return value
894 896
897 def get_role_info_3_3(self, role_key, page_num='0'):
898 name = self.item.copy()
899 id_num = self.item.copy()
900 representative = self.item.copy()
901 # 以保证人2 的左上角为定位点
902 anchor = None
903 for block in self.pdf_info[page_num]['blocks']:
904 if block['type'] != 0:
905 continue
906 for line in block['lines']:
907 for span in line['spans']:
908 bbox, text = span['bbox'], span['text']
909 # 找到角色姓名
910 if re.match('保证人2', text) is not None:
911 anchor = [bbox[0], bbox[1]]
912 if anchor is not None:
913 for block in self.pdf_info[page_num]['blocks']:
914 if block['type'] != 0:
915 continue
916 for line in block['lines']:
917 for span in line['spans']:
918 bbox, text = span['bbox'], span['text']
919 # 找到角色姓名
920 if re.match(role_key, text) is not None:
921 words = text.split(':')[-1]
922 name['words'] = words
923 name['page'] = page_num
924 name['position'] = bbox
925 if role_key == '承租人一:':
926 # 找到证件号码且确定位置
927 if re.match('证件号码:', text) is not None and np.mean(bbox[::2]) < anchor[0] and np.mean(
928 bbox[1::2]) < anchor[1]:
929 words = text.split(':')[-1]
930 id_num['words'] = words
931 id_num['page'] = page_num
932 id_num['position'] = bbox
933 # 找到法人代表且确定位置
934 if re.match('法定代表人或授权代表:', text) is not None and np.mean(bbox[::2]) < anchor[0] and np.mean(
935 bbox[1::2]) < anchor[1]:
936 words = text.split(':')[-1]
937 representative['words'] = words
938 representative['page'] = page_num
939 representative['position'] = bbox
940 if role_key == '共同承租人:':
941 # 找到证件号码且确定位置
942 if re.match('证件号码:', text) is not None and np.mean(bbox[::2]) < anchor[0] and np.mean(
943 bbox[1::2]) > anchor[1]:
944 words = text.split(':')[-1]
945 id_num['words'] = words
946 id_num['page'] = page_num
947 id_num['position'] = bbox
948 # 找到法人代表且确定位置
949 if re.match('法定代表人或授权代表:', text) is not None and np.mean(bbox[::2]) < anchor[0] and np.mean(
950 bbox[1::2]) > anchor[1]:
951 words = text.split(':')[-1]
952 representative['words'] = words
953 representative['page'] = page_num
954 representative['position'] = bbox
955 if role_key == '保证人1:':
956 # 找到证件号码且确定位置
957 if re.match('证件号码:', text) is not None and np.mean(bbox[::2]) > anchor[0] and np.mean(
958 bbox[1::2]) < anchor[1]:
959 words = text.split(':')[-1]
960 id_num['words'] = words
961 id_num['page'] = page_num
962 id_num['position'] = bbox
963 # 找到法人代表且确定位置
964 if re.match('法定代表人或授权代表:', text) is not None and np.mean(bbox[::2]) > anchor[0] and np.mean(
965 bbox[1::2]) < anchor[1]:
966 words = text.split(':')[-1]
967 representative['words'] = words
968 representative['page'] = page_num
969 representative['position'] = bbox
970 if role_key == '保证人2:':
971 # 找到证件号码且确定位置
972 if re.match('证件号码:', text) is not None and np.mean(bbox[::2]) > anchor[0] and np.mean(
973 bbox[1::2]) > anchor[1]:
974 words = text.split(':')[-1]
975 id_num['words'] = words
976 id_num['page'] = page_num
977 id_num['position'] = bbox
978 # 找到法人代表且确定位置
979 if re.match('法定代表人或授权代表:', text) is not None and np.mean(bbox[::2]) > anchor[0] and np.mean(
980 bbox[1::2]) > anchor[1]:
981 words = text.split(':')[-1]
982 representative['words'] = words
983 representative['page'] = page_num
984 representative['position'] = bbox
985 return name, id_num, representative
986
895 def get_info(self): 987 def get_info(self):
896 """ 988 """
897 block['type'] == 0 : 表示该元素为图片 989 block['type'] == 0 : 表示该元素为图片
...@@ -905,6 +997,8 @@ class Finder: ...@@ -905,6 +997,8 @@ class Finder:
905 self.init_result['合同编号'] = contract_no 997 self.init_result['合同编号'] = contract_no
906 # 从第一页上取四个角色的姓名和证件号码 998 # 从第一页上取四个角色的姓名和证件号码
907 name, id_num, representative = self.get_role_info(role_key='承租人:', page_num='0') 999 name, id_num, representative = self.get_role_info(role_key='承租人:', page_num='0')
1000 if name["words"] == None:
1001 name, id_num, representative = self.get_role_info_3_3(role_key='承租人一:', page_num='0')
908 self.init_result['承租人-姓名'] = name 1002 self.init_result['承租人-姓名'] = name
909 self.init_result['承租人-证件号码'] = id_num 1003 self.init_result['承租人-证件号码'] = id_num
910 self.init_result['承租人-法定代表人或授权代表'] = representative 1004 self.init_result['承租人-法定代表人或授权代表'] = representative
...@@ -912,14 +1006,31 @@ class Finder: ...@@ -912,14 +1006,31 @@ class Finder:
912 self.init_result['保证人1-姓名'] = name 1006 self.init_result['保证人1-姓名'] = name
913 self.init_result['保证人1-证件号码'] = id_num 1007 self.init_result['保证人1-证件号码'] = id_num
914 self.init_result['保证人1-法定代表人或授权代表'] = representative 1008 self.init_result['保证人1-法定代表人或授权代表'] = representative
1009 # if条件判别 对应3_3版本
1010 if name["words"] == None:
1011 name, id_num, representative = self.get_role_info_3_3(role_key='共同承租人:', page_num='0')
1012 self.init_result['共同承租人-姓名'] = name
1013 self.init_result['共同承租人-证件号码'] = id_num
1014 self.init_result['共同承租人-法定代表人或授权代表'] = representative
915 name, id_num, representative = self.get_role_info(role_key='保证人2:', page_num='0') 1015 name, id_num, representative = self.get_role_info(role_key='保证人2:', page_num='0')
916 self.init_result['保证人2-姓名'] = name 1016 self.init_result['保证人2-姓名'] = name
917 self.init_result['保证人2-证件号码'] = id_num 1017 self.init_result['保证人2-证件号码'] = id_num
918 self.init_result['保证人2-法定代表人或授权代表'] = representative 1018 self.init_result['保证人2-法定代表人或授权代表'] = representative
1019 # if条件判别 对应3_3版本
1020 if name["words"] == None:
1021 name, id_num, representative = self.get_role_info_3_3(role_key='保证人1:', page_num='0')
1022 self.init_result['保证人2-姓名'] = name
1023 self.init_result['保证人2-证件号码'] = id_num
1024 self.init_result['保证人2-法定代表人或授权代表'] = representative
919 name, id_num, representative = self.get_role_info(role_key='保证人3:', page_num='0') 1025 name, id_num, representative = self.get_role_info(role_key='保证人3:', page_num='0')
920 self.init_result['保证人3-姓名'] = name 1026 self.init_result['保证人3-姓名'] = name
921 self.init_result['保证人3-证件号码'] = id_num 1027 self.init_result['保证人3-证件号码'] = id_num
922 self.init_result['保证人3-法定代表人或授权代表'] = representative 1028 self.init_result['保证人3-法定代表人或授权代表'] = representative
1029 if name["words"] == None:
1030 name, id_num, representative = self.get_role_info_3_3(role_key='保证人2:', page_num='0')
1031 self.init_result['保证人3-姓名'] = name
1032 self.init_result['保证人3-证件号码'] = id_num
1033 self.init_result['保证人3-法定代表人或授权代表'] = representative
923 # 在所有页面中找正文中(第二部分 融资租赁主要条款及付款计划)的那个编号,因为存在换行的情况所以暂时不带位置输出 1034 # 在所有页面中找正文中(第二部分 融资租赁主要条款及付款计划)的那个编号,因为存在换行的情况所以暂时不带位置输出
924 contract_no = self.get_contract_no_one() 1035 contract_no = self.get_contract_no_one()
925 self.init_result['合同编号(正文)'] = contract_no 1036 self.init_result['合同编号(正文)'] = contract_no
...@@ -955,6 +1066,9 @@ class Finder: ...@@ -955,6 +1066,9 @@ class Finder:
955 # 承租人姓名、签章 1066 # 承租人姓名、签章
956 name = self.get_key_value(key='承租人姓名:') 1067 name = self.get_key_value(key='承租人姓名:')
957 electronic_signature = self.get_electronic_signature(top='承租人姓名:', bottom='保证人1姓名:') 1068 electronic_signature = self.get_electronic_signature(top='承租人姓名:', bottom='保证人1姓名:')
1069 if name["words"] == None:
1070 name = self.get_key_value(key='承租人一姓名:')
1071 electronic_signature = self.get_electronic_signature(top='承租人一姓名:', bottom='共同承租人名称:')
958 self.init_result['签字页-承租人姓名'] = name 1072 self.init_result['签字页-承租人姓名'] = name
959 self.init_result['签字页-承租人签章'] = electronic_signature 1073 self.init_result['签字页-承租人签章'] = electronic_signature
960 # 保证人1姓名、签章 1074 # 保证人1姓名、签章
...@@ -962,19 +1076,35 @@ class Finder: ...@@ -962,19 +1076,35 @@ class Finder:
962 electronic_signature = self.get_electronic_signature(top='保证人1姓名:', bottom='保证人2姓名:') 1076 electronic_signature = self.get_electronic_signature(top='保证人1姓名:', bottom='保证人2姓名:')
963 self.init_result['签字页-保证人1姓名'] = name 1077 self.init_result['签字页-保证人1姓名'] = name
964 self.init_result['签字页-保证人1签章'] = electronic_signature 1078 self.init_result['签字页-保证人1签章'] = electronic_signature
1079 # 这里用的是 name["words"] == ""
1080 if name["words"] == "":
1081 name = self.get_key_value(key='共同承租人名称:')
1082 electronic_signature = self.get_electronic_signature(top='共同承租人名称:', bottom='保证人1姓名:')
1083 self.init_result['签字页-共同承租人姓名'] = name
1084 self.init_result['签字页-共同承租人签章'] = electronic_signature
965 # 保证人2姓名、签章 1085 # 保证人2姓名、签章
966 name = self.get_key_value(key='保证人2姓名:') 1086 name = self.get_key_value(key='保证人2姓名:')
967 electronic_signature = self.get_electronic_signature(top='保证人2姓名:', bottom='保证人3姓名:') 1087 electronic_signature = self.get_electronic_signature(top='保证人2姓名:', bottom='保证人3姓名:')
968 self.init_result['签字页-保证人2姓名'] = name 1088 self.init_result['签字页-保证人2姓名'] = name
969 self.init_result['签字页-保证人2签章'] = electronic_signature 1089 self.init_result['签字页-保证人2签章'] = electronic_signature
970 # 保证人2姓名、签章 1090 # if判断条件对应3_3版本
1091 if name["words"] == "":
1092 name = self.get_key_value(key='保证人1姓名:')
1093 electronic_signature = self.get_electronic_signature(top='保证人1姓名:', bottom='保证人2姓名:')
1094 self.init_result['签字页-保证人1姓名'] = name
1095 self.init_result['签字页-保证人1签章'] = electronic_signature
1096 # 保证人3姓名、签章
971 name = self.get_key_value(key='保证人3姓名:') 1097 name = self.get_key_value(key='保证人3姓名:')
972 electronic_signature = self.get_electronic_signature(top='保证人3姓名:', bottom='日期:') 1098 electronic_signature = self.get_electronic_signature(top='保证人3姓名:', bottom='日期:')
973 self.init_result['签字页-保证人3姓名'] = name 1099 self.init_result['签字页-保证人3姓名'] = name
974 self.init_result['签字页-保证人3签章'] = electronic_signature 1100 self.init_result['签字页-保证人3签章'] = electronic_signature
975 1101 # if判断条件对应3_3版本
1102 if name["words"] == None:
1103 name = self.get_key_value(key='保证人2姓名:')
1104 electronic_signature = self.get_electronic_signature(top='保证人2姓名:', bottom='日期:')
1105 self.init_result['签字页-保证人2姓名'] = name
1106 self.init_result['签字页-保证人2签章'] = electronic_signature
976 return self.init_result 1107 return self.init_result
977
978 # results['is_shhz_contract'] = True 1108 # results['is_shhz_contract'] = True
979 # results['pdf_info'] = self.init_result 1109 # results['pdf_info'] = self.init_result
980 1110
......
...@@ -18,7 +18,6 @@ def predict(pdf_info, file_cls): ...@@ -18,7 +18,6 @@ def predict(pdf_info, file_cls):
18 Returns: 18 Returns:
19 TYPE: Description 19 TYPE: Description
20 """ 20 """
21
22 # 0: 售后回租合同 21 # 0: 售后回租合同
23 pdf_info_0 = [] 22 pdf_info_0 = []
24 for pno in pdf_info: 23 for pno in pdf_info:
...@@ -30,7 +29,6 @@ def predict(pdf_info, file_cls): ...@@ -30,7 +29,6 @@ def predict(pdf_info, file_cls):
30 bbox, text = span['bbox'], span['text'] 29 bbox, text = span['bbox'], span['text']
31 if '售后回租合同_' in text: 30 if '售后回租合同_' in text:
32 pdf_info_0.append(pdf_info[pno]) 31 pdf_info_0.append(pdf_info[pno])
33
34 # 1: 车辆处置协议 32 # 1: 车辆处置协议
35 pdf_info_1 = [] 33 pdf_info_1 = []
36 for pno in pdf_info: 34 for pno in pdf_info:
...@@ -42,7 +40,6 @@ def predict(pdf_info, file_cls): ...@@ -42,7 +40,6 @@ def predict(pdf_info, file_cls):
42 bbox, text = span['bbox'], span['text'] 40 bbox, text = span['bbox'], span['text']
43 if '售后回租合同附件一' in text: 41 if '售后回租合同附件一' in text:
44 pdf_info_1.append(pdf_info[pno]) 42 pdf_info_1.append(pdf_info[pno])
45
46 # 2: 车辆租赁抵押合同 43 # 2: 车辆租赁抵押合同
47 pdf_info_2 = [] 44 pdf_info_2 = []
48 for pno in pdf_info: 45 for pno in pdf_info:
...@@ -54,7 +51,6 @@ def predict(pdf_info, file_cls): ...@@ -54,7 +51,6 @@ def predict(pdf_info, file_cls):
54 bbox, text = span['bbox'], span['text'] 51 bbox, text = span['bbox'], span['text']
55 if '车辆租赁抵押合同_' in text: 52 if '车辆租赁抵押合同_' in text:
56 pdf_info_2.append(pdf_info[pno]) 53 pdf_info_2.append(pdf_info[pno])
57
58 is_clczxy = False 54 is_clczxy = False
59 # 如果 pdf_info_1 == 4 页,则说明此时输入包含了车辆处置协议 55 # 如果 pdf_info_1 == 4 页,则说明此时输入包含了车辆处置协议
60 if len(pdf_info_1) == 4 and file_cls == 1 and len(pdf_info_0) != 0: 56 if len(pdf_info_1) == 4 and file_cls == 1 and len(pdf_info_0) != 0:
...@@ -62,7 +58,6 @@ def predict(pdf_info, file_cls): ...@@ -62,7 +58,6 @@ def predict(pdf_info, file_cls):
62 pdf_info = dict() 58 pdf_info = dict()
63 for pno, page_info in enumerate(pdf_info_1): 59 for pno, page_info in enumerate(pdf_info_1):
64 pdf_info[str(pno)] = page_info 60 pdf_info[str(pno)] = page_info
65
66 f = Finder(pdf_info) 61 f = Finder(pdf_info)
67 if file_cls == 0: 62 if file_cls == 0:
68 results = f.get_info() 63 results = f.get_info()
...@@ -72,13 +67,11 @@ def predict(pdf_info, file_cls): ...@@ -72,13 +67,11 @@ def predict(pdf_info, file_cls):
72 if file_cls == 2: 67 if file_cls == 2:
73 # 提取信息 ———— 车辆租赁抵押合同 68 # 提取信息 ———— 车辆租赁抵押合同
74 results = f.get_info_2() 69 results = f.get_info_2()
75 70 if is_clczxy is True:
76 if is_clczxy == True:
77 for key in results: 71 for key in results:
78 if results[key]['page'] is not None: 72 if results[key]['page'] is not None:
79 results[key]['page'] = str(int(results[key]['page'])+6) 73 results[key]['page'] = str(int(results[key]['page']) + 6)
80
81 for key in results: 74 for key in results:
82 if results[key]['page'] is not None: 75 if results[key]['page'] is not None:
83 results[key]['page'] = 'page_' + str(int(results[key]['page'])+1) 76 results[key]['page'] = 'page_' + str(int(results[key]['page']) + 1)
84 return results 77 return results
......
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!