fix HIL contract
Showing
2 changed files
with
199 additions
and
76 deletions
... | @@ -6,14 +6,11 @@ | ... | @@ -6,14 +6,11 @@ |
6 | # @Description : | 6 | # @Description : |
7 | 7 | ||
8 | import re | 8 | import re |
9 | import cv2 | ||
10 | import base64 | ||
11 | import numpy as np | 9 | import numpy as np |
12 | from fuzzywuzzy import fuzz | 10 | from fuzzywuzzy import fuzz |
13 | 11 | ||
14 | 12 | ||
15 | class Finder: | 13 | class Finder: |
16 | |||
17 | def __init__(self, pdf_info): | 14 | def __init__(self, pdf_info): |
18 | self.pdf_info = pdf_info | 15 | self.pdf_info = pdf_info |
19 | self.item = {"words": None, | 16 | self.item = {"words": None, |
... | @@ -25,6 +22,9 @@ class Finder: | ... | @@ -25,6 +22,9 @@ class Finder: |
25 | "承租人-姓名": self.item, | 22 | "承租人-姓名": self.item, |
26 | "承租人-证件号码": self.item, | 23 | "承租人-证件号码": self.item, |
27 | "承租人-法定代表人或授权代表": self.item, | 24 | "承租人-法定代表人或授权代表": self.item, |
25 | "共同承租人-姓名": self.item, | ||
26 | "共同承租人-证件号码": self.item, | ||
27 | "共同承租人-法定代表人或授权代表": self.item, | ||
28 | "保证人1-姓名": self.item, | 28 | "保证人1-姓名": self.item, |
29 | "保证人1-证件号码": self.item, | 29 | "保证人1-证件号码": self.item, |
30 | "保证人1-法定代表人或授权代表": self.item, | 30 | "保证人1-法定代表人或授权代表": self.item, |
... | @@ -47,6 +47,8 @@ class Finder: | ... | @@ -47,6 +47,8 @@ class Finder: |
47 | "银行账户-开户行": self.item, | 47 | "银行账户-开户行": self.item, |
48 | "签字页-承租人姓名": self.item, | 48 | "签字页-承租人姓名": self.item, |
49 | "签字页-承租人签章": self.item, | 49 | "签字页-承租人签章": self.item, |
50 | "签字页-共同承租人姓名": self.item, | ||
51 | "签字页-共同承租人签章": self.item, | ||
50 | "签字页-保证人1姓名": self.item, | 52 | "签字页-保证人1姓名": self.item, |
51 | "签字页-保证人1签章": self.item, | 53 | "签字页-保证人1签章": self.item, |
52 | "签字页-保证人2姓名": self.item, | 54 | "签字页-保证人2姓名": self.item, |
... | @@ -54,7 +56,6 @@ class Finder: | ... | @@ -54,7 +56,6 @@ class Finder: |
54 | "签字页-保证人3姓名": self.item, | 56 | "签字页-保证人3姓名": self.item, |
55 | "签字页-保证人3签章": self.item, | 57 | "签字页-保证人3签章": self.item, |
56 | } | 58 | } |
57 | |||
58 | # 格式化输出 车辆处置协议 要是别的字段 | 59 | # 格式化输出 车辆处置协议 要是别的字段 |
59 | self.init_result_1 = {"合同编号": self.item, | 60 | self.init_result_1 = {"合同编号": self.item, |
60 | "承租人-姓名": self.item, | 61 | "承租人-姓名": self.item, |
... | @@ -66,9 +67,7 @@ class Finder: | ... | @@ -66,9 +67,7 @@ class Finder: |
66 | "签字页-承租人签章": self.item, | 67 | "签字页-承租人签章": self.item, |
67 | "签字页-销售经销商": self.item, | 68 | "签字页-销售经销商": self.item, |
68 | "签字页-销售经销商签章": self.item, | 69 | "签字页-销售经销商签章": self.item, |
69 | |||
70 | } | 70 | } |
71 | |||
72 | # 格式化输出 车辆租赁抵押合同 | 71 | # 格式化输出 车辆租赁抵押合同 |
73 | self.init_result_2 = {"合同编号": self.item, | 72 | self.init_result_2 = {"合同编号": self.item, |
74 | "合同编号(正文)": self.item, | 73 | "合同编号(正文)": self.item, |
... | @@ -150,23 +149,24 @@ class Finder: | ... | @@ -150,23 +149,24 @@ class Finder: |
150 | words = matchObj.group(1) | 149 | words = matchObj.group(1) |
151 | contract_no['position'] = None | 150 | contract_no['position'] = None |
152 | contract_no['page'] = pno | 151 | contract_no['page'] = pno |
153 | contract_no['words'] = words | 152 | # contract_no['words'] = words |
153 | contract_no['words'] = re.sub("\s", "", words).replace(")", "") | ||
154 | return contract_no | 154 | return contract_no |
155 | |||
156 | matchObj = re.search(r'编号为(.*?)的', all_text) | 155 | matchObj = re.search(r'编号为(.*?)的', all_text) |
157 | if matchObj: | 156 | if matchObj: |
158 | words = matchObj.group(1).strip() | 157 | words = matchObj.group(1).strip() |
159 | contract_no['position'] = None | 158 | contract_no['position'] = None |
160 | contract_no['page'] = pno | 159 | contract_no['page'] = pno |
161 | contract_no['words'] = words | 160 | # contract_no['words'] = words |
161 | contract_no['words'] = re.sub("\s", "", words).replace(")", "") | ||
162 | return contract_no | 162 | return contract_no |
163 | |||
164 | matchObj = re.search(r'编号为(.*?))的', all_text) | 163 | matchObj = re.search(r'编号为(.*?))的', all_text) |
165 | if matchObj: | 164 | if matchObj: |
166 | words = matchObj.group(1).strip() | 165 | words = matchObj.group(1).strip() |
167 | contract_no['position'] = None | 166 | contract_no['position'] = None |
168 | contract_no['page'] = pno | 167 | contract_no['page'] = pno |
169 | contract_no['words'] = words | 168 | # contract_no['words'] = words |
169 | contract_no['words'] = re.sub("\s", "", words) | ||
170 | return contract_no | 170 | return contract_no |
171 | 171 | ||
172 | def get_key_value(self, key, page_num=None): | 172 | def get_key_value(self, key, page_num=None): |
... | @@ -180,10 +180,11 @@ class Finder: | ... | @@ -180,10 +180,11 @@ class Finder: |
180 | for span in line['spans']: | 180 | for span in line['spans']: |
181 | bbox, text = span['bbox'], span['text'] | 181 | bbox, text = span['bbox'], span['text'] |
182 | if key in text: | 182 | if key in text: |
183 | words = text.split(':')[-1] | 183 | words = text.split(':')[-1].replace("。", "") |
184 | value['position'] = bbox | 184 | value['position'] = bbox |
185 | value['page'] = pno | 185 | value['page'] = pno |
186 | value['words'] = words | 186 | # value['words'] = words |
187 | value['words'] = re.sub("\s", "", words) | ||
187 | else: | 188 | else: |
188 | for pno in self.pdf_info: | 189 | for pno in self.pdf_info: |
189 | for block in self.pdf_info[pno]['blocks']: | 190 | for block in self.pdf_info[pno]['blocks']: |
... | @@ -194,10 +195,11 @@ class Finder: | ... | @@ -194,10 +195,11 @@ class Finder: |
194 | bbox, text = span['bbox'], span['text'] | 195 | bbox, text = span['bbox'], span['text'] |
195 | if key in text: | 196 | if key in text: |
196 | # print(self.pdf_info[pno]) | 197 | # print(self.pdf_info[pno]) |
197 | words = text.split(':')[-1] | 198 | words = text.split(':')[-1].replace("。", "") |
198 | value['position'] = bbox | 199 | value['position'] = bbox |
199 | value['page'] = pno | 200 | value['page'] = pno |
200 | value['words'] = words | 201 | # value['words'] = words |
202 | value['words'] = re.sub("\s", "", words) | ||
201 | return value | 203 | return value |
202 | 204 | ||
203 | def get_loan_principal(self, page_num='0'): | 205 | def get_loan_principal(self, page_num='0'): |
... | @@ -267,7 +269,6 @@ class Finder: | ... | @@ -267,7 +269,6 @@ class Finder: |
267 | 269 | ||
268 | def get_asp_details(self, page_num): | 270 | def get_asp_details(self, page_num): |
269 | asp_details_table_term = self.item.copy() | 271 | asp_details_table_term = self.item.copy() |
270 | |||
271 | asp_details_table = [] | 272 | asp_details_table = [] |
272 | asp_details_text_list = [] | 273 | asp_details_text_list = [] |
273 | table = False | 274 | table = False |
... | @@ -283,25 +284,20 @@ class Finder: | ... | @@ -283,25 +284,20 @@ class Finder: |
283 | table = False | 284 | table = False |
284 | if table == True: | 285 | if table == True: |
285 | asp_details_text_list.append(text) | 286 | asp_details_text_list.append(text) |
286 | 287 | for i in range((len(asp_details_text_list) + 2) // 3): | |
287 | for i in range((len(asp_details_text_list)+2)//3): | ||
288 | |||
289 | line = [] | 288 | line = [] |
290 | if i == 0: | 289 | if i == 0: |
291 | line = [asp_details_text_list[0]] | 290 | line = [asp_details_text_list[0]] |
292 | else: | 291 | else: |
293 | for j in range(3): | 292 | for j in range(3): |
294 | line.append(asp_details_text_list[i*3-2+j]) | 293 | line.append(asp_details_text_list[i * 3 - 2 + j]) |
295 | |||
296 | asp_details_table.append(line) | 294 | asp_details_table.append(line) |
297 | |||
298 | if len(asp_details_table) > 0: | 295 | if len(asp_details_table) > 0: |
299 | asp_details_table_term['words'] = asp_details_table | 296 | asp_details_table_term['words'] = asp_details_table |
300 | return asp_details_table_term | 297 | return asp_details_table_term |
301 | 298 | ||
302 | def get_signature(self): | 299 | def get_signature(self): |
303 | signature = self.item.copy() | 300 | signature = self.item.copy() |
304 | |||
305 | for block in self.pdf_info['0']['blocks']: | 301 | for block in self.pdf_info['0']['blocks']: |
306 | if block['type'] != 0: | 302 | if block['type'] != 0: |
307 | continue | 303 | continue |
... | @@ -369,8 +365,8 @@ class Finder: | ... | @@ -369,8 +365,8 @@ class Finder: |
369 | for line in block['lines']: | 365 | for line in block['lines']: |
370 | for span in line['spans']: | 366 | for span in line['spans']: |
371 | bbox, text = span['bbox'], span['text'] | 367 | bbox, text = span['bbox'], span['text'] |
372 | if anchor_bbox[2]<np.mean(bbox[::2])<half_width and \ | 368 | if anchor_bbox[2] < np.mean(bbox[::2]) < half_width and \ |
373 | anchor_bbox[1]<np.mean(bbox[1::2])<anchor_bbox[3]: | 369 | anchor_bbox[1] < np.mean(bbox[1::2]) < anchor_bbox[3]: |
374 | seller['position'] = bbox | 370 | seller['position'] = bbox |
375 | seller['words'] = text | 371 | seller['words'] = text |
376 | return seller | 372 | return seller |
... | @@ -430,7 +426,6 @@ class Finder: | ... | @@ -430,7 +426,6 @@ class Finder: |
430 | 426 | ||
431 | def get_repayment_schedule(self): | 427 | def get_repayment_schedule(self): |
432 | repayment_schedule = self.item.copy() | 428 | repayment_schedule = self.item.copy() |
433 | |||
434 | repayment_schedule_text_list = [] | 429 | repayment_schedule_text_list = [] |
435 | table = False | 430 | table = False |
436 | page = None | 431 | page = None |
... | @@ -444,20 +439,25 @@ class Finder: | ... | @@ -444,20 +439,25 @@ class Finder: |
444 | if '以上表格中所列序号' in text: | 439 | if '以上表格中所列序号' in text: |
445 | table = False | 440 | table = False |
446 | if table == True: | 441 | if table == True: |
442 | # 过滤汉字 | ||
443 | if re.compile(r'[\u4e00-\u9fff]').search(text): | ||
444 | continue | ||
445 | # 过滤 1. - 61. 这些标题 | ||
446 | if re.findall("\d+", text): | ||
447 | if len(re.findall("\d+", text)) == 1: | ||
448 | continue | ||
447 | repayment_schedule_text_list.append(text) | 449 | repayment_schedule_text_list.append(text) |
448 | if '61.' in text: | 450 | if '61.' in text: |
449 | page = pno | 451 | page = pno |
450 | table = True | 452 | table = True |
451 | 453 | # print("repayment_schedule_text_list = ", repayment_schedule_text_list) | |
452 | repayment_schedule_table = [['序号', '融资租赁成本', '融资租赁费用', '租金', '剩余融资租赁成本']] | 454 | repayment_schedule_table = [['序号', '融资租赁成本', '融资租赁费用', '租金', '剩余融资租赁成本']] |
453 | for i in range(len(repayment_schedule_text_list)//4): | 455 | for i in range(len(repayment_schedule_text_list) // 4): |
454 | line = [f'{i+1}.'] | 456 | line = [f'{i + 1}.'] |
455 | # 4表示4列的意思 | 457 | # 4表示4列的意思 |
456 | for j in range(4): | 458 | for j in range(4): |
457 | line.append(repayment_schedule_text_list[i*4+j]) | 459 | line.append(repayment_schedule_text_list[i * 4 + j]) |
458 | |||
459 | repayment_schedule_table.append(line) | 460 | repayment_schedule_table.append(line) |
460 | |||
461 | repayment_schedule['words'] = repayment_schedule_table | 461 | repayment_schedule['words'] = repayment_schedule_table |
462 | repayment_schedule['page'] = page | 462 | repayment_schedule['page'] = page |
463 | return repayment_schedule | 463 | return repayment_schedule |
... | @@ -506,7 +506,7 @@ class Finder: | ... | @@ -506,7 +506,7 @@ class Finder: |
506 | else: | 506 | else: |
507 | words = '无' | 507 | words = '无' |
508 | boxes = np.array(boxes).reshape((-1, 2)) | 508 | boxes = np.array(boxes).reshape((-1, 2)) |
509 | position = [min(boxes[:,0]), min(boxes[:,1]), max(boxes[:,0]), max(boxes[:,1])] | 509 | position = [min(boxes[:, 0]), min(boxes[:, 1]), max(boxes[:, 0]), max(boxes[:, 1])] |
510 | signature_role_2['page_num'] = page_num | 510 | signature_role_2['page_num'] = page_num |
511 | signature_role_2['position'] = position | 511 | signature_role_2['position'] = position |
512 | signature_role_2['words'] = words | 512 | signature_role_2['words'] = words |
... | @@ -541,7 +541,7 @@ class Finder: | ... | @@ -541,7 +541,7 @@ class Finder: |
541 | else: | 541 | else: |
542 | words = '无' | 542 | words = '无' |
543 | boxes = np.array(boxes).reshape((-1, 2)) | 543 | boxes = np.array(boxes).reshape((-1, 2)) |
544 | position = [min(boxes[:,0]), min(boxes[:,1]), max(boxes[:,0]), max(boxes[:,1])] | 544 | position = [min(boxes[:, 0]), min(boxes[:, 1]), max(boxes[:, 0]), max(boxes[:, 1])] |
545 | signature_role_3['page_num'] = page_num | 545 | signature_role_3['page_num'] = page_num |
546 | signature_role_3['position'] = position | 546 | signature_role_3['position'] = position |
547 | signature_role_3['words'] = words | 547 | signature_role_3['words'] = words |
... | @@ -576,7 +576,7 @@ class Finder: | ... | @@ -576,7 +576,7 @@ class Finder: |
576 | else: | 576 | else: |
577 | words = '无' | 577 | words = '无' |
578 | boxes = np.array(boxes).reshape((-1, 2)) | 578 | boxes = np.array(boxes).reshape((-1, 2)) |
579 | position = [min(boxes[:,0]), min(boxes[:,1]), max(boxes[:,0]), max(boxes[:,1])] | 579 | position = [min(boxes[:, 0]), min(boxes[:, 1]), max(boxes[:, 0]), max(boxes[:, 1])] |
580 | signature_role_4['page_num'] = page_num | 580 | signature_role_4['page_num'] = page_num |
581 | signature_role_4['position'] = position | 581 | signature_role_4['position'] = position |
582 | signature_role_4['words'] = words | 582 | signature_role_4['words'] = words |
... | @@ -612,7 +612,7 @@ class Finder: | ... | @@ -612,7 +612,7 @@ class Finder: |
612 | else: | 612 | else: |
613 | words = '无' | 613 | words = '无' |
614 | boxes = np.array(boxes).reshape((-1, 2)) | 614 | boxes = np.array(boxes).reshape((-1, 2)) |
615 | position = [min(boxes[:,0]), min(boxes[:,1]), max(boxes[:,0]), max(boxes[:,1])] | 615 | position = [min(boxes[:, 0]), min(boxes[:, 1]), max(boxes[:, 0]), max(boxes[:, 1])] |
616 | signature_role_5['page_num'] = page_num | 616 | signature_role_5['page_num'] = page_num |
617 | signature_role_5['position'] = position | 617 | signature_role_5['position'] = position |
618 | signature_role_5['words'] = words | 618 | signature_role_5['words'] = words |
... | @@ -640,7 +640,7 @@ class Finder: | ... | @@ -640,7 +640,7 @@ class Finder: |
640 | for line in block['lines']: | 640 | for line in block['lines']: |
641 | for span in line['spans']: | 641 | for span in line['spans']: |
642 | bbox, text = span['bbox'], span['text'] | 642 | bbox, text = span['bbox'], span['text'] |
643 | if '签署日期' in text and int(anchor_top)<np.mean(bbox[1::2])<int(anchor_bottom): | 643 | if '签署日期' in text and int(anchor_top) < np.mean(bbox[1::2]) < int(anchor_bottom): |
644 | name = text.split(' ')[0] | 644 | name = text.split(' ')[0] |
645 | date = text.split(':')[-1] | 645 | date = text.split(':')[-1] |
646 | signature_name['words'] = name | 646 | signature_name['words'] = name |
... | @@ -663,7 +663,7 @@ class Finder: | ... | @@ -663,7 +663,7 @@ class Finder: |
663 | if top in text: | 663 | if top in text: |
664 | anchor_top = bbox[1] | 664 | anchor_top = bbox[1] |
665 | if bottom in text: | 665 | if bottom in text: |
666 | anchor_bottom = bbox[1] | 666 | anchor_bottom = bbox[3] |
667 | if anchor_top is not None and anchor_bottom is not None: | 667 | if anchor_top is not None and anchor_bottom is not None: |
668 | for pno in self.pdf_info: | 668 | for pno in self.pdf_info: |
669 | for block in self.pdf_info[pno]['blocks']: | 669 | for block in self.pdf_info[pno]['blocks']: |
... | @@ -672,7 +672,9 @@ class Finder: | ... | @@ -672,7 +672,9 @@ class Finder: |
672 | for line in block['lines']: | 672 | for line in block['lines']: |
673 | for span in line['spans']: | 673 | for span in line['spans']: |
674 | bbox, text = span['bbox'], span['text'] | 674 | bbox, text = span['bbox'], span['text'] |
675 | if '签署日期' in text and int(anchor_top)<np.mean(bbox[1::2])<int(anchor_bottom): | 675 | # ------------ # |
676 | # print("--text = ", text) | ||
677 | if '签署日期' in text and int(anchor_top) < np.mean(bbox[1::2]) < int(anchor_bottom): | ||
676 | words = text | 678 | words = text |
677 | signature['words'] = words | 679 | signature['words'] = words |
678 | signature['page'] = pno | 680 | signature['page'] = pno |
... | @@ -683,7 +685,6 @@ class Finder: | ... | @@ -683,7 +685,6 @@ class Finder: |
683 | name = self.item.copy() | 685 | name = self.item.copy() |
684 | id_num = self.item.copy() | 686 | id_num = self.item.copy() |
685 | representative = self.item.copy() | 687 | representative = self.item.copy() |
686 | |||
687 | # 以保证人3 的左上角为定位点 | 688 | # 以保证人3 的左上角为定位点 |
688 | anchor = None | 689 | anchor = None |
689 | for block in self.pdf_info[page_num]['blocks']: | 690 | for block in self.pdf_info[page_num]['blocks']: |
... | @@ -695,7 +696,6 @@ class Finder: | ... | @@ -695,7 +696,6 @@ class Finder: |
695 | # 找到角色姓名 | 696 | # 找到角色姓名 |
696 | if re.match('保证人3', text) is not None: | 697 | if re.match('保证人3', text) is not None: |
697 | anchor = [bbox[0], bbox[1]] | 698 | anchor = [bbox[0], bbox[1]] |
698 | |||
699 | if anchor is not None: | 699 | if anchor is not None: |
700 | for block in self.pdf_info[page_num]['blocks']: | 700 | for block in self.pdf_info[page_num]['blocks']: |
701 | if block['type'] != 0: | 701 | if block['type'] != 0: |
... | @@ -711,52 +711,60 @@ class Finder: | ... | @@ -711,52 +711,60 @@ class Finder: |
711 | name['position'] = bbox | 711 | name['position'] = bbox |
712 | if role_key == '承租人:': | 712 | if role_key == '承租人:': |
713 | # 找到证件号码且确定位置 | 713 | # 找到证件号码且确定位置 |
714 | if re.match('证件号码:', text) is not None and np.mean(bbox[::2]) < anchor[0] and np.mean(bbox[1::2]) < anchor[1]: | 714 | if re.match('证件号码:', text) is not None and np.mean(bbox[::2]) < anchor[0] and np.mean( |
715 | bbox[1::2]) < anchor[1]: | ||
715 | words = text.split(':')[-1] | 716 | words = text.split(':')[-1] |
716 | id_num['words'] = words | 717 | id_num['words'] = words |
717 | id_num['page'] = page_num | 718 | id_num['page'] = page_num |
718 | id_num['position'] = bbox | 719 | id_num['position'] = bbox |
719 | # 找到法人代表且确定位置 | 720 | # 找到法人代表且确定位置 |
720 | if re.match('法定代表人或授权代表:', text) is not None and np.mean(bbox[::2]) < anchor[0] and np.mean(bbox[1::2]) < anchor[1]: | 721 | if re.match('法定代表人或授权代表:', text) is not None and np.mean(bbox[::2]) < anchor[0] and np.mean( |
722 | bbox[1::2]) < anchor[1]: | ||
721 | words = text.split(':')[-1] | 723 | words = text.split(':')[-1] |
722 | representative['words'] = words | 724 | representative['words'] = words |
723 | representative['page'] = page_num | 725 | representative['page'] = page_num |
724 | representative['position'] = bbox | 726 | representative['position'] = bbox |
725 | if role_key == '保证人1:': | 727 | if role_key == '保证人1:': |
726 | # 找到证件号码且确定位置 | 728 | # 找到证件号码且确定位置 |
727 | if re.match('证件号码:', text) is not None and np.mean(bbox[::2]) < anchor[0] and np.mean(bbox[1::2]) > anchor[1]: | 729 | if re.match('证件号码:', text) is not None and np.mean(bbox[::2]) < anchor[0] and np.mean( |
730 | bbox[1::2]) > anchor[1]: | ||
728 | words = text.split(':')[-1] | 731 | words = text.split(':')[-1] |
729 | id_num['words'] = words | 732 | id_num['words'] = words |
730 | id_num['page'] = page_num | 733 | id_num['page'] = page_num |
731 | id_num['position'] = bbox | 734 | id_num['position'] = bbox |
732 | # 找到法人代表且确定位置 | 735 | # 找到法人代表且确定位置 |
733 | if re.match('法定代表人或授权代表:', text) is not None and np.mean(bbox[::2]) < anchor[0] and np.mean(bbox[1::2]) > anchor[1]: | 736 | if re.match('法定代表人或授权代表:', text) is not None and np.mean(bbox[::2]) < anchor[0] and np.mean( |
737 | bbox[1::2]) > anchor[1]: | ||
734 | words = text.split(':')[-1] | 738 | words = text.split(':')[-1] |
735 | representative['words'] = words | 739 | representative['words'] = words |
736 | representative['page'] = page_num | 740 | representative['page'] = page_num |
737 | representative['position'] = bbox | 741 | representative['position'] = bbox |
738 | if role_key == '保证人2:': | 742 | if role_key == '保证人2:': |
739 | # 找到证件号码且确定位置 | 743 | # 找到证件号码且确定位置 |
740 | if re.match('证件号码:', text) is not None and np.mean(bbox[::2]) > anchor[0] and np.mean(bbox[1::2]) < anchor[1]: | 744 | if re.match('证件号码:', text) is not None and np.mean(bbox[::2]) > anchor[0] and np.mean( |
745 | bbox[1::2]) < anchor[1]: | ||
741 | words = text.split(':')[-1] | 746 | words = text.split(':')[-1] |
742 | id_num['words'] = words | 747 | id_num['words'] = words |
743 | id_num['page'] = page_num | 748 | id_num['page'] = page_num |
744 | id_num['position'] = bbox | 749 | id_num['position'] = bbox |
745 | # 找到法人代表且确定位置 | 750 | # 找到法人代表且确定位置 |
746 | if re.match('法定代表人或授权代表:', text) is not None and np.mean(bbox[::2]) > anchor[0] and np.mean(bbox[1::2]) < anchor[1]: | 751 | if re.match('法定代表人或授权代表:', text) is not None and np.mean(bbox[::2]) > anchor[0] and np.mean( |
752 | bbox[1::2]) < anchor[1]: | ||
747 | words = text.split(':')[-1] | 753 | words = text.split(':')[-1] |
748 | representative['words'] = words | 754 | representative['words'] = words |
749 | representative['page'] = page_num | 755 | representative['page'] = page_num |
750 | representative['position'] = bbox | 756 | representative['position'] = bbox |
751 | if role_key == '保证人3:': | 757 | if role_key == '保证人3:': |
752 | # 找到证件号码且确定位置 | 758 | # 找到证件号码且确定位置 |
753 | if re.match('证件号码:', text) is not None and np.mean(bbox[::2]) > anchor[0] and np.mean(bbox[1::2]) > anchor[1]: | 759 | if re.match('证件号码:', text) is not None and np.mean(bbox[::2]) > anchor[0] and np.mean( |
760 | bbox[1::2]) > anchor[1]: | ||
754 | words = text.split(':')[-1] | 761 | words = text.split(':')[-1] |
755 | id_num['words'] = words | 762 | id_num['words'] = words |
756 | id_num['page'] = page_num | 763 | id_num['page'] = page_num |
757 | id_num['position'] = bbox | 764 | id_num['position'] = bbox |
758 | # 找到法人代表且确定位置 | 765 | # 找到法人代表且确定位置 |
759 | if re.match('法定代表人或授权代表:', text) is not None and np.mean(bbox[::2]) > anchor[0] and np.mean(bbox[1::2]) > anchor[1]: | 766 | if re.match('法定代表人或授权代表:', text) is not None and np.mean(bbox[::2]) > anchor[0] and np.mean( |
767 | bbox[1::2]) > anchor[1]: | ||
760 | words = text.split(':')[-1] | 768 | words = text.split(':')[-1] |
761 | representative['words'] = words | 769 | representative['words'] = words |
762 | representative['page'] = page_num | 770 | representative['page'] = page_num |
... | @@ -783,12 +791,10 @@ class Finder: | ... | @@ -783,12 +791,10 @@ class Finder: |
783 | start = False | 791 | start = False |
784 | if start == True: | 792 | if start == True: |
785 | items.append(text) | 793 | items.append(text) |
786 | |||
787 | lines = [['项目', '购买价格', '实际融资金额']] | 794 | lines = [['项目', '购买价格', '实际融资金额']] |
788 | for i in range(len(items)//3): | 795 | for i in range(len(items) // 3): |
789 | line = [items[2+i*3+0], items[2+i*3+1], items[2+i*3+2]] | 796 | line = [items[2 + i * 3 + 0], items[2 + i * 3 + 1], items[2 + i * 3 + 2]] |
790 | lines.append(line) | 797 | lines.append(line) |
791 | |||
792 | if len(items) > 0: | 798 | if len(items) > 0: |
793 | lines.append([items[0], '', items[1]]) | 799 | lines.append([items[0], '', items[1]]) |
794 | 800 | ||
... | @@ -800,7 +806,6 @@ class Finder: | ... | @@ -800,7 +806,6 @@ class Finder: |
800 | def get_contract_no_dy(self): | 806 | def get_contract_no_dy(self): |
801 | # 查找抵押合同编号 | 807 | # 查找抵押合同编号 |
802 | contract_no = self.item.copy() | 808 | contract_no = self.item.copy() |
803 | |||
804 | key_box = None | 809 | key_box = None |
805 | for pno in self.pdf_info: | 810 | for pno in self.pdf_info: |
806 | for block in self.pdf_info[pno]['blocks']: | 811 | for block in self.pdf_info[pno]['blocks']: |
... | @@ -811,7 +816,6 @@ class Finder: | ... | @@ -811,7 +816,6 @@ class Finder: |
811 | bbox, text = span['bbox'], span['text'] | 816 | bbox, text = span['bbox'], span['text'] |
812 | if '抵押合同编号' in text: | 817 | if '抵押合同编号' in text: |
813 | key_box = bbox | 818 | key_box = bbox |
814 | |||
815 | if key_box is not None: | 819 | if key_box is not None: |
816 | for pno in self.pdf_info: | 820 | for pno in self.pdf_info: |
817 | for block in self.pdf_info[pno]['blocks']: | 821 | for block in self.pdf_info[pno]['blocks']: |
... | @@ -829,7 +833,6 @@ class Finder: | ... | @@ -829,7 +833,6 @@ class Finder: |
829 | def get_dyr_name_id(self): | 833 | def get_dyr_name_id(self): |
830 | name = self.item.copy() | 834 | name = self.item.copy() |
831 | _id = self.item.copy() | 835 | _id = self.item.copy() |
832 | |||
833 | key_box = None | 836 | key_box = None |
834 | for pno in self.pdf_info: | 837 | for pno in self.pdf_info: |
835 | for block in self.pdf_info[pno]['blocks']: | 838 | for block in self.pdf_info[pno]['blocks']: |
... | @@ -842,7 +845,7 @@ class Finder: | ... | @@ -842,7 +845,7 @@ class Finder: |
842 | key_box = bbox | 845 | key_box = bbox |
843 | 846 | ||
844 | if key_box is not None: | 847 | if key_box is not None: |
845 | rh = abs(key_box[1]-key_box[3]) | 848 | rh = abs(key_box[1] - key_box[3]) |
846 | for pno in self.pdf_info: | 849 | for pno in self.pdf_info: |
847 | for block in self.pdf_info[pno]['blocks']: | 850 | for block in self.pdf_info[pno]['blocks']: |
848 | if block['type'] != 0: | 851 | if block['type'] != 0: |
... | @@ -850,12 +853,12 @@ class Finder: | ... | @@ -850,12 +853,12 @@ class Finder: |
850 | for line in block['lines']: | 853 | for line in block['lines']: |
851 | for span in line['spans']: | 854 | for span in line['spans']: |
852 | bbox, text = span['bbox'], span['text'] | 855 | bbox, text = span['bbox'], span['text'] |
853 | if key_box[1] < np.mean(bbox[1::2]) < key_box[3]+rh*3 and '姓名' in text: | 856 | if key_box[1] < np.mean(bbox[1::2]) < key_box[3] + rh * 3 and '姓名' in text: |
854 | words = text.split(':')[-1] | 857 | words = text.split(':')[-1] |
855 | name['position'] = bbox | 858 | name['position'] = bbox |
856 | name['page'] = pno | 859 | name['page'] = pno |
857 | name['words'] = words | 860 | name['words'] = words |
858 | if key_box[1] < np.mean(bbox[1::2]) < key_box[3]+rh*3 and '证件号码' in text: | 861 | if key_box[1] < np.mean(bbox[1::2]) < key_box[3] + rh * 3 and '证件号码' in text: |
859 | words = text.split(':')[-1] | 862 | words = text.split(':')[-1] |
860 | _id['position'] = bbox | 863 | _id['position'] = bbox |
861 | _id['page'] = pno | 864 | _id['page'] = pno |
... | @@ -864,7 +867,6 @@ class Finder: | ... | @@ -864,7 +867,6 @@ class Finder: |
864 | 867 | ||
865 | def get_key_value_position(self, key): | 868 | def get_key_value_position(self, key): |
866 | value = self.item.copy() | 869 | value = self.item.copy() |
867 | |||
868 | key_box = None | 870 | key_box = None |
869 | for pno in self.pdf_info: | 871 | for pno in self.pdf_info: |
870 | for block in self.pdf_info[pno]['blocks']: | 872 | for block in self.pdf_info[pno]['blocks']: |
... | @@ -875,9 +877,8 @@ class Finder: | ... | @@ -875,9 +877,8 @@ class Finder: |
875 | bbox, text = span['bbox'], span['text'] | 877 | bbox, text = span['bbox'], span['text'] |
876 | if text == key: | 878 | if text == key: |
877 | key_box = bbox | 879 | key_box = bbox |
878 | |||
879 | if key_box is not None: | 880 | if key_box is not None: |
880 | rh = abs(key_box[1]-key_box[3]) | 881 | rh = abs(key_box[1] - key_box[3]) |
881 | for pno in self.pdf_info: | 882 | for pno in self.pdf_info: |
882 | for block in self.pdf_info[pno]['blocks']: | 883 | for block in self.pdf_info[pno]['blocks']: |
883 | if block['type'] != 0: | 884 | if block['type'] != 0: |
... | @@ -885,13 +886,104 @@ class Finder: | ... | @@ -885,13 +886,104 @@ class Finder: |
885 | for line in block['lines']: | 886 | for line in block['lines']: |
886 | for span in line['spans']: | 887 | for span in line['spans']: |
887 | bbox, text = span['bbox'], span['text'] | 888 | bbox, text = span['bbox'], span['text'] |
888 | if key_box[1] < np.mean(bbox[1::2]) < key_box[3] and key_box[0] < bbox[0] and abs(key_box[2]-bbox[0]) < rh*10: | 889 | if key_box[1] < np.mean(bbox[1::2]) < key_box[3] and key_box[0] < bbox[0] and abs( |
890 | key_box[2] - bbox[0]) < rh * 10: | ||
889 | words = text | 891 | words = text |
890 | value['position'] = bbox | 892 | value['position'] = bbox |
891 | value['page'] = pno | 893 | value['page'] = pno |
892 | value['words'] = words | 894 | value['words'] = words |
893 | return value | 895 | return value |
894 | 896 | ||
897 | def get_role_info_3_3(self, role_key, page_num='0'): | ||
898 | name = self.item.copy() | ||
899 | id_num = self.item.copy() | ||
900 | representative = self.item.copy() | ||
901 | # 以保证人2 的左上角为定位点 | ||
902 | anchor = None | ||
903 | for block in self.pdf_info[page_num]['blocks']: | ||
904 | if block['type'] != 0: | ||
905 | continue | ||
906 | for line in block['lines']: | ||
907 | for span in line['spans']: | ||
908 | bbox, text = span['bbox'], span['text'] | ||
909 | # 找到角色姓名 | ||
910 | if re.match('保证人2', text) is not None: | ||
911 | anchor = [bbox[0], bbox[1]] | ||
912 | if anchor is not None: | ||
913 | for block in self.pdf_info[page_num]['blocks']: | ||
914 | if block['type'] != 0: | ||
915 | continue | ||
916 | for line in block['lines']: | ||
917 | for span in line['spans']: | ||
918 | bbox, text = span['bbox'], span['text'] | ||
919 | # 找到角色姓名 | ||
920 | if re.match(role_key, text) is not None: | ||
921 | words = text.split(':')[-1] | ||
922 | name['words'] = words | ||
923 | name['page'] = page_num | ||
924 | name['position'] = bbox | ||
925 | if role_key == '承租人一:': | ||
926 | # 找到证件号码且确定位置 | ||
927 | if re.match('证件号码:', text) is not None and np.mean(bbox[::2]) < anchor[0] and np.mean( | ||
928 | bbox[1::2]) < anchor[1]: | ||
929 | words = text.split(':')[-1] | ||
930 | id_num['words'] = words | ||
931 | id_num['page'] = page_num | ||
932 | id_num['position'] = bbox | ||
933 | # 找到法人代表且确定位置 | ||
934 | if re.match('法定代表人或授权代表:', text) is not None and np.mean(bbox[::2]) < anchor[0] and np.mean( | ||
935 | bbox[1::2]) < anchor[1]: | ||
936 | words = text.split(':')[-1] | ||
937 | representative['words'] = words | ||
938 | representative['page'] = page_num | ||
939 | representative['position'] = bbox | ||
940 | if role_key == '共同承租人:': | ||
941 | # 找到证件号码且确定位置 | ||
942 | if re.match('证件号码:', text) is not None and np.mean(bbox[::2]) < anchor[0] and np.mean( | ||
943 | bbox[1::2]) > anchor[1]: | ||
944 | words = text.split(':')[-1] | ||
945 | id_num['words'] = words | ||
946 | id_num['page'] = page_num | ||
947 | id_num['position'] = bbox | ||
948 | # 找到法人代表且确定位置 | ||
949 | if re.match('法定代表人或授权代表:', text) is not None and np.mean(bbox[::2]) < anchor[0] and np.mean( | ||
950 | bbox[1::2]) > anchor[1]: | ||
951 | words = text.split(':')[-1] | ||
952 | representative['words'] = words | ||
953 | representative['page'] = page_num | ||
954 | representative['position'] = bbox | ||
955 | if role_key == '保证人1:': | ||
956 | # 找到证件号码且确定位置 | ||
957 | if re.match('证件号码:', text) is not None and np.mean(bbox[::2]) > anchor[0] and np.mean( | ||
958 | bbox[1::2]) < anchor[1]: | ||
959 | words = text.split(':')[-1] | ||
960 | id_num['words'] = words | ||
961 | id_num['page'] = page_num | ||
962 | id_num['position'] = bbox | ||
963 | # 找到法人代表且确定位置 | ||
964 | if re.match('法定代表人或授权代表:', text) is not None and np.mean(bbox[::2]) > anchor[0] and np.mean( | ||
965 | bbox[1::2]) < anchor[1]: | ||
966 | words = text.split(':')[-1] | ||
967 | representative['words'] = words | ||
968 | representative['page'] = page_num | ||
969 | representative['position'] = bbox | ||
970 | if role_key == '保证人2:': | ||
971 | # 找到证件号码且确定位置 | ||
972 | if re.match('证件号码:', text) is not None and np.mean(bbox[::2]) > anchor[0] and np.mean( | ||
973 | bbox[1::2]) > anchor[1]: | ||
974 | words = text.split(':')[-1] | ||
975 | id_num['words'] = words | ||
976 | id_num['page'] = page_num | ||
977 | id_num['position'] = bbox | ||
978 | # 找到法人代表且确定位置 | ||
979 | if re.match('法定代表人或授权代表:', text) is not None and np.mean(bbox[::2]) > anchor[0] and np.mean( | ||
980 | bbox[1::2]) > anchor[1]: | ||
981 | words = text.split(':')[-1] | ||
982 | representative['words'] = words | ||
983 | representative['page'] = page_num | ||
984 | representative['position'] = bbox | ||
985 | return name, id_num, representative | ||
986 | |||
895 | def get_info(self): | 987 | def get_info(self): |
896 | """ | 988 | """ |
897 | block['type'] == 0 : 表示该元素为图片 | 989 | block['type'] == 0 : 表示该元素为图片 |
... | @@ -905,6 +997,8 @@ class Finder: | ... | @@ -905,6 +997,8 @@ class Finder: |
905 | self.init_result['合同编号'] = contract_no | 997 | self.init_result['合同编号'] = contract_no |
906 | # 从第一页上取四个角色的姓名和证件号码 | 998 | # 从第一页上取四个角色的姓名和证件号码 |
907 | name, id_num, representative = self.get_role_info(role_key='承租人:', page_num='0') | 999 | name, id_num, representative = self.get_role_info(role_key='承租人:', page_num='0') |
1000 | if name["words"] == None: | ||
1001 | name, id_num, representative = self.get_role_info_3_3(role_key='承租人一:', page_num='0') | ||
908 | self.init_result['承租人-姓名'] = name | 1002 | self.init_result['承租人-姓名'] = name |
909 | self.init_result['承租人-证件号码'] = id_num | 1003 | self.init_result['承租人-证件号码'] = id_num |
910 | self.init_result['承租人-法定代表人或授权代表'] = representative | 1004 | self.init_result['承租人-法定代表人或授权代表'] = representative |
... | @@ -912,14 +1006,31 @@ class Finder: | ... | @@ -912,14 +1006,31 @@ class Finder: |
912 | self.init_result['保证人1-姓名'] = name | 1006 | self.init_result['保证人1-姓名'] = name |
913 | self.init_result['保证人1-证件号码'] = id_num | 1007 | self.init_result['保证人1-证件号码'] = id_num |
914 | self.init_result['保证人1-法定代表人或授权代表'] = representative | 1008 | self.init_result['保证人1-法定代表人或授权代表'] = representative |
1009 | # if条件判别 对应3_3版本 | ||
1010 | if name["words"] == None: | ||
1011 | name, id_num, representative = self.get_role_info_3_3(role_key='共同承租人:', page_num='0') | ||
1012 | self.init_result['共同承租人-姓名'] = name | ||
1013 | self.init_result['共同承租人-证件号码'] = id_num | ||
1014 | self.init_result['共同承租人-法定代表人或授权代表'] = representative | ||
915 | name, id_num, representative = self.get_role_info(role_key='保证人2:', page_num='0') | 1015 | name, id_num, representative = self.get_role_info(role_key='保证人2:', page_num='0') |
916 | self.init_result['保证人2-姓名'] = name | 1016 | self.init_result['保证人2-姓名'] = name |
917 | self.init_result['保证人2-证件号码'] = id_num | 1017 | self.init_result['保证人2-证件号码'] = id_num |
918 | self.init_result['保证人2-法定代表人或授权代表'] = representative | 1018 | self.init_result['保证人2-法定代表人或授权代表'] = representative |
1019 | # if条件判别 对应3_3版本 | ||
1020 | if name["words"] == None: | ||
1021 | name, id_num, representative = self.get_role_info_3_3(role_key='保证人1:', page_num='0') | ||
1022 | self.init_result['保证人2-姓名'] = name | ||
1023 | self.init_result['保证人2-证件号码'] = id_num | ||
1024 | self.init_result['保证人2-法定代表人或授权代表'] = representative | ||
919 | name, id_num, representative = self.get_role_info(role_key='保证人3:', page_num='0') | 1025 | name, id_num, representative = self.get_role_info(role_key='保证人3:', page_num='0') |
920 | self.init_result['保证人3-姓名'] = name | 1026 | self.init_result['保证人3-姓名'] = name |
921 | self.init_result['保证人3-证件号码'] = id_num | 1027 | self.init_result['保证人3-证件号码'] = id_num |
922 | self.init_result['保证人3-法定代表人或授权代表'] = representative | 1028 | self.init_result['保证人3-法定代表人或授权代表'] = representative |
1029 | if name["words"] == None: | ||
1030 | name, id_num, representative = self.get_role_info_3_3(role_key='保证人2:', page_num='0') | ||
1031 | self.init_result['保证人3-姓名'] = name | ||
1032 | self.init_result['保证人3-证件号码'] = id_num | ||
1033 | self.init_result['保证人3-法定代表人或授权代表'] = representative | ||
923 | # 在所有页面中找正文中(第二部分 融资租赁主要条款及付款计划)的那个编号,因为存在换行的情况所以暂时不带位置输出 | 1034 | # 在所有页面中找正文中(第二部分 融资租赁主要条款及付款计划)的那个编号,因为存在换行的情况所以暂时不带位置输出 |
924 | contract_no = self.get_contract_no_one() | 1035 | contract_no = self.get_contract_no_one() |
925 | self.init_result['合同编号(正文)'] = contract_no | 1036 | self.init_result['合同编号(正文)'] = contract_no |
... | @@ -955,6 +1066,9 @@ class Finder: | ... | @@ -955,6 +1066,9 @@ class Finder: |
955 | # 承租人姓名、签章 | 1066 | # 承租人姓名、签章 |
956 | name = self.get_key_value(key='承租人姓名:') | 1067 | name = self.get_key_value(key='承租人姓名:') |
957 | electronic_signature = self.get_electronic_signature(top='承租人姓名:', bottom='保证人1姓名:') | 1068 | electronic_signature = self.get_electronic_signature(top='承租人姓名:', bottom='保证人1姓名:') |
1069 | if name["words"] == None: | ||
1070 | name = self.get_key_value(key='承租人一姓名:') | ||
1071 | electronic_signature = self.get_electronic_signature(top='承租人一姓名:', bottom='共同承租人名称:') | ||
958 | self.init_result['签字页-承租人姓名'] = name | 1072 | self.init_result['签字页-承租人姓名'] = name |
959 | self.init_result['签字页-承租人签章'] = electronic_signature | 1073 | self.init_result['签字页-承租人签章'] = electronic_signature |
960 | # 保证人1姓名、签章 | 1074 | # 保证人1姓名、签章 |
... | @@ -962,19 +1076,35 @@ class Finder: | ... | @@ -962,19 +1076,35 @@ class Finder: |
962 | electronic_signature = self.get_electronic_signature(top='保证人1姓名:', bottom='保证人2姓名:') | 1076 | electronic_signature = self.get_electronic_signature(top='保证人1姓名:', bottom='保证人2姓名:') |
963 | self.init_result['签字页-保证人1姓名'] = name | 1077 | self.init_result['签字页-保证人1姓名'] = name |
964 | self.init_result['签字页-保证人1签章'] = electronic_signature | 1078 | self.init_result['签字页-保证人1签章'] = electronic_signature |
1079 | # 这里用的是 name["words"] == "" | ||
1080 | if name["words"] == "": | ||
1081 | name = self.get_key_value(key='共同承租人名称:') | ||
1082 | electronic_signature = self.get_electronic_signature(top='共同承租人名称:', bottom='保证人1姓名:') | ||
1083 | self.init_result['签字页-共同承租人姓名'] = name | ||
1084 | self.init_result['签字页-共同承租人签章'] = electronic_signature | ||
965 | # 保证人2姓名、签章 | 1085 | # 保证人2姓名、签章 |
966 | name = self.get_key_value(key='保证人2姓名:') | 1086 | name = self.get_key_value(key='保证人2姓名:') |
967 | electronic_signature = self.get_electronic_signature(top='保证人2姓名:', bottom='保证人3姓名:') | 1087 | electronic_signature = self.get_electronic_signature(top='保证人2姓名:', bottom='保证人3姓名:') |
968 | self.init_result['签字页-保证人2姓名'] = name | 1088 | self.init_result['签字页-保证人2姓名'] = name |
969 | self.init_result['签字页-保证人2签章'] = electronic_signature | 1089 | self.init_result['签字页-保证人2签章'] = electronic_signature |
970 | # 保证人2姓名、签章 | 1090 | # if判断条件对应3_3版本 |
1091 | if name["words"] == "": | ||
1092 | name = self.get_key_value(key='保证人1姓名:') | ||
1093 | electronic_signature = self.get_electronic_signature(top='保证人1姓名:', bottom='保证人2姓名:') | ||
1094 | self.init_result['签字页-保证人1姓名'] = name | ||
1095 | self.init_result['签字页-保证人1签章'] = electronic_signature | ||
1096 | # 保证人3姓名、签章 | ||
971 | name = self.get_key_value(key='保证人3姓名:') | 1097 | name = self.get_key_value(key='保证人3姓名:') |
972 | electronic_signature = self.get_electronic_signature(top='保证人3姓名:', bottom='日期:') | 1098 | electronic_signature = self.get_electronic_signature(top='保证人3姓名:', bottom='日期:') |
973 | self.init_result['签字页-保证人3姓名'] = name | 1099 | self.init_result['签字页-保证人3姓名'] = name |
974 | self.init_result['签字页-保证人3签章'] = electronic_signature | 1100 | self.init_result['签字页-保证人3签章'] = electronic_signature |
975 | 1101 | # if判断条件对应3_3版本 | |
1102 | if name["words"] == None: | ||
1103 | name = self.get_key_value(key='保证人2姓名:') | ||
1104 | electronic_signature = self.get_electronic_signature(top='保证人2姓名:', bottom='日期:') | ||
1105 | self.init_result['签字页-保证人2姓名'] = name | ||
1106 | self.init_result['签字页-保证人2签章'] = electronic_signature | ||
976 | return self.init_result | 1107 | return self.init_result |
977 | |||
978 | # results['is_shhz_contract'] = True | 1108 | # results['is_shhz_contract'] = True |
979 | # results['pdf_info'] = self.init_result | 1109 | # results['pdf_info'] = self.init_result |
980 | 1110 | ... | ... |
... | @@ -18,7 +18,6 @@ def predict(pdf_info, file_cls): | ... | @@ -18,7 +18,6 @@ def predict(pdf_info, file_cls): |
18 | Returns: | 18 | Returns: |
19 | TYPE: Description | 19 | TYPE: Description |
20 | """ | 20 | """ |
21 | |||
22 | # 0: 售后回租合同 | 21 | # 0: 售后回租合同 |
23 | pdf_info_0 = [] | 22 | pdf_info_0 = [] |
24 | for pno in pdf_info: | 23 | for pno in pdf_info: |
... | @@ -30,7 +29,6 @@ def predict(pdf_info, file_cls): | ... | @@ -30,7 +29,6 @@ def predict(pdf_info, file_cls): |
30 | bbox, text = span['bbox'], span['text'] | 29 | bbox, text = span['bbox'], span['text'] |
31 | if '售后回租合同_' in text: | 30 | if '售后回租合同_' in text: |
32 | pdf_info_0.append(pdf_info[pno]) | 31 | pdf_info_0.append(pdf_info[pno]) |
33 | |||
34 | # 1: 车辆处置协议 | 32 | # 1: 车辆处置协议 |
35 | pdf_info_1 = [] | 33 | pdf_info_1 = [] |
36 | for pno in pdf_info: | 34 | for pno in pdf_info: |
... | @@ -42,7 +40,6 @@ def predict(pdf_info, file_cls): | ... | @@ -42,7 +40,6 @@ def predict(pdf_info, file_cls): |
42 | bbox, text = span['bbox'], span['text'] | 40 | bbox, text = span['bbox'], span['text'] |
43 | if '售后回租合同附件一' in text: | 41 | if '售后回租合同附件一' in text: |
44 | pdf_info_1.append(pdf_info[pno]) | 42 | pdf_info_1.append(pdf_info[pno]) |
45 | |||
46 | # 2: 车辆租赁抵押合同 | 43 | # 2: 车辆租赁抵押合同 |
47 | pdf_info_2 = [] | 44 | pdf_info_2 = [] |
48 | for pno in pdf_info: | 45 | for pno in pdf_info: |
... | @@ -54,7 +51,6 @@ def predict(pdf_info, file_cls): | ... | @@ -54,7 +51,6 @@ def predict(pdf_info, file_cls): |
54 | bbox, text = span['bbox'], span['text'] | 51 | bbox, text = span['bbox'], span['text'] |
55 | if '车辆租赁抵押合同_' in text: | 52 | if '车辆租赁抵押合同_' in text: |
56 | pdf_info_2.append(pdf_info[pno]) | 53 | pdf_info_2.append(pdf_info[pno]) |
57 | |||
58 | is_clczxy = False | 54 | is_clczxy = False |
59 | # 如果 pdf_info_1 == 4 页,则说明此时输入包含了车辆处置协议 | 55 | # 如果 pdf_info_1 == 4 页,则说明此时输入包含了车辆处置协议 |
60 | if len(pdf_info_1) == 4 and file_cls == 1 and len(pdf_info_0) != 0: | 56 | if len(pdf_info_1) == 4 and file_cls == 1 and len(pdf_info_0) != 0: |
... | @@ -62,7 +58,6 @@ def predict(pdf_info, file_cls): | ... | @@ -62,7 +58,6 @@ def predict(pdf_info, file_cls): |
62 | pdf_info = dict() | 58 | pdf_info = dict() |
63 | for pno, page_info in enumerate(pdf_info_1): | 59 | for pno, page_info in enumerate(pdf_info_1): |
64 | pdf_info[str(pno)] = page_info | 60 | pdf_info[str(pno)] = page_info |
65 | |||
66 | f = Finder(pdf_info) | 61 | f = Finder(pdf_info) |
67 | if file_cls == 0: | 62 | if file_cls == 0: |
68 | results = f.get_info() | 63 | results = f.get_info() |
... | @@ -72,13 +67,11 @@ def predict(pdf_info, file_cls): | ... | @@ -72,13 +67,11 @@ def predict(pdf_info, file_cls): |
72 | if file_cls == 2: | 67 | if file_cls == 2: |
73 | # 提取信息 ———— 车辆租赁抵押合同 | 68 | # 提取信息 ———— 车辆租赁抵押合同 |
74 | results = f.get_info_2() | 69 | results = f.get_info_2() |
75 | 70 | if is_clczxy is True: | |
76 | if is_clczxy == True: | ||
77 | for key in results: | 71 | for key in results: |
78 | if results[key]['page'] is not None: | 72 | if results[key]['page'] is not None: |
79 | results[key]['page'] = str(int(results[key]['page'])+6) | 73 | results[key]['page'] = str(int(results[key]['page']) + 6) |
80 | |||
81 | for key in results: | 74 | for key in results: |
82 | if results[key]['page'] is not None: | 75 | if results[key]['page'] is not None: |
83 | results[key]['page'] = 'page_' + str(int(results[key]['page'])+1) | 76 | results[key]['page'] = 'page_' + str(int(results[key]['page']) + 1) |
84 | return results | 77 | return results | ... | ... |
-
Please register or sign in to post a comment