fix format
Showing
1 changed file
with
46 additions
and
14 deletions
| ... | @@ -18,6 +18,7 @@ class Finder: | ... | @@ -18,6 +18,7 @@ class Finder: |
| 18 | self.item = {"words": None, | 18 | self.item = {"words": None, |
| 19 | "position": None, | 19 | "position": None, |
| 20 | } | 20 | } |
| 21 | |||
| 21 | def gen_init_result(self, is_asp): | 22 | def gen_init_result(self, is_asp): |
| 22 | # 格式化算法输出 | 23 | # 格式化算法输出 |
| 23 | self.init_result = {"page_1": {"合同编号": self.item, | 24 | self.init_result = {"page_1": {"合同编号": self.item, |
| ... | @@ -108,10 +109,12 @@ class Finder: | ... | @@ -108,10 +109,12 @@ class Finder: |
| 108 | "日期": self.item, | 109 | "日期": self.item, |
| 109 | }, | 110 | }, |
| 110 | } | 111 | } |
| 112 | |||
| 111 | def poly_to_rectangle(self, poly): | 113 | def poly_to_rectangle(self, poly): |
| 112 | xmin, ymin, xmax, ymin, xmax, ymax, xmin, ymax = poly | 114 | xmin, ymin, xmax, ymin, xmax, ymax, xmin, ymax = poly |
| 113 | bbox = [xmin, ymin, xmax, ymax] | 115 | bbox = [xmin, ymin, xmax, ymax] |
| 114 | return bbox | 116 | return bbox |
| 117 | |||
| 115 | def get_contract_no(self, page_num): | 118 | def get_contract_no(self, page_num): |
| 116 | """传入页码,查看该页码右上角的编号 | 119 | """传入页码,查看该页码右上角的编号 |
| 117 | 120 | ||
| ... | @@ -133,6 +136,7 @@ class Finder: | ... | @@ -133,6 +136,7 @@ class Finder: |
| 133 | contract_no['words'] = words | 136 | contract_no['words'] = words |
| 134 | contract_no['position'] = location | 137 | contract_no['position'] = location |
| 135 | return contract_no | 138 | return contract_no |
| 139 | |||
| 136 | def get_vehicle_price(self, page_num='0'): | 140 | def get_vehicle_price(self, page_num='0'): |
| 137 | vehicle_price = self.item.copy() | 141 | vehicle_price = self.item.copy() |
| 138 | # vehicle_price['words'] = '' | 142 | # vehicle_price['words'] = '' |
| ... | @@ -145,6 +149,7 @@ class Finder: | ... | @@ -145,6 +149,7 @@ class Finder: |
| 145 | vehicle_price['words'] = words | 149 | vehicle_price['words'] = words |
| 146 | vehicle_price['position'] = location | 150 | vehicle_price['position'] = location |
| 147 | return vehicle_price | 151 | return vehicle_price |
| 152 | |||
| 148 | def get_vin(self, page_num='0'): | 153 | def get_vin(self, page_num='0'): |
| 149 | vin = self.item.copy() | 154 | vin = self.item.copy() |
| 150 | # vin['words'] = '' | 155 | # vin['words'] = '' |
| ... | @@ -157,6 +162,7 @@ class Finder: | ... | @@ -157,6 +162,7 @@ class Finder: |
| 157 | vin['words'] = words | 162 | vin['words'] = words |
| 158 | vin['position'] = location | 163 | vin['position'] = location |
| 159 | return vin | 164 | return vin |
| 165 | |||
| 160 | def get_loan_principal(self, page_num='0'): | 166 | def get_loan_principal(self, page_num='0'): |
| 161 | chinese_keywords = ['壹', '贰', '叁', '肆', '伍', '陆', '柒', '捌', '玖', '拾', | 167 | chinese_keywords = ['壹', '贰', '叁', '肆', '伍', '陆', '柒', '捌', '玖', '拾', |
| 162 | '佰', '仟', '万', '亿', '元', '角', '分', '零', '整'] | 168 | '佰', '仟', '万', '亿', '元', '角', '分', '零', '整'] |
| ... | @@ -197,6 +203,7 @@ class Finder: | ... | @@ -197,6 +203,7 @@ class Finder: |
| 197 | asp_2['position'] = bbox | 203 | asp_2['position'] = bbox |
| 198 | asp_2['words'] = words | 204 | asp_2['words'] = words |
| 199 | return upper, lower, asp_1, asp_2 | 205 | return upper, lower, asp_1, asp_2 |
| 206 | |||
| 200 | def get_loan_term(self, page_num='0'): | 207 | def get_loan_term(self, page_num='0'): |
| 201 | loan_term = self.item.copy() | 208 | loan_term = self.item.copy() |
| 202 | all_text = '' | 209 | all_text = '' |
| ... | @@ -220,6 +227,7 @@ class Finder: | ... | @@ -220,6 +227,7 @@ class Finder: |
| 220 | loan_term['position'] = bbox | 227 | loan_term['position'] = bbox |
| 221 | loan_term['words'] = words | 228 | loan_term['words'] = words |
| 222 | return loan_term | 229 | return loan_term |
| 230 | |||
| 223 | def mergelist(self, text_list): | 231 | def mergelist(self, text_list): |
| 224 | pattern = re.compile("[^\u4e00-\u9fa5]") # 匹配不是中文的其他字符 | 232 | pattern = re.compile("[^\u4e00-\u9fa5]") # 匹配不是中文的其他字符 |
| 225 | mergeindex = -1 | 233 | mergeindex = -1 |
| ... | @@ -230,8 +238,10 @@ class Finder: | ... | @@ -230,8 +238,10 @@ class Finder: |
| 230 | if mergeindex == -1: | 238 | if mergeindex == -1: |
| 231 | return text_list | 239 | return text_list |
| 232 | else: | 240 | else: |
| 233 | new_text_list = text_list[:mergeindex] + [text_list[mergeindex] + text_list[mergeindex+1]] + text_list[mergeindex+2:] | 241 | new_text_list = text_list[:mergeindex] + [ |
| 242 | text_list[mergeindex] + text_list[mergeindex+1]] + text_list[mergeindex+2:] | ||
| 234 | return self.mergelist(new_text_list) | 243 | return self.mergelist(new_text_list) |
| 244 | |||
| 235 | def get_asp_details(self, page_num): | 245 | def get_asp_details(self, page_num): |
| 236 | asp_details_table_term = self.item.copy() | 246 | asp_details_table_term = self.item.copy() |
| 237 | asp_details_table = [] | 247 | asp_details_table = [] |
| ... | @@ -262,6 +272,7 @@ class Finder: | ... | @@ -262,6 +272,7 @@ class Finder: |
| 262 | if len(asp_details_table) > 0: | 272 | if len(asp_details_table) > 0: |
| 263 | asp_details_table_term['words'] = asp_details_table | 273 | asp_details_table_term['words'] = asp_details_table |
| 264 | return asp_details_table_term | 274 | return asp_details_table_term |
| 275 | |||
| 265 | def get_signature(self): | 276 | def get_signature(self): |
| 266 | signature = self.item.copy() | 277 | signature = self.item.copy() |
| 267 | for block in self.pdf_info['0']['blocks']: | 278 | for block in self.pdf_info['0']['blocks']: |
| ... | @@ -275,6 +286,7 @@ class Finder: | ... | @@ -275,6 +286,7 @@ class Finder: |
| 275 | signature['words'] = words | 286 | signature['words'] = words |
| 276 | signature['position'] = bbox | 287 | signature['position'] = bbox |
| 277 | return signature | 288 | return signature |
| 289 | |||
| 278 | def get_somebody(self, top, bottom): | 290 | def get_somebody(self, top, bottom): |
| 279 | # 指定上下边界后,返回上下边界内的客户信息 | 291 | # 指定上下边界后,返回上下边界内的客户信息 |
| 280 | _name = self.item.copy() | 292 | _name = self.item.copy() |
| ... | @@ -309,6 +321,7 @@ class Finder: | ... | @@ -309,6 +321,7 @@ class Finder: |
| 309 | _id['position'] = bbox | 321 | _id['position'] = bbox |
| 310 | _id['words'] = words | 322 | _id['words'] = words |
| 311 | return _name, _id | 323 | return _name, _id |
| 324 | |||
| 312 | def get_seller(self): | 325 | def get_seller(self): |
| 313 | seller = self.item.copy() | 326 | seller = self.item.copy() |
| 314 | # 先找到 key | 327 | # 先找到 key |
| ... | @@ -330,11 +343,12 @@ class Finder: | ... | @@ -330,11 +343,12 @@ class Finder: |
| 330 | for line in block['lines']: | 343 | for line in block['lines']: |
| 331 | for span in line['spans']: | 344 | for span in line['spans']: |
| 332 | bbox, text = span['bbox'], span['text'] | 345 | bbox, text = span['bbox'], span['text'] |
| 333 | if anchor_bbox[2]<np.mean(bbox[::2])<half_width and \ | 346 | if anchor_bbox[2] < np.mean(bbox[::2]) < half_width and \ |
| 334 | anchor_bbox[1]<np.mean(bbox[1::2])<anchor_bbox[3]: | 347 | anchor_bbox[1] < np.mean(bbox[1::2]) < anchor_bbox[3]: |
| 335 | seller['position'] = bbox | 348 | seller['position'] = bbox |
| 336 | seller['words'] = text | 349 | seller['words'] = text |
| 337 | return seller | 350 | return seller |
| 351 | |||
| 338 | def get_payback_account(self): | 352 | def get_payback_account(self): |
| 339 | account = self.item.copy() | 353 | account = self.item.copy() |
| 340 | account_name = self.item.copy() | 354 | account_name = self.item.copy() |
| ... | @@ -387,6 +401,7 @@ class Finder: | ... | @@ -387,6 +401,7 @@ class Finder: |
| 387 | account_bank['position'] = bbox | 401 | account_bank['position'] = bbox |
| 388 | account_bank['words'] = words | 402 | account_bank['words'] = words |
| 389 | return account, account_name, account_bank | 403 | return account, account_name, account_bank |
| 404 | |||
| 390 | def get_repayment_schedule(self): | 405 | def get_repayment_schedule(self): |
| 391 | repayment_schedule = self.item.copy() | 406 | repayment_schedule = self.item.copy() |
| 392 | # 只看第二页 | 407 | # 只看第二页 |
| ... | @@ -416,6 +431,7 @@ class Finder: | ... | @@ -416,6 +431,7 @@ class Finder: |
| 416 | if len(repayment_schedule_table) > 0: | 431 | if len(repayment_schedule_table) > 0: |
| 417 | repayment_schedule['words'] = repayment_schedule_table | 432 | repayment_schedule['words'] = repayment_schedule_table |
| 418 | return repayment_schedule | 433 | return repayment_schedule |
| 434 | |||
| 419 | def get_signature_role_1(self): | 435 | def get_signature_role_1(self): |
| 420 | signature_role_1 = self.init_item.copy() | 436 | signature_role_1 = self.init_item.copy() |
| 421 | # 先定位签字区域 | 437 | # 先定位签字区域 |
| ... | @@ -445,11 +461,13 @@ class Finder: | ... | @@ -445,11 +461,13 @@ class Finder: |
| 445 | else: | 461 | else: |
| 446 | words = '无' | 462 | words = '无' |
| 447 | boxes = np.array(boxes).reshape((-1, 2)) | 463 | boxes = np.array(boxes).reshape((-1, 2)) |
| 448 | position = [min(boxes[:,0]), min(boxes[:,1]), max(boxes[:,0]), max(boxes[:,1])] | 464 | position = [min(boxes[:, 0]), min(boxes[:, 1]), |
| 465 | max(boxes[:, 0]), max(boxes[:, 1])] | ||
| 449 | signature_role_1['page_num'] = page_num | 466 | signature_role_1['page_num'] = page_num |
| 450 | signature_role_1['position'] = position | 467 | signature_role_1['position'] = position |
| 451 | signature_role_1['words'] = words | 468 | signature_role_1['words'] = words |
| 452 | return signature_role_1 | 469 | return signature_role_1 |
| 470 | |||
| 453 | def get_signature_role_2(self): | 471 | def get_signature_role_2(self): |
| 454 | signature_role_2 = self.init_item.copy() | 472 | signature_role_2 = self.init_item.copy() |
| 455 | # 先定位签字区域 | 473 | # 先定位签字区域 |
| ... | @@ -479,11 +497,13 @@ class Finder: | ... | @@ -479,11 +497,13 @@ class Finder: |
| 479 | else: | 497 | else: |
| 480 | words = '无' | 498 | words = '无' |
| 481 | boxes = np.array(boxes).reshape((-1, 2)) | 499 | boxes = np.array(boxes).reshape((-1, 2)) |
| 482 | position = [min(boxes[:,0]), min(boxes[:,1]), max(boxes[:,0]), max(boxes[:,1])] | 500 | position = [min(boxes[:, 0]), min(boxes[:, 1]), |
| 501 | max(boxes[:, 0]), max(boxes[:, 1])] | ||
| 483 | signature_role_2['page_num'] = page_num | 502 | signature_role_2['page_num'] = page_num |
| 484 | signature_role_2['position'] = position | 503 | signature_role_2['position'] = position |
| 485 | signature_role_2['words'] = words | 504 | signature_role_2['words'] = words |
| 486 | return signature_role_2 | 505 | return signature_role_2 |
| 506 | |||
| 487 | def get_signature_role_3(self): | 507 | def get_signature_role_3(self): |
| 488 | signature_role_3 = self.init_item.copy() | 508 | signature_role_3 = self.init_item.copy() |
| 489 | # 先定位签字区域 | 509 | # 先定位签字区域 |
| ... | @@ -513,11 +533,13 @@ class Finder: | ... | @@ -513,11 +533,13 @@ class Finder: |
| 513 | else: | 533 | else: |
| 514 | words = '无' | 534 | words = '无' |
| 515 | boxes = np.array(boxes).reshape((-1, 2)) | 535 | boxes = np.array(boxes).reshape((-1, 2)) |
| 516 | position = [min(boxes[:,0]), min(boxes[:,1]), max(boxes[:,0]), max(boxes[:,1])] | 536 | position = [min(boxes[:, 0]), min(boxes[:, 1]), |
| 537 | max(boxes[:, 0]), max(boxes[:, 1])] | ||
| 517 | signature_role_3['page_num'] = page_num | 538 | signature_role_3['page_num'] = page_num |
| 518 | signature_role_3['position'] = position | 539 | signature_role_3['position'] = position |
| 519 | signature_role_3['words'] = words | 540 | signature_role_3['words'] = words |
| 520 | return signature_role_3 | 541 | return signature_role_3 |
| 542 | |||
| 521 | def get_signature_role_4(self): | 543 | def get_signature_role_4(self): |
| 522 | signature_role_4 = self.init_item.copy() | 544 | signature_role_4 = self.init_item.copy() |
| 523 | # 先定位签字区域 | 545 | # 先定位签字区域 |
| ... | @@ -547,11 +569,13 @@ class Finder: | ... | @@ -547,11 +569,13 @@ class Finder: |
| 547 | else: | 569 | else: |
| 548 | words = '无' | 570 | words = '无' |
| 549 | boxes = np.array(boxes).reshape((-1, 2)) | 571 | boxes = np.array(boxes).reshape((-1, 2)) |
| 550 | position = [min(boxes[:,0]), min(boxes[:,1]), max(boxes[:,0]), max(boxes[:,1])] | 572 | position = [min(boxes[:, 0]), min(boxes[:, 1]), |
| 573 | max(boxes[:, 0]), max(boxes[:, 1])] | ||
| 551 | signature_role_4['page_num'] = page_num | 574 | signature_role_4['page_num'] = page_num |
| 552 | signature_role_4['position'] = position | 575 | signature_role_4['position'] = position |
| 553 | signature_role_4['words'] = words | 576 | signature_role_4['words'] = words |
| 554 | return signature_role_4 | 577 | return signature_role_4 |
| 578 | |||
| 555 | def get_signature_role_5(self): | 579 | def get_signature_role_5(self): |
| 556 | signature_role_5 = self.init_item.copy() | 580 | signature_role_5 = self.init_item.copy() |
| 557 | # 先定位签字区域 | 581 | # 先定位签字区域 |
| ... | @@ -582,11 +606,13 @@ class Finder: | ... | @@ -582,11 +606,13 @@ class Finder: |
| 582 | else: | 606 | else: |
| 583 | words = '无' | 607 | words = '无' |
| 584 | boxes = np.array(boxes).reshape((-1, 2)) | 608 | boxes = np.array(boxes).reshape((-1, 2)) |
| 585 | position = [min(boxes[:,0]), min(boxes[:,1]), max(boxes[:,0]), max(boxes[:,1])] | 609 | position = [min(boxes[:, 0]), min(boxes[:, 1]), |
| 610 | max(boxes[:, 0]), max(boxes[:, 1])] | ||
| 586 | signature_role_5['page_num'] = page_num | 611 | signature_role_5['page_num'] = page_num |
| 587 | signature_role_5['position'] = position | 612 | signature_role_5['position'] = position |
| 588 | signature_role_5['words'] = words | 613 | signature_role_5['words'] = words |
| 589 | return signature_role_5 | 614 | return signature_role_5 |
| 615 | |||
| 590 | def get_last_page_signature(self, page_num, top, bottom): | 616 | def get_last_page_signature(self, page_num, top, bottom): |
| 591 | signature_name = self.item.copy() | 617 | signature_name = self.item.copy() |
| 592 | signature_date = self.item.copy() | 618 | signature_date = self.item.copy() |
| ... | @@ -610,7 +636,7 @@ class Finder: | ... | @@ -610,7 +636,7 @@ class Finder: |
| 610 | for line in block['lines']: | 636 | for line in block['lines']: |
| 611 | for span in line['spans']: | 637 | for span in line['spans']: |
| 612 | bbox, text = span['bbox'], span['text'] | 638 | bbox, text = span['bbox'], span['text'] |
| 613 | if '签署日期' in text and int(anchor_top)<np.mean(bbox[1::2])<int(anchor_bottom): | 639 | if '签署日期' in text and int(anchor_top) < np.mean(bbox[1::2]) < int(anchor_bottom): |
| 614 | name = text.split(' ')[0] | 640 | name = text.split(' ')[0] |
| 615 | date = text.split(':')[-1] | 641 | date = text.split(':')[-1] |
| 616 | signature_name['words'] = name | 642 | signature_name['words'] = name |
| ... | @@ -618,6 +644,7 @@ class Finder: | ... | @@ -618,6 +644,7 @@ class Finder: |
| 618 | signature_date['words'] = date | 644 | signature_date['words'] = date |
| 619 | signature_date['position'] = bbox | 645 | signature_date['position'] = bbox |
| 620 | return signature_name, signature_date | 646 | return signature_name, signature_date |
| 647 | |||
| 621 | def get_info(self): | 648 | def get_info(self): |
| 622 | """ | 649 | """ |
| 623 | block['type'] == 0 : 表示该元素为图片 | 650 | block['type'] == 0 : 表示该元素为图片 |
| ... | @@ -672,22 +699,27 @@ class Finder: | ... | @@ -672,22 +699,27 @@ class Finder: |
| 672 | contract_no = self.get_contract_no(page_num='0') | 699 | contract_no = self.get_contract_no(page_num='0') |
| 673 | self.init_result['page_2']['合同编号'] = contract_no | 700 | self.init_result['page_2']['合同编号'] = contract_no |
| 674 | # 找借款人及抵押人(地址字段原本有空格) | 701 | # 找借款人及抵押人(地址字段原本有空格) |
| 675 | borrower_name, borrower_id = self.get_somebody(top='借款人及抵押人:', bottom='共同借款人:') | 702 | borrower_name, borrower_id = self.get_somebody( |
| 703 | top='借款人及抵押人:', bottom='共同借款人:') | ||
| 676 | # 这是为了同时兼容 8.1 版本 | 704 | # 这是为了同时兼容 8.1 版本 |
| 677 | if borrower_name['words'] == None: | 705 | if borrower_name['words'] == None: |
| 678 | borrower_name, borrower_id = self.get_somebody(top='借款人及抵押人:', bottom='共同借款人及共同抵押人:') | 706 | borrower_name, borrower_id = self.get_somebody( |
| 707 | top='借款人及抵押人:', bottom='共同借款人及共同抵押人:') | ||
| 679 | self.init_result['page_2']['借款人及抵押人']['name'] = borrower_name | 708 | self.init_result['page_2']['借款人及抵押人']['name'] = borrower_name |
| 680 | self.init_result['page_2']['借款人及抵押人']['id'] = borrower_id | 709 | self.init_result['page_2']['借款人及抵押人']['id'] = borrower_id |
| 681 | # 找共同借款人及共同抵押人 | 710 | # 找共同借款人及共同抵押人 |
| 682 | co_borrower_name, co_borrower_id = self.get_somebody(top='共同借款人:', bottom='保证人1:') | 711 | co_borrower_name, co_borrower_id = self.get_somebody( |
| 712 | top='共同借款人:', bottom='保证人1:') | ||
| 683 | self.init_result['page_2']['共同借款人及共同抵押人']['name'] = co_borrower_name | 713 | self.init_result['page_2']['共同借款人及共同抵押人']['name'] = co_borrower_name |
| 684 | self.init_result['page_2']['共同借款人及共同抵押人']['id'] = co_borrower_id | 714 | self.init_result['page_2']['共同借款人及共同抵押人']['id'] = co_borrower_id |
| 685 | # 保证人1 | 715 | # 保证人1 |
| 686 | first_guarantor_name, first_guarantor_id = self.get_somebody(top='保证人1:', bottom='保证人2:') | 716 | first_guarantor_name, first_guarantor_id = self.get_somebody( |
| 717 | top='保证人1:', bottom='保证人2:') | ||
| 687 | self.init_result['page_2']['保证人1']['name'] = first_guarantor_name | 718 | self.init_result['page_2']['保证人1']['name'] = first_guarantor_name |
| 688 | self.init_result['page_2']['保证人1']['id'] = first_guarantor_id | 719 | self.init_result['page_2']['保证人1']['id'] = first_guarantor_id |
| 689 | # 保证人2 | 720 | # 保证人2 |
| 690 | second_guarantor_name, second_guarantor_id = self.get_somebody(top='保证人2:', bottom='第一章') | 721 | second_guarantor_name, second_guarantor_id = self.get_somebody( |
| 722 | top='保证人2:', bottom='第一章') | ||
| 691 | self.init_result['page_2']['保证人2']['name'] = second_guarantor_name | 723 | self.init_result['page_2']['保证人2']['name'] = second_guarantor_name |
| 692 | self.init_result['page_2']['保证人2']['id'] = second_guarantor_id | 724 | self.init_result['page_2']['保证人2']['id'] = second_guarantor_id |
| 693 | # 所购车辆价格 | 725 | # 所购车辆价格 | ... | ... |
-
Please register or sign in to post a comment