fix format
Showing
1 changed file
with
46 additions
and
14 deletions
... | @@ -18,6 +18,7 @@ class Finder: | ... | @@ -18,6 +18,7 @@ class Finder: |
18 | self.item = {"words": None, | 18 | self.item = {"words": None, |
19 | "position": None, | 19 | "position": None, |
20 | } | 20 | } |
21 | |||
21 | def gen_init_result(self, is_asp): | 22 | def gen_init_result(self, is_asp): |
22 | # 格式化算法输出 | 23 | # 格式化算法输出 |
23 | self.init_result = {"page_1": {"合同编号": self.item, | 24 | self.init_result = {"page_1": {"合同编号": self.item, |
... | @@ -108,10 +109,12 @@ class Finder: | ... | @@ -108,10 +109,12 @@ class Finder: |
108 | "日期": self.item, | 109 | "日期": self.item, |
109 | }, | 110 | }, |
110 | } | 111 | } |
112 | |||
111 | def poly_to_rectangle(self, poly): | 113 | def poly_to_rectangle(self, poly): |
112 | xmin, ymin, xmax, ymin, xmax, ymax, xmin, ymax = poly | 114 | xmin, ymin, xmax, ymin, xmax, ymax, xmin, ymax = poly |
113 | bbox = [xmin, ymin, xmax, ymax] | 115 | bbox = [xmin, ymin, xmax, ymax] |
114 | return bbox | 116 | return bbox |
117 | |||
115 | def get_contract_no(self, page_num): | 118 | def get_contract_no(self, page_num): |
116 | """传入页码,查看该页码右上角的编号 | 119 | """传入页码,查看该页码右上角的编号 |
117 | 120 | ||
... | @@ -133,6 +136,7 @@ class Finder: | ... | @@ -133,6 +136,7 @@ class Finder: |
133 | contract_no['words'] = words | 136 | contract_no['words'] = words |
134 | contract_no['position'] = location | 137 | contract_no['position'] = location |
135 | return contract_no | 138 | return contract_no |
139 | |||
136 | def get_vehicle_price(self, page_num='0'): | 140 | def get_vehicle_price(self, page_num='0'): |
137 | vehicle_price = self.item.copy() | 141 | vehicle_price = self.item.copy() |
138 | # vehicle_price['words'] = '' | 142 | # vehicle_price['words'] = '' |
... | @@ -145,6 +149,7 @@ class Finder: | ... | @@ -145,6 +149,7 @@ class Finder: |
145 | vehicle_price['words'] = words | 149 | vehicle_price['words'] = words |
146 | vehicle_price['position'] = location | 150 | vehicle_price['position'] = location |
147 | return vehicle_price | 151 | return vehicle_price |
152 | |||
148 | def get_vin(self, page_num='0'): | 153 | def get_vin(self, page_num='0'): |
149 | vin = self.item.copy() | 154 | vin = self.item.copy() |
150 | # vin['words'] = '' | 155 | # vin['words'] = '' |
... | @@ -157,6 +162,7 @@ class Finder: | ... | @@ -157,6 +162,7 @@ class Finder: |
157 | vin['words'] = words | 162 | vin['words'] = words |
158 | vin['position'] = location | 163 | vin['position'] = location |
159 | return vin | 164 | return vin |
165 | |||
160 | def get_loan_principal(self, page_num='0'): | 166 | def get_loan_principal(self, page_num='0'): |
161 | chinese_keywords = ['壹', '贰', '叁', '肆', '伍', '陆', '柒', '捌', '玖', '拾', | 167 | chinese_keywords = ['壹', '贰', '叁', '肆', '伍', '陆', '柒', '捌', '玖', '拾', |
162 | '佰', '仟', '万', '亿', '元', '角', '分', '零', '整'] | 168 | '佰', '仟', '万', '亿', '元', '角', '分', '零', '整'] |
... | @@ -197,6 +203,7 @@ class Finder: | ... | @@ -197,6 +203,7 @@ class Finder: |
197 | asp_2['position'] = bbox | 203 | asp_2['position'] = bbox |
198 | asp_2['words'] = words | 204 | asp_2['words'] = words |
199 | return upper, lower, asp_1, asp_2 | 205 | return upper, lower, asp_1, asp_2 |
206 | |||
200 | def get_loan_term(self, page_num='0'): | 207 | def get_loan_term(self, page_num='0'): |
201 | loan_term = self.item.copy() | 208 | loan_term = self.item.copy() |
202 | all_text = '' | 209 | all_text = '' |
... | @@ -220,6 +227,7 @@ class Finder: | ... | @@ -220,6 +227,7 @@ class Finder: |
220 | loan_term['position'] = bbox | 227 | loan_term['position'] = bbox |
221 | loan_term['words'] = words | 228 | loan_term['words'] = words |
222 | return loan_term | 229 | return loan_term |
230 | |||
223 | def mergelist(self, text_list): | 231 | def mergelist(self, text_list): |
224 | pattern = re.compile("[^\u4e00-\u9fa5]") # 匹配不是中文的其他字符 | 232 | pattern = re.compile("[^\u4e00-\u9fa5]") # 匹配不是中文的其他字符 |
225 | mergeindex = -1 | 233 | mergeindex = -1 |
... | @@ -230,8 +238,10 @@ class Finder: | ... | @@ -230,8 +238,10 @@ class Finder: |
230 | if mergeindex == -1: | 238 | if mergeindex == -1: |
231 | return text_list | 239 | return text_list |
232 | else: | 240 | else: |
233 | new_text_list = text_list[:mergeindex] + [text_list[mergeindex] + text_list[mergeindex+1]] + text_list[mergeindex+2:] | 241 | new_text_list = text_list[:mergeindex] + [ |
242 | text_list[mergeindex] + text_list[mergeindex+1]] + text_list[mergeindex+2:] | ||
234 | return self.mergelist(new_text_list) | 243 | return self.mergelist(new_text_list) |
244 | |||
235 | def get_asp_details(self, page_num): | 245 | def get_asp_details(self, page_num): |
236 | asp_details_table_term = self.item.copy() | 246 | asp_details_table_term = self.item.copy() |
237 | asp_details_table = [] | 247 | asp_details_table = [] |
... | @@ -262,6 +272,7 @@ class Finder: | ... | @@ -262,6 +272,7 @@ class Finder: |
262 | if len(asp_details_table) > 0: | 272 | if len(asp_details_table) > 0: |
263 | asp_details_table_term['words'] = asp_details_table | 273 | asp_details_table_term['words'] = asp_details_table |
264 | return asp_details_table_term | 274 | return asp_details_table_term |
275 | |||
265 | def get_signature(self): | 276 | def get_signature(self): |
266 | signature = self.item.copy() | 277 | signature = self.item.copy() |
267 | for block in self.pdf_info['0']['blocks']: | 278 | for block in self.pdf_info['0']['blocks']: |
... | @@ -275,6 +286,7 @@ class Finder: | ... | @@ -275,6 +286,7 @@ class Finder: |
275 | signature['words'] = words | 286 | signature['words'] = words |
276 | signature['position'] = bbox | 287 | signature['position'] = bbox |
277 | return signature | 288 | return signature |
289 | |||
278 | def get_somebody(self, top, bottom): | 290 | def get_somebody(self, top, bottom): |
279 | # 指定上下边界后,返回上下边界内的客户信息 | 291 | # 指定上下边界后,返回上下边界内的客户信息 |
280 | _name = self.item.copy() | 292 | _name = self.item.copy() |
... | @@ -309,6 +321,7 @@ class Finder: | ... | @@ -309,6 +321,7 @@ class Finder: |
309 | _id['position'] = bbox | 321 | _id['position'] = bbox |
310 | _id['words'] = words | 322 | _id['words'] = words |
311 | return _name, _id | 323 | return _name, _id |
324 | |||
312 | def get_seller(self): | 325 | def get_seller(self): |
313 | seller = self.item.copy() | 326 | seller = self.item.copy() |
314 | # 先找到 key | 327 | # 先找到 key |
... | @@ -330,11 +343,12 @@ class Finder: | ... | @@ -330,11 +343,12 @@ class Finder: |
330 | for line in block['lines']: | 343 | for line in block['lines']: |
331 | for span in line['spans']: | 344 | for span in line['spans']: |
332 | bbox, text = span['bbox'], span['text'] | 345 | bbox, text = span['bbox'], span['text'] |
333 | if anchor_bbox[2]<np.mean(bbox[::2])<half_width and \ | 346 | if anchor_bbox[2] < np.mean(bbox[::2]) < half_width and \ |
334 | anchor_bbox[1]<np.mean(bbox[1::2])<anchor_bbox[3]: | 347 | anchor_bbox[1] < np.mean(bbox[1::2]) < anchor_bbox[3]: |
335 | seller['position'] = bbox | 348 | seller['position'] = bbox |
336 | seller['words'] = text | 349 | seller['words'] = text |
337 | return seller | 350 | return seller |
351 | |||
338 | def get_payback_account(self): | 352 | def get_payback_account(self): |
339 | account = self.item.copy() | 353 | account = self.item.copy() |
340 | account_name = self.item.copy() | 354 | account_name = self.item.copy() |
... | @@ -387,6 +401,7 @@ class Finder: | ... | @@ -387,6 +401,7 @@ class Finder: |
387 | account_bank['position'] = bbox | 401 | account_bank['position'] = bbox |
388 | account_bank['words'] = words | 402 | account_bank['words'] = words |
389 | return account, account_name, account_bank | 403 | return account, account_name, account_bank |
404 | |||
390 | def get_repayment_schedule(self): | 405 | def get_repayment_schedule(self): |
391 | repayment_schedule = self.item.copy() | 406 | repayment_schedule = self.item.copy() |
392 | # 只看第二页 | 407 | # 只看第二页 |
... | @@ -416,6 +431,7 @@ class Finder: | ... | @@ -416,6 +431,7 @@ class Finder: |
416 | if len(repayment_schedule_table) > 0: | 431 | if len(repayment_schedule_table) > 0: |
417 | repayment_schedule['words'] = repayment_schedule_table | 432 | repayment_schedule['words'] = repayment_schedule_table |
418 | return repayment_schedule | 433 | return repayment_schedule |
434 | |||
419 | def get_signature_role_1(self): | 435 | def get_signature_role_1(self): |
420 | signature_role_1 = self.init_item.copy() | 436 | signature_role_1 = self.init_item.copy() |
421 | # 先定位签字区域 | 437 | # 先定位签字区域 |
... | @@ -445,11 +461,13 @@ class Finder: | ... | @@ -445,11 +461,13 @@ class Finder: |
445 | else: | 461 | else: |
446 | words = '无' | 462 | words = '无' |
447 | boxes = np.array(boxes).reshape((-1, 2)) | 463 | boxes = np.array(boxes).reshape((-1, 2)) |
448 | position = [min(boxes[:,0]), min(boxes[:,1]), max(boxes[:,0]), max(boxes[:,1])] | 464 | position = [min(boxes[:, 0]), min(boxes[:, 1]), |
465 | max(boxes[:, 0]), max(boxes[:, 1])] | ||
449 | signature_role_1['page_num'] = page_num | 466 | signature_role_1['page_num'] = page_num |
450 | signature_role_1['position'] = position | 467 | signature_role_1['position'] = position |
451 | signature_role_1['words'] = words | 468 | signature_role_1['words'] = words |
452 | return signature_role_1 | 469 | return signature_role_1 |
470 | |||
453 | def get_signature_role_2(self): | 471 | def get_signature_role_2(self): |
454 | signature_role_2 = self.init_item.copy() | 472 | signature_role_2 = self.init_item.copy() |
455 | # 先定位签字区域 | 473 | # 先定位签字区域 |
... | @@ -479,11 +497,13 @@ class Finder: | ... | @@ -479,11 +497,13 @@ class Finder: |
479 | else: | 497 | else: |
480 | words = '无' | 498 | words = '无' |
481 | boxes = np.array(boxes).reshape((-1, 2)) | 499 | boxes = np.array(boxes).reshape((-1, 2)) |
482 | position = [min(boxes[:,0]), min(boxes[:,1]), max(boxes[:,0]), max(boxes[:,1])] | 500 | position = [min(boxes[:, 0]), min(boxes[:, 1]), |
501 | max(boxes[:, 0]), max(boxes[:, 1])] | ||
483 | signature_role_2['page_num'] = page_num | 502 | signature_role_2['page_num'] = page_num |
484 | signature_role_2['position'] = position | 503 | signature_role_2['position'] = position |
485 | signature_role_2['words'] = words | 504 | signature_role_2['words'] = words |
486 | return signature_role_2 | 505 | return signature_role_2 |
506 | |||
487 | def get_signature_role_3(self): | 507 | def get_signature_role_3(self): |
488 | signature_role_3 = self.init_item.copy() | 508 | signature_role_3 = self.init_item.copy() |
489 | # 先定位签字区域 | 509 | # 先定位签字区域 |
... | @@ -513,11 +533,13 @@ class Finder: | ... | @@ -513,11 +533,13 @@ class Finder: |
513 | else: | 533 | else: |
514 | words = '无' | 534 | words = '无' |
515 | boxes = np.array(boxes).reshape((-1, 2)) | 535 | boxes = np.array(boxes).reshape((-1, 2)) |
516 | position = [min(boxes[:,0]), min(boxes[:,1]), max(boxes[:,0]), max(boxes[:,1])] | 536 | position = [min(boxes[:, 0]), min(boxes[:, 1]), |
537 | max(boxes[:, 0]), max(boxes[:, 1])] | ||
517 | signature_role_3['page_num'] = page_num | 538 | signature_role_3['page_num'] = page_num |
518 | signature_role_3['position'] = position | 539 | signature_role_3['position'] = position |
519 | signature_role_3['words'] = words | 540 | signature_role_3['words'] = words |
520 | return signature_role_3 | 541 | return signature_role_3 |
542 | |||
521 | def get_signature_role_4(self): | 543 | def get_signature_role_4(self): |
522 | signature_role_4 = self.init_item.copy() | 544 | signature_role_4 = self.init_item.copy() |
523 | # 先定位签字区域 | 545 | # 先定位签字区域 |
... | @@ -547,11 +569,13 @@ class Finder: | ... | @@ -547,11 +569,13 @@ class Finder: |
547 | else: | 569 | else: |
548 | words = '无' | 570 | words = '无' |
549 | boxes = np.array(boxes).reshape((-1, 2)) | 571 | boxes = np.array(boxes).reshape((-1, 2)) |
550 | position = [min(boxes[:,0]), min(boxes[:,1]), max(boxes[:,0]), max(boxes[:,1])] | 572 | position = [min(boxes[:, 0]), min(boxes[:, 1]), |
573 | max(boxes[:, 0]), max(boxes[:, 1])] | ||
551 | signature_role_4['page_num'] = page_num | 574 | signature_role_4['page_num'] = page_num |
552 | signature_role_4['position'] = position | 575 | signature_role_4['position'] = position |
553 | signature_role_4['words'] = words | 576 | signature_role_4['words'] = words |
554 | return signature_role_4 | 577 | return signature_role_4 |
578 | |||
555 | def get_signature_role_5(self): | 579 | def get_signature_role_5(self): |
556 | signature_role_5 = self.init_item.copy() | 580 | signature_role_5 = self.init_item.copy() |
557 | # 先定位签字区域 | 581 | # 先定位签字区域 |
... | @@ -582,11 +606,13 @@ class Finder: | ... | @@ -582,11 +606,13 @@ class Finder: |
582 | else: | 606 | else: |
583 | words = '无' | 607 | words = '无' |
584 | boxes = np.array(boxes).reshape((-1, 2)) | 608 | boxes = np.array(boxes).reshape((-1, 2)) |
585 | position = [min(boxes[:,0]), min(boxes[:,1]), max(boxes[:,0]), max(boxes[:,1])] | 609 | position = [min(boxes[:, 0]), min(boxes[:, 1]), |
610 | max(boxes[:, 0]), max(boxes[:, 1])] | ||
586 | signature_role_5['page_num'] = page_num | 611 | signature_role_5['page_num'] = page_num |
587 | signature_role_5['position'] = position | 612 | signature_role_5['position'] = position |
588 | signature_role_5['words'] = words | 613 | signature_role_5['words'] = words |
589 | return signature_role_5 | 614 | return signature_role_5 |
615 | |||
590 | def get_last_page_signature(self, page_num, top, bottom): | 616 | def get_last_page_signature(self, page_num, top, bottom): |
591 | signature_name = self.item.copy() | 617 | signature_name = self.item.copy() |
592 | signature_date = self.item.copy() | 618 | signature_date = self.item.copy() |
... | @@ -610,7 +636,7 @@ class Finder: | ... | @@ -610,7 +636,7 @@ class Finder: |
610 | for line in block['lines']: | 636 | for line in block['lines']: |
611 | for span in line['spans']: | 637 | for span in line['spans']: |
612 | bbox, text = span['bbox'], span['text'] | 638 | bbox, text = span['bbox'], span['text'] |
613 | if '签署日期' in text and int(anchor_top)<np.mean(bbox[1::2])<int(anchor_bottom): | 639 | if '签署日期' in text and int(anchor_top) < np.mean(bbox[1::2]) < int(anchor_bottom): |
614 | name = text.split(' ')[0] | 640 | name = text.split(' ')[0] |
615 | date = text.split(':')[-1] | 641 | date = text.split(':')[-1] |
616 | signature_name['words'] = name | 642 | signature_name['words'] = name |
... | @@ -618,6 +644,7 @@ class Finder: | ... | @@ -618,6 +644,7 @@ class Finder: |
618 | signature_date['words'] = date | 644 | signature_date['words'] = date |
619 | signature_date['position'] = bbox | 645 | signature_date['position'] = bbox |
620 | return signature_name, signature_date | 646 | return signature_name, signature_date |
647 | |||
621 | def get_info(self): | 648 | def get_info(self): |
622 | """ | 649 | """ |
623 | block['type'] == 0 : 表示该元素为图片 | 650 | block['type'] == 0 : 表示该元素为图片 |
... | @@ -672,22 +699,27 @@ class Finder: | ... | @@ -672,22 +699,27 @@ class Finder: |
672 | contract_no = self.get_contract_no(page_num='0') | 699 | contract_no = self.get_contract_no(page_num='0') |
673 | self.init_result['page_2']['合同编号'] = contract_no | 700 | self.init_result['page_2']['合同编号'] = contract_no |
674 | # 找借款人及抵押人(地址字段原本有空格) | 701 | # 找借款人及抵押人(地址字段原本有空格) |
675 | borrower_name, borrower_id = self.get_somebody(top='借款人及抵押人:', bottom='共同借款人:') | 702 | borrower_name, borrower_id = self.get_somebody( |
703 | top='借款人及抵押人:', bottom='共同借款人:') | ||
676 | # 这是为了同时兼容 8.1 版本 | 704 | # 这是为了同时兼容 8.1 版本 |
677 | if borrower_name['words'] == None: | 705 | if borrower_name['words'] == None: |
678 | borrower_name, borrower_id = self.get_somebody(top='借款人及抵押人:', bottom='共同借款人及共同抵押人:') | 706 | borrower_name, borrower_id = self.get_somebody( |
707 | top='借款人及抵押人:', bottom='共同借款人及共同抵押人:') | ||
679 | self.init_result['page_2']['借款人及抵押人']['name'] = borrower_name | 708 | self.init_result['page_2']['借款人及抵押人']['name'] = borrower_name |
680 | self.init_result['page_2']['借款人及抵押人']['id'] = borrower_id | 709 | self.init_result['page_2']['借款人及抵押人']['id'] = borrower_id |
681 | # 找共同借款人及共同抵押人 | 710 | # 找共同借款人及共同抵押人 |
682 | co_borrower_name, co_borrower_id = self.get_somebody(top='共同借款人:', bottom='保证人1:') | 711 | co_borrower_name, co_borrower_id = self.get_somebody( |
712 | top='共同借款人:', bottom='保证人1:') | ||
683 | self.init_result['page_2']['共同借款人及共同抵押人']['name'] = co_borrower_name | 713 | self.init_result['page_2']['共同借款人及共同抵押人']['name'] = co_borrower_name |
684 | self.init_result['page_2']['共同借款人及共同抵押人']['id'] = co_borrower_id | 714 | self.init_result['page_2']['共同借款人及共同抵押人']['id'] = co_borrower_id |
685 | # 保证人1 | 715 | # 保证人1 |
686 | first_guarantor_name, first_guarantor_id = self.get_somebody(top='保证人1:', bottom='保证人2:') | 716 | first_guarantor_name, first_guarantor_id = self.get_somebody( |
717 | top='保证人1:', bottom='保证人2:') | ||
687 | self.init_result['page_2']['保证人1']['name'] = first_guarantor_name | 718 | self.init_result['page_2']['保证人1']['name'] = first_guarantor_name |
688 | self.init_result['page_2']['保证人1']['id'] = first_guarantor_id | 719 | self.init_result['page_2']['保证人1']['id'] = first_guarantor_id |
689 | # 保证人2 | 720 | # 保证人2 |
690 | second_guarantor_name, second_guarantor_id = self.get_somebody(top='保证人2:', bottom='第一章') | 721 | second_guarantor_name, second_guarantor_id = self.get_somebody( |
722 | top='保证人2:', bottom='第一章') | ||
691 | self.init_result['page_2']['保证人2']['name'] = second_guarantor_name | 723 | self.init_result['page_2']['保证人2']['name'] = second_guarantor_name |
692 | self.init_result['page_2']['保证人2']['id'] = second_guarantor_id | 724 | self.init_result['page_2']['保证人2']['id'] = second_guarantor_id |
693 | # 所购车辆价格 | 725 | # 所购车辆价格 | ... | ... |
-
Please register or sign in to post a comment