add contract 8.5
Showing
3 changed files
with
105 additions
and
29 deletions
... | @@ -2970,6 +2970,8 @@ def compare(application_id, application_entity, uniq_seq, ocr_res_id, is_ca=True | ... | @@ -2970,6 +2970,8 @@ def compare(application_id, application_entity, uniq_seq, ocr_res_id, is_ca=True |
2970 | auto_obj = auto_class.objects.filter(application_id=application_id, on_off=True).first() | 2970 | auto_obj = auto_class.objects.filter(application_id=application_id, on_off=True).first() |
2971 | if auto_obj is not None: | 2971 | if auto_obj is not None: |
2972 | auto_result = se_compare_auto(application_id, application_entity, ocr_res_id, last_obj, ocr_res_dict, auto_obj) | 2972 | auto_result = se_compare_auto(application_id, application_entity, ocr_res_id, last_obj, ocr_res_dict, auto_obj) |
2973 | else: | ||
2974 | auto_result = None | ||
2973 | 2975 | ||
2974 | full_result = se_compare(application_id, application_entity, ocr_res_id, last_obj, ocr_res_dict, is_cms, auto_result) | 2976 | full_result = se_compare(application_id, application_entity, ocr_res_id, last_obj, ocr_res_dict, is_cms, auto_result) |
2975 | 2977 | ... | ... |
... | @@ -6,28 +6,35 @@ | ... | @@ -6,28 +6,35 @@ |
6 | # @Description : | 6 | # @Description : |
7 | 7 | ||
8 | from .get_char import Finder | 8 | from .get_char import Finder |
9 | import numpy as np | ||
9 | 10 | ||
10 | 11 | ||
11 | def predict(pdf_info): | 12 | def predict(pdf_info): |
12 | ocr_results = {} | 13 | ocr_results = {} |
13 | for pno in pdf_info: | 14 | for pno in pdf_info: |
14 | ocr_results[pno] = {} | 15 | ocr_results[pno] = {} |
16 | ocr_result = [] | ||
15 | for key, block in enumerate(pdf_info[pno]['blocks']): | 17 | for key, block in enumerate(pdf_info[pno]['blocks']): |
16 | if block['type'] != 0: | 18 | if block['type'] != 0: |
17 | continue | 19 | continue |
18 | for line in block['lines']: | 20 | for line in block['lines']: |
19 | for span in line['spans']: | 21 | for span in line['spans']: |
20 | bbox, text = span['bbox'], span['text'] | 22 | bbox, text = span['bbox'], span['text'] |
23 | if len(text) == 0: | ||
24 | continue | ||
21 | # print(text) | 25 | # print(text) |
22 | xmin, ymin, xmax, ymax = bbox | 26 | xmin, ymin, xmax, ymax = bbox |
23 | polygon = [xmin, ymin, xmax, ymin, xmax, ymax, xmin, ymax] | 27 | polygon = [xmin, ymin, xmax, ymin, xmax, ymax, xmin, ymax] |
28 | polygon = np.array(polygon, dtype=np.int32).tolist() | ||
24 | text = text.replace(":", ":").replace(" ", "") | 29 | text = text.replace(":", ":").replace(" ", "") |
25 | ocr_results[pno][key] = [polygon, text] | 30 | ocr_result.append([polygon, text]) |
26 | 31 | ocr_result = sorted(ocr_result, key=lambda x: x[0][1], reverse=False) # 按 y0 从小到大排 | |
32 | keys = list(range(len(ocr_result))) | ||
33 | ocr_result = dict(zip(keys, ocr_result)) | ||
34 | ocr_results[pno] = ocr_result | ||
27 | # 输入是整个 PDF 中的信息 | 35 | # 输入是整个 PDF 中的信息 |
28 | f = Finder(pdf_info, ocr_results=ocr_results) | 36 | f = Finder(pdf_info, ocr_results=ocr_results) |
29 | results = f.get_info() | 37 | results = f.get_info() |
30 | |||
31 | return results | 38 | return results |
32 | 39 | ||
33 | 40 | ... | ... |
... | @@ -8,6 +8,7 @@ | ... | @@ -8,6 +8,7 @@ |
8 | import re | 8 | import re |
9 | import numpy as np | 9 | import numpy as np |
10 | from fuzzywuzzy import fuzz | 10 | from fuzzywuzzy import fuzz |
11 | from shapely.geometry import Polygon | ||
11 | 12 | ||
12 | 13 | ||
13 | class Finder: | 14 | class Finder: |
... | @@ -111,6 +112,28 @@ class Finder: | ... | @@ -111,6 +112,28 @@ class Finder: |
111 | }, | 112 | }, |
112 | } | 113 | } |
113 | 114 | ||
115 | def get_top_iou(self, poly, ocr_result): | ||
116 | """传入一个多边形, 找到与之最匹配的多边形 | ||
117 | |||
118 | Args: | ||
119 | poly (TYPE): Description | ||
120 | """ | ||
121 | iou_list = [] | ||
122 | for key in ocr_result: | ||
123 | bbox, text = ocr_result[key] | ||
124 | g = Polygon(np.array(bbox).reshape((-1, 2))) | ||
125 | p = Polygon(np.array(poly).reshape((-1, 2))) | ||
126 | if not g.is_valid or not p.is_valid: | ||
127 | continue | ||
128 | inter = Polygon(g).intersection(Polygon(p)).area | ||
129 | union = g.area + p.area - inter | ||
130 | iou = inter / union | ||
131 | iou_list.append([iou, key]) | ||
132 | if len(iou_list) == 0: | ||
133 | return -1, -1 | ||
134 | top_iou = sorted(iou_list, key=lambda x: x[0])[-1] | ||
135 | return top_iou | ||
136 | |||
114 | def poly_to_rectangle(self, poly): | 137 | def poly_to_rectangle(self, poly): |
115 | xmin, ymin, xmax, ymin, xmax, ymax, xmin, ymax = poly | 138 | xmin, ymin, xmax, ymin, xmax, ymax, xmin, ymax = poly |
116 | bbox = [xmin, ymin, xmax, ymax] | 139 | bbox = [xmin, ymin, xmax, ymax] |
... | @@ -253,38 +276,67 @@ class Finder: | ... | @@ -253,38 +276,67 @@ class Finder: |
253 | if mergeindex == -1: | 276 | if mergeindex == -1: |
254 | return text_list | 277 | return text_list |
255 | else: | 278 | else: |
256 | new_text_list = text_list[:mergeindex] + [text_list[mergeindex] + text_list[mergeindex + 1]] + \ | 279 | new_text_list = text_list[:mergeindex] + [text_list[mergeindex] + text_list[mergeindex + 1]] + text_list[ |
257 | text_list[mergeindex + 2:] | 280 | mergeindex + 2:] |
258 | return self.mergelist(new_text_list) | 281 | return self.mergelist(new_text_list) |
259 | 282 | ||
260 | def get_asp_details(self, page_num): | 283 | def get_asp_details(self, page_num): |
261 | asp_details_table_term = self.item.copy() | 284 | asp_details_table_term = self.item.copy() |
262 | asp_details_table = [] | 285 | asp_details_table = [['附加产品融资贷款本金总金额及贷款利率明细'], ['项目1', '用途总金额2', '贷款本金3']] |
263 | asp_details_text_list = [] | 286 | bbox_xm = None |
264 | table = False | 287 | bbox_ytzje = None |
265 | for block in self.pdf_info[page_num]['blocks']: | 288 | bbox_dkbj = None |
266 | if block['type'] != 0: | 289 | bbox_total = None |
290 | for key in self.ocr_results[page_num]: | ||
291 | bbox, text = self.ocr_results[page_num][key] | ||
292 | if text == '项目1': | ||
293 | bbox_xm = bbox | ||
294 | if text == '用途总金额2': | ||
295 | bbox_ytzje = bbox | ||
296 | if text == '贷款本金3': | ||
297 | bbox_dkbj = bbox | ||
298 | if text == '附加产品融资贷款本': | ||
299 | bbox_total = bbox | ||
300 | # print(bbox_xm, bbox_ytzje, bbox_dkbj, bbox_total) | ||
301 | if bbox_xm: | ||
302 | for i in range(10): | ||
303 | rh = abs(bbox_xm[1] - bbox_xm[-1]) | ||
304 | anchor = np.array(bbox_xm).reshape((-1, 2)) | ||
305 | anchor[:, 1] += int(rh * 1.4) | ||
306 | _iou, _key = self.get_top_iou(poly=anchor, ocr_result=self.ocr_results[page_num]) | ||
307 | if _iou > 0: | ||
308 | bbox, xm_text = self.ocr_results[page_num][_key] | ||
309 | bbox_xm = bbox | ||
310 | # 解决项目内容是两行的问题 | ||
311 | if not '所购' in xm_text: | ||
312 | line = asp_details_table[-1] | ||
313 | line[0] += xm_text | ||
314 | asp_details_table[-1] = line | ||
267 | continue | 315 | continue |
268 | for line in block['lines']: | 316 | # print(xm_text) |
269 | for span in line['spans']: | 317 | anchor_1 = [bbox_ytzje[0], bbox[1], bbox_ytzje[2], bbox[3], |
270 | bbox, text = span['bbox'], span['text'] | 318 | bbox_ytzje[4], bbox[5], bbox_ytzje[6], bbox[7]] |
271 | if '附加产品融资贷款本金总金额明细' == text: | 319 | _iou, _key = self.get_top_iou(poly=anchor_1, ocr_result=self.ocr_results[page_num]) |
272 | table = True | 320 | bbox, ytzje_text = self.ocr_results[page_num][_key] |
273 | if '第二条' in text or '征信管理' in text: | 321 | # print(ytzje_text) |
274 | table = False | 322 | anchor_2 = [bbox_dkbj[0], bbox[1], bbox_dkbj[2], bbox[3], |
275 | if table == True: | 323 | bbox_dkbj[4], bbox[5], bbox_dkbj[6], bbox[7]] |
276 | # print(text) | 324 | _iou, _key = self.get_top_iou(poly=anchor_2, ocr_result=self.ocr_results[page_num]) |
277 | asp_details_text_list.append(text) | 325 | bbox, dkbj_text = self.ocr_results[page_num][_key] |
278 | asp_details_text_list = self.mergelist(asp_details_text_list) | 326 | # print(dkbj_text) |
279 | for i in range((len(asp_details_text_list) + 2) // 3): | 327 | if xm_text == ytzje_text: |
280 | line = [] | 328 | xm_text, ytzje_text = xm_text.split(' ') |
281 | if i == 0: | 329 | line = [xm_text, ytzje_text, dkbj_text] |
282 | line = [asp_details_text_list[0]] | ||
283 | else: | ||
284 | for j in range(3): | ||
285 | line.append(asp_details_text_list[i * 3 - 2 + j]) | ||
286 | asp_details_table.append(line) | 330 | asp_details_table.append(line) |
287 | if len(asp_details_table) > 0: | 331 | else: |
332 | break | ||
333 | |||
334 | if bbox_total: | ||
335 | anchor = [bbox_total[0], bbox[1], bbox_total[2], bbox[3], | ||
336 | bbox_total[4], bbox[5], bbox_total[6], bbox[7]] | ||
337 | _iou, _key = self.get_top_iou(poly=anchor_2, ocr_result=self.ocr_results[page_num]) | ||
338 | bbox, total_text = self.ocr_results[page_num][_key] | ||
339 | asp_details_table.append(['附加产品融资贷款本金总金额:', '', total_text]) | ||
288 | asp_details_table_term['words'] = asp_details_table | 340 | asp_details_table_term['words'] = asp_details_table |
289 | return asp_details_table_term | 341 | return asp_details_table_term |
290 | 342 | ||
... | @@ -678,38 +730,48 @@ class Finder: | ... | @@ -678,38 +730,48 @@ class Finder: |
678 | if '附加产品融资贷款本金总金额' in text: | 730 | if '附加产品融资贷款本金总金额' in text: |
679 | self.is_asp = True | 731 | self.is_asp = True |
680 | self.gen_init_result(self.is_asp) | 732 | self.gen_init_result(self.is_asp) |
733 | if len(list(self.ocr_results.keys())) <= 8: # 8.5 版本客户提供的样本出现串页的情况,暂时无法识别 | ||
681 | # Page 1 | 734 | # Page 1 |
682 | # 找合同编号 | 735 | # 找合同编号 |
683 | contract_no = self.get_contract_no(page_num='0') | 736 | contract_no = self.get_contract_no(page_num='0') |
737 | # print(contract_no) | ||
684 | self.init_result['page_1']['合同编号'] = contract_no | 738 | self.init_result['page_1']['合同编号'] = contract_no |
685 | # 所购车辆价格 | 739 | # 所购车辆价格 |
686 | vehicle_price = self.get_vehicle_price() | 740 | vehicle_price = self.get_vehicle_price() |
741 | # print(vehicle_price) | ||
687 | self.init_result['page_1']['所购车辆价格'] = vehicle_price | 742 | self.init_result['page_1']['所购车辆价格'] = vehicle_price |
688 | # 车架号 | 743 | # 车架号 |
689 | vin = self.get_vin() | 744 | vin = self.get_vin() |
745 | # print(vin) | ||
690 | self.init_result['page_1']['车架号'] = vin | 746 | self.init_result['page_1']['车架号'] = vin |
691 | # 贷款本金金额(如果是 ASP产品)则'贷款本金金额'项目中包含'车辆贷款本金金额'和'附加产品融资贷款本金总金额'两个项目 | 747 | # 贷款本金金额(如果是 ASP产品)则'贷款本金金额'项目中包含'车辆贷款本金金额'和'附加产品融资贷款本金总金额'两个项目 |
692 | upper, lower, asp_1, asp_2 = self.get_loan_principal() | 748 | upper, lower, asp_1, asp_2 = self.get_loan_principal() |
749 | # print(upper, lower, asp_1, asp_2) | ||
693 | self.init_result['page_1']['贷款本金金额']['大写'] = upper | 750 | self.init_result['page_1']['贷款本金金额']['大写'] = upper |
694 | self.init_result['page_1']['贷款本金金额']['小写'] = lower | 751 | self.init_result['page_1']['贷款本金金额']['小写'] = lower |
695 | self.init_result['page_1']['贷款本金金额']['车辆贷款本金金额'] = asp_1 | 752 | self.init_result['page_1']['贷款本金金额']['车辆贷款本金金额'] = asp_1 |
696 | self.init_result['page_1']['贷款本金金额']['附加产品融资贷款本金总金额'] = asp_2 | 753 | self.init_result['page_1']['贷款本金金额']['附加产品融资贷款本金总金额'] = asp_2 |
697 | # 贷款期限 | 754 | # 贷款期限 |
698 | loan_term = self.get_loan_term() | 755 | loan_term = self.get_loan_term() |
756 | # print(loan_term) | ||
699 | self.init_result['page_1']['贷款期限'] = loan_term | 757 | self.init_result['page_1']['贷款期限'] = loan_term |
700 | # 附加产品融资贷款本金总金额明细(ASP-表格) | 758 | # 附加产品融资贷款本金总金额明细(ASP-表格) |
701 | asp_details_table = self.get_asp_details(page_num='0') | 759 | asp_details_table = self.get_asp_details(page_num='0') |
760 | # print(asp_details_table) | ||
702 | self.init_result['page_1']['附加产品融资贷款本金总金额明细'] = asp_details_table | 761 | self.init_result['page_1']['附加产品融资贷款本金总金额明细'] = asp_details_table |
703 | # 借款人签字及时间 | 762 | # 借款人签字及时间 |
704 | signature = self.get_signature() | 763 | signature = self.get_signature() |
764 | # print(signature) | ||
705 | self.init_result['page_1']['借款人签字及时间'] = signature | 765 | self.init_result['page_1']['借款人签字及时间'] = signature |
706 | ####################################### | 766 | ####################################### |
707 | # Page 2 | 767 | # Page 2 |
708 | # 找合同编号 | 768 | # 找合同编号 |
709 | contract_no = self.get_contract_no(page_num='0') | 769 | contract_no = self.get_contract_no(page_num='0') |
770 | # print(contract_no) | ||
710 | self.init_result['page_2']['合同编号'] = contract_no | 771 | self.init_result['page_2']['合同编号'] = contract_no |
711 | # 找借款人及抵押人(地址字段原本有空格) | 772 | # 找借款人及抵押人(地址字段原本有空格) |
712 | borrower_name, borrower_id = self.get_somebody(top='借款人及抵押人:', bottom='共同借款人:') | 773 | borrower_name, borrower_id = self.get_somebody(top='借款人及抵押人:', bottom='共同借款人:') |
774 | # print(borrower_name, borrower_id) | ||
713 | # 这是为了同时兼容 8.1 版本 | 775 | # 这是为了同时兼容 8.1 版本 |
714 | if borrower_name['words'] == None: | 776 | if borrower_name['words'] == None: |
715 | borrower_name, borrower_id = self.get_somebody(top='借款人及抵押人:', bottom='共同借款人及共同抵押人:') | 777 | borrower_name, borrower_id = self.get_somebody(top='借款人及抵押人:', bottom='共同借款人及共同抵押人:') |
... | @@ -717,6 +779,7 @@ class Finder: | ... | @@ -717,6 +779,7 @@ class Finder: |
717 | self.init_result['page_2']['借款人及抵押人']['id'] = borrower_id | 779 | self.init_result['page_2']['借款人及抵押人']['id'] = borrower_id |
718 | # 找共同借款人及共同抵押人 | 780 | # 找共同借款人及共同抵押人 |
719 | co_borrower_name, co_borrower_id = self.get_somebody(top='共同借款人:', bottom='保证人1:') | 781 | co_borrower_name, co_borrower_id = self.get_somebody(top='共同借款人:', bottom='保证人1:') |
782 | # print(co_borrower_name, co_borrower_id) | ||
720 | self.init_result['page_2']['共同借款人及共同抵押人']['name'] = co_borrower_name | 783 | self.init_result['page_2']['共同借款人及共同抵押人']['name'] = co_borrower_name |
721 | self.init_result['page_2']['共同借款人及共同抵押人']['id'] = co_borrower_id | 784 | self.init_result['page_2']['共同借款人及共同抵押人']['id'] = co_borrower_id |
722 | # 保证人1 | 785 | # 保证人1 |
... | @@ -738,6 +801,7 @@ class Finder: | ... | @@ -738,6 +801,7 @@ class Finder: |
738 | self.init_result['page_2']['经销商'] = seller | 801 | self.init_result['page_2']['经销商'] = seller |
739 | # 贷款本金金额(如果是 ASP产品)则'贷款本金金额'项目中包含'车辆贷款本金金额'和'附加产品融资贷款本金总金额'两个项目 | 802 | # 贷款本金金额(如果是 ASP产品)则'贷款本金金额'项目中包含'车辆贷款本金金额'和'附加产品融资贷款本金总金额'两个项目 |
740 | upper, lower, asp_1, asp_2 = self.get_loan_principal(page_num='1') | 803 | upper, lower, asp_1, asp_2 = self.get_loan_principal(page_num='1') |
804 | # print(upper, lower, asp_1, asp_2) | ||
741 | self.init_result['page_2']['贷款本金金额']['大写'] = upper | 805 | self.init_result['page_2']['贷款本金金额']['大写'] = upper |
742 | self.init_result['page_2']['贷款本金金额']['小写'] = lower | 806 | self.init_result['page_2']['贷款本金金额']['小写'] = lower |
743 | self.init_result['page_2']['贷款本金金额']['车辆贷款本金金额'] = asp_1 | 807 | self.init_result['page_2']['贷款本金金额']['车辆贷款本金金额'] = asp_1 |
... | @@ -750,6 +814,7 @@ class Finder: | ... | @@ -750,6 +814,7 @@ class Finder: |
750 | self.init_result['page_2']['标准利率'] = standard_rate | 814 | self.init_result['page_2']['标准利率'] = standard_rate |
751 | # 还款账户 | 815 | # 还款账户 |
752 | account, account_name, account_bank = self.get_payback_account() | 816 | account, account_name, account_bank = self.get_payback_account() |
817 | # print(account, account_name, account_bank) | ||
753 | self.init_result['page_2']['还款账户']['账号'] = account | 818 | self.init_result['page_2']['还款账户']['账号'] = account |
754 | self.init_result['page_2']['还款账户']['户名'] = account_name | 819 | self.init_result['page_2']['还款账户']['户名'] = account_name |
755 | self.init_result['page_2']['还款账户']['开户行'] = account_bank | 820 | self.init_result['page_2']['还款账户']['开户行'] = account_bank |
... | @@ -760,6 +825,7 @@ class Finder: | ... | @@ -760,6 +825,7 @@ class Finder: |
760 | self.init_result['page_3']['合同编号'] = contract_no | 825 | self.init_result['page_3']['合同编号'] = contract_no |
761 | # 还款计划表(表格) | 826 | # 还款计划表(表格) |
762 | repayment_schedule_table = self.get_repayment_schedule() | 827 | repayment_schedule_table = self.get_repayment_schedule() |
828 | # print(repayment_schedule_table) | ||
763 | self.init_result['page_3']['还款计划表'] = repayment_schedule_table | 829 | self.init_result['page_3']['还款计划表'] = repayment_schedule_table |
764 | ####################################### | 830 | ####################################### |
765 | # Page 4 | 831 | # Page 4 |
... | @@ -768,6 +834,7 @@ class Finder: | ... | @@ -768,6 +834,7 @@ class Finder: |
768 | self.init_result['page_4']['合同编号'] = contract_no | 834 | self.init_result['page_4']['合同编号'] = contract_no |
769 | # 附加产品融资贷款本金总金额明细(ASP-表格) | 835 | # 附加产品融资贷款本金总金额明细(ASP-表格) |
770 | asp_details_table = self.get_asp_details(page_num='3') | 836 | asp_details_table = self.get_asp_details(page_num='3') |
837 | # print(asp_details_table) | ||
771 | self.init_result['page_4']['附加产品融资贷款本金总金额明细'] = asp_details_table | 838 | self.init_result['page_4']['附加产品融资贷款本金总金额明细'] = asp_details_table |
772 | ####################################### | 839 | ####################################### |
773 | # Page 5 | 840 | # Page 5 | ... | ... |
-
Please register or sign in to post a comment