e2de024d by 周伟奇

Merge branch 'feature/fsm-contract' into fix/report_ca

2 parents dc481cd4 8d595a3e
...@@ -11,7 +11,7 @@ PAGE_SIZE_DEFAULT = 10 ...@@ -11,7 +11,7 @@ PAGE_SIZE_DEFAULT = 10
11 FIXED_APPLICATION_ID_PREFIX = 'CH-S' 11 FIXED_APPLICATION_ID_PREFIX = 'CH-S'
12 12
13 DOC_SCHEME_LIST = ['ACCEPTANCE', 'SETTLEMENT', 'CONTRACTMANAGEMENT'] 13 DOC_SCHEME_LIST = ['ACCEPTANCE', 'SETTLEMENT', 'CONTRACTMANAGEMENT']
14 DATA_SOURCE_LIST = ['POS', 'EAPP', 'ECONTRACT'] 14 DATA_SOURCE_LIST = ['POS', 'EAPP', 'ECONTRACT', 'OVP']
15 COMPARE_DOC_SCHEME_LIST = ['CA', 'SE'] 15 COMPARE_DOC_SCHEME_LIST = ['CA', 'SE']
16 16
17 HIL_PREFIX = 'HIL' 17 HIL_PREFIX = 'HIL'
......
...@@ -1476,7 +1476,8 @@ class Command(BaseCommand, LoggerMixin): ...@@ -1476,7 +1476,8 @@ class Command(BaseCommand, LoggerMixin):
1476 1476
1477 # AFC合同 1477 # AFC合同
1478 if classify_1_str == str(consts.CONTRACT_CLASSIFY): 1478 if classify_1_str == str(consts.CONTRACT_CLASSIFY):
1479 ocr_result = afc_predict(pdf_handler.pdf_info) 1479 is_fsm = doc.data_source == consts.DATA_SOURCE_LIST[3]
1480 ocr_result = afc_predict(pdf_handler.pdf_info, is_fsm=is_fsm)
1480 page_res = {} 1481 page_res = {}
1481 for page_num, page_info in ocr_result.get('page_info', {}).items(): 1482 for page_num, page_info in ocr_result.get('page_info', {}).items():
1482 if isinstance(page_num, str) and page_num.startswith('page_'): 1483 if isinstance(page_num, str) and page_num.startswith('page_'):
...@@ -1499,8 +1500,9 @@ class Command(BaseCommand, LoggerMixin): ...@@ -1499,8 +1500,9 @@ class Command(BaseCommand, LoggerMixin):
1499 } 1500 }
1500 # HIL合同 1501 # HIL合同
1501 elif classify_1_str in consts.HIL_CONTRACT_TYPE_MAP: 1502 elif classify_1_str in consts.HIL_CONTRACT_TYPE_MAP:
1503 is_fsm = doc.data_source == consts.DATA_SOURCE_LIST[3]
1502 file_type_1 = consts.HIL_CONTRACT_TYPE_MAP.get(classify_1_str) 1504 file_type_1 = consts.HIL_CONTRACT_TYPE_MAP.get(classify_1_str)
1503 ocr_result_1 = hil_predict(pdf_handler.pdf_info, file_type_1) 1505 ocr_result_1 = hil_predict(pdf_handler.pdf_info, file_type_1, is_fsm=is_fsm)
1504 rebuild_res_1 = {} 1506 rebuild_res_1 = {}
1505 page_res = {} 1507 page_res = {}
1506 for field_name, field_info in ocr_result_1.items(): 1508 for field_name, field_info in ocr_result_1.items():
...@@ -1526,8 +1528,8 @@ class Command(BaseCommand, LoggerMixin): ...@@ -1526,8 +1528,8 @@ class Command(BaseCommand, LoggerMixin):
1526 'page_info': page_info 1528 'page_info': page_info
1527 } 1529 }
1528 # hmh 1530 # hmh
1529 else: 1531 # else:
1530 pass 1532 # pass
1531 1533
1532 1534
1533 contract_res = {} 1535 contract_res = {}
......
...@@ -36,6 +36,7 @@ class RequestTrigger(NamedEnum): ...@@ -36,6 +36,7 @@ class RequestTrigger(NamedEnum):
36 DOCUPLOAD = (3, 'Document Upload') 36 DOCUPLOAD = (3, 'Document Upload')
37 SUBMITING = (4, 'Submiting') 37 SUBMITING = (4, 'Submiting')
38 UPLOADING = (5, 'Uploading') 38 UPLOADING = (5, 'Uploading')
39 OVP = (6, 'OVP')
39 40
40 41
41 class FailureReason(NamedEnum): 42 class FailureReason(NamedEnum):
......
...@@ -602,12 +602,13 @@ class UploadDocView(GenericView, DocHandler): ...@@ -602,12 +602,13 @@ class UploadDocView(GenericView, DocHandler):
602 is_zip = False 602 is_zip = False
603 603
604 classify_1 = 0 604 classify_1 = 0
605 # 电子合同 605 # 电子合同 Econtract or OVP(FSM)
606 if data_source == consts.DATA_SOURCE_LIST[-1] and document_scheme == consts.DOC_SCHEME_LIST[1]: 606 if data_source == consts.DATA_SOURCE_LIST[2] or data_source == consts.DATA_SOURCE_LIST[3]:
607 for keyword, classify_1_tmp in consts.ECONTRACT_KEYWORDS_MAP.get(prefix): 607 if document_scheme == consts.DOC_SCHEME_LIST[1]:
608 if keyword in document_name: 608 for keyword, classify_1_tmp in consts.ECONTRACT_KEYWORDS_MAP.get(prefix):
609 classify_1 = classify_1_tmp 609 if keyword in document_name:
610 break 610 classify_1 = classify_1_tmp
611 break
611 # FSM合同:WEP/MSI/SC 612 # FSM合同:WEP/MSI/SC
612 elif data_source == consts.DATA_SOURCE_LIST[0] and document_scheme == consts.DOC_SCHEME_LIST[0]: 613 elif data_source == consts.DATA_SOURCE_LIST[0] and document_scheme == consts.DOC_SCHEME_LIST[0]:
613 for keyword, classify_1_tmp in consts.FSM_ECONTRACT_KEYWORDS_MAP.get(prefix): 614 for keyword, classify_1_tmp in consts.FSM_ECONTRACT_KEYWORDS_MAP.get(prefix):
......
...@@ -6,6 +6,7 @@ ...@@ -6,6 +6,7 @@
6 # @Description : 6 # @Description :
7 7
8 from .get_char import Finder 8 from .get_char import Finder
9 from .get_char_fsm import Finder as FSMFinder
9 import numpy as np 10 import numpy as np
10 11
11 12
...@@ -23,7 +24,7 @@ def extract_info(ocr_results): ...@@ -23,7 +24,7 @@ def extract_info(ocr_results):
23 return {'page_1': {'合同编号': contract_no}} 24 return {'page_1': {'合同编号': contract_no}}
24 25
25 26
26 def predict(pdf_info, is_qrs=False): 27 def predict(pdf_info, is_qrs=False, is_fsm=False):
27 ocr_results = {} 28 ocr_results = {}
28 for pno in pdf_info: 29 for pno in pdf_info:
29 ocr_results[pno] = {} 30 ocr_results[pno] = {}
...@@ -50,7 +51,10 @@ def predict(pdf_info, is_qrs=False): ...@@ -50,7 +51,10 @@ def predict(pdf_info, is_qrs=False):
50 results = extract_info(ocr_results) 51 results = extract_info(ocr_results)
51 else: 52 else:
52 # 输入是整个 PDF 中的信息 53 # 输入是整个 PDF 中的信息
53 f = Finder(pdf_info, ocr_results=ocr_results) 54 if is_fsm:
55 f = FSMFinder(pdf_info, ocr_results=ocr_results)
56 else:
57 f = Finder(pdf_info, ocr_results=ocr_results)
54 results = f.get_info() 58 results = f.get_info()
55 return results 59 return results
56 60
......
1 import re
2 import numpy as np
3 from fuzzywuzzy import fuzz
4 from shapely.geometry import Polygon
5
6
7 class Finder:
8
9 def __init__(self, pdf_info, ocr_results):
10 self.pdf_info = pdf_info
11 self.ocr_results = ocr_results
12 self.is_asp = False
13 self.item = {"words": None,
14 "position": None,
15 }
16
17 def gen_init_result(self, is_asp):
18 # 格式化算法输出
19 self.init_result = {"page_1": {"合同编号": self.item,
20 "所购车辆价格": self.item,
21 "车架号": self.item,
22 "贷款本金金额": {"大写": self.item,
23 "小写": self.item,
24 "车辆贷款本金金额": self.item,
25 "附加产品融资贷款本金总金额": self.item,
26 },
27 "贷款期限": self.item,
28 "附加产品融资贷款本金总金额明细": self.item,
29 "借款人签字及时间": self.item,
30 },
31 "page_2": {"合同编号": self.item,
32 "借款人及抵押人": {"name": self.item,
33 "id": self.item,
34 },
35 "共同借款人及共同抵押人": {"name": self.item,
36 "id": self.item,
37 },
38 "保证人1": {"name": self.item,
39 "id": self.item,
40 },
41 "保证人2": {"name": self.item,
42 "id": self.item,
43 },
44 "所购车辆价格": self.item,
45 "车架号": self.item,
46 "经销商": self.item,
47 "贷款本金金额": {"大写": self.item,
48 "小写": self.item,
49 "车辆贷款本金金额": self.item,
50 "附加产品融资贷款本金总金额": self.item,
51 },
52 "贷款期限": self.item,
53 "标准利率": self.item,
54 "借款人收款账户": {"账号": self.item,
55 "户名": self.item,
56 "开户行": self.item,
57 },
58 "还款账户": {"账号": self.item,
59 "户名": self.item,
60 "开户行": self.item,
61 },
62 },
63 "page_3": {"合同编号": self.item,
64 "还款计划表": self.item,
65 },
66 "page_4": {"合同编号": self.item,
67 "附加产品融资贷款本金总金额明细": self.item,
68 },
69 "page_5": {"合同编号": self.item,
70 },
71 "page_6": {"合同编号": self.item,
72 },
73 }
74 self.init_result["page_7"] = {"合同编号": self.item,
75 }
76 self.init_result["page_8"] = {"合同编号": self.item,
77 "主借人签字": {"签字": self.item,
78 "日期": self.item,
79 },
80 "共借人签字": {"签字": self.item,
81 "日期": self.item,
82 },
83 "保证人1签字": {"签字": self.item,
84 "日期": self.item,
85 },
86 "保证人2签字": {"签字": self.item,
87 "日期": self.item,
88 },
89 "见证人签字": {"签字": self.item,
90 "日期": self.item,
91 },
92 }
93
94 def get_top_iou(self, poly, ocr_result):
95 """传入一个多边形, 找到与之最匹配的多边形
96
97 Args:
98 poly (TYPE): Description
99 """
100 iou_list = []
101 for key in ocr_result:
102 bbox, text = ocr_result[key]
103 g = Polygon(np.array(bbox).reshape((-1, 2)))
104 p = Polygon(np.array(poly).reshape((-1, 2)))
105 if not g.is_valid or not p.is_valid:
106 continue
107 inter = Polygon(g).intersection(Polygon(p)).area
108 union = g.area + p.area - inter
109 iou = inter/union
110 iou_list.append([iou, key])
111 if len(iou_list) == 0:
112 return -1, -1
113 top_iou = sorted(iou_list, key=lambda x: x[0])[-1]
114 return top_iou
115
116 def poly_to_rectangle(self, poly):
117 xmin, ymin, xmax, ymin, xmax, ymax, xmin, ymax = poly
118 bbox = [xmin, ymin, xmax, ymax]
119 return bbox
120
121 def get_contract_no(self, page_num):
122 """传入页码,查看该页码右上角的编号
123
124 Args:
125 page_num (string):
126
127 Returns:
128 sting:
129 """
130 contract_no = self.item.copy()
131 # contract_no['words'] = ''
132 # contract_no['position'] = [-1, -1, -1, -1]
133 # 只看第一页
134 for key in self.ocr_results[page_num]:
135 bbox, text = self.ocr_results[page_num][key]
136 if '合同编号:' in text:
137 words = text.split(':')[-1]
138 location = self.poly_to_rectangle(bbox)
139 contract_no['words'] = words
140 contract_no['position'] = location
141 return contract_no
142
143 def get_vehicle_price(self, page_num='0'):
144 vehicle_price = self.item.copy()
145 # vehicle_price['words'] = ''
146 # vehicle_price['position'] = [-1, -1, -1, -1]
147 for key in self.ocr_results[page_num]:
148 bbox, text = self.ocr_results[page_num][key]
149 if '所购车辆价格为人民币' in text:
150 words = text.split('币')[-1]
151 location = self.poly_to_rectangle(bbox)
152 vehicle_price['words'] = words
153 vehicle_price['position'] = location
154 return vehicle_price
155
156 def get_vin(self, page_num='0'):
157 vin = self.item.copy()
158 # vin['words'] = ''
159 # vin['position'] = [-1, -1, -1, -1]
160 for key in self.ocr_results[page_num]:
161 bbox, text = self.ocr_results[page_num][key]
162 if '车架号:' in text:
163 words = text.split(':')[-1]
164 location = self.poly_to_rectangle(bbox)
165 vin['words'] = words
166 vin['position'] = location
167 return vin
168
169 def get_loan_principal(self, page_num='0'):
170 chinese_keywords = ['壹', '贰', '叁', '肆', '伍', '陆', '柒', '捌', '玖', '拾',
171 '佰', '仟', '万', '亿', '元', '角', '分', '零', '整']
172 upper = self.item.copy()
173 lower = self.item.copy()
174 asp_1 = self.item.copy()
175 asp_2 = self.item.copy()
176 anchor_bbox = None
177 for block in self.pdf_info[page_num]['blocks']:
178 if block['type'] != 0:
179 continue
180 for line in block['lines']:
181 for span in line['spans']:
182 bbox, text = span['bbox'], span['text']
183 if fuzz.ratio(''.join(chinese_keywords), text) > 15:
184 text = text.split(':')[-1].strip()
185 upper['position'] = bbox
186 upper['words'] = text
187 if '小写:¥' in text:
188 words = text.split('¥')[-1].strip()
189 lower['position'] = bbox
190 lower['words'] = words
191 if '附加产品融资贷款本金总金额' == text:
192 anchor_bbox = bbox
193 if anchor_bbox:
194 for block in self.pdf_info[page_num]['blocks']:
195 if block['type'] != 0:
196 continue
197 for line in block['lines']:
198 for span in line['spans']:
199 bbox, text = span['bbox'], span['text']
200 if np.mean(bbox[1::2]) < np.mean(anchor_bbox[1::2]) and '人民币:小写:' in text:
201 words = re.findall(r'人民币:小写:\[(.*)\]', text)[0]
202 asp_1['position'] = bbox
203 asp_1['words'] = words
204 if np.mean(bbox[1::2]) > np.mean(anchor_bbox[1::2]) and '人民币:小写:' in text:
205 words = re.findall(r'人民币:小写:\[(.*)\]', text)[0]
206 asp_2['position'] = bbox
207 asp_2['words'] = words
208 return upper, lower, asp_1, asp_2
209
210 def get_loan_term(self, page_num='0'):
211 loan_term = self.item.copy()
212 all_text = ''
213 for block in self.pdf_info[page_num]['blocks']:
214 if block['type'] != 0:
215 continue
216 for line in block['lines']:
217 for span in line['spans']:
218 bbox, text = span['bbox'], span['text']
219 all_text += text
220 matchs = re.search(r'贷款期限(\d+)个月', all_text)
221 if matchs:
222 words = matchs.group(1)
223 for block in self.pdf_info[page_num]['blocks']:
224 if block['type'] != 0:
225 continue
226 for line in block['lines']:
227 for span in line['spans']:
228 bbox, text = span['bbox'], span['text']
229 if f'{words}个月' in text:
230 loan_term['position'] = bbox
231 loan_term['words'] = words
232 return loan_term
233
234 def get_standard_rate(self, page_num='0'):
235 standard_rate = self.item.copy()
236 for block in self.pdf_info[page_num]['blocks']:
237 if block['type'] != 0:
238 continue
239 for line in block['lines']:
240 for span in line['spans']:
241 bbox, text = span['bbox'], span['text']
242 matchs = re.search(r'本合同当期的标准利率为(\S+)%/年', text)
243 if matchs:
244 standard_rate['position'] = bbox
245 standard_rate['words'] = matchs.group(1)
246 return standard_rate
247
248 def mergelist(self, text_list):
249 pattern = re.compile("[^\u4e00-\u9fa5]") # 匹配不是中文的其他字符
250 mergeindex = -1
251 for index, i in enumerate(text_list):
252 if '所购' in i and len(pattern.sub('', pattern.sub('', text_list[index+1]))) != 0:
253 # if '所购' in i and '.00' not in text_list[index+1]:
254 mergeindex = index
255 if mergeindex == -1:
256 return text_list
257 else:
258 new_text_list = text_list[:mergeindex] + [text_list[mergeindex] + text_list[mergeindex+1]] + text_list[mergeindex+2:]
259 return self.mergelist(new_text_list)
260
261 def get_asp_details(self, page_num):
262 asp_details_table_term = self.item.copy()
263
264 asp_details_table = [['附加产品融资贷款本金总金额及贷款利率明细'], ['项目1', '用途总金额2', '贷款本金3']]
265
266 bbox_xm = None
267 bbox_ytzje = None
268 bbox_dkbj = None
269 bbox_total = None
270 for key in self.ocr_results[page_num]:
271 bbox, text = self.ocr_results[page_num][key]
272 if text == '项目1':
273 bbox_xm = bbox
274 if text == '用途总金额2':
275 bbox_ytzje = bbox
276 if text == '贷款本金3':
277 bbox_dkbj = bbox
278 if text in ['附加产品融资贷款本', '附加产品融资贷款本金', '附加产品融资贷']:
279 bbox_total = bbox
280
281 if bbox_xm:
282 for i in range(10):
283 rh = abs(bbox_xm[1]-bbox_xm[-1])
284 anchor = np.array(bbox_xm).reshape((-1 ,2))
285 anchor[:, 1] += int(rh*1.4)
286 _iou, _key = self.get_top_iou(poly=anchor, ocr_result=self.ocr_results[page_num])
287 if _iou > 0:
288 bbox, xm_text = self.ocr_results[page_num][_key]
289 bbox_xm = bbox
290 # 解决项目内容是两行的问题
291 if not '所购' in xm_text:
292 line = asp_details_table[-1]
293 line[0] += xm_text
294 asp_details_table[-1] = line
295 continue
296 # print(xm_text)
297 anchor_1 = [bbox_ytzje[0], bbox[1], bbox_ytzje[2], bbox[3],
298 bbox_ytzje[4], bbox[5], bbox_ytzje[6], bbox[7]]
299 _iou, _key = self.get_top_iou(poly=anchor_1, ocr_result=self.ocr_results[page_num])
300 bbox, ytzje_text = self.ocr_results[page_num][_key]
301 # print(ytzje_text)
302 anchor_2 = [bbox_dkbj[0], bbox[1], bbox_dkbj[2], bbox[3],
303 bbox_dkbj[4], bbox[5], bbox_dkbj[6], bbox[7]]
304 _iou, _key = self.get_top_iou(poly=anchor_2, ocr_result=self.ocr_results[page_num])
305 bbox, dkbj_text = self.ocr_results[page_num][_key]
306 # print(dkbj_text)
307 if xm_text == ytzje_text:
308 xm_text, ytzje_text = xm_text.split(' ')
309 line = [xm_text, ytzje_text, dkbj_text]
310 asp_details_table.append(line)
311 else:
312 break
313
314 if bbox_total:
315 anchor = [bbox_dkbj[0], bbox_total[1], bbox_dkbj[2], bbox_total[3],
316 bbox_dkbj[4], bbox_total[5], bbox_dkbj[6], bbox_total[7]]
317 _iou, _key = self.get_top_iou(poly=anchor, ocr_result=self.ocr_results[page_num])
318 bbox, total_text = self.ocr_results[page_num][_key]
319 asp_details_table.append(['附加产品融资贷款本金总金额:', '', total_text])
320 asp_details_table_term['words'] = asp_details_table
321
322 return asp_details_table_term
323
324 def get_signature(self):
325 signature = self.item.copy()
326
327 for block in self.pdf_info['0']['blocks']:
328 if block['type'] != 0:
329 continue
330 for line in block['lines']:
331 for span in line['spans']:
332 bbox, text = span['bbox'], span['text']
333 if '签署日期' in text:
334 words = text
335 signature['words'] = words
336 signature['position'] = bbox
337 return signature
338
339 def get_somebody(self, top, bottom):
340 # 指定上下边界后,返回上下边界内的客户信息
341 _name = self.item.copy()
342 _id = self.item.copy()
343 # 只看第一页,先划定上下边界
344 y_top = 0
345 y_bottom = 0
346 for block in self.pdf_info['1']['blocks']:
347 if block['type'] != 0:
348 continue
349 for line in block['lines']:
350 for span in line['spans']:
351 bbox, text = span['bbox'], span['text']
352 if top in text:
353 y_top = bbox[3]
354 if bottom in text:
355 y_bottom = bbox[3]
356 for block in self.pdf_info['1']['blocks']:
357 if block['type'] != 0:
358 continue
359 for line in block['lines']:
360 for span in line['spans']:
361 bbox, text = span['bbox'], span['text']
362 if y_top < bbox[3] < y_bottom:
363 # print(top, bottom, text)
364 if '姓名/名称' in text:
365 words = text.split(':')[-1]
366 _name['position'] = bbox
367 _name['words'] = words
368 if '自然人身份证件号码/法人执照号码' in text:
369 words = text.split(':')[-1]
370 _id['position'] = bbox
371 _id['words'] = words
372 return _name, _id
373
374 def get_seller(self):
375 seller = self.item.copy()
376 # 先找到 key
377 anchor_bbox = None
378 for block in self.pdf_info['1']['blocks']:
379 if block['type'] != 0:
380 continue
381 for line in block['lines']:
382 for span in line['spans']:
383 bbox, text = span['bbox'], span['text']
384 if text in ['经销商', '车辆销售方']:
385 anchor_bbox = bbox
386 # 当找到了 key, 则根据 key 去匹配 value
387 if anchor_bbox:
388 half_width = self.pdf_info['1']['width'] * 0.5
389 for block in self.pdf_info['1']['blocks']:
390 if block['type'] != 0:
391 continue
392 for line in block['lines']:
393 for span in line['spans']:
394 bbox, text = span['bbox'], span['text']
395 if anchor_bbox[2]<np.mean(bbox[::2])<half_width and \
396 anchor_bbox[1]<np.mean(bbox[1::2])<anchor_bbox[3]:
397 seller['position'] = bbox
398 seller['words'] = text
399 return seller
400
401 def get_borrower_collection_account(self):
402 account = self.item.copy()
403 account_name = self.item.copy()
404 account_bank = self.item.copy()
405 all_text = ''
406 for block in self.pdf_info['1']['blocks']:
407 if block['type'] != 0:
408 continue
409 for line in block['lines']:
410 for span in line['spans']:
411 bbox, text = span['bbox'], span['text']
412 all_text += text
413 # 首先确定账户信息是哪种,我们只输出非另行通知的格式
414 if '借款人收款账户' in all_text:
415 all_text = all_text.replace(' ', '').replace(' ', '')
416 matchs_1 = re.findall(r'账号:(.*?)户名', all_text)
417 if matchs_1:
418 words = matchs_1[0]
419 for block in self.pdf_info['1']['blocks']:
420 if block['type'] != 0:
421 continue
422 for line in block['lines']:
423 for span in line['spans']:
424 bbox, text = span['bbox'], span['text']
425 if f'{words}' in text:
426 account['position'] = bbox
427 account['words'] = words
428 matchs_2 = re.findall(r'户名:(.*?)开户行', all_text)
429 if matchs_2:
430 words = matchs_2[0]
431 for block in self.pdf_info['1']['blocks']:
432 if block['type'] != 0:
433 continue
434 for line in block['lines']:
435 for span in line['spans']:
436 bbox, text = span['bbox'], span['text']
437 if f'{words}' in text:
438 account_name['position'] = bbox
439 account_name['words'] = words
440 matchs_3 = re.findall(r'开户行:(.*?)借款人', all_text)
441 if matchs_3:
442 words = matchs_3[0]
443 for block in self.pdf_info['1']['blocks']:
444 if block['type'] != 0:
445 continue
446 for line in block['lines']:
447 for span in line['spans']:
448 bbox, text = span['bbox'], span['text']
449 if f'{words}' in text:
450 account_bank['position'] = bbox
451 account_bank['words'] = words
452 return account, account_name, account_bank
453
454 def get_payback_account(self):
455 account = self.item.copy()
456 account_name = self.item.copy()
457 account_bank = self.item.copy()
458 all_text = ''
459 for block in self.pdf_info['1']['blocks']:
460 if block['type'] != 0:
461 continue
462 for line in block['lines']:
463 for span in line['spans']:
464 bbox, text = span['bbox'], span['text']
465 all_text += text
466 # 首先确定账户信息是哪种,我们只输出非另行通知的格式
467 if '(13) 还款账户' in all_text:
468 all_text = all_text.split('(13) 还款账户')[-1]
469 all_text = all_text.replace(' ', '').replace(' ', '')
470 matchs_1 = re.findall(r'账号:(.*?)户名', all_text)
471 if matchs_1:
472 words = matchs_1[0]
473 for block in self.pdf_info['1']['blocks']:
474 if block['type'] != 0:
475 continue
476 for line in block['lines']:
477 for span in line['spans']:
478 bbox, text = span['bbox'], span['text']
479 if f'{words}' in text:
480 account['position'] = bbox
481 account['words'] = words
482 matchs_2 = re.findall(r'户名:(.*?)开户行', all_text)
483 if matchs_2:
484 words = matchs_2[0]
485 for block in self.pdf_info['1']['blocks']:
486 if block['type'] != 0:
487 continue
488 for line in block['lines']:
489 for span in line['spans']:
490 bbox, text = span['bbox'], span['text']
491 if f'{words}' in text:
492 account_name['position'] = bbox
493 account_name['words'] = words
494 matchs_3 = re.findall(r'开户行:(.*?);', all_text)
495 if matchs_3:
496 words = matchs_3[0]
497 for block in self.pdf_info['1']['blocks']:
498 if block['type'] != 0:
499 continue
500 for line in block['lines']:
501 for span in line['spans']:
502 bbox, text = span['bbox'], span['text']
503 if f'开户行:{words};' in text.replace(' ', ''):
504 account_bank['position'] = bbox
505 account_bank['words'] = words
506 return account, account_name, account_bank
507
508 def get_repayment_schedule(self):
509 repayment_schedule = self.item.copy()
510 # 只看第二页
511 repayment_schedule_table = []
512 repayment_schedule_text_list = []
513 table = False
514 for block in self.pdf_info['2']['blocks']:
515 if block['type'] != 0:
516 continue
517 for line in block['lines']:
518 for span in line['spans']:
519 bbox, text = span['bbox'], span['text']
520 if '序号' == text:
521 table = True
522 if '以上表格中所列的序号并非还款期数' in text:
523 table = False
524 if table == True:
525 repayment_schedule_text_list.append(text)
526
527 for i in range(len(repayment_schedule_text_list)//5):
528
529 line = []
530 # 5表示5列的意思
531 for j in range(5):
532 line.append(repayment_schedule_text_list[i*5+j])
533
534 if str(i+1) == line[1]:
535 break
536
537 repayment_schedule_table.append(line)
538
539 if len(repayment_schedule_table) > 0:
540 repayment_schedule['words'] = repayment_schedule_table
541 return repayment_schedule
542
543 def get_signature_role_1(self):
544 signature_role_1 = self.init_item.copy()
545 # 先定位签字区域
546 texts = []
547 boxes = []
548 page_num = None
549 position = None
550 words = None
551 region = False
552 for i in list(self.pdf_info.keys()):
553 for block in self.pdf_info[i]['blocks']:
554 if block['type'] != 0:
555 continue
556 for line in block['lines']:
557 for span in line['spans']:
558 bbox, text = span['bbox'], span['text']
559 if '借款人(抵押人)' in text:
560 region = True
561 if '日期' in text:
562 region = False
563 if region == True:
564 page_num = i
565 texts.append(text)
566 boxes.append(bbox)
567 if len(texts) > 4:
568 words = '有'
569 else:
570 words = '无'
571 boxes = np.array(boxes).reshape((-1, 2))
572 position = [min(boxes[:,0]), min(boxes[:,1]), max(boxes[:,0]), max(boxes[:,1])]
573 signature_role_1['page_num'] = page_num
574 signature_role_1['position'] = position
575 signature_role_1['words'] = words
576 return signature_role_1
577
578 def get_signature_role_2(self):
579 signature_role_2 = self.init_item.copy()
580 # 先定位签字区域
581 texts = []
582 boxes = []
583 page_num = None
584 position = None
585 words = None
586 region = False
587 for i in list(self.pdf_info.keys()):
588 for block in self.pdf_info[i]['blocks']:
589 if block['type'] != 0:
590 continue
591 for line in block['lines']:
592 for span in line['spans']:
593 bbox, text = span['bbox'], span['text']
594 if '共同借款人(共同抵押人)' in text:
595 region = True
596 if '日期' in text:
597 region = False
598 if region == True:
599 page_num = i
600 texts.append(text)
601 boxes.append(bbox)
602 if len(texts) > 4:
603 words = '有'
604 else:
605 words = '无'
606 boxes = np.array(boxes).reshape((-1, 2))
607 position = [min(boxes[:,0]), min(boxes[:,1]), max(boxes[:,0]), max(boxes[:,1])]
608 signature_role_2['page_num'] = page_num
609 signature_role_2['position'] = position
610 signature_role_2['words'] = words
611 return signature_role_2
612
613 def get_signature_role_3(self):
614 signature_role_3 = self.init_item.copy()
615 # 先定位签字区域
616 texts = []
617 boxes = []
618 page_num = None
619 position = None
620 words = None
621 region = False
622 for i in list(self.pdf_info.keys()):
623 for block in self.pdf_info[i]['blocks']:
624 if block['type'] != 0:
625 continue
626 for line in block['lines']:
627 for span in line['spans']:
628 bbox, text = span['bbox'], span['text']
629 if '保证人1' in text and int(i) != 0:
630 region = True
631 if '日期' in text:
632 region = False
633 if region == True:
634 page_num = i
635 texts.append(text)
636 boxes.append(bbox)
637 if len(texts) > 4:
638 words = '有'
639 else:
640 words = '无'
641 boxes = np.array(boxes).reshape((-1, 2))
642 position = [min(boxes[:,0]), min(boxes[:,1]), max(boxes[:,0]), max(boxes[:,1])]
643 signature_role_3['page_num'] = page_num
644 signature_role_3['position'] = position
645 signature_role_3['words'] = words
646 return signature_role_3
647
648 def get_signature_role_4(self):
649 signature_role_4 = self.init_item.copy()
650 # 先定位签字区域
651 texts = []
652 boxes = []
653 page_num = None
654 position = None
655 words = None
656 region = False
657 for i in list(self.pdf_info.keys()):
658 for block in self.pdf_info[i]['blocks']:
659 if block['type'] != 0:
660 continue
661 for line in block['lines']:
662 for span in line['spans']:
663 bbox, text = span['bbox'], span['text']
664 if '保证人2' in text and int(i) != 0:
665 region = True
666 if '日期' in text:
667 region = False
668 if region == True:
669 page_num = i
670 texts.append(text)
671 boxes.append(bbox)
672 if len(texts) > 4:
673 words = '有'
674 else:
675 words = '无'
676 boxes = np.array(boxes).reshape((-1, 2))
677 position = [min(boxes[:,0]), min(boxes[:,1]), max(boxes[:,0]), max(boxes[:,1])]
678 signature_role_4['page_num'] = page_num
679 signature_role_4['position'] = position
680 signature_role_4['words'] = words
681 return signature_role_4
682
683 def get_signature_role_5(self):
684 signature_role_5 = self.init_item.copy()
685 # 先定位签字区域
686 texts = []
687 boxes = []
688 page_num = None
689 position = None
690 words = None
691 region = False
692 for i in list(self.pdf_info.keys()):
693 for block in self.pdf_info[i]['blocks']:
694 if block['type'] != 0:
695 continue
696 for line in block['lines']:
697 for span in line['spans']:
698 bbox, text = span['bbox'], span['text']
699 if '见证人签字' in text and int(i) != 0:
700 region = True
701 if '年' in text:
702 region = False
703 if region == True:
704 page_num = i
705 texts.append(text)
706 boxes.append(bbox)
707 print(texts)
708 if len(texts) > 4:
709 words = '有'
710 else:
711 words = '无'
712 boxes = np.array(boxes).reshape((-1, 2))
713 position = [min(boxes[:,0]), min(boxes[:,1]), max(boxes[:,0]), max(boxes[:,1])]
714 signature_role_5['page_num'] = page_num
715 signature_role_5['position'] = position
716 signature_role_5['words'] = words
717 return signature_role_5
718
719 def get_last_page_signature(self, page_num, top, bottom):
720 signature_name = self.item.copy()
721 signature_date = self.item.copy()
722 anchor_top = None
723 anchor_bottom = None
724 for block in self.pdf_info[page_num]['blocks']:
725 if block['type'] != 0:
726 continue
727 for line in block['lines']:
728 for span in line['spans']:
729 bbox, text = span['bbox'], span['text']
730 if top in text:
731 anchor_top = bbox[1]
732 if bottom in text:
733 anchor_bottom = bbox[1]
734 # print(top, anchor_top, anchor_bottom)
735 if anchor_top is not None and anchor_bottom is not None:
736 for block in self.pdf_info[page_num]['blocks']:
737 if block['type'] != 0:
738 continue
739 for line in block['lines']:
740 for span in line['spans']:
741 bbox, text = span['bbox'], span['text']
742 if '签署日期' in text and int(anchor_top)<np.mean(bbox[1::2])<int(anchor_bottom):
743 name = text.split(' ')[0]
744 date = text.split(':')[-1]
745 signature_name['words'] = name
746 signature_name['position'] = bbox
747 signature_date['words'] = date
748 signature_date['position'] = bbox
749 return signature_name, signature_date
750
751 def get_info(self):
752 """
753 block['type'] == 0 : 表示该元素为图片
754
755 Returns:
756 dict: Description
757 """
758
759 # 先判断是否为 ASP 产品
760 # 只看第一页,判断是否有 '附加产品融资贷款本金总金额' 这一句话,若有则为 ASP 产品
761 # print(self.pdf_info['0']['blocks'])
762 # for block in self.pdf_info['0']['blocks']:
763 # if block['type'] != 0:
764 # continue
765 # for line in block['lines']:
766 # for span in line['spans']:
767 # bbox, text = span['bbox'], span['text']
768 # if '附加产品融资贷款本金总金额' == text:
769 # self.is_asp = True
770 for key in self.ocr_results['0']:
771 bbox, text = self.ocr_results['0'][key]
772 if '附加产品融资贷款本金总金额' in text:
773 self.is_asp = True
774
775 self.gen_init_result(self.is_asp)
776
777 if len(list(self.ocr_results.keys())) <= 8: # 8.5 版本客户提供的样本出现串页的情况,暂时无法识别
778 # Page 1
779 # 找合同编号
780 contract_no = self.get_contract_no(page_num='0')
781 # print(contract_no)
782 self.init_result['page_1']['合同编号'] = contract_no
783 # 所购车辆价格
784 vehicle_price = self.get_vehicle_price()
785 # print(vehicle_price)
786 self.init_result['page_1']['所购车辆价格'] = vehicle_price
787 # 车架号
788 vin = self.get_vin()
789 # print(vin)
790 self.init_result['page_1']['车架号'] = vin
791 # 贷款本金金额(如果是 ASP产品)则'贷款本金金额'项目中包含'车辆贷款本金金额'和'附加产品融资贷款本金总金额'两个项目
792 upper, lower, asp_1, asp_2 = self.get_loan_principal()
793 # print(upper, lower, asp_1, asp_2)
794 self.init_result['page_1']['贷款本金金额']['大写'] = upper
795 self.init_result['page_1']['贷款本金金额']['小写'] = lower
796 self.init_result['page_1']['贷款本金金额']['车辆贷款本金金额'] = asp_1
797 self.init_result['page_1']['贷款本金金额']['附加产品融资贷款本金总金额'] = asp_2
798 # 贷款期限
799 loan_term = self.get_loan_term()
800 # print(loan_term)
801 self.init_result['page_1']['贷款期限'] = loan_term
802 # 附加产品融资贷款本金总金额明细(ASP-表格)
803 asp_details_table = self.get_asp_details(page_num='0')
804 # print(asp_details_table)
805 self.init_result['page_1']['附加产品融资贷款本金总金额明细'] = asp_details_table
806 # 借款人签字及时间
807 signature = self.get_signature()
808 # print(signature)
809 self.init_result['page_1']['借款人签字及时间'] = signature
810 #######################################
811 # Page 2
812 # 找合同编号
813 contract_no = self.get_contract_no(page_num='0')
814 # print(contract_no)
815 self.init_result['page_2']['合同编号'] = contract_no
816 # 找借款人及抵押人(地址字段原本有空格)
817 borrower_name, borrower_id = self.get_somebody(top='借款人及抵押人:', bottom='共同借款人:')
818 # 这是为了同时兼容 8.1 版本
819 if borrower_name['words'] == None:
820 borrower_name, borrower_id = self.get_somebody(top='借款人及抵押人:', bottom='共同借款人及共同抵押人:')
821 # 这是为了兼容车贷分离版本
822 if borrower_name['words'] == None:
823 borrower_name, borrower_id = self.get_somebody(top='借款人:', bottom='共同借款人及抵押人:')
824 # print(borrower_name, borrower_id)
825 self.init_result['page_2']['借款人及抵押人']['name'] = borrower_name
826 self.init_result['page_2']['借款人及抵押人']['id'] = borrower_id
827 # 找共同借款人及共同抵押人
828 co_borrower_name, co_borrower_id = self.get_somebody(top='共同借款人:', bottom='保证人1:')
829 # print(co_borrower_name, co_borrower_id)
830 self.init_result['page_2']['共同借款人及共同抵押人']['name'] = co_borrower_name
831 self.init_result['page_2']['共同借款人及共同抵押人']['id'] = co_borrower_id
832 # 保证人1
833 first_guarantor_name, first_guarantor_id = self.get_somebody(top='保证人1:', bottom='保证人2:')
834 self.init_result['page_2']['保证人1']['name'] = first_guarantor_name
835 self.init_result['page_2']['保证人1']['id'] = first_guarantor_id
836 # 保证人2
837 second_guarantor_name, second_guarantor_id = self.get_somebody(top='保证人2:', bottom='第一章')
838 self.init_result['page_2']['保证人2']['name'] = second_guarantor_name
839 self.init_result['page_2']['保证人2']['id'] = second_guarantor_id
840 # 所购车辆价格
841 vehicle_price = self.get_vehicle_price(page_num='1')
842 # print(vehicle_price)
843 self.init_result['page_2']['所购车辆价格'] = vehicle_price
844 # 车架号
845 vin = self.get_vin(page_num='1')
846 # print(vin)
847 self.init_result['page_2']['车架号'] = vin
848 # 经销商
849 seller = self.get_seller()
850 # print(seller)
851 self.init_result['page_2']['经销商'] = seller
852 # 贷款本金金额(如果是 ASP产品)则'贷款本金金额'项目中包含'车辆贷款本金金额'和'附加产品融资贷款本金总金额'两个项目
853 upper, lower, asp_1, asp_2 = self.get_loan_principal(page_num='1')
854 # print(upper, lower, asp_1, asp_2)
855 self.init_result['page_2']['贷款本金金额']['大写'] = upper
856 self.init_result['page_2']['贷款本金金额']['小写'] = lower
857 self.init_result['page_2']['贷款本金金额']['车辆贷款本金金额'] = asp_1
858 self.init_result['page_2']['贷款本金金额']['附加产品融资贷款本金总金额'] = asp_2
859 # 贷款期限
860 loan_term = self.get_loan_term(page_num='1')
861 # print(loan_term)
862 self.init_result['page_2']['贷款期限'] = loan_term
863 # 本合同当期的标准利率
864 standard_rate = self.get_standard_rate(page_num='1')
865 # print(standard_rate)
866 self.init_result['page_2']['标准利率'] = standard_rate
867 # 202212 release 新增借款人收款账户
868 account, account_name, account_bank = self.get_borrower_collection_account()
869 # print(account, account_name, account_bank)
870 self.init_result['page_2']['借款人收款账户']['账号'] = account
871 self.init_result['page_2']['借款人收款账户']['户名'] = account_name
872 self.init_result['page_2']['借款人收款账户']['开户行'] = account_bank
873 # 还款账户
874 account, account_name, account_bank = self.get_payback_account()
875 # print(account, account_name, account_bank)
876 self.init_result['page_2']['还款账户']['账号'] = account
877 self.init_result['page_2']['还款账户']['户名'] = account_name
878 self.init_result['page_2']['还款账户']['开户行'] = account_bank
879 #######################################
880 # Page 3
881 # 找合同编号
882 contract_no = self.get_contract_no(page_num='2')
883 self.init_result['page_3']['合同编号'] = contract_no
884 # 还款计划表(表格)
885 repayment_schedule_table = self.get_repayment_schedule()
886 # print(repayment_schedule_table)
887 self.init_result['page_3']['还款计划表'] = repayment_schedule_table
888 #######################################
889 # Page 4
890 # 找合同编号
891 contract_no = self.get_contract_no(page_num='3')
892 # print(contract_no)
893 self.init_result['page_4']['合同编号'] = contract_no
894 # 附加产品融资贷款本金总金额明细(ASP-表格)
895 asp_details_table = self.get_asp_details(page_num='3')
896 # print(asp_details_table)
897 self.init_result['page_4']['附加产品融资贷款本金总金额明细'] = asp_details_table
898 #######################################
899 # Page 5
900 # 找合同编号
901 contract_no = self.get_contract_no(page_num='4')
902 # print(contract_no)
903 self.init_result['page_5']['合同编号'] = contract_no
904 #######################################
905 # Page 6
906 # 找合同编号
907 contract_no = self.get_contract_no(page_num='5')
908 # print(contract_no)
909 self.init_result['page_6']['合同编号'] = contract_no
910 # Page 7
911 # 找合同编号
912 contract_no = self.get_contract_no(page_num='6')
913 self.init_result['page_7']['合同编号'] = contract_no
914 # Page 8
915 # 找合同编号
916 contract_no = self.get_contract_no(page_num='7')
917 self.init_result['page_8']['合同编号'] = contract_no
918 signature_name, signature_date = self.get_last_page_signature(page_num='7',
919 top='合同编号', bottom='共同借款人')
920 if signature_name['words'] == None:
921 signature_name, signature_date = self.get_last_page_signature(page_num='7',
922 top='合同编号', bottom='共同借款人(抵押人)')
923 # print(signature_name, signature_date)
924 self.init_result['page_8']['主借人签字']['签字'] = signature_name
925 self.init_result['page_8']['主借人签字']['日期'] = signature_date
926 signature_name, signature_date = self.get_last_page_signature(page_num='7',
927 top='共同借款人', bottom='保证人1')
928 if signature_name['words'] == None:
929 signature_name, signature_date = self.get_last_page_signature(page_num='7',
930 top='共同借款人(抵押人)', bottom='保证人1')
931 # print(signature_name, signature_date)
932 self.init_result['page_8']['共借人签字']['签字'] = signature_name
933 self.init_result['page_8']['共借人签字']['日期'] = signature_date
934 signature_name, signature_date = self.get_last_page_signature(page_num='7',
935 top='保证人1', bottom='保证人2')
936 self.init_result['page_8']['保证人1签字']['签字'] = signature_name
937 self.init_result['page_8']['保证人1签字']['日期'] = signature_date
938 signature_name, signature_date = self.get_last_page_signature(page_num='7',
939 top='保证人2', bottom='在本人面前亲笔签署本合同')
940 self.init_result['page_8']['保证人2签字']['签字'] = signature_name
941 self.init_result['page_8']['保证人2签字']['日期'] = signature_date
942 signature_name, signature_date = self.get_last_page_signature(page_num='7',
943 top='在本人面前亲笔签署本合同', bottom='以下无正文')
944 # print(signature_name, signature_date)
945 self.init_result['page_8']['见证人签字']['签字'] = signature_name
946 self.init_result['page_8']['见证人签字']['日期'] = signature_date
947
948 # 重新定制输出
949 new_results = {"is_asp": self.is_asp,
950 "page_info": self.init_result
951 }
952 return new_results
...\ No newline at end of file ...\ No newline at end of file
1 import re
2 import numpy as np
3 from fuzzywuzzy import fuzz
4 from shapely.geometry import Polygon
5
6 def caculate_iou(g, p):
7 g = Polygon(np.array(g).reshape((-1, 2)))
8 p = Polygon(np.array(p).reshape((-1, 2)))
9 inter = Polygon(g).intersection(Polygon(p)).area
10 union = g.area + p.area - inter
11 return inter/union
12
13 def get_table_info(bbox_1, bbox_2, ocr_result):
14 anchor = [bbox_2[0], bbox_1[1], bbox_2[2], bbox_1[3],
15 bbox_2[4], bbox_1[5], bbox_2[6], bbox_1[7]]
16 table_info = ''
17 for span in ocr_result:
18 iou = caculate_iou(anchor, span[0])
19 if iou > 0:
20 table_info = span[1]
21 return table_info
22
23 class Finder:
24
25 def __init__(self, pdf_info):
26 self.pdf_info = pdf_info
27 self.item = {"words": None,
28 "page": None,
29 "position": None,
30 }
31 # 格式化算法输出
32 self.init_result = {"合同编号": self.item,
33 "承租人-姓名": self.item,
34 "承租人-证件号码": self.item,
35 "承租人-法定代表人或授权代表": self.item,
36
37 "共同承租人-姓名": self.item,
38 "共同承租人-证件号码": self.item,
39 "共同承租人-法定代表人或授权代表": self.item,
40
41 "保证人1-姓名": self.item,
42 "保证人1-证件号码": self.item,
43 "保证人1-法定代表人或授权代表": self.item,
44
45 "保证人2-姓名": self.item,
46 "保证人2-证件号码": self.item,
47 "保证人2-法定代表人或授权代表": self.item,
48 "保证人3-姓名": self.item,
49 "保证人3-证件号码": self.item,
50 "保证人3-法定代表人或授权代表": self.item,
51 "合同编号(正文)": self.item,
52 "车辆识别代码": self.item,
53 "车辆卖方(经销商)": self.item,
54 "车辆原始销售价格(《机动车销售统一发票》所列金额)": self.item,
55 "车辆附加产品明细表": self.item,
56 "融资成本总额": self.item,
57 "租期": self.item,
58 "付款计划表": self.item,
59 "承租人收款账户-户名": self.item,
60 "承租人收款账户-银行账号": self.item,
61 "承租人收款账户-开户行": self.item,
62 "承租人扣款账户-户名": self.item,
63 "承租人扣款账户-银行账号": self.item,
64 "承租人扣款账户-开户行": self.item,
65 "签字页-承租人姓名": self.item,
66 "签字页-承租人签章": self.item,
67
68 "签字页-共同承租人姓名": self.item,
69 "签字页-共同承租人签章": self.item,
70
71 "签字页-保证人1姓名": self.item,
72 "签字页-保证人1签章": self.item,
73
74 "签字页-保证人2姓名": self.item,
75 "签字页-保证人2签章": self.item,
76 "签字页-保证人3姓名": self.item,
77 "签字页-保证人3签章": self.item,
78 }
79
80 # 格式化输出 车辆处置协议 要是别的字段
81 self.init_result_1 = {"合同编号": self.item,
82 "承租人-姓名": self.item,
83 "承租人-证件号码": self.item,
84 "销售经销商": self.item,
85 "合同编号(正文)": self.item,
86 "签字页-承租人姓名": self.item,
87 "签字页-承租人证件号码": self.item,
88 "签字页-承租人签章": self.item,
89 "签字页-销售经销商": self.item,
90 "签字页-销售经销商签章": self.item,
91 }
92
93 # 格式化输出 车辆租赁抵押合同
94 self.init_result_2 = {"合同编号": self.item,
95 "合同编号(正文)": self.item,
96 "抵押人姓名/名称": self.item,
97 "抵押人证件号码": self.item,
98 "抵押人配偶姓名/名称": self.item,
99 "抵押人配偶证件号码": self.item,
100 "车辆识别代码": self.item,
101 "租金总额": self.item,
102 "融资租赁期限": self.item,
103 "签字页-抵押人姓名": self.item,
104 "签字页-抵押人签章": self.item,
105 "签字页-抵押人配偶姓名": self.item,
106 "签字页-抵押人配偶签章": self.item,
107 }
108
109 def get_contract_no(self, page_num):
110 """传入页码,查看该页码右上角的编号
111
112 Args:
113 page_num (string):
114
115 Returns:
116 sting:
117 """
118 contract_no = self.item.copy()
119 # 只看第一页
120 for block in self.pdf_info[page_num]['blocks']:
121 if block['type'] != 0:
122 continue
123 for line in block['lines']:
124 for span in line['spans']:
125 bbox, text = span['bbox'], span['text']
126 if '合同编号:' in text:
127 words = text.split(':')[-1]
128 contract_no['position'] = bbox
129 contract_no['page'] = page_num
130 contract_no['words'] = words
131 if contract_no['words'] == '':
132 for block in self.pdf_info[page_num]['blocks']:
133 if block['type'] != 0:
134 continue
135 for line in block['lines']:
136 for span in line['spans']:
137 bbox, text = span['bbox'], span['text']
138 if bbox[1] < contract_no['position'][3] and 'CH' in text:
139 contract_no['position'] = bbox
140 contract_no['page'] = page_num
141 contract_no['words'] = text
142 return contract_no
143
144 def get_vehicle_price(self, page_num='0'):
145 vehicle_price = self.item.copy()
146 for block in self.pdf_info[page_num]['blocks']:
147 if block['type'] != 0:
148 continue
149 for line in block['lines']:
150 for span in line['spans']:
151 bbox, text = span['bbox'], span['text']
152 if '所购车辆价格为人民币' in text:
153 words = text.split('币')[-1]
154 vehicle_price['position'] = bbox
155 vehicle_price['words'] = words
156 return vehicle_price
157
158 def get_contract_no_one(self):
159 # 查找正文中的合同编号,有可能存在换行的情况
160 contract_no = self.item.copy()
161 for pno in self.pdf_info:
162 all_text = ''
163 for block in self.pdf_info[pno]['blocks']:
164 if block['type'] != 0:
165 continue
166 for line in block['lines']:
167 for span in line['spans']:
168 bbox, text = span['bbox'], span['text']
169 all_text += text
170 all_text = all_text.replace(' ', '')
171 matchObj = re.search(r'(合同编号:\[(.*?)\])', all_text)
172 if matchObj:
173 words = matchObj.group(1)
174 contract_no['position'] = None
175 contract_no['page'] = pno
176 # contract_no['words'] = words
177 contract_no['words'] = re.sub("\s", "", words).replace(")", "")
178 return contract_no
179
180 matchObj = re.search(r'编号为(.*?)的', all_text)
181 if matchObj:
182 words = matchObj.group(1).strip()
183 contract_no['position'] = None
184 contract_no['page'] = pno
185 # contract_no['words'] = words
186 contract_no['words'] = re.sub("\s", "", words).replace(")", "")
187 return contract_no
188
189 matchObj = re.search(r'编号为(.*?))的', all_text)
190 if matchObj:
191 words = matchObj.group(1).strip()
192 contract_no['position'] = None
193 contract_no['page'] = pno
194 # contract_no['words'] = words
195 contract_no['words'] = re.sub("\s", "", words)
196 return contract_no
197
198 def get_key_value(self, key, page_num=None):
199 value = self.item.copy()
200 if page_num is not None:
201 pno = page_num
202 for block in self.pdf_info[pno]['blocks']:
203 if block['type'] != 0:
204 continue
205 for line in block['lines']:
206 for span in line['spans']:
207 bbox, text = span['bbox'], span['text']
208 if key in text:
209 words = text.split(':')[-1].replace("。", "")
210 value['position'] = bbox
211 value['page'] = pno
212 # value['words'] = words
213 value['words'] = re.sub("\s", "", words)
214 else:
215 for pno in self.pdf_info:
216 for block in self.pdf_info[pno]['blocks']:
217 if block['type'] != 0:
218 continue
219 for line in block['lines']:
220 for span in line['spans']:
221 bbox, text = span['bbox'], span['text']
222 if key in text:
223 # print(self.pdf_info[pno])
224 words = text.split(':')[-1].replace("。", "")
225 value['position'] = bbox
226 value['page'] = pno
227 # value['words'] = words
228 value['words'] = re.sub("\s", "", words)
229 return value
230
231 def get_loan_principal(self, page_num='0'):
232 chinese_keywords = ['壹', '贰', '叁', '肆', '伍', '陆', '柒', '捌', '玖', '拾',
233 '佰', '仟', '万', '亿', '元', '角', '分', '零', '整']
234 upper = self.item.copy()
235 lower = self.item.copy()
236 asp_1 = self.item.copy()
237 asp_2 = self.item.copy()
238 anchor_bbox = None
239 for block in self.pdf_info[page_num]['blocks']:
240 if block['type'] != 0:
241 continue
242 for line in block['lines']:
243 for span in line['spans']:
244 bbox, text = span['bbox'], span['text']
245 if fuzz.ratio(''.join(chinese_keywords), text) > 15:
246 text = text.split(':')[-1].strip()
247 upper['position'] = bbox
248 upper['words'] = text
249 if '小写:¥' in text:
250 words = text.split('¥')[-1].strip()
251 lower['position'] = bbox
252 lower['words'] = words
253 if '附加产品融资贷款本金总金额' == text:
254 anchor_bbox = bbox
255 if anchor_bbox:
256 for block in self.pdf_info[page_num]['blocks']:
257 if block['type'] != 0:
258 continue
259 for line in block['lines']:
260 for span in line['spans']:
261 bbox, text = span['bbox'], span['text']
262 if np.mean(bbox[1::2]) < np.mean(anchor_bbox[1::2]) and '人民币:小写:' in text:
263 words = re.findall(r'人民币:小写:\[(.*)\]', text)[0]
264 asp_1['position'] = bbox
265 asp_1['words'] = words
266 if np.mean(bbox[1::2]) > np.mean(anchor_bbox[1::2]) and '人民币:小写:' in text:
267 words = re.findall(r'人民币:小写:\[(.*)\]', text)[0]
268 asp_2['position'] = bbox
269 asp_2['words'] = words
270 return upper, lower, asp_1, asp_2
271
272 def get_loan_term(self, page_num='0'):
273 loan_term = self.item.copy()
274 all_text = ''
275 for block in self.pdf_info[page_num]['blocks']:
276 if block['type'] != 0:
277 continue
278 for line in block['lines']:
279 for span in line['spans']:
280 bbox, text = span['bbox'], span['text']
281 all_text += text
282 matchs = re.search(r'贷款期限(\d+)个月', all_text)
283 if matchs:
284 words = matchs.group(1)
285 for block in self.pdf_info[page_num]['blocks']:
286 if block['type'] != 0:
287 continue
288 for line in block['lines']:
289 for span in line['spans']:
290 bbox, text = span['bbox'], span['text']
291 if f'{words}个月' in text:
292 loan_term['position'] = bbox
293 loan_term['words'] = words
294 return loan_term
295
296 def get_asp_details(self, page_num):
297 asp_details_table_term = self.item.copy()
298
299 asp_details_table = []
300 asp_details_text_list = []
301 table = False
302 for block in self.pdf_info[page_num]['blocks']:
303 if block['type'] != 0:
304 continue
305 for line in block['lines']:
306 for span in line['spans']:
307 bbox, text = span['bbox'], span['text']
308 if '附加产品融资贷款本金总金额明细' == text:
309 table = True
310 if '第二条' in text or '征信管理' in text:
311 table = False
312 if table == True:
313 asp_details_text_list.append(text)
314
315 for i in range((len(asp_details_text_list)+2)//3):
316
317 line = []
318 if i == 0:
319 line = [asp_details_text_list[0]]
320 else:
321 for j in range(3):
322 line.append(asp_details_text_list[i*3-2+j])
323
324 asp_details_table.append(line)
325
326 if len(asp_details_table) > 0:
327 asp_details_table_term['words'] = asp_details_table
328 return asp_details_table_term
329
330 def get_signature(self):
331 signature = self.item.copy()
332
333 for block in self.pdf_info['0']['blocks']:
334 if block['type'] != 0:
335 continue
336 for line in block['lines']:
337 for span in line['spans']:
338 bbox, text = span['bbox'], span['text']
339 if '签署日期' in text:
340 words = text
341 signature['words'] = words
342 signature['position'] = bbox
343 return signature
344
345 def get_somebody(self, top, bottom):
346 # 指定上下边界后,返回上下边界内的客户信息
347 _name = self.item.copy()
348 _id = self.item.copy()
349 # 只看第一页,先划定上下边界
350 y_top = 0
351 y_bottom = 0
352 for block in self.pdf_info['1']['blocks']:
353 if block['type'] != 0:
354 continue
355 for line in block['lines']:
356 for span in line['spans']:
357 bbox, text = span['bbox'], span['text']
358 if top in text:
359 y_top = bbox[3]
360 if bottom in text:
361 y_bottom = bbox[3]
362 for block in self.pdf_info['1']['blocks']:
363 if block['type'] != 0:
364 continue
365 for line in block['lines']:
366 for span in line['spans']:
367 bbox, text = span['bbox'], span['text']
368 if y_top < bbox[3] < y_bottom:
369 if '姓名/名称' in text:
370 words = text.split(':')[-1]
371 _name['position'] = bbox
372 _name['words'] = words
373 if '自然人身份证件号码/法人执照号码' in text:
374 words = text.split(':')[-1]
375 _id['position'] = bbox
376 _id['words'] = words
377 return _name, _id
378
379 def get_seller(self):
380 seller = self.item.copy()
381 # 先找到 key
382 anchor_bbox = None
383 for block in self.pdf_info['1']['blocks']:
384 if block['type'] != 0:
385 continue
386 for line in block['lines']:
387 for span in line['spans']:
388 bbox, text = span['bbox'], span['text']
389 if '经销商' == text:
390 anchor_bbox = bbox
391 # 当找到了 key, 则根据 key 去匹配 value
392 if anchor_bbox:
393 half_width = self.pdf_info['1']['width'] * 0.5
394 for block in self.pdf_info['1']['blocks']:
395 if block['type'] != 0:
396 continue
397 for line in block['lines']:
398 for span in line['spans']:
399 bbox, text = span['bbox'], span['text']
400 if anchor_bbox[2]<np.mean(bbox[::2])<half_width and \
401 anchor_bbox[1]<np.mean(bbox[1::2])<anchor_bbox[3]:
402 seller['position'] = bbox
403 seller['words'] = text
404 return seller
405
406 def get_payback_account(self):
407 account = self.item.copy()
408 account_name = self.item.copy()
409 account_bank = self.item.copy()
410 all_text = ''
411 for block in self.pdf_info['1']['blocks']:
412 if block['type'] != 0:
413 continue
414 for line in block['lines']:
415 for span in line['spans']:
416 bbox, text = span['bbox'], span['text']
417 all_text += text
418 # 首先确定账户信息是哪种,我们只输出非另行通知的格式
419 if '☑账号' in all_text:
420 all_text = all_text.replace(' ', '')
421 matchs_1 = re.findall(r'账号:(.*)户名', all_text)
422 if matchs_1:
423 words = matchs_1[0]
424 for block in self.pdf_info['1']['blocks']:
425 if block['type'] != 0:
426 continue
427 for line in block['lines']:
428 for span in line['spans']:
429 bbox, text = span['bbox'], span['text']
430 if f'{words}' in text:
431 account['position'] = bbox
432 account['words'] = words
433 matchs_2 = re.findall(r'户名:(.*)开户行', all_text)
434 if matchs_2:
435 words = matchs_2[0]
436 for block in self.pdf_info['1']['blocks']:
437 if block['type'] != 0:
438 continue
439 for line in block['lines']:
440 for span in line['spans']:
441 bbox, text = span['bbox'], span['text']
442 if f'{words}' in text:
443 account_name['position'] = bbox
444 account_name['words'] = words
445 matchs_3 = re.findall(r'开户行:(.*);', all_text)
446 if matchs_3:
447 words = matchs_3[0]
448 for block in self.pdf_info['1']['blocks']:
449 if block['type'] != 0:
450 continue
451 for line in block['lines']:
452 for span in line['spans']:
453 bbox, text = span['bbox'], span['text']
454 if f'开户行:{words};' in text.replace(' ', ''):
455 account_bank['position'] = bbox
456 account_bank['words'] = words
457 return account, account_name, account_bank
458
459 def get_repayment_schedule(self):
460 repayment_schedule = self.item.copy()
461
462 repayment_schedule_text_list = []
463 table = False
464 page = None
465 left = 0
466 right = 0
467 for pno in self.pdf_info:
468 for block in self.pdf_info[pno]['blocks']:
469 if block['type'] != 0:
470 continue
471 for line in block['lines']:
472 for span in line['spans']:
473 bbox, text = span['bbox'], span['text']
474 if '剩余融资' in text:
475 right = bbox[2]
476 if '以上表格中所列序号' in text:
477 table = False
478 if table == True:
479 # 过滤汉字
480 if re.compile(r'[\u4e00-\u9fff]').search(text):
481 continue
482 # 过滤 1. - 61. 这些标题
483 if re.findall("\d+", text):
484 if len(re.findall("\d+", text)) == 1:
485 continue
486 if not left < bbox[0] < right:
487 continue
488 repayment_schedule_text_list.append(text)
489
490 if text.strip() == "61.":
491 page = pno
492 table = True
493 left = bbox[0]
494 # print("repayment_schedule_text_list = ", repayment_schedule_text_list)
495 # repayment_schedule_table = [['序号', '融资租赁成本', '融资租赁费用', '租金', '剩余融资租赁成本']]
496 repayment_schedule_table = [['序号', '租金']]
497 for i in range(len(repayment_schedule_text_list)//4):
498 line = [f'{i+1}.']
499 # 4表示4列的意思
500 for j in range(4):
501 line.append(repayment_schedule_text_list[i*4+j])
502
503 # 只保留序号和租金列
504 line = [line[0].replace('.', ''), line[3]]
505
506 repayment_schedule_table.append(line)
507
508 repayment_schedule['words'] = repayment_schedule_table
509 repayment_schedule['page'] = page
510 return repayment_schedule
511
512 def get_signature_role_1(self):
513 signature_role_1 = self.item.copy()
514 for pno in self.pdf_info:
515 for block in self.pdf_info[pno]['blocks']:
516 if block['type'] != 0:
517 continue
518 for line in block['lines']:
519 for span in line['spans']:
520 bbox, text = span['bbox'], span['text']
521 if '签署日期' in text:
522 signature_role_1['position'] = bbox
523 signature_role_1['page'] = pno
524 signature_role_1['words'] = text
525 return signature_role_1
526
527 def get_signature_role_2(self):
528 signature_role_2 = self.init_item.copy()
529 # 先定位签字区域
530 texts = []
531 boxes = []
532 page_num = None
533 position = None
534 words = None
535 region = False
536 for i in list(self.pdf_info.keys()):
537 for block in self.pdf_info[i]['blocks']:
538 if block['type'] != 0:
539 continue
540 for line in block['lines']:
541 for span in line['spans']:
542 bbox, text = span['bbox'], span['text']
543 if '共同借款人(共同抵押人)' in text:
544 region = True
545 if '日期' in text:
546 region = False
547 if region == True:
548 page_num = i
549 texts.append(text)
550 boxes.append(bbox)
551 if len(texts) > 4:
552 words = '有'
553 else:
554 words = '无'
555 boxes = np.array(boxes).reshape((-1, 2))
556 position = [min(boxes[:,0]), min(boxes[:,1]), max(boxes[:,0]), max(boxes[:,1])]
557 signature_role_2['page_num'] = page_num
558 signature_role_2['position'] = position
559 signature_role_2['words'] = words
560 return signature_role_2
561
562 def get_signature_role_3(self):
563 signature_role_3 = self.init_item.copy()
564 # 先定位签字区域
565 texts = []
566 boxes = []
567 page_num = None
568 position = None
569 words = None
570 region = False
571 for i in list(self.pdf_info.keys()):
572 for block in self.pdf_info[i]['blocks']:
573 if block['type'] != 0:
574 continue
575 for line in block['lines']:
576 for span in line['spans']:
577 bbox, text = span['bbox'], span['text']
578 if '保证人1' in text and int(i) != 0:
579 region = True
580 if '日期' in text:
581 region = False
582 if region == True:
583 page_num = i
584 texts.append(text)
585 boxes.append(bbox)
586 if len(texts) > 4:
587 words = '有'
588 else:
589 words = '无'
590 boxes = np.array(boxes).reshape((-1, 2))
591 position = [min(boxes[:,0]), min(boxes[:,1]), max(boxes[:,0]), max(boxes[:,1])]
592 signature_role_3['page_num'] = page_num
593 signature_role_3['position'] = position
594 signature_role_3['words'] = words
595 return signature_role_3
596
597 def get_signature_role_4(self):
598 signature_role_4 = self.init_item.copy()
599 # 先定位签字区域
600 texts = []
601 boxes = []
602 page_num = None
603 position = None
604 words = None
605 region = False
606 for i in list(self.pdf_info.keys()):
607 for block in self.pdf_info[i]['blocks']:
608 if block['type'] != 0:
609 continue
610 for line in block['lines']:
611 for span in line['spans']:
612 bbox, text = span['bbox'], span['text']
613 if '保证人2' in text and int(i) != 0:
614 region = True
615 if '日期' in text:
616 region = False
617 if region == True:
618 page_num = i
619 texts.append(text)
620 boxes.append(bbox)
621 if len(texts) > 4:
622 words = '有'
623 else:
624 words = '无'
625 boxes = np.array(boxes).reshape((-1, 2))
626 position = [min(boxes[:,0]), min(boxes[:,1]), max(boxes[:,0]), max(boxes[:,1])]
627 signature_role_4['page_num'] = page_num
628 signature_role_4['position'] = position
629 signature_role_4['words'] = words
630 return signature_role_4
631
632 def get_signature_role_5(self):
633 signature_role_5 = self.init_item.copy()
634 # 先定位签字区域
635 texts = []
636 boxes = []
637 page_num = None
638 position = None
639 words = None
640 region = False
641 for i in list(self.pdf_info.keys()):
642 for block in self.pdf_info[i]['blocks']:
643 if block['type'] != 0:
644 continue
645 for line in block['lines']:
646 for span in line['spans']:
647 bbox, text = span['bbox'], span['text']
648 if '见证人签字' in text and int(i) != 0:
649 region = True
650 if '年' in text:
651 region = False
652 if region == True:
653 page_num = i
654 texts.append(text)
655 boxes.append(bbox)
656 # print(texts)
657 if len(texts) > 4:
658 words = '有'
659 else:
660 words = '无'
661 boxes = np.array(boxes).reshape((-1, 2))
662 position = [min(boxes[:,0]), min(boxes[:,1]), max(boxes[:,0]), max(boxes[:,1])]
663 signature_role_5['page_num'] = page_num
664 signature_role_5['position'] = position
665 signature_role_5['words'] = words
666 return signature_role_5
667
668 def get_last_page_signature(self, page_num, top, bottom):
669 signature_name = self.item.copy()
670 signature_date = self.item.copy()
671 anchor_top = None
672 anchor_bottom = None
673 for block in self.pdf_info[page_num]['blocks']:
674 if block['type'] != 0:
675 continue
676 for line in block['lines']:
677 for span in line['spans']:
678 bbox, text = span['bbox'], span['text']
679 if top in text:
680 anchor_top = bbox[1]
681 if bottom in text:
682 anchor_bottom = bbox[1]
683 if anchor_top is not None and anchor_bottom is not None:
684 for block in self.pdf_info[page_num]['blocks']:
685 if block['type'] != 0:
686 continue
687 for line in block['lines']:
688 for span in line['spans']:
689 bbox, text = span['bbox'], span['text']
690 if '签署日期' in text and int(anchor_top)<np.mean(bbox[1::2])<int(anchor_bottom):
691 name = text.split(' ')[0]
692 date = text.split(':')[-1]
693 signature_name['words'] = name
694 signature_name['position'] = bbox
695 signature_date['words'] = date
696 signature_name['position'] = bbox
697 return signature_name, signature_date
698
699 def get_electronic_signature(self, top, bottom):
700 signature = self.item.copy()
701 anchor_top = None
702 anchor_bottom = None
703 for pno in self.pdf_info:
704 for block in self.pdf_info[pno]['blocks']:
705 if block['type'] != 0:
706 continue
707 for line in block['lines']:
708 for span in line['spans']:
709 bbox, text = span['bbox'], span['text']
710 if top in text:
711 anchor_top = bbox[1]
712 if bottom in text:
713 anchor_bottom = bbox[3]
714 if anchor_top is not None and anchor_bottom is not None:
715 for pno in self.pdf_info:
716 for block in self.pdf_info[pno]['blocks']:
717 if block['type'] != 0:
718 continue
719 for line in block['lines']:
720 for span in line['spans']:
721 bbox, text = span['bbox'], span['text']
722 # ------------ #
723 # print("--text = ", text)
724 if '签署日期' in text and int(anchor_top)<np.mean(bbox[1::2])<int(anchor_bottom):
725 words = text
726 signature['words'] = words
727 signature['page'] = pno
728 signature['position'] = bbox
729 return signature
730
731 def get_role_info(self, role_key, page_num='0'):
732 name = self.item.copy()
733 id_num = self.item.copy()
734 representative = self.item.copy()
735
736 # 以保证人3 的左上角为定位点
737 anchor = None
738 for block in self.pdf_info[page_num]['blocks']:
739 if block['type'] != 0:
740 continue
741 for line in block['lines']:
742 for span in line['spans']:
743 bbox, text = span['bbox'], span['text']
744 # 找到角色姓名
745 if re.match('保证人3', text) is not None:
746 anchor = [bbox[0], bbox[1]]
747
748 if anchor is not None:
749 for block in self.pdf_info[page_num]['blocks']:
750 if block['type'] != 0:
751 continue
752 for line in block['lines']:
753 for span in line['spans']:
754 bbox, text = span['bbox'], span['text']
755 # 找到角色姓名
756 if re.match(role_key, text) is not None:
757 words = text.split(':')[-1]
758 name['words'] = words
759 name['page'] = page_num
760 name['position'] = bbox
761 if role_key == '承租人:':
762 # 找到证件号码且确定位置
763 if re.match('证件号码:', text) is not None and np.mean(bbox[::2]) < anchor[0] and np.mean(bbox[1::2]) < anchor[1]:
764 words = text.split(':')[-1]
765 id_num['words'] = words
766 id_num['page'] = page_num
767 id_num['position'] = bbox
768 # 找到法人代表且确定位置
769 if re.match('法定代表人或授权代表:', text) is not None and np.mean(bbox[::2]) < anchor[0] and np.mean(bbox[1::2]) < anchor[1]:
770 words = text.split(':')[-1]
771 representative['words'] = words
772 representative['page'] = page_num
773 representative['position'] = bbox
774 if role_key == '保证人1:':
775 # 找到证件号码且确定位置
776 if re.match('证件号码:', text) is not None and np.mean(bbox[::2]) < anchor[0] and np.mean(bbox[1::2]) > anchor[1]:
777 words = text.split(':')[-1]
778 id_num['words'] = words
779 id_num['page'] = page_num
780 id_num['position'] = bbox
781 # 找到法人代表且确定位置
782 if re.match('法定代表人或授权代表:', text) is not None and np.mean(bbox[::2]) < anchor[0] and np.mean(bbox[1::2]) > anchor[1]:
783 words = text.split(':')[-1]
784 representative['words'] = words
785 representative['page'] = page_num
786 representative['position'] = bbox
787 if role_key == '保证人2:':
788 # 找到证件号码且确定位置
789 if re.match('证件号码:', text) is not None and np.mean(bbox[::2]) > anchor[0] and np.mean(bbox[1::2]) < anchor[1]:
790 words = text.split(':')[-1]
791 id_num['words'] = words
792 id_num['page'] = page_num
793 id_num['position'] = bbox
794 # 找到法人代表且确定位置
795 if re.match('法定代表人或授权代表:', text) is not None and np.mean(bbox[::2]) > anchor[0] and np.mean(bbox[1::2]) < anchor[1]:
796 words = text.split(':')[-1]
797 representative['words'] = words
798 representative['page'] = page_num
799 representative['position'] = bbox
800 if role_key == '保证人3:':
801 # 找到证件号码且确定位置
802 if re.match('证件号码:', text) is not None and np.mean(bbox[::2]) > anchor[0] and np.mean(bbox[1::2]) > anchor[1]:
803 words = text.split(':')[-1]
804 id_num['words'] = words
805 id_num['page'] = page_num
806 id_num['position'] = bbox
807 # 找到法人代表且确定位置
808 if re.match('法定代表人或授权代表:', text) is not None and np.mean(bbox[::2]) > anchor[0] and np.mean(bbox[1::2]) > anchor[1]:
809 words = text.split(':')[-1]
810 representative['words'] = words
811 representative['page'] = page_num
812 representative['position'] = bbox
813 return name, id_num, representative
814
815 def get_table_add_product(self):
816 table_add_product = self.item.copy()
817
818 add_product_page_num = None
819 for pno in self.pdf_info:
820 for block in self.pdf_info[f'{pno}']['blocks']:
821 if block['type'] != 0:
822 continue
823 for line in block['lines']:
824 for span in line['spans']:
825 bbox, text = span['bbox'], span['text']
826 if '车辆附加产品(明细见下表)' in text:
827 add_product_page_num = pno
828 ocr_results = []
829 for block in self.pdf_info[f'{add_product_page_num}']['blocks']:
830 if block['type'] != 0:
831 continue
832 for line in block['lines']:
833 for span in line['spans']:
834 bbox, text = span['bbox'], span['text']
835 xmin, ymin, xmax, ymax = bbox
836 bbox = [xmin, ymin, xmax, ymin, xmax, ymax, xmin, ymax]
837 ocr_results.append([bbox, text])
838
839 lines = [['项目', '购买价格', '实际融资金额']]
840
841 key_xm = None
842 key_gmjg = None
843 key_sjrzje = None
844 key_total = None
845
846 for index, span in enumerate(ocr_results):
847 if span[1] == '项目':
848 key_xm = index
849 if span[1] == '购买价格':
850 key_gmjg = index
851 if span[1] == '实际融资金额':
852 key_sjrzje = index
853 if span[1] == '总计':
854 key_total = index
855
856 bbox, text = ocr_results[key_xm]
857 rh = abs(bbox[1]-bbox[-1])
858 anchor = np.array(bbox).reshape((-1, 2))
859 anchor[:, 0] += 2*rh
860 anchor[:, 1] += rh
861
862 for i in range(5):
863 for span in ocr_results:
864 iou = caculate_iou(anchor, span[0])
865 if iou > 0.01 and span[1].strip() != '所购':
866 x = get_table_info(span[0], ocr_results[key_gmjg][0], ocr_results)
867 y = get_table_info(span[0], ocr_results[key_sjrzje][0], ocr_results)
868 line = [span[1].replace('\u3000', ' '), x, y]
869 # print(line)
870 lines.append(line)
871 anchor = np.array(span[0]).reshape((-1, 2))
872 anchor[:, 1] += rh
873
874 total = get_table_info(ocr_results[key_total][0], ocr_results[key_sjrzje][0], ocr_results)
875 lines.append(['总计', '', total])
876
877 # 所购 BMW悦然焕
878 # 新服务
879
880 # 所购 BMW5年10
881 # 万公里长悦保养套餐
882
883 # 所购 事故维修补偿
884 # 方案
885
886 # 所购 BMW5年10万公里
887 # 长悦保养套餐
888
889 # 所购 MINI4年6万公里长悦
890 # 保养套餐
891
892 filtered_lines = []
893 for line in lines:
894 if line[0][:2] not in ['所购', '项目', '总计']:
895 continue
896 if 'BMW悦然' in line[0]:
897 line[0] = '所购 BMW悦然焕新服务'
898 if 'BMW5年10' in line[0]:
899 line[0] = '所购 BMW5年10万公里长悦保养套餐'
900 if '事故维修补' in line[0]:
901 line[0] = '所购 事故维修补偿方案'
902 if 'MINI4年6万公里长悦' in line[0]:
903 line[0] = '所购 MINI4年6万公里长悦保养套餐'
904 filtered_lines.append(line)
905 table_add_product['words'] = filtered_lines
906 table_add_product['page'] = add_product_page_num
907 table_add_product['position'] = None
908 return table_add_product
909
910 def get_contract_no_dy(self):
911 # 查找抵押合同编号
912 contract_no = self.item.copy()
913
914 key_box = None
915 for pno in self.pdf_info:
916 for block in self.pdf_info[pno]['blocks']:
917 if block['type'] != 0:
918 continue
919 for line in block['lines']:
920 for span in line['spans']:
921 bbox, text = span['bbox'], span['text']
922 if '抵押合同编号' in text:
923 key_box = bbox
924
925 if key_box is not None:
926 for pno in self.pdf_info:
927 for block in self.pdf_info[pno]['blocks']:
928 if block['type'] != 0:
929 continue
930 for line in block['lines']:
931 for span in line['spans']:
932 bbox, text = span['bbox'], span['text']
933 if key_box[1] < np.mean(bbox[1::2]) < key_box[3] and 'CH-' in text:
934 contract_no['position'] = bbox
935 contract_no['page'] = pno
936 contract_no['words'] = text
937 return contract_no
938
939 def get_dyr_name_id(self):
940 name = self.item.copy()
941 _id = self.item.copy()
942
943 key_box = None
944 for pno in self.pdf_info:
945 for block in self.pdf_info[pno]['blocks']:
946 if block['type'] != 0:
947 continue
948 for line in block['lines']:
949 for span in line['spans']:
950 bbox, text = span['bbox'], span['text']
951 if text == '抵押人':
952 key_box = bbox
953
954 if key_box is not None:
955 rh = abs(key_box[1]-key_box[3])
956 for pno in self.pdf_info:
957 for block in self.pdf_info[pno]['blocks']:
958 if block['type'] != 0:
959 continue
960 for line in block['lines']:
961 for span in line['spans']:
962 bbox, text = span['bbox'], span['text']
963 if key_box[1] < np.mean(bbox[1::2]) < key_box[3]+rh*3 and '姓名' in text:
964 words = text.split(':')[-1]
965 name['position'] = bbox
966 name['page'] = pno
967 name['words'] = words
968 if key_box[1] < np.mean(bbox[1::2]) < key_box[3]+rh*3 and '证件号码' in text:
969 words = text.split(':')[-1]
970 _id['position'] = bbox
971 _id['page'] = pno
972 _id['words'] = words
973 return name, _id
974
975 def get_dyrpo_name_id(self):
976 name = self.item.copy()
977 _id = self.item.copy()
978
979 key_box = None
980 for pno in self.pdf_info:
981 for block in self.pdf_info[pno]['blocks']:
982 if block['type'] != 0:
983 continue
984 for line in block['lines']:
985 for span in line['spans']:
986 bbox, text = span['bbox'], span['text']
987 if text == '抵押人配偶(如适':
988 key_box = bbox
989
990 if key_box is not None:
991 rh = abs(key_box[1]-key_box[3])
992 for pno in self.pdf_info:
993 for block in self.pdf_info[pno]['blocks']:
994 if block['type'] != 0:
995 continue
996 for line in block['lines']:
997 for span in line['spans']:
998 bbox, text = span['bbox'], span['text']
999 if key_box[1] < np.mean(bbox[1::2]) < key_box[3]+rh*3 and '姓名' in text:
1000 words = text.split(':')[-1]
1001 name['position'] = bbox
1002 name['page'] = pno
1003 name['words'] = words
1004 if key_box[1] < np.mean(bbox[1::2]) < key_box[3]+rh*3 and '证件号码' in text:
1005 words = text.split(':')[-1]
1006 _id['position'] = bbox
1007 _id['page'] = pno
1008 _id['words'] = words
1009 return name, _id
1010
1011 def get_key_value_position(self, key):
1012 value = self.item.copy()
1013
1014 key_box = None
1015 for pno in self.pdf_info:
1016 for block in self.pdf_info[pno]['blocks']:
1017 if block['type'] != 0:
1018 continue
1019 for line in block['lines']:
1020 for span in line['spans']:
1021 bbox, text = span['bbox'], span['text']
1022 if text == key:
1023 key_box = bbox
1024
1025 if key_box is not None:
1026 rh = abs(key_box[1]-key_box[3])
1027 for pno in self.pdf_info:
1028 for block in self.pdf_info[pno]['blocks']:
1029 if block['type'] != 0:
1030 continue
1031 for line in block['lines']:
1032 for span in line['spans']:
1033 bbox, text = span['bbox'], span['text']
1034 if key_box[1] < np.mean(bbox[1::2]) < key_box[3] and key_box[0] < bbox[0] and abs(key_box[2]-bbox[0]) < rh*10:
1035 words = text
1036 value['position'] = bbox
1037 value['page'] = pno
1038 value['words'] = words
1039 return value
1040
1041 def get_role_info_3_3(self, role_key, page_num='0'):
1042 name = self.item.copy()
1043 id_num = self.item.copy()
1044 representative = self.item.copy()
1045
1046 # 以保证人2 的左上角为定位点
1047 anchor = None
1048 for block in self.pdf_info[page_num]['blocks']:
1049 if block['type'] != 0:
1050 continue
1051 for line in block['lines']:
1052 for span in line['spans']:
1053 bbox, text = span['bbox'], span['text']
1054 # 找到角色姓名
1055 if re.match('保证人2', text) is not None:
1056 anchor = [bbox[0], bbox[1]]
1057
1058 if anchor is not None:
1059 for block in self.pdf_info[page_num]['blocks']:
1060 if block['type'] != 0:
1061 continue
1062 for line in block['lines']:
1063 for span in line['spans']:
1064 bbox, text = span['bbox'], span['text']
1065 # 找到角色姓名
1066 if re.match(role_key, text) is not None:
1067 words = text.split(':')[-1]
1068 name['words'] = words
1069 name['page'] = page_num
1070 name['position'] = bbox
1071 if role_key == '承租人一:':
1072 # 找到证件号码且确定位置
1073 if re.match('证件号码:', text) is not None and np.mean(bbox[::2]) < anchor[0] and np.mean(bbox[1::2]) < anchor[1]:
1074 words = text.split(':')[-1]
1075 id_num['words'] = words
1076 id_num['page'] = page_num
1077 id_num['position'] = bbox
1078 # 找到法人代表且确定位置
1079 if re.match('法定代表人或授权代表:', text) is not None and np.mean(bbox[::2]) < anchor[0] and np.mean(bbox[1::2]) < anchor[1]:
1080 words = text.split(':')[-1]
1081 representative['words'] = words
1082 representative['page'] = page_num
1083 representative['position'] = bbox
1084 if role_key == '共同承租人:':
1085 # 找到证件号码且确定位置
1086 if re.match('证件号码:', text) is not None and np.mean(bbox[::2]) < anchor[0] and np.mean(bbox[1::2]) > anchor[1]:
1087 words = text.split(':')[-1]
1088 id_num['words'] = words
1089 id_num['page'] = page_num
1090 id_num['position'] = bbox
1091 # 找到法人代表且确定位置
1092 if re.match('法定代表人或授权代表:', text) is not None and np.mean(bbox[::2]) < anchor[0] and np.mean(bbox[1::2]) > anchor[1]:
1093 words = text.split(':')[-1]
1094 representative['words'] = words
1095 representative['page'] = page_num
1096 representative['position'] = bbox
1097 if role_key == '保证人1:':
1098 # 找到证件号码且确定位置
1099 if re.match('证件号码:', text) is not None and np.mean(bbox[::2]) > anchor[0] and np.mean(bbox[1::2]) < anchor[1]:
1100 words = text.split(':')[-1]
1101 id_num['words'] = words
1102 id_num['page'] = page_num
1103 id_num['position'] = bbox
1104 # 找到法人代表且确定位置
1105 if re.match('法定代表人或授权代表:', text) is not None and np.mean(bbox[::2]) > anchor[0] and np.mean(bbox[1::2]) < anchor[1]:
1106 words = text.split(':')[-1]
1107 representative['words'] = words
1108 representative['page'] = page_num
1109 representative['position'] = bbox
1110 if role_key == '保证人2:':
1111 # 找到证件号码且确定位置
1112 if re.match('证件号码:', text) is not None and np.mean(bbox[::2]) > anchor[0] and np.mean(bbox[1::2]) > anchor[1]:
1113 words = text.split(':')[-1]
1114 id_num['words'] = words
1115 id_num['page'] = page_num
1116 id_num['position'] = bbox
1117 # 找到法人代表且确定位置
1118 if re.match('法定代表人或授权代表:', text) is not None and np.mean(bbox[::2]) > anchor[0] and np.mean(bbox[1::2]) > anchor[1]:
1119 words = text.split(':')[-1]
1120 representative['words'] = words
1121 representative['page'] = page_num
1122 representative['position'] = bbox
1123 return name, id_num, representative
1124
1125 def get_value_by_findall(self, prefix, suffix, page_num):
1126 value = self.item.copy()
1127 all_text = ''
1128 pno = page_num
1129 for block in self.pdf_info[pno]['blocks']:
1130 if block['type'] != 0:
1131 continue
1132 for line in block['lines']:
1133 for span in line['spans']:
1134 bbox, text = span['bbox'], span['text']
1135 all_text += text
1136 words_list = re.findall(f"{prefix}(.*?){suffix}", all_text)
1137 if len(words_list) > 0:
1138 for block in self.pdf_info[pno]['blocks']:
1139 if block['type'] != 0:
1140 continue
1141 for line in block['lines']:
1142 for span in line['spans']:
1143 bbox, text = span['bbox'], span['text']
1144 if words_list[0] in text:
1145 value['position'] = bbox
1146 value['page'] = pno
1147 value['words'] = words_list[0]
1148 return value
1149
1150 def get_info(self):
1151 """
1152 block['type'] == 0 : 表示该元素为图片
1153
1154 Returns:
1155 dict: Description
1156 """
1157 if len(self.pdf_info) > 0:
1158 # 取 Page 1 上的合同编号
1159 contract_no = self.get_contract_no(page_num='0')
1160 self.init_result['合同编号'] = contract_no
1161
1162 # 粗略判断是否是 ‘车贷分离版本’ 的合同
1163 is_cdfl = False
1164 for block in self.pdf_info['0']['blocks']:
1165 if block['type'] != 0:
1166 continue
1167 for line in block['lines']:
1168 for span in line['spans']:
1169 bbox, text = span['bbox'], span['text']
1170 if '共同承租人:' in text:
1171 is_cdfl = True
1172
1173 if is_cdfl == False:
1174 # 从第一页上取四个角色的姓名和证件号码
1175 name, id_num, representative = self.get_role_info(role_key='承租人:', page_num='0')
1176
1177 if name["words"] == None:
1178 name, id_num, representative = self.get_role_info_3_3(role_key='承租人一:', page_num='0')
1179 self.init_result['承租人-姓名'] = name
1180 self.init_result['承租人-证件号码'] = id_num
1181 self.init_result['承租人-法定代表人或授权代表'] = representative
1182
1183 name, id_num, representative = self.get_role_info(role_key='保证人1:', page_num='0')
1184 self.init_result['保证人1-姓名'] = name
1185 self.init_result['保证人1-证件号码'] = id_num
1186 self.init_result['保证人1-法定代表人或授权代表'] = representative
1187 # if条件判别 对应3_3版本
1188 if name["words"] == None:
1189 name, id_num, representative = self.get_role_info_3_3(role_key='共同承租人:', page_num='0')
1190 self.init_result['共同承租人-姓名'] = name
1191 self.init_result['共同承租人-证件号码'] = id_num
1192 self.init_result['共同承租人-法定代表人或授权代表'] = representative
1193
1194 name, id_num, representative = self.get_role_info(role_key='保证人2:', page_num='0')
1195 self.init_result['保证人2-姓名'] = name
1196 self.init_result['保证人2-证件号码'] = id_num
1197 self.init_result['保证人2-法定代表人或授权代表'] = representative
1198 # if条件判别 对应3_3版本
1199 if name["words"] == None:
1200 name, id_num, representative = self.get_role_info_3_3(role_key='保证人1:', page_num='0')
1201 self.init_result['保证人2-姓名'] = name
1202 self.init_result['保证人2-证件号码'] = id_num
1203 self.init_result['保证人2-法定代表人或授权代表'] = representative
1204
1205 name, id_num, representative = self.get_role_info(role_key='保证人3:', page_num='0')
1206 self.init_result['保证人3-姓名'] = name
1207 self.init_result['保证人3-证件号码'] = id_num
1208 self.init_result['保证人3-法定代表人或授权代表'] = representative
1209 if name["words"] == None:
1210 name, id_num, representative = self.get_role_info_3_3(role_key='保证人2:', page_num='0')
1211 self.init_result['保证人3-姓名'] = name
1212 self.init_result['保证人3-证件号码'] = id_num
1213 self.init_result['保证人3-法定代表人或授权代表'] = representative
1214 else:
1215 name, id_num, representative = self.get_role_info_3_3(role_key='承租人一:', page_num='0')
1216 self.init_result['承租人-姓名'] = name
1217 self.init_result['承租人-证件号码'] = id_num
1218 self.init_result['承租人-法定代表人或授权代表'] = representative
1219
1220 name, id_num, representative = self.get_role_info_3_3(role_key='共同承租人:', page_num='0')
1221 self.init_result['共同承租人-姓名'] = name
1222 self.init_result['共同承租人-证件号码'] = id_num
1223 self.init_result['共同承租人-法定代表人或授权代表'] = representative
1224
1225 name, id_num, representative = self.get_role_info_3_3(role_key='保证人1:', page_num='0')
1226 self.init_result['保证人1-姓名'] = name
1227 self.init_result['保证人1-证件号码'] = id_num
1228 self.init_result['保证人1-法定代表人或授权代表'] = representative
1229
1230 name, id_num, representative = self.get_role_info_3_3(role_key='保证人2:', page_num='0')
1231 self.init_result['保证人2-姓名'] = name
1232 self.init_result['保证人2-证件号码'] = id_num
1233 self.init_result['保证人2-法定代表人或授权代表'] = representative
1234
1235 # 在所有页面中找正文中(第二部分 融资租赁主要条款及付款计划)的那个编号,因为存在换行的情况所以暂时不带位置输出
1236 contract_no = self.get_contract_no_one()
1237 self.init_result['合同编号(正文)'] = contract_no
1238 # 找到车辆识别代码
1239 vin = self.get_key_value(key='车辆识别代码:')
1240 self.init_result['车辆识别代码'] = vin
1241 # 找到经销商(车辆卖方(经销商))
1242 seller = self.get_key_value(key='车辆卖方(经销商):')
1243 if seller['words'] == None:
1244 seller = self.get_key_value(key='车辆卖方:')
1245 self.init_result['车辆卖方(经销商)'] = seller
1246 # 找到 —— 车辆原始销售价格
1247 vehicle_price = self.get_key_value(key='车辆原始销售价格(《机动车销售统一发票》所列金额):')
1248 self.init_result['车辆原始销售价格(《机动车销售统一发票》所列金额)'] = vehicle_price
1249 # 找车辆附加产品明细(表)
1250 table_add_product = self.get_table_add_product()
1251 self.init_result['车辆附加产品明细表'] = table_add_product
1252 # 找融资成本总额
1253 financing_cost = self.get_key_value(key='融资成本总额:')
1254 self.init_result['融资成本总额'] = financing_cost
1255 # 找租期
1256 lease_term = self.get_key_value(key='租期:')
1257 self.init_result['租期'] = lease_term
1258 # 找还款计划(表)
1259 repayment_schedule = self.get_repayment_schedule()
1260 self.init_result['付款计划表'] = repayment_schedule
1261 # 找承租人收款账户户名、银行账号、银行
1262 name = self.get_key_value(key='户名:', page_num='4')
1263 self.init_result['承租人收款账户-户名'] = name
1264 account = self.get_key_value(key='银行账号:', page_num='4')
1265 self.init_result['承租人收款账户-银行账号'] = account
1266 bank = self.get_key_value(key='开户银行:', page_num='4')
1267 self.init_result['承租人收款账户-开户行'] = bank
1268 # 找承租人扣款账户户名、银行账号、银行
1269 name = self.get_key_value(key='户名:', page_num='5')
1270 self.init_result['承租人扣款账户-户名'] = name
1271 account = self.get_key_value(key='银行账号:', page_num='5')
1272 self.init_result['承租人扣款账户-银行账号'] = account
1273 bank = self.get_key_value(key='开户银行:', page_num='5')
1274 self.init_result['承租人扣款账户-开户行'] = bank
1275
1276 # 找签字页上的系列信息
1277 # 承租人姓名、签章
1278 if is_cdfl == False:
1279 name = self.get_key_value(key='承租人姓名:')
1280 electronic_signature = self.get_electronic_signature(top='承租人姓名:', bottom='保证人1姓名:')
1281
1282 if name["words"] == None:
1283 name = self.get_key_value(key='承租人一姓名:')
1284 electronic_signature = self.get_electronic_signature(top='承租人一姓名:', bottom='共同承租人名称:')
1285
1286 self.init_result['签字页-承租人姓名'] = name
1287 self.init_result['签字页-承租人签章'] = electronic_signature
1288 # 保证人1姓名、签章
1289 name = self.get_key_value(key='保证人1姓名:')
1290 electronic_signature = self.get_electronic_signature(top='保证人1姓名:', bottom='保证人2姓名:')
1291 self.init_result['签字页-保证人1姓名'] = name
1292 self.init_result['签字页-保证人1签章'] = electronic_signature
1293 # 这里用的是 name["words"] == ""
1294 if name["words"] == "":
1295 name = self.get_key_value(key='共同承租人名称:')
1296 electronic_signature = self.get_electronic_signature(top='共同承租人名称:', bottom='保证人1姓名:')
1297 self.init_result['签字页-共同承租人姓名'] = name
1298 self.init_result['签字页-共同承租人签章'] = electronic_signature
1299 # 保证人2姓名、签章
1300 name = self.get_key_value(key='保证人2姓名:')
1301 electronic_signature = self.get_electronic_signature(top='保证人2姓名:', bottom='保证人3姓名:')
1302 self.init_result['签字页-保证人2姓名'] = name
1303 self.init_result['签字页-保证人2签章'] = electronic_signature
1304 # if判断条件对应3_3版本
1305 if name["words"] == "":
1306 name = self.get_key_value(key='保证人1姓名:')
1307 electronic_signature = self.get_electronic_signature(top='保证人1姓名:', bottom='保证人2姓名:')
1308 self.init_result['签字页-保证人1姓名'] = name
1309 self.init_result['签字页-保证人1签章'] = electronic_signature
1310 # 保证人3姓名、签章
1311 name = self.get_key_value(key='保证人3姓名:')
1312 electronic_signature = self.get_electronic_signature(top='保证人3姓名:', bottom='日期:')
1313 self.init_result['签字页-保证人3姓名'] = name
1314 self.init_result['签字页-保证人3签章'] = electronic_signature
1315 # if判断条件对应3_3版本
1316 if name["words"] == None:
1317 name = self.get_key_value(key='保证人2姓名:')
1318 electronic_signature = self.get_electronic_signature(top='保证人2姓名:', bottom='日期:')
1319 self.init_result['签字页-保证人2姓名'] = name
1320 self.init_result['签字页-保证人2签章'] = electronic_signature
1321 else:
1322 name = self.get_key_value(key='承租人一姓名:')
1323 electronic_signature = self.get_electronic_signature(top='承租人一姓名:', bottom='共同承租人名称:')
1324 self.init_result['签字页-承租人姓名'] = name
1325 self.init_result['签字页-承租人签章'] = electronic_signature
1326
1327 name = self.get_key_value(key='共同承租人名称:')
1328 electronic_signature = self.get_electronic_signature(top='共同承租人名称:', bottom='保证人1姓名:')
1329 self.init_result['签字页-共同承租人姓名'] = name
1330 self.init_result['签字页-共同承租人签章'] = electronic_signature
1331
1332 name = self.get_key_value(key='保证人1姓名:')
1333 electronic_signature = self.get_electronic_signature(top='保证人1姓名:', bottom='保证人2姓名:')
1334 self.init_result['签字页-保证人1姓名'] = name
1335 self.init_result['签字页-保证人1签章'] = electronic_signature
1336
1337 name = self.get_key_value(key='保证人2姓名:')
1338 electronic_signature = self.get_electronic_signature(top='保证人2姓名:', bottom='保证人3姓名:')
1339 self.init_result['签字页-保证人2姓名'] = name
1340 self.init_result['签字页-保证人2签章'] = electronic_signature
1341
1342 return self.init_result
1343
1344 def get_info_1(self):
1345 if len(self.pdf_info) > 0:
1346 contract_no = self.get_contract_no(page_num='0')
1347 self.init_result_1['合同编号'] = contract_no
1348 # 承租人姓名
1349 name = self.get_key_value(key='承租人:', page_num='0')
1350 self.init_result_1['承租人-姓名'] = name
1351 # 承租人证件号码
1352 _id = self.get_key_value(key='证件号码:', page_num='0')
1353 self.init_result_1['承租人-证件号码'] = _id
1354 # 销售经销商
1355 seller = self.get_key_value(key='销售经销商:', page_num='0')
1356 if seller['words'] == "":
1357 seller = self.get_value_by_findall('销售经销商:', '地址:', page_num='0')
1358 self.init_result_1['销售经销商'] = seller
1359 # 合同编号(正文)
1360 contract_no = self.get_contract_no_one()
1361 self.init_result_1['合同编号(正文)'] = contract_no
1362 # 签字页-承租人姓名
1363 name = self.get_key_value(key='姓名/名称:')
1364 self.init_result_1['签字页-承租人姓名'] = name
1365 # 签字页-承租人证件号码
1366 _id = self.get_key_value(key='自然人身份证件号码/法人执照号码:')
1367 self.init_result_1['签字页-承租人证件号码'] = _id
1368 # 签字页-承租人签章
1369 signature_role_1 = self.get_signature_role_1()
1370 self.init_result_1['签字页-承租人签章'] = signature_role_1
1371 # 签字页-销售经销商
1372 seller = self.get_key_value(key='销售经销商:')
1373 if seller['words'] == "":
1374 # 销售经销商:深圳市宝创汽车贸易有限公司南山分公司(请授权代表签字并请盖章)
1375 seller = self.get_value_by_findall('销售经销商:', '(请授权代表签字并请盖章)', page_num='3')
1376 self.init_result_1['签字页-销售经销商'] = seller
1377 # 经销商签章
1378 pass
1379 return self.init_result_1
1380
1381 def get_info_2(self):
1382 if len(self.pdf_info) > 0:
1383 contract_no = self.get_contract_no_dy()
1384 self.init_result_2['合同编号'] = contract_no
1385 # 合同编号(正文)
1386 contract_no = self.get_contract_no_one()
1387 self.init_result_2['合同编号(正文)'] = contract_no
1388 # 抵押人姓名/名称
1389 name, _id = self.get_dyr_name_id()
1390 self.init_result_2['抵押人姓名/名称'] = name
1391 self.init_result_2['抵押人证件号码'] = _id
1392 # 抵押人配偶信息
1393 name, _id = self.get_dyrpo_name_id()
1394 self.init_result_2['抵押人配偶姓名/名称'] = name
1395 self.init_result_2['抵押人配偶证件号码'] = _id
1396 # 车辆识别代码
1397 vin = self.get_key_value(key='车辆识别代码:')
1398 self.init_result_2['车辆识别代码'] = vin
1399 # 租金总额
1400 rent = self.get_key_value_position(key='租金总额')
1401 self.init_result_2['租金总额'] = rent
1402 # 融资租赁期限
1403 lease_term = self.get_key_value_position(key='融资租赁期限')
1404 self.init_result_2['融资租赁期限'] = lease_term
1405 # 签字页抵押人姓名和签章
1406 name = self.get_key_value(key='抵押人姓名:')
1407 electronic_signature = self.get_electronic_signature(top='抵押权人盖章', bottom='抵押人配偶姓名:')
1408 self.init_result_2['签字页-抵押人姓名'] = name
1409 self.init_result_2['签字页-抵押人签章'] = electronic_signature
1410 # 签字页抵押人配偶姓名和签章
1411 name = self.get_key_value(key='抵押人配偶姓名:')
1412 electronic_signature = self.get_electronic_signature(top='抵押人配偶姓名:', bottom='日期')
1413 self.init_result_2['签字页-抵押人配偶姓名'] = name
1414 self.init_result_2['签字页-抵押人配偶签章'] = electronic_signature
1415 return self.init_result_2
...\ No newline at end of file ...\ No newline at end of file
...@@ -6,9 +6,10 @@ ...@@ -6,9 +6,10 @@
6 # @Description : 6 # @Description :
7 7
8 from .get_char import Finder 8 from .get_char import Finder
9 from .get_char_fsm import Finder as FSMFinder
9 10
10 11
11 def predict(pdf_info, file_cls): 12 def predict(pdf_info, file_cls, is_fsm=False):
12 """Summary 13 """Summary
13 14
14 Args: 15 Args:
...@@ -58,7 +59,11 @@ def predict(pdf_info, file_cls): ...@@ -58,7 +59,11 @@ def predict(pdf_info, file_cls):
58 pdf_info = dict() 59 pdf_info = dict()
59 for pno, page_info in enumerate(pdf_info_1): 60 for pno, page_info in enumerate(pdf_info_1):
60 pdf_info[str(pno)] = page_info 61 pdf_info[str(pno)] = page_info
61 f = Finder(pdf_info) 62
63 if is_fsm:
64 f = FSMFinder(pdf_info)
65 else:
66 f = Finder(pdf_info)
62 if file_cls == 0: 67 if file_cls == 0:
63 results = f.get_info() 68 results = f.get_info()
64 if file_cls == 1: 69 if file_cls == 1:
......
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!