Merge branch 'feature/fsm-contract' into fix/report_ca
Showing
8 changed files
with
2389 additions
and
9 deletions
... | @@ -11,7 +11,7 @@ PAGE_SIZE_DEFAULT = 10 | ... | @@ -11,7 +11,7 @@ PAGE_SIZE_DEFAULT = 10 |
11 | FIXED_APPLICATION_ID_PREFIX = 'CH-S' | 11 | FIXED_APPLICATION_ID_PREFIX = 'CH-S' |
12 | 12 | ||
13 | DOC_SCHEME_LIST = ['ACCEPTANCE', 'SETTLEMENT', 'CONTRACTMANAGEMENT'] | 13 | DOC_SCHEME_LIST = ['ACCEPTANCE', 'SETTLEMENT', 'CONTRACTMANAGEMENT'] |
14 | DATA_SOURCE_LIST = ['POS', 'EAPP', 'ECONTRACT'] | 14 | DATA_SOURCE_LIST = ['POS', 'EAPP', 'ECONTRACT', 'OVP'] |
15 | COMPARE_DOC_SCHEME_LIST = ['CA', 'SE'] | 15 | COMPARE_DOC_SCHEME_LIST = ['CA', 'SE'] |
16 | 16 | ||
17 | HIL_PREFIX = 'HIL' | 17 | HIL_PREFIX = 'HIL' | ... | ... |
... | @@ -1476,7 +1476,8 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -1476,7 +1476,8 @@ class Command(BaseCommand, LoggerMixin): |
1476 | 1476 | ||
1477 | # AFC合同 | 1477 | # AFC合同 |
1478 | if classify_1_str == str(consts.CONTRACT_CLASSIFY): | 1478 | if classify_1_str == str(consts.CONTRACT_CLASSIFY): |
1479 | ocr_result = afc_predict(pdf_handler.pdf_info) | 1479 | is_fsm = doc.data_source == consts.DATA_SOURCE_LIST[3] |
1480 | ocr_result = afc_predict(pdf_handler.pdf_info, is_fsm=is_fsm) | ||
1480 | page_res = {} | 1481 | page_res = {} |
1481 | for page_num, page_info in ocr_result.get('page_info', {}).items(): | 1482 | for page_num, page_info in ocr_result.get('page_info', {}).items(): |
1482 | if isinstance(page_num, str) and page_num.startswith('page_'): | 1483 | if isinstance(page_num, str) and page_num.startswith('page_'): |
... | @@ -1499,8 +1500,9 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -1499,8 +1500,9 @@ class Command(BaseCommand, LoggerMixin): |
1499 | } | 1500 | } |
1500 | # HIL合同 | 1501 | # HIL合同 |
1501 | elif classify_1_str in consts.HIL_CONTRACT_TYPE_MAP: | 1502 | elif classify_1_str in consts.HIL_CONTRACT_TYPE_MAP: |
1503 | is_fsm = doc.data_source == consts.DATA_SOURCE_LIST[3] | ||
1502 | file_type_1 = consts.HIL_CONTRACT_TYPE_MAP.get(classify_1_str) | 1504 | file_type_1 = consts.HIL_CONTRACT_TYPE_MAP.get(classify_1_str) |
1503 | ocr_result_1 = hil_predict(pdf_handler.pdf_info, file_type_1) | 1505 | ocr_result_1 = hil_predict(pdf_handler.pdf_info, file_type_1, is_fsm=is_fsm) |
1504 | rebuild_res_1 = {} | 1506 | rebuild_res_1 = {} |
1505 | page_res = {} | 1507 | page_res = {} |
1506 | for field_name, field_info in ocr_result_1.items(): | 1508 | for field_name, field_info in ocr_result_1.items(): |
... | @@ -1526,8 +1528,8 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -1526,8 +1528,8 @@ class Command(BaseCommand, LoggerMixin): |
1526 | 'page_info': page_info | 1528 | 'page_info': page_info |
1527 | } | 1529 | } |
1528 | # hmh | 1530 | # hmh |
1529 | else: | 1531 | # else: |
1530 | pass | 1532 | # pass |
1531 | 1533 | ||
1532 | 1534 | ||
1533 | contract_res = {} | 1535 | contract_res = {} | ... | ... |
... | @@ -36,6 +36,7 @@ class RequestTrigger(NamedEnum): | ... | @@ -36,6 +36,7 @@ class RequestTrigger(NamedEnum): |
36 | DOCUPLOAD = (3, 'Document Upload') | 36 | DOCUPLOAD = (3, 'Document Upload') |
37 | SUBMITING = (4, 'Submiting') | 37 | SUBMITING = (4, 'Submiting') |
38 | UPLOADING = (5, 'Uploading') | 38 | UPLOADING = (5, 'Uploading') |
39 | OVP = (6, 'OVP') | ||
39 | 40 | ||
40 | 41 | ||
41 | class FailureReason(NamedEnum): | 42 | class FailureReason(NamedEnum): | ... | ... |
... | @@ -602,8 +602,9 @@ class UploadDocView(GenericView, DocHandler): | ... | @@ -602,8 +602,9 @@ class UploadDocView(GenericView, DocHandler): |
602 | is_zip = False | 602 | is_zip = False |
603 | 603 | ||
604 | classify_1 = 0 | 604 | classify_1 = 0 |
605 | # 电子合同 | 605 | # 电子合同 Econtract or OVP(FSM) |
606 | if data_source == consts.DATA_SOURCE_LIST[-1] and document_scheme == consts.DOC_SCHEME_LIST[1]: | 606 | if data_source == consts.DATA_SOURCE_LIST[2] or data_source == consts.DATA_SOURCE_LIST[3]: |
607 | if document_scheme == consts.DOC_SCHEME_LIST[1]: | ||
607 | for keyword, classify_1_tmp in consts.ECONTRACT_KEYWORDS_MAP.get(prefix): | 608 | for keyword, classify_1_tmp in consts.ECONTRACT_KEYWORDS_MAP.get(prefix): |
608 | if keyword in document_name: | 609 | if keyword in document_name: |
609 | classify_1 = classify_1_tmp | 610 | classify_1 = classify_1_tmp | ... | ... |
... | @@ -6,6 +6,7 @@ | ... | @@ -6,6 +6,7 @@ |
6 | # @Description : | 6 | # @Description : |
7 | 7 | ||
8 | from .get_char import Finder | 8 | from .get_char import Finder |
9 | from .get_char_fsm import Finder as FSMFinder | ||
9 | import numpy as np | 10 | import numpy as np |
10 | 11 | ||
11 | 12 | ||
... | @@ -23,7 +24,7 @@ def extract_info(ocr_results): | ... | @@ -23,7 +24,7 @@ def extract_info(ocr_results): |
23 | return {'page_1': {'合同编号': contract_no}} | 24 | return {'page_1': {'合同编号': contract_no}} |
24 | 25 | ||
25 | 26 | ||
26 | def predict(pdf_info, is_qrs=False): | 27 | def predict(pdf_info, is_qrs=False, is_fsm=False): |
27 | ocr_results = {} | 28 | ocr_results = {} |
28 | for pno in pdf_info: | 29 | for pno in pdf_info: |
29 | ocr_results[pno] = {} | 30 | ocr_results[pno] = {} |
... | @@ -50,6 +51,9 @@ def predict(pdf_info, is_qrs=False): | ... | @@ -50,6 +51,9 @@ def predict(pdf_info, is_qrs=False): |
50 | results = extract_info(ocr_results) | 51 | results = extract_info(ocr_results) |
51 | else: | 52 | else: |
52 | # 输入是整个 PDF 中的信息 | 53 | # 输入是整个 PDF 中的信息 |
54 | if is_fsm: | ||
55 | f = FSMFinder(pdf_info, ocr_results=ocr_results) | ||
56 | else: | ||
53 | f = Finder(pdf_info, ocr_results=ocr_results) | 57 | f = Finder(pdf_info, ocr_results=ocr_results) |
54 | results = f.get_info() | 58 | results = f.get_info() |
55 | return results | 59 | return results | ... | ... |
1 | import re | ||
2 | import numpy as np | ||
3 | from fuzzywuzzy import fuzz | ||
4 | from shapely.geometry import Polygon | ||
5 | |||
6 | |||
7 | class Finder: | ||
8 | |||
9 | def __init__(self, pdf_info, ocr_results): | ||
10 | self.pdf_info = pdf_info | ||
11 | self.ocr_results = ocr_results | ||
12 | self.is_asp = False | ||
13 | self.item = {"words": None, | ||
14 | "position": None, | ||
15 | } | ||
16 | |||
17 | def gen_init_result(self, is_asp): | ||
18 | # 格式化算法输出 | ||
19 | self.init_result = {"page_1": {"合同编号": self.item, | ||
20 | "所购车辆价格": self.item, | ||
21 | "车架号": self.item, | ||
22 | "贷款本金金额": {"大写": self.item, | ||
23 | "小写": self.item, | ||
24 | "车辆贷款本金金额": self.item, | ||
25 | "附加产品融资贷款本金总金额": self.item, | ||
26 | }, | ||
27 | "贷款期限": self.item, | ||
28 | "附加产品融资贷款本金总金额明细": self.item, | ||
29 | "借款人签字及时间": self.item, | ||
30 | }, | ||
31 | "page_2": {"合同编号": self.item, | ||
32 | "借款人及抵押人": {"name": self.item, | ||
33 | "id": self.item, | ||
34 | }, | ||
35 | "共同借款人及共同抵押人": {"name": self.item, | ||
36 | "id": self.item, | ||
37 | }, | ||
38 | "保证人1": {"name": self.item, | ||
39 | "id": self.item, | ||
40 | }, | ||
41 | "保证人2": {"name": self.item, | ||
42 | "id": self.item, | ||
43 | }, | ||
44 | "所购车辆价格": self.item, | ||
45 | "车架号": self.item, | ||
46 | "经销商": self.item, | ||
47 | "贷款本金金额": {"大写": self.item, | ||
48 | "小写": self.item, | ||
49 | "车辆贷款本金金额": self.item, | ||
50 | "附加产品融资贷款本金总金额": self.item, | ||
51 | }, | ||
52 | "贷款期限": self.item, | ||
53 | "标准利率": self.item, | ||
54 | "借款人收款账户": {"账号": self.item, | ||
55 | "户名": self.item, | ||
56 | "开户行": self.item, | ||
57 | }, | ||
58 | "还款账户": {"账号": self.item, | ||
59 | "户名": self.item, | ||
60 | "开户行": self.item, | ||
61 | }, | ||
62 | }, | ||
63 | "page_3": {"合同编号": self.item, | ||
64 | "还款计划表": self.item, | ||
65 | }, | ||
66 | "page_4": {"合同编号": self.item, | ||
67 | "附加产品融资贷款本金总金额明细": self.item, | ||
68 | }, | ||
69 | "page_5": {"合同编号": self.item, | ||
70 | }, | ||
71 | "page_6": {"合同编号": self.item, | ||
72 | }, | ||
73 | } | ||
74 | self.init_result["page_7"] = {"合同编号": self.item, | ||
75 | } | ||
76 | self.init_result["page_8"] = {"合同编号": self.item, | ||
77 | "主借人签字": {"签字": self.item, | ||
78 | "日期": self.item, | ||
79 | }, | ||
80 | "共借人签字": {"签字": self.item, | ||
81 | "日期": self.item, | ||
82 | }, | ||
83 | "保证人1签字": {"签字": self.item, | ||
84 | "日期": self.item, | ||
85 | }, | ||
86 | "保证人2签字": {"签字": self.item, | ||
87 | "日期": self.item, | ||
88 | }, | ||
89 | "见证人签字": {"签字": self.item, | ||
90 | "日期": self.item, | ||
91 | }, | ||
92 | } | ||
93 | |||
94 | def get_top_iou(self, poly, ocr_result): | ||
95 | """传入一个多边形, 找到与之最匹配的多边形 | ||
96 | |||
97 | Args: | ||
98 | poly (TYPE): Description | ||
99 | """ | ||
100 | iou_list = [] | ||
101 | for key in ocr_result: | ||
102 | bbox, text = ocr_result[key] | ||
103 | g = Polygon(np.array(bbox).reshape((-1, 2))) | ||
104 | p = Polygon(np.array(poly).reshape((-1, 2))) | ||
105 | if not g.is_valid or not p.is_valid: | ||
106 | continue | ||
107 | inter = Polygon(g).intersection(Polygon(p)).area | ||
108 | union = g.area + p.area - inter | ||
109 | iou = inter/union | ||
110 | iou_list.append([iou, key]) | ||
111 | if len(iou_list) == 0: | ||
112 | return -1, -1 | ||
113 | top_iou = sorted(iou_list, key=lambda x: x[0])[-1] | ||
114 | return top_iou | ||
115 | |||
116 | def poly_to_rectangle(self, poly): | ||
117 | xmin, ymin, xmax, ymin, xmax, ymax, xmin, ymax = poly | ||
118 | bbox = [xmin, ymin, xmax, ymax] | ||
119 | return bbox | ||
120 | |||
121 | def get_contract_no(self, page_num): | ||
122 | """传入页码,查看该页码右上角的编号 | ||
123 | |||
124 | Args: | ||
125 | page_num (string): | ||
126 | |||
127 | Returns: | ||
128 | sting: | ||
129 | """ | ||
130 | contract_no = self.item.copy() | ||
131 | # contract_no['words'] = '' | ||
132 | # contract_no['position'] = [-1, -1, -1, -1] | ||
133 | # 只看第一页 | ||
134 | for key in self.ocr_results[page_num]: | ||
135 | bbox, text = self.ocr_results[page_num][key] | ||
136 | if '合同编号:' in text: | ||
137 | words = text.split(':')[-1] | ||
138 | location = self.poly_to_rectangle(bbox) | ||
139 | contract_no['words'] = words | ||
140 | contract_no['position'] = location | ||
141 | return contract_no | ||
142 | |||
143 | def get_vehicle_price(self, page_num='0'): | ||
144 | vehicle_price = self.item.copy() | ||
145 | # vehicle_price['words'] = '' | ||
146 | # vehicle_price['position'] = [-1, -1, -1, -1] | ||
147 | for key in self.ocr_results[page_num]: | ||
148 | bbox, text = self.ocr_results[page_num][key] | ||
149 | if '所购车辆价格为人民币' in text: | ||
150 | words = text.split('币')[-1] | ||
151 | location = self.poly_to_rectangle(bbox) | ||
152 | vehicle_price['words'] = words | ||
153 | vehicle_price['position'] = location | ||
154 | return vehicle_price | ||
155 | |||
156 | def get_vin(self, page_num='0'): | ||
157 | vin = self.item.copy() | ||
158 | # vin['words'] = '' | ||
159 | # vin['position'] = [-1, -1, -1, -1] | ||
160 | for key in self.ocr_results[page_num]: | ||
161 | bbox, text = self.ocr_results[page_num][key] | ||
162 | if '车架号:' in text: | ||
163 | words = text.split(':')[-1] | ||
164 | location = self.poly_to_rectangle(bbox) | ||
165 | vin['words'] = words | ||
166 | vin['position'] = location | ||
167 | return vin | ||
168 | |||
169 | def get_loan_principal(self, page_num='0'): | ||
170 | chinese_keywords = ['壹', '贰', '叁', '肆', '伍', '陆', '柒', '捌', '玖', '拾', | ||
171 | '佰', '仟', '万', '亿', '元', '角', '分', '零', '整'] | ||
172 | upper = self.item.copy() | ||
173 | lower = self.item.copy() | ||
174 | asp_1 = self.item.copy() | ||
175 | asp_2 = self.item.copy() | ||
176 | anchor_bbox = None | ||
177 | for block in self.pdf_info[page_num]['blocks']: | ||
178 | if block['type'] != 0: | ||
179 | continue | ||
180 | for line in block['lines']: | ||
181 | for span in line['spans']: | ||
182 | bbox, text = span['bbox'], span['text'] | ||
183 | if fuzz.ratio(''.join(chinese_keywords), text) > 15: | ||
184 | text = text.split(':')[-1].strip() | ||
185 | upper['position'] = bbox | ||
186 | upper['words'] = text | ||
187 | if '小写:¥' in text: | ||
188 | words = text.split('¥')[-1].strip() | ||
189 | lower['position'] = bbox | ||
190 | lower['words'] = words | ||
191 | if '附加产品融资贷款本金总金额' == text: | ||
192 | anchor_bbox = bbox | ||
193 | if anchor_bbox: | ||
194 | for block in self.pdf_info[page_num]['blocks']: | ||
195 | if block['type'] != 0: | ||
196 | continue | ||
197 | for line in block['lines']: | ||
198 | for span in line['spans']: | ||
199 | bbox, text = span['bbox'], span['text'] | ||
200 | if np.mean(bbox[1::2]) < np.mean(anchor_bbox[1::2]) and '人民币:小写:' in text: | ||
201 | words = re.findall(r'人民币:小写:\[(.*)\]', text)[0] | ||
202 | asp_1['position'] = bbox | ||
203 | asp_1['words'] = words | ||
204 | if np.mean(bbox[1::2]) > np.mean(anchor_bbox[1::2]) and '人民币:小写:' in text: | ||
205 | words = re.findall(r'人民币:小写:\[(.*)\]', text)[0] | ||
206 | asp_2['position'] = bbox | ||
207 | asp_2['words'] = words | ||
208 | return upper, lower, asp_1, asp_2 | ||
209 | |||
210 | def get_loan_term(self, page_num='0'): | ||
211 | loan_term = self.item.copy() | ||
212 | all_text = '' | ||
213 | for block in self.pdf_info[page_num]['blocks']: | ||
214 | if block['type'] != 0: | ||
215 | continue | ||
216 | for line in block['lines']: | ||
217 | for span in line['spans']: | ||
218 | bbox, text = span['bbox'], span['text'] | ||
219 | all_text += text | ||
220 | matchs = re.search(r'贷款期限(\d+)个月', all_text) | ||
221 | if matchs: | ||
222 | words = matchs.group(1) | ||
223 | for block in self.pdf_info[page_num]['blocks']: | ||
224 | if block['type'] != 0: | ||
225 | continue | ||
226 | for line in block['lines']: | ||
227 | for span in line['spans']: | ||
228 | bbox, text = span['bbox'], span['text'] | ||
229 | if f'{words}个月' in text: | ||
230 | loan_term['position'] = bbox | ||
231 | loan_term['words'] = words | ||
232 | return loan_term | ||
233 | |||
234 | def get_standard_rate(self, page_num='0'): | ||
235 | standard_rate = self.item.copy() | ||
236 | for block in self.pdf_info[page_num]['blocks']: | ||
237 | if block['type'] != 0: | ||
238 | continue | ||
239 | for line in block['lines']: | ||
240 | for span in line['spans']: | ||
241 | bbox, text = span['bbox'], span['text'] | ||
242 | matchs = re.search(r'本合同当期的标准利率为(\S+)%/年', text) | ||
243 | if matchs: | ||
244 | standard_rate['position'] = bbox | ||
245 | standard_rate['words'] = matchs.group(1) | ||
246 | return standard_rate | ||
247 | |||
248 | def mergelist(self, text_list): | ||
249 | pattern = re.compile("[^\u4e00-\u9fa5]") # 匹配不是中文的其他字符 | ||
250 | mergeindex = -1 | ||
251 | for index, i in enumerate(text_list): | ||
252 | if '所购' in i and len(pattern.sub('', pattern.sub('', text_list[index+1]))) != 0: | ||
253 | # if '所购' in i and '.00' not in text_list[index+1]: | ||
254 | mergeindex = index | ||
255 | if mergeindex == -1: | ||
256 | return text_list | ||
257 | else: | ||
258 | new_text_list = text_list[:mergeindex] + [text_list[mergeindex] + text_list[mergeindex+1]] + text_list[mergeindex+2:] | ||
259 | return self.mergelist(new_text_list) | ||
260 | |||
261 | def get_asp_details(self, page_num): | ||
262 | asp_details_table_term = self.item.copy() | ||
263 | |||
264 | asp_details_table = [['附加产品融资贷款本金总金额及贷款利率明细'], ['项目1', '用途总金额2', '贷款本金3']] | ||
265 | |||
266 | bbox_xm = None | ||
267 | bbox_ytzje = None | ||
268 | bbox_dkbj = None | ||
269 | bbox_total = None | ||
270 | for key in self.ocr_results[page_num]: | ||
271 | bbox, text = self.ocr_results[page_num][key] | ||
272 | if text == '项目1': | ||
273 | bbox_xm = bbox | ||
274 | if text == '用途总金额2': | ||
275 | bbox_ytzje = bbox | ||
276 | if text == '贷款本金3': | ||
277 | bbox_dkbj = bbox | ||
278 | if text in ['附加产品融资贷款本', '附加产品融资贷款本金', '附加产品融资贷']: | ||
279 | bbox_total = bbox | ||
280 | |||
281 | if bbox_xm: | ||
282 | for i in range(10): | ||
283 | rh = abs(bbox_xm[1]-bbox_xm[-1]) | ||
284 | anchor = np.array(bbox_xm).reshape((-1 ,2)) | ||
285 | anchor[:, 1] += int(rh*1.4) | ||
286 | _iou, _key = self.get_top_iou(poly=anchor, ocr_result=self.ocr_results[page_num]) | ||
287 | if _iou > 0: | ||
288 | bbox, xm_text = self.ocr_results[page_num][_key] | ||
289 | bbox_xm = bbox | ||
290 | # 解决项目内容是两行的问题 | ||
291 | if not '所购' in xm_text: | ||
292 | line = asp_details_table[-1] | ||
293 | line[0] += xm_text | ||
294 | asp_details_table[-1] = line | ||
295 | continue | ||
296 | # print(xm_text) | ||
297 | anchor_1 = [bbox_ytzje[0], bbox[1], bbox_ytzje[2], bbox[3], | ||
298 | bbox_ytzje[4], bbox[5], bbox_ytzje[6], bbox[7]] | ||
299 | _iou, _key = self.get_top_iou(poly=anchor_1, ocr_result=self.ocr_results[page_num]) | ||
300 | bbox, ytzje_text = self.ocr_results[page_num][_key] | ||
301 | # print(ytzje_text) | ||
302 | anchor_2 = [bbox_dkbj[0], bbox[1], bbox_dkbj[2], bbox[3], | ||
303 | bbox_dkbj[4], bbox[5], bbox_dkbj[6], bbox[7]] | ||
304 | _iou, _key = self.get_top_iou(poly=anchor_2, ocr_result=self.ocr_results[page_num]) | ||
305 | bbox, dkbj_text = self.ocr_results[page_num][_key] | ||
306 | # print(dkbj_text) | ||
307 | if xm_text == ytzje_text: | ||
308 | xm_text, ytzje_text = xm_text.split(' ') | ||
309 | line = [xm_text, ytzje_text, dkbj_text] | ||
310 | asp_details_table.append(line) | ||
311 | else: | ||
312 | break | ||
313 | |||
314 | if bbox_total: | ||
315 | anchor = [bbox_dkbj[0], bbox_total[1], bbox_dkbj[2], bbox_total[3], | ||
316 | bbox_dkbj[4], bbox_total[5], bbox_dkbj[6], bbox_total[7]] | ||
317 | _iou, _key = self.get_top_iou(poly=anchor, ocr_result=self.ocr_results[page_num]) | ||
318 | bbox, total_text = self.ocr_results[page_num][_key] | ||
319 | asp_details_table.append(['附加产品融资贷款本金总金额:', '', total_text]) | ||
320 | asp_details_table_term['words'] = asp_details_table | ||
321 | |||
322 | return asp_details_table_term | ||
323 | |||
324 | def get_signature(self): | ||
325 | signature = self.item.copy() | ||
326 | |||
327 | for block in self.pdf_info['0']['blocks']: | ||
328 | if block['type'] != 0: | ||
329 | continue | ||
330 | for line in block['lines']: | ||
331 | for span in line['spans']: | ||
332 | bbox, text = span['bbox'], span['text'] | ||
333 | if '签署日期' in text: | ||
334 | words = text | ||
335 | signature['words'] = words | ||
336 | signature['position'] = bbox | ||
337 | return signature | ||
338 | |||
339 | def get_somebody(self, top, bottom): | ||
340 | # 指定上下边界后,返回上下边界内的客户信息 | ||
341 | _name = self.item.copy() | ||
342 | _id = self.item.copy() | ||
343 | # 只看第一页,先划定上下边界 | ||
344 | y_top = 0 | ||
345 | y_bottom = 0 | ||
346 | for block in self.pdf_info['1']['blocks']: | ||
347 | if block['type'] != 0: | ||
348 | continue | ||
349 | for line in block['lines']: | ||
350 | for span in line['spans']: | ||
351 | bbox, text = span['bbox'], span['text'] | ||
352 | if top in text: | ||
353 | y_top = bbox[3] | ||
354 | if bottom in text: | ||
355 | y_bottom = bbox[3] | ||
356 | for block in self.pdf_info['1']['blocks']: | ||
357 | if block['type'] != 0: | ||
358 | continue | ||
359 | for line in block['lines']: | ||
360 | for span in line['spans']: | ||
361 | bbox, text = span['bbox'], span['text'] | ||
362 | if y_top < bbox[3] < y_bottom: | ||
363 | # print(top, bottom, text) | ||
364 | if '姓名/名称' in text: | ||
365 | words = text.split(':')[-1] | ||
366 | _name['position'] = bbox | ||
367 | _name['words'] = words | ||
368 | if '自然人身份证件号码/法人执照号码' in text: | ||
369 | words = text.split(':')[-1] | ||
370 | _id['position'] = bbox | ||
371 | _id['words'] = words | ||
372 | return _name, _id | ||
373 | |||
374 | def get_seller(self): | ||
375 | seller = self.item.copy() | ||
376 | # 先找到 key | ||
377 | anchor_bbox = None | ||
378 | for block in self.pdf_info['1']['blocks']: | ||
379 | if block['type'] != 0: | ||
380 | continue | ||
381 | for line in block['lines']: | ||
382 | for span in line['spans']: | ||
383 | bbox, text = span['bbox'], span['text'] | ||
384 | if text in ['经销商', '车辆销售方']: | ||
385 | anchor_bbox = bbox | ||
386 | # 当找到了 key, 则根据 key 去匹配 value | ||
387 | if anchor_bbox: | ||
388 | half_width = self.pdf_info['1']['width'] * 0.5 | ||
389 | for block in self.pdf_info['1']['blocks']: | ||
390 | if block['type'] != 0: | ||
391 | continue | ||
392 | for line in block['lines']: | ||
393 | for span in line['spans']: | ||
394 | bbox, text = span['bbox'], span['text'] | ||
395 | if anchor_bbox[2]<np.mean(bbox[::2])<half_width and \ | ||
396 | anchor_bbox[1]<np.mean(bbox[1::2])<anchor_bbox[3]: | ||
397 | seller['position'] = bbox | ||
398 | seller['words'] = text | ||
399 | return seller | ||
400 | |||
401 | def get_borrower_collection_account(self): | ||
402 | account = self.item.copy() | ||
403 | account_name = self.item.copy() | ||
404 | account_bank = self.item.copy() | ||
405 | all_text = '' | ||
406 | for block in self.pdf_info['1']['blocks']: | ||
407 | if block['type'] != 0: | ||
408 | continue | ||
409 | for line in block['lines']: | ||
410 | for span in line['spans']: | ||
411 | bbox, text = span['bbox'], span['text'] | ||
412 | all_text += text | ||
413 | # 首先确定账户信息是哪种,我们只输出非另行通知的格式 | ||
414 | if '借款人收款账户' in all_text: | ||
415 | all_text = all_text.replace(' ', '').replace(' ', '') | ||
416 | matchs_1 = re.findall(r'账号:(.*?)户名', all_text) | ||
417 | if matchs_1: | ||
418 | words = matchs_1[0] | ||
419 | for block in self.pdf_info['1']['blocks']: | ||
420 | if block['type'] != 0: | ||
421 | continue | ||
422 | for line in block['lines']: | ||
423 | for span in line['spans']: | ||
424 | bbox, text = span['bbox'], span['text'] | ||
425 | if f'{words}' in text: | ||
426 | account['position'] = bbox | ||
427 | account['words'] = words | ||
428 | matchs_2 = re.findall(r'户名:(.*?)开户行', all_text) | ||
429 | if matchs_2: | ||
430 | words = matchs_2[0] | ||
431 | for block in self.pdf_info['1']['blocks']: | ||
432 | if block['type'] != 0: | ||
433 | continue | ||
434 | for line in block['lines']: | ||
435 | for span in line['spans']: | ||
436 | bbox, text = span['bbox'], span['text'] | ||
437 | if f'{words}' in text: | ||
438 | account_name['position'] = bbox | ||
439 | account_name['words'] = words | ||
440 | matchs_3 = re.findall(r'开户行:(.*?)借款人', all_text) | ||
441 | if matchs_3: | ||
442 | words = matchs_3[0] | ||
443 | for block in self.pdf_info['1']['blocks']: | ||
444 | if block['type'] != 0: | ||
445 | continue | ||
446 | for line in block['lines']: | ||
447 | for span in line['spans']: | ||
448 | bbox, text = span['bbox'], span['text'] | ||
449 | if f'{words}' in text: | ||
450 | account_bank['position'] = bbox | ||
451 | account_bank['words'] = words | ||
452 | return account, account_name, account_bank | ||
453 | |||
454 | def get_payback_account(self): | ||
455 | account = self.item.copy() | ||
456 | account_name = self.item.copy() | ||
457 | account_bank = self.item.copy() | ||
458 | all_text = '' | ||
459 | for block in self.pdf_info['1']['blocks']: | ||
460 | if block['type'] != 0: | ||
461 | continue | ||
462 | for line in block['lines']: | ||
463 | for span in line['spans']: | ||
464 | bbox, text = span['bbox'], span['text'] | ||
465 | all_text += text | ||
466 | # 首先确定账户信息是哪种,我们只输出非另行通知的格式 | ||
467 | if '(13) 还款账户' in all_text: | ||
468 | all_text = all_text.split('(13) 还款账户')[-1] | ||
469 | all_text = all_text.replace(' ', '').replace(' ', '') | ||
470 | matchs_1 = re.findall(r'账号:(.*?)户名', all_text) | ||
471 | if matchs_1: | ||
472 | words = matchs_1[0] | ||
473 | for block in self.pdf_info['1']['blocks']: | ||
474 | if block['type'] != 0: | ||
475 | continue | ||
476 | for line in block['lines']: | ||
477 | for span in line['spans']: | ||
478 | bbox, text = span['bbox'], span['text'] | ||
479 | if f'{words}' in text: | ||
480 | account['position'] = bbox | ||
481 | account['words'] = words | ||
482 | matchs_2 = re.findall(r'户名:(.*?)开户行', all_text) | ||
483 | if matchs_2: | ||
484 | words = matchs_2[0] | ||
485 | for block in self.pdf_info['1']['blocks']: | ||
486 | if block['type'] != 0: | ||
487 | continue | ||
488 | for line in block['lines']: | ||
489 | for span in line['spans']: | ||
490 | bbox, text = span['bbox'], span['text'] | ||
491 | if f'{words}' in text: | ||
492 | account_name['position'] = bbox | ||
493 | account_name['words'] = words | ||
494 | matchs_3 = re.findall(r'开户行:(.*?);', all_text) | ||
495 | if matchs_3: | ||
496 | words = matchs_3[0] | ||
497 | for block in self.pdf_info['1']['blocks']: | ||
498 | if block['type'] != 0: | ||
499 | continue | ||
500 | for line in block['lines']: | ||
501 | for span in line['spans']: | ||
502 | bbox, text = span['bbox'], span['text'] | ||
503 | if f'开户行:{words};' in text.replace(' ', ''): | ||
504 | account_bank['position'] = bbox | ||
505 | account_bank['words'] = words | ||
506 | return account, account_name, account_bank | ||
507 | |||
508 | def get_repayment_schedule(self): | ||
509 | repayment_schedule = self.item.copy() | ||
510 | # 只看第二页 | ||
511 | repayment_schedule_table = [] | ||
512 | repayment_schedule_text_list = [] | ||
513 | table = False | ||
514 | for block in self.pdf_info['2']['blocks']: | ||
515 | if block['type'] != 0: | ||
516 | continue | ||
517 | for line in block['lines']: | ||
518 | for span in line['spans']: | ||
519 | bbox, text = span['bbox'], span['text'] | ||
520 | if '序号' == text: | ||
521 | table = True | ||
522 | if '以上表格中所列的序号并非还款期数' in text: | ||
523 | table = False | ||
524 | if table == True: | ||
525 | repayment_schedule_text_list.append(text) | ||
526 | |||
527 | for i in range(len(repayment_schedule_text_list)//5): | ||
528 | |||
529 | line = [] | ||
530 | # 5表示5列的意思 | ||
531 | for j in range(5): | ||
532 | line.append(repayment_schedule_text_list[i*5+j]) | ||
533 | |||
534 | if str(i+1) == line[1]: | ||
535 | break | ||
536 | |||
537 | repayment_schedule_table.append(line) | ||
538 | |||
539 | if len(repayment_schedule_table) > 0: | ||
540 | repayment_schedule['words'] = repayment_schedule_table | ||
541 | return repayment_schedule | ||
542 | |||
543 | def get_signature_role_1(self): | ||
544 | signature_role_1 = self.init_item.copy() | ||
545 | # 先定位签字区域 | ||
546 | texts = [] | ||
547 | boxes = [] | ||
548 | page_num = None | ||
549 | position = None | ||
550 | words = None | ||
551 | region = False | ||
552 | for i in list(self.pdf_info.keys()): | ||
553 | for block in self.pdf_info[i]['blocks']: | ||
554 | if block['type'] != 0: | ||
555 | continue | ||
556 | for line in block['lines']: | ||
557 | for span in line['spans']: | ||
558 | bbox, text = span['bbox'], span['text'] | ||
559 | if '借款人(抵押人)' in text: | ||
560 | region = True | ||
561 | if '日期' in text: | ||
562 | region = False | ||
563 | if region == True: | ||
564 | page_num = i | ||
565 | texts.append(text) | ||
566 | boxes.append(bbox) | ||
567 | if len(texts) > 4: | ||
568 | words = '有' | ||
569 | else: | ||
570 | words = '无' | ||
571 | boxes = np.array(boxes).reshape((-1, 2)) | ||
572 | position = [min(boxes[:,0]), min(boxes[:,1]), max(boxes[:,0]), max(boxes[:,1])] | ||
573 | signature_role_1['page_num'] = page_num | ||
574 | signature_role_1['position'] = position | ||
575 | signature_role_1['words'] = words | ||
576 | return signature_role_1 | ||
577 | |||
578 | def get_signature_role_2(self): | ||
579 | signature_role_2 = self.init_item.copy() | ||
580 | # 先定位签字区域 | ||
581 | texts = [] | ||
582 | boxes = [] | ||
583 | page_num = None | ||
584 | position = None | ||
585 | words = None | ||
586 | region = False | ||
587 | for i in list(self.pdf_info.keys()): | ||
588 | for block in self.pdf_info[i]['blocks']: | ||
589 | if block['type'] != 0: | ||
590 | continue | ||
591 | for line in block['lines']: | ||
592 | for span in line['spans']: | ||
593 | bbox, text = span['bbox'], span['text'] | ||
594 | if '共同借款人(共同抵押人)' in text: | ||
595 | region = True | ||
596 | if '日期' in text: | ||
597 | region = False | ||
598 | if region == True: | ||
599 | page_num = i | ||
600 | texts.append(text) | ||
601 | boxes.append(bbox) | ||
602 | if len(texts) > 4: | ||
603 | words = '有' | ||
604 | else: | ||
605 | words = '无' | ||
606 | boxes = np.array(boxes).reshape((-1, 2)) | ||
607 | position = [min(boxes[:,0]), min(boxes[:,1]), max(boxes[:,0]), max(boxes[:,1])] | ||
608 | signature_role_2['page_num'] = page_num | ||
609 | signature_role_2['position'] = position | ||
610 | signature_role_2['words'] = words | ||
611 | return signature_role_2 | ||
612 | |||
613 | def get_signature_role_3(self): | ||
614 | signature_role_3 = self.init_item.copy() | ||
615 | # 先定位签字区域 | ||
616 | texts = [] | ||
617 | boxes = [] | ||
618 | page_num = None | ||
619 | position = None | ||
620 | words = None | ||
621 | region = False | ||
622 | for i in list(self.pdf_info.keys()): | ||
623 | for block in self.pdf_info[i]['blocks']: | ||
624 | if block['type'] != 0: | ||
625 | continue | ||
626 | for line in block['lines']: | ||
627 | for span in line['spans']: | ||
628 | bbox, text = span['bbox'], span['text'] | ||
629 | if '保证人1' in text and int(i) != 0: | ||
630 | region = True | ||
631 | if '日期' in text: | ||
632 | region = False | ||
633 | if region == True: | ||
634 | page_num = i | ||
635 | texts.append(text) | ||
636 | boxes.append(bbox) | ||
637 | if len(texts) > 4: | ||
638 | words = '有' | ||
639 | else: | ||
640 | words = '无' | ||
641 | boxes = np.array(boxes).reshape((-1, 2)) | ||
642 | position = [min(boxes[:,0]), min(boxes[:,1]), max(boxes[:,0]), max(boxes[:,1])] | ||
643 | signature_role_3['page_num'] = page_num | ||
644 | signature_role_3['position'] = position | ||
645 | signature_role_3['words'] = words | ||
646 | return signature_role_3 | ||
647 | |||
648 | def get_signature_role_4(self): | ||
649 | signature_role_4 = self.init_item.copy() | ||
650 | # 先定位签字区域 | ||
651 | texts = [] | ||
652 | boxes = [] | ||
653 | page_num = None | ||
654 | position = None | ||
655 | words = None | ||
656 | region = False | ||
657 | for i in list(self.pdf_info.keys()): | ||
658 | for block in self.pdf_info[i]['blocks']: | ||
659 | if block['type'] != 0: | ||
660 | continue | ||
661 | for line in block['lines']: | ||
662 | for span in line['spans']: | ||
663 | bbox, text = span['bbox'], span['text'] | ||
664 | if '保证人2' in text and int(i) != 0: | ||
665 | region = True | ||
666 | if '日期' in text: | ||
667 | region = False | ||
668 | if region == True: | ||
669 | page_num = i | ||
670 | texts.append(text) | ||
671 | boxes.append(bbox) | ||
672 | if len(texts) > 4: | ||
673 | words = '有' | ||
674 | else: | ||
675 | words = '无' | ||
676 | boxes = np.array(boxes).reshape((-1, 2)) | ||
677 | position = [min(boxes[:,0]), min(boxes[:,1]), max(boxes[:,0]), max(boxes[:,1])] | ||
678 | signature_role_4['page_num'] = page_num | ||
679 | signature_role_4['position'] = position | ||
680 | signature_role_4['words'] = words | ||
681 | return signature_role_4 | ||
682 | |||
683 | def get_signature_role_5(self): | ||
684 | signature_role_5 = self.init_item.copy() | ||
685 | # 先定位签字区域 | ||
686 | texts = [] | ||
687 | boxes = [] | ||
688 | page_num = None | ||
689 | position = None | ||
690 | words = None | ||
691 | region = False | ||
692 | for i in list(self.pdf_info.keys()): | ||
693 | for block in self.pdf_info[i]['blocks']: | ||
694 | if block['type'] != 0: | ||
695 | continue | ||
696 | for line in block['lines']: | ||
697 | for span in line['spans']: | ||
698 | bbox, text = span['bbox'], span['text'] | ||
699 | if '见证人签字' in text and int(i) != 0: | ||
700 | region = True | ||
701 | if '年' in text: | ||
702 | region = False | ||
703 | if region == True: | ||
704 | page_num = i | ||
705 | texts.append(text) | ||
706 | boxes.append(bbox) | ||
707 | print(texts) | ||
708 | if len(texts) > 4: | ||
709 | words = '有' | ||
710 | else: | ||
711 | words = '无' | ||
712 | boxes = np.array(boxes).reshape((-1, 2)) | ||
713 | position = [min(boxes[:,0]), min(boxes[:,1]), max(boxes[:,0]), max(boxes[:,1])] | ||
714 | signature_role_5['page_num'] = page_num | ||
715 | signature_role_5['position'] = position | ||
716 | signature_role_5['words'] = words | ||
717 | return signature_role_5 | ||
718 | |||
719 | def get_last_page_signature(self, page_num, top, bottom): | ||
720 | signature_name = self.item.copy() | ||
721 | signature_date = self.item.copy() | ||
722 | anchor_top = None | ||
723 | anchor_bottom = None | ||
724 | for block in self.pdf_info[page_num]['blocks']: | ||
725 | if block['type'] != 0: | ||
726 | continue | ||
727 | for line in block['lines']: | ||
728 | for span in line['spans']: | ||
729 | bbox, text = span['bbox'], span['text'] | ||
730 | if top in text: | ||
731 | anchor_top = bbox[1] | ||
732 | if bottom in text: | ||
733 | anchor_bottom = bbox[1] | ||
734 | # print(top, anchor_top, anchor_bottom) | ||
735 | if anchor_top is not None and anchor_bottom is not None: | ||
736 | for block in self.pdf_info[page_num]['blocks']: | ||
737 | if block['type'] != 0: | ||
738 | continue | ||
739 | for line in block['lines']: | ||
740 | for span in line['spans']: | ||
741 | bbox, text = span['bbox'], span['text'] | ||
742 | if '签署日期' in text and int(anchor_top)<np.mean(bbox[1::2])<int(anchor_bottom): | ||
743 | name = text.split(' ')[0] | ||
744 | date = text.split(':')[-1] | ||
745 | signature_name['words'] = name | ||
746 | signature_name['position'] = bbox | ||
747 | signature_date['words'] = date | ||
748 | signature_date['position'] = bbox | ||
749 | return signature_name, signature_date | ||
750 | |||
751 | def get_info(self): | ||
752 | """ | ||
753 | block['type'] == 0 : 表示该元素为图片 | ||
754 | |||
755 | Returns: | ||
756 | dict: Description | ||
757 | """ | ||
758 | |||
759 | # 先判断是否为 ASP 产品 | ||
760 | # 只看第一页,判断是否有 '附加产品融资贷款本金总金额' 这一句话,若有则为 ASP 产品 | ||
761 | # print(self.pdf_info['0']['blocks']) | ||
762 | # for block in self.pdf_info['0']['blocks']: | ||
763 | # if block['type'] != 0: | ||
764 | # continue | ||
765 | # for line in block['lines']: | ||
766 | # for span in line['spans']: | ||
767 | # bbox, text = span['bbox'], span['text'] | ||
768 | # if '附加产品融资贷款本金总金额' == text: | ||
769 | # self.is_asp = True | ||
770 | for key in self.ocr_results['0']: | ||
771 | bbox, text = self.ocr_results['0'][key] | ||
772 | if '附加产品融资贷款本金总金额' in text: | ||
773 | self.is_asp = True | ||
774 | |||
775 | self.gen_init_result(self.is_asp) | ||
776 | |||
777 | if len(list(self.ocr_results.keys())) <= 8: # 8.5 版本客户提供的样本出现串页的情况,暂时无法识别 | ||
778 | # Page 1 | ||
779 | # 找合同编号 | ||
780 | contract_no = self.get_contract_no(page_num='0') | ||
781 | # print(contract_no) | ||
782 | self.init_result['page_1']['合同编号'] = contract_no | ||
783 | # 所购车辆价格 | ||
784 | vehicle_price = self.get_vehicle_price() | ||
785 | # print(vehicle_price) | ||
786 | self.init_result['page_1']['所购车辆价格'] = vehicle_price | ||
787 | # 车架号 | ||
788 | vin = self.get_vin() | ||
789 | # print(vin) | ||
790 | self.init_result['page_1']['车架号'] = vin | ||
791 | # 贷款本金金额(如果是 ASP产品)则'贷款本金金额'项目中包含'车辆贷款本金金额'和'附加产品融资贷款本金总金额'两个项目 | ||
792 | upper, lower, asp_1, asp_2 = self.get_loan_principal() | ||
793 | # print(upper, lower, asp_1, asp_2) | ||
794 | self.init_result['page_1']['贷款本金金额']['大写'] = upper | ||
795 | self.init_result['page_1']['贷款本金金额']['小写'] = lower | ||
796 | self.init_result['page_1']['贷款本金金额']['车辆贷款本金金额'] = asp_1 | ||
797 | self.init_result['page_1']['贷款本金金额']['附加产品融资贷款本金总金额'] = asp_2 | ||
798 | # 贷款期限 | ||
799 | loan_term = self.get_loan_term() | ||
800 | # print(loan_term) | ||
801 | self.init_result['page_1']['贷款期限'] = loan_term | ||
802 | # 附加产品融资贷款本金总金额明细(ASP-表格) | ||
803 | asp_details_table = self.get_asp_details(page_num='0') | ||
804 | # print(asp_details_table) | ||
805 | self.init_result['page_1']['附加产品融资贷款本金总金额明细'] = asp_details_table | ||
806 | # 借款人签字及时间 | ||
807 | signature = self.get_signature() | ||
808 | # print(signature) | ||
809 | self.init_result['page_1']['借款人签字及时间'] = signature | ||
810 | ####################################### | ||
811 | # Page 2 | ||
812 | # 找合同编号 | ||
813 | contract_no = self.get_contract_no(page_num='0') | ||
814 | # print(contract_no) | ||
815 | self.init_result['page_2']['合同编号'] = contract_no | ||
816 | # 找借款人及抵押人(地址字段原本有空格) | ||
817 | borrower_name, borrower_id = self.get_somebody(top='借款人及抵押人:', bottom='共同借款人:') | ||
818 | # 这是为了同时兼容 8.1 版本 | ||
819 | if borrower_name['words'] == None: | ||
820 | borrower_name, borrower_id = self.get_somebody(top='借款人及抵押人:', bottom='共同借款人及共同抵押人:') | ||
821 | # 这是为了兼容车贷分离版本 | ||
822 | if borrower_name['words'] == None: | ||
823 | borrower_name, borrower_id = self.get_somebody(top='借款人:', bottom='共同借款人及抵押人:') | ||
824 | # print(borrower_name, borrower_id) | ||
825 | self.init_result['page_2']['借款人及抵押人']['name'] = borrower_name | ||
826 | self.init_result['page_2']['借款人及抵押人']['id'] = borrower_id | ||
827 | # 找共同借款人及共同抵押人 | ||
828 | co_borrower_name, co_borrower_id = self.get_somebody(top='共同借款人:', bottom='保证人1:') | ||
829 | # print(co_borrower_name, co_borrower_id) | ||
830 | self.init_result['page_2']['共同借款人及共同抵押人']['name'] = co_borrower_name | ||
831 | self.init_result['page_2']['共同借款人及共同抵押人']['id'] = co_borrower_id | ||
832 | # 保证人1 | ||
833 | first_guarantor_name, first_guarantor_id = self.get_somebody(top='保证人1:', bottom='保证人2:') | ||
834 | self.init_result['page_2']['保证人1']['name'] = first_guarantor_name | ||
835 | self.init_result['page_2']['保证人1']['id'] = first_guarantor_id | ||
836 | # 保证人2 | ||
837 | second_guarantor_name, second_guarantor_id = self.get_somebody(top='保证人2:', bottom='第一章') | ||
838 | self.init_result['page_2']['保证人2']['name'] = second_guarantor_name | ||
839 | self.init_result['page_2']['保证人2']['id'] = second_guarantor_id | ||
840 | # 所购车辆价格 | ||
841 | vehicle_price = self.get_vehicle_price(page_num='1') | ||
842 | # print(vehicle_price) | ||
843 | self.init_result['page_2']['所购车辆价格'] = vehicle_price | ||
844 | # 车架号 | ||
845 | vin = self.get_vin(page_num='1') | ||
846 | # print(vin) | ||
847 | self.init_result['page_2']['车架号'] = vin | ||
848 | # 经销商 | ||
849 | seller = self.get_seller() | ||
850 | # print(seller) | ||
851 | self.init_result['page_2']['经销商'] = seller | ||
852 | # 贷款本金金额(如果是 ASP产品)则'贷款本金金额'项目中包含'车辆贷款本金金额'和'附加产品融资贷款本金总金额'两个项目 | ||
853 | upper, lower, asp_1, asp_2 = self.get_loan_principal(page_num='1') | ||
854 | # print(upper, lower, asp_1, asp_2) | ||
855 | self.init_result['page_2']['贷款本金金额']['大写'] = upper | ||
856 | self.init_result['page_2']['贷款本金金额']['小写'] = lower | ||
857 | self.init_result['page_2']['贷款本金金额']['车辆贷款本金金额'] = asp_1 | ||
858 | self.init_result['page_2']['贷款本金金额']['附加产品融资贷款本金总金额'] = asp_2 | ||
859 | # 贷款期限 | ||
860 | loan_term = self.get_loan_term(page_num='1') | ||
861 | # print(loan_term) | ||
862 | self.init_result['page_2']['贷款期限'] = loan_term | ||
863 | # 本合同当期的标准利率 | ||
864 | standard_rate = self.get_standard_rate(page_num='1') | ||
865 | # print(standard_rate) | ||
866 | self.init_result['page_2']['标准利率'] = standard_rate | ||
867 | # 202212 release 新增借款人收款账户 | ||
868 | account, account_name, account_bank = self.get_borrower_collection_account() | ||
869 | # print(account, account_name, account_bank) | ||
870 | self.init_result['page_2']['借款人收款账户']['账号'] = account | ||
871 | self.init_result['page_2']['借款人收款账户']['户名'] = account_name | ||
872 | self.init_result['page_2']['借款人收款账户']['开户行'] = account_bank | ||
873 | # 还款账户 | ||
874 | account, account_name, account_bank = self.get_payback_account() | ||
875 | # print(account, account_name, account_bank) | ||
876 | self.init_result['page_2']['还款账户']['账号'] = account | ||
877 | self.init_result['page_2']['还款账户']['户名'] = account_name | ||
878 | self.init_result['page_2']['还款账户']['开户行'] = account_bank | ||
879 | ####################################### | ||
880 | # Page 3 | ||
881 | # 找合同编号 | ||
882 | contract_no = self.get_contract_no(page_num='2') | ||
883 | self.init_result['page_3']['合同编号'] = contract_no | ||
884 | # 还款计划表(表格) | ||
885 | repayment_schedule_table = self.get_repayment_schedule() | ||
886 | # print(repayment_schedule_table) | ||
887 | self.init_result['page_3']['还款计划表'] = repayment_schedule_table | ||
888 | ####################################### | ||
889 | # Page 4 | ||
890 | # 找合同编号 | ||
891 | contract_no = self.get_contract_no(page_num='3') | ||
892 | # print(contract_no) | ||
893 | self.init_result['page_4']['合同编号'] = contract_no | ||
894 | # 附加产品融资贷款本金总金额明细(ASP-表格) | ||
895 | asp_details_table = self.get_asp_details(page_num='3') | ||
896 | # print(asp_details_table) | ||
897 | self.init_result['page_4']['附加产品融资贷款本金总金额明细'] = asp_details_table | ||
898 | ####################################### | ||
899 | # Page 5 | ||
900 | # 找合同编号 | ||
901 | contract_no = self.get_contract_no(page_num='4') | ||
902 | # print(contract_no) | ||
903 | self.init_result['page_5']['合同编号'] = contract_no | ||
904 | ####################################### | ||
905 | # Page 6 | ||
906 | # 找合同编号 | ||
907 | contract_no = self.get_contract_no(page_num='5') | ||
908 | # print(contract_no) | ||
909 | self.init_result['page_6']['合同编号'] = contract_no | ||
910 | # Page 7 | ||
911 | # 找合同编号 | ||
912 | contract_no = self.get_contract_no(page_num='6') | ||
913 | self.init_result['page_7']['合同编号'] = contract_no | ||
914 | # Page 8 | ||
915 | # 找合同编号 | ||
916 | contract_no = self.get_contract_no(page_num='7') | ||
917 | self.init_result['page_8']['合同编号'] = contract_no | ||
918 | signature_name, signature_date = self.get_last_page_signature(page_num='7', | ||
919 | top='合同编号', bottom='共同借款人') | ||
920 | if signature_name['words'] == None: | ||
921 | signature_name, signature_date = self.get_last_page_signature(page_num='7', | ||
922 | top='合同编号', bottom='共同借款人(抵押人)') | ||
923 | # print(signature_name, signature_date) | ||
924 | self.init_result['page_8']['主借人签字']['签字'] = signature_name | ||
925 | self.init_result['page_8']['主借人签字']['日期'] = signature_date | ||
926 | signature_name, signature_date = self.get_last_page_signature(page_num='7', | ||
927 | top='共同借款人', bottom='保证人1') | ||
928 | if signature_name['words'] == None: | ||
929 | signature_name, signature_date = self.get_last_page_signature(page_num='7', | ||
930 | top='共同借款人(抵押人)', bottom='保证人1') | ||
931 | # print(signature_name, signature_date) | ||
932 | self.init_result['page_8']['共借人签字']['签字'] = signature_name | ||
933 | self.init_result['page_8']['共借人签字']['日期'] = signature_date | ||
934 | signature_name, signature_date = self.get_last_page_signature(page_num='7', | ||
935 | top='保证人1', bottom='保证人2') | ||
936 | self.init_result['page_8']['保证人1签字']['签字'] = signature_name | ||
937 | self.init_result['page_8']['保证人1签字']['日期'] = signature_date | ||
938 | signature_name, signature_date = self.get_last_page_signature(page_num='7', | ||
939 | top='保证人2', bottom='在本人面前亲笔签署本合同') | ||
940 | self.init_result['page_8']['保证人2签字']['签字'] = signature_name | ||
941 | self.init_result['page_8']['保证人2签字']['日期'] = signature_date | ||
942 | signature_name, signature_date = self.get_last_page_signature(page_num='7', | ||
943 | top='在本人面前亲笔签署本合同', bottom='以下无正文') | ||
944 | # print(signature_name, signature_date) | ||
945 | self.init_result['page_8']['见证人签字']['签字'] = signature_name | ||
946 | self.init_result['page_8']['见证人签字']['日期'] = signature_date | ||
947 | |||
948 | # 重新定制输出 | ||
949 | new_results = {"is_asp": self.is_asp, | ||
950 | "page_info": self.init_result | ||
951 | } | ||
952 | return new_results | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
1 | import re | ||
2 | import numpy as np | ||
3 | from fuzzywuzzy import fuzz | ||
4 | from shapely.geometry import Polygon | ||
5 | |||
6 | def caculate_iou(g, p): | ||
7 | g = Polygon(np.array(g).reshape((-1, 2))) | ||
8 | p = Polygon(np.array(p).reshape((-1, 2))) | ||
9 | inter = Polygon(g).intersection(Polygon(p)).area | ||
10 | union = g.area + p.area - inter | ||
11 | return inter/union | ||
12 | |||
13 | def get_table_info(bbox_1, bbox_2, ocr_result): | ||
14 | anchor = [bbox_2[0], bbox_1[1], bbox_2[2], bbox_1[3], | ||
15 | bbox_2[4], bbox_1[5], bbox_2[6], bbox_1[7]] | ||
16 | table_info = '' | ||
17 | for span in ocr_result: | ||
18 | iou = caculate_iou(anchor, span[0]) | ||
19 | if iou > 0: | ||
20 | table_info = span[1] | ||
21 | return table_info | ||
22 | |||
23 | class Finder: | ||
24 | |||
25 | def __init__(self, pdf_info): | ||
26 | self.pdf_info = pdf_info | ||
27 | self.item = {"words": None, | ||
28 | "page": None, | ||
29 | "position": None, | ||
30 | } | ||
31 | # 格式化算法输出 | ||
32 | self.init_result = {"合同编号": self.item, | ||
33 | "承租人-姓名": self.item, | ||
34 | "承租人-证件号码": self.item, | ||
35 | "承租人-法定代表人或授权代表": self.item, | ||
36 | |||
37 | "共同承租人-姓名": self.item, | ||
38 | "共同承租人-证件号码": self.item, | ||
39 | "共同承租人-法定代表人或授权代表": self.item, | ||
40 | |||
41 | "保证人1-姓名": self.item, | ||
42 | "保证人1-证件号码": self.item, | ||
43 | "保证人1-法定代表人或授权代表": self.item, | ||
44 | |||
45 | "保证人2-姓名": self.item, | ||
46 | "保证人2-证件号码": self.item, | ||
47 | "保证人2-法定代表人或授权代表": self.item, | ||
48 | "保证人3-姓名": self.item, | ||
49 | "保证人3-证件号码": self.item, | ||
50 | "保证人3-法定代表人或授权代表": self.item, | ||
51 | "合同编号(正文)": self.item, | ||
52 | "车辆识别代码": self.item, | ||
53 | "车辆卖方(经销商)": self.item, | ||
54 | "车辆原始销售价格(《机动车销售统一发票》所列金额)": self.item, | ||
55 | "车辆附加产品明细表": self.item, | ||
56 | "融资成本总额": self.item, | ||
57 | "租期": self.item, | ||
58 | "付款计划表": self.item, | ||
59 | "承租人收款账户-户名": self.item, | ||
60 | "承租人收款账户-银行账号": self.item, | ||
61 | "承租人收款账户-开户行": self.item, | ||
62 | "承租人扣款账户-户名": self.item, | ||
63 | "承租人扣款账户-银行账号": self.item, | ||
64 | "承租人扣款账户-开户行": self.item, | ||
65 | "签字页-承租人姓名": self.item, | ||
66 | "签字页-承租人签章": self.item, | ||
67 | |||
68 | "签字页-共同承租人姓名": self.item, | ||
69 | "签字页-共同承租人签章": self.item, | ||
70 | |||
71 | "签字页-保证人1姓名": self.item, | ||
72 | "签字页-保证人1签章": self.item, | ||
73 | |||
74 | "签字页-保证人2姓名": self.item, | ||
75 | "签字页-保证人2签章": self.item, | ||
76 | "签字页-保证人3姓名": self.item, | ||
77 | "签字页-保证人3签章": self.item, | ||
78 | } | ||
79 | |||
80 | # 格式化输出 车辆处置协议 要是别的字段 | ||
81 | self.init_result_1 = {"合同编号": self.item, | ||
82 | "承租人-姓名": self.item, | ||
83 | "承租人-证件号码": self.item, | ||
84 | "销售经销商": self.item, | ||
85 | "合同编号(正文)": self.item, | ||
86 | "签字页-承租人姓名": self.item, | ||
87 | "签字页-承租人证件号码": self.item, | ||
88 | "签字页-承租人签章": self.item, | ||
89 | "签字页-销售经销商": self.item, | ||
90 | "签字页-销售经销商签章": self.item, | ||
91 | } | ||
92 | |||
93 | # 格式化输出 车辆租赁抵押合同 | ||
94 | self.init_result_2 = {"合同编号": self.item, | ||
95 | "合同编号(正文)": self.item, | ||
96 | "抵押人姓名/名称": self.item, | ||
97 | "抵押人证件号码": self.item, | ||
98 | "抵押人配偶姓名/名称": self.item, | ||
99 | "抵押人配偶证件号码": self.item, | ||
100 | "车辆识别代码": self.item, | ||
101 | "租金总额": self.item, | ||
102 | "融资租赁期限": self.item, | ||
103 | "签字页-抵押人姓名": self.item, | ||
104 | "签字页-抵押人签章": self.item, | ||
105 | "签字页-抵押人配偶姓名": self.item, | ||
106 | "签字页-抵押人配偶签章": self.item, | ||
107 | } | ||
108 | |||
109 | def get_contract_no(self, page_num): | ||
110 | """传入页码,查看该页码右上角的编号 | ||
111 | |||
112 | Args: | ||
113 | page_num (string): | ||
114 | |||
115 | Returns: | ||
116 | sting: | ||
117 | """ | ||
118 | contract_no = self.item.copy() | ||
119 | # 只看第一页 | ||
120 | for block in self.pdf_info[page_num]['blocks']: | ||
121 | if block['type'] != 0: | ||
122 | continue | ||
123 | for line in block['lines']: | ||
124 | for span in line['spans']: | ||
125 | bbox, text = span['bbox'], span['text'] | ||
126 | if '合同编号:' in text: | ||
127 | words = text.split(':')[-1] | ||
128 | contract_no['position'] = bbox | ||
129 | contract_no['page'] = page_num | ||
130 | contract_no['words'] = words | ||
131 | if contract_no['words'] == '': | ||
132 | for block in self.pdf_info[page_num]['blocks']: | ||
133 | if block['type'] != 0: | ||
134 | continue | ||
135 | for line in block['lines']: | ||
136 | for span in line['spans']: | ||
137 | bbox, text = span['bbox'], span['text'] | ||
138 | if bbox[1] < contract_no['position'][3] and 'CH' in text: | ||
139 | contract_no['position'] = bbox | ||
140 | contract_no['page'] = page_num | ||
141 | contract_no['words'] = text | ||
142 | return contract_no | ||
143 | |||
144 | def get_vehicle_price(self, page_num='0'): | ||
145 | vehicle_price = self.item.copy() | ||
146 | for block in self.pdf_info[page_num]['blocks']: | ||
147 | if block['type'] != 0: | ||
148 | continue | ||
149 | for line in block['lines']: | ||
150 | for span in line['spans']: | ||
151 | bbox, text = span['bbox'], span['text'] | ||
152 | if '所购车辆价格为人民币' in text: | ||
153 | words = text.split('币')[-1] | ||
154 | vehicle_price['position'] = bbox | ||
155 | vehicle_price['words'] = words | ||
156 | return vehicle_price | ||
157 | |||
158 | def get_contract_no_one(self): | ||
159 | # 查找正文中的合同编号,有可能存在换行的情况 | ||
160 | contract_no = self.item.copy() | ||
161 | for pno in self.pdf_info: | ||
162 | all_text = '' | ||
163 | for block in self.pdf_info[pno]['blocks']: | ||
164 | if block['type'] != 0: | ||
165 | continue | ||
166 | for line in block['lines']: | ||
167 | for span in line['spans']: | ||
168 | bbox, text = span['bbox'], span['text'] | ||
169 | all_text += text | ||
170 | all_text = all_text.replace(' ', '') | ||
171 | matchObj = re.search(r'(合同编号:\[(.*?)\])', all_text) | ||
172 | if matchObj: | ||
173 | words = matchObj.group(1) | ||
174 | contract_no['position'] = None | ||
175 | contract_no['page'] = pno | ||
176 | # contract_no['words'] = words | ||
177 | contract_no['words'] = re.sub("\s", "", words).replace(")", "") | ||
178 | return contract_no | ||
179 | |||
180 | matchObj = re.search(r'编号为(.*?)的', all_text) | ||
181 | if matchObj: | ||
182 | words = matchObj.group(1).strip() | ||
183 | contract_no['position'] = None | ||
184 | contract_no['page'] = pno | ||
185 | # contract_no['words'] = words | ||
186 | contract_no['words'] = re.sub("\s", "", words).replace(")", "") | ||
187 | return contract_no | ||
188 | |||
189 | matchObj = re.search(r'编号为(.*?))的', all_text) | ||
190 | if matchObj: | ||
191 | words = matchObj.group(1).strip() | ||
192 | contract_no['position'] = None | ||
193 | contract_no['page'] = pno | ||
194 | # contract_no['words'] = words | ||
195 | contract_no['words'] = re.sub("\s", "", words) | ||
196 | return contract_no | ||
197 | |||
198 | def get_key_value(self, key, page_num=None): | ||
199 | value = self.item.copy() | ||
200 | if page_num is not None: | ||
201 | pno = page_num | ||
202 | for block in self.pdf_info[pno]['blocks']: | ||
203 | if block['type'] != 0: | ||
204 | continue | ||
205 | for line in block['lines']: | ||
206 | for span in line['spans']: | ||
207 | bbox, text = span['bbox'], span['text'] | ||
208 | if key in text: | ||
209 | words = text.split(':')[-1].replace("。", "") | ||
210 | value['position'] = bbox | ||
211 | value['page'] = pno | ||
212 | # value['words'] = words | ||
213 | value['words'] = re.sub("\s", "", words) | ||
214 | else: | ||
215 | for pno in self.pdf_info: | ||
216 | for block in self.pdf_info[pno]['blocks']: | ||
217 | if block['type'] != 0: | ||
218 | continue | ||
219 | for line in block['lines']: | ||
220 | for span in line['spans']: | ||
221 | bbox, text = span['bbox'], span['text'] | ||
222 | if key in text: | ||
223 | # print(self.pdf_info[pno]) | ||
224 | words = text.split(':')[-1].replace("。", "") | ||
225 | value['position'] = bbox | ||
226 | value['page'] = pno | ||
227 | # value['words'] = words | ||
228 | value['words'] = re.sub("\s", "", words) | ||
229 | return value | ||
230 | |||
231 | def get_loan_principal(self, page_num='0'): | ||
232 | chinese_keywords = ['壹', '贰', '叁', '肆', '伍', '陆', '柒', '捌', '玖', '拾', | ||
233 | '佰', '仟', '万', '亿', '元', '角', '分', '零', '整'] | ||
234 | upper = self.item.copy() | ||
235 | lower = self.item.copy() | ||
236 | asp_1 = self.item.copy() | ||
237 | asp_2 = self.item.copy() | ||
238 | anchor_bbox = None | ||
239 | for block in self.pdf_info[page_num]['blocks']: | ||
240 | if block['type'] != 0: | ||
241 | continue | ||
242 | for line in block['lines']: | ||
243 | for span in line['spans']: | ||
244 | bbox, text = span['bbox'], span['text'] | ||
245 | if fuzz.ratio(''.join(chinese_keywords), text) > 15: | ||
246 | text = text.split(':')[-1].strip() | ||
247 | upper['position'] = bbox | ||
248 | upper['words'] = text | ||
249 | if '小写:¥' in text: | ||
250 | words = text.split('¥')[-1].strip() | ||
251 | lower['position'] = bbox | ||
252 | lower['words'] = words | ||
253 | if '附加产品融资贷款本金总金额' == text: | ||
254 | anchor_bbox = bbox | ||
255 | if anchor_bbox: | ||
256 | for block in self.pdf_info[page_num]['blocks']: | ||
257 | if block['type'] != 0: | ||
258 | continue | ||
259 | for line in block['lines']: | ||
260 | for span in line['spans']: | ||
261 | bbox, text = span['bbox'], span['text'] | ||
262 | if np.mean(bbox[1::2]) < np.mean(anchor_bbox[1::2]) and '人民币:小写:' in text: | ||
263 | words = re.findall(r'人民币:小写:\[(.*)\]', text)[0] | ||
264 | asp_1['position'] = bbox | ||
265 | asp_1['words'] = words | ||
266 | if np.mean(bbox[1::2]) > np.mean(anchor_bbox[1::2]) and '人民币:小写:' in text: | ||
267 | words = re.findall(r'人民币:小写:\[(.*)\]', text)[0] | ||
268 | asp_2['position'] = bbox | ||
269 | asp_2['words'] = words | ||
270 | return upper, lower, asp_1, asp_2 | ||
271 | |||
272 | def get_loan_term(self, page_num='0'): | ||
273 | loan_term = self.item.copy() | ||
274 | all_text = '' | ||
275 | for block in self.pdf_info[page_num]['blocks']: | ||
276 | if block['type'] != 0: | ||
277 | continue | ||
278 | for line in block['lines']: | ||
279 | for span in line['spans']: | ||
280 | bbox, text = span['bbox'], span['text'] | ||
281 | all_text += text | ||
282 | matchs = re.search(r'贷款期限(\d+)个月', all_text) | ||
283 | if matchs: | ||
284 | words = matchs.group(1) | ||
285 | for block in self.pdf_info[page_num]['blocks']: | ||
286 | if block['type'] != 0: | ||
287 | continue | ||
288 | for line in block['lines']: | ||
289 | for span in line['spans']: | ||
290 | bbox, text = span['bbox'], span['text'] | ||
291 | if f'{words}个月' in text: | ||
292 | loan_term['position'] = bbox | ||
293 | loan_term['words'] = words | ||
294 | return loan_term | ||
295 | |||
296 | def get_asp_details(self, page_num): | ||
297 | asp_details_table_term = self.item.copy() | ||
298 | |||
299 | asp_details_table = [] | ||
300 | asp_details_text_list = [] | ||
301 | table = False | ||
302 | for block in self.pdf_info[page_num]['blocks']: | ||
303 | if block['type'] != 0: | ||
304 | continue | ||
305 | for line in block['lines']: | ||
306 | for span in line['spans']: | ||
307 | bbox, text = span['bbox'], span['text'] | ||
308 | if '附加产品融资贷款本金总金额明细' == text: | ||
309 | table = True | ||
310 | if '第二条' in text or '征信管理' in text: | ||
311 | table = False | ||
312 | if table == True: | ||
313 | asp_details_text_list.append(text) | ||
314 | |||
315 | for i in range((len(asp_details_text_list)+2)//3): | ||
316 | |||
317 | line = [] | ||
318 | if i == 0: | ||
319 | line = [asp_details_text_list[0]] | ||
320 | else: | ||
321 | for j in range(3): | ||
322 | line.append(asp_details_text_list[i*3-2+j]) | ||
323 | |||
324 | asp_details_table.append(line) | ||
325 | |||
326 | if len(asp_details_table) > 0: | ||
327 | asp_details_table_term['words'] = asp_details_table | ||
328 | return asp_details_table_term | ||
329 | |||
330 | def get_signature(self): | ||
331 | signature = self.item.copy() | ||
332 | |||
333 | for block in self.pdf_info['0']['blocks']: | ||
334 | if block['type'] != 0: | ||
335 | continue | ||
336 | for line in block['lines']: | ||
337 | for span in line['spans']: | ||
338 | bbox, text = span['bbox'], span['text'] | ||
339 | if '签署日期' in text: | ||
340 | words = text | ||
341 | signature['words'] = words | ||
342 | signature['position'] = bbox | ||
343 | return signature | ||
344 | |||
345 | def get_somebody(self, top, bottom): | ||
346 | # 指定上下边界后,返回上下边界内的客户信息 | ||
347 | _name = self.item.copy() | ||
348 | _id = self.item.copy() | ||
349 | # 只看第一页,先划定上下边界 | ||
350 | y_top = 0 | ||
351 | y_bottom = 0 | ||
352 | for block in self.pdf_info['1']['blocks']: | ||
353 | if block['type'] != 0: | ||
354 | continue | ||
355 | for line in block['lines']: | ||
356 | for span in line['spans']: | ||
357 | bbox, text = span['bbox'], span['text'] | ||
358 | if top in text: | ||
359 | y_top = bbox[3] | ||
360 | if bottom in text: | ||
361 | y_bottom = bbox[3] | ||
362 | for block in self.pdf_info['1']['blocks']: | ||
363 | if block['type'] != 0: | ||
364 | continue | ||
365 | for line in block['lines']: | ||
366 | for span in line['spans']: | ||
367 | bbox, text = span['bbox'], span['text'] | ||
368 | if y_top < bbox[3] < y_bottom: | ||
369 | if '姓名/名称' in text: | ||
370 | words = text.split(':')[-1] | ||
371 | _name['position'] = bbox | ||
372 | _name['words'] = words | ||
373 | if '自然人身份证件号码/法人执照号码' in text: | ||
374 | words = text.split(':')[-1] | ||
375 | _id['position'] = bbox | ||
376 | _id['words'] = words | ||
377 | return _name, _id | ||
378 | |||
379 | def get_seller(self): | ||
380 | seller = self.item.copy() | ||
381 | # 先找到 key | ||
382 | anchor_bbox = None | ||
383 | for block in self.pdf_info['1']['blocks']: | ||
384 | if block['type'] != 0: | ||
385 | continue | ||
386 | for line in block['lines']: | ||
387 | for span in line['spans']: | ||
388 | bbox, text = span['bbox'], span['text'] | ||
389 | if '经销商' == text: | ||
390 | anchor_bbox = bbox | ||
391 | # 当找到了 key, 则根据 key 去匹配 value | ||
392 | if anchor_bbox: | ||
393 | half_width = self.pdf_info['1']['width'] * 0.5 | ||
394 | for block in self.pdf_info['1']['blocks']: | ||
395 | if block['type'] != 0: | ||
396 | continue | ||
397 | for line in block['lines']: | ||
398 | for span in line['spans']: | ||
399 | bbox, text = span['bbox'], span['text'] | ||
400 | if anchor_bbox[2]<np.mean(bbox[::2])<half_width and \ | ||
401 | anchor_bbox[1]<np.mean(bbox[1::2])<anchor_bbox[3]: | ||
402 | seller['position'] = bbox | ||
403 | seller['words'] = text | ||
404 | return seller | ||
405 | |||
406 | def get_payback_account(self): | ||
407 | account = self.item.copy() | ||
408 | account_name = self.item.copy() | ||
409 | account_bank = self.item.copy() | ||
410 | all_text = '' | ||
411 | for block in self.pdf_info['1']['blocks']: | ||
412 | if block['type'] != 0: | ||
413 | continue | ||
414 | for line in block['lines']: | ||
415 | for span in line['spans']: | ||
416 | bbox, text = span['bbox'], span['text'] | ||
417 | all_text += text | ||
418 | # 首先确定账户信息是哪种,我们只输出非另行通知的格式 | ||
419 | if '☑账号' in all_text: | ||
420 | all_text = all_text.replace(' ', '') | ||
421 | matchs_1 = re.findall(r'账号:(.*)户名', all_text) | ||
422 | if matchs_1: | ||
423 | words = matchs_1[0] | ||
424 | for block in self.pdf_info['1']['blocks']: | ||
425 | if block['type'] != 0: | ||
426 | continue | ||
427 | for line in block['lines']: | ||
428 | for span in line['spans']: | ||
429 | bbox, text = span['bbox'], span['text'] | ||
430 | if f'{words}' in text: | ||
431 | account['position'] = bbox | ||
432 | account['words'] = words | ||
433 | matchs_2 = re.findall(r'户名:(.*)开户行', all_text) | ||
434 | if matchs_2: | ||
435 | words = matchs_2[0] | ||
436 | for block in self.pdf_info['1']['blocks']: | ||
437 | if block['type'] != 0: | ||
438 | continue | ||
439 | for line in block['lines']: | ||
440 | for span in line['spans']: | ||
441 | bbox, text = span['bbox'], span['text'] | ||
442 | if f'{words}' in text: | ||
443 | account_name['position'] = bbox | ||
444 | account_name['words'] = words | ||
445 | matchs_3 = re.findall(r'开户行:(.*);', all_text) | ||
446 | if matchs_3: | ||
447 | words = matchs_3[0] | ||
448 | for block in self.pdf_info['1']['blocks']: | ||
449 | if block['type'] != 0: | ||
450 | continue | ||
451 | for line in block['lines']: | ||
452 | for span in line['spans']: | ||
453 | bbox, text = span['bbox'], span['text'] | ||
454 | if f'开户行:{words};' in text.replace(' ', ''): | ||
455 | account_bank['position'] = bbox | ||
456 | account_bank['words'] = words | ||
457 | return account, account_name, account_bank | ||
458 | |||
459 | def get_repayment_schedule(self): | ||
460 | repayment_schedule = self.item.copy() | ||
461 | |||
462 | repayment_schedule_text_list = [] | ||
463 | table = False | ||
464 | page = None | ||
465 | left = 0 | ||
466 | right = 0 | ||
467 | for pno in self.pdf_info: | ||
468 | for block in self.pdf_info[pno]['blocks']: | ||
469 | if block['type'] != 0: | ||
470 | continue | ||
471 | for line in block['lines']: | ||
472 | for span in line['spans']: | ||
473 | bbox, text = span['bbox'], span['text'] | ||
474 | if '剩余融资' in text: | ||
475 | right = bbox[2] | ||
476 | if '以上表格中所列序号' in text: | ||
477 | table = False | ||
478 | if table == True: | ||
479 | # 过滤汉字 | ||
480 | if re.compile(r'[\u4e00-\u9fff]').search(text): | ||
481 | continue | ||
482 | # 过滤 1. - 61. 这些标题 | ||
483 | if re.findall("\d+", text): | ||
484 | if len(re.findall("\d+", text)) == 1: | ||
485 | continue | ||
486 | if not left < bbox[0] < right: | ||
487 | continue | ||
488 | repayment_schedule_text_list.append(text) | ||
489 | |||
490 | if text.strip() == "61.": | ||
491 | page = pno | ||
492 | table = True | ||
493 | left = bbox[0] | ||
494 | # print("repayment_schedule_text_list = ", repayment_schedule_text_list) | ||
495 | # repayment_schedule_table = [['序号', '融资租赁成本', '融资租赁费用', '租金', '剩余融资租赁成本']] | ||
496 | repayment_schedule_table = [['序号', '租金']] | ||
497 | for i in range(len(repayment_schedule_text_list)//4): | ||
498 | line = [f'{i+1}.'] | ||
499 | # 4表示4列的意思 | ||
500 | for j in range(4): | ||
501 | line.append(repayment_schedule_text_list[i*4+j]) | ||
502 | |||
503 | # 只保留序号和租金列 | ||
504 | line = [line[0].replace('.', ''), line[3]] | ||
505 | |||
506 | repayment_schedule_table.append(line) | ||
507 | |||
508 | repayment_schedule['words'] = repayment_schedule_table | ||
509 | repayment_schedule['page'] = page | ||
510 | return repayment_schedule | ||
511 | |||
512 | def get_signature_role_1(self): | ||
513 | signature_role_1 = self.item.copy() | ||
514 | for pno in self.pdf_info: | ||
515 | for block in self.pdf_info[pno]['blocks']: | ||
516 | if block['type'] != 0: | ||
517 | continue | ||
518 | for line in block['lines']: | ||
519 | for span in line['spans']: | ||
520 | bbox, text = span['bbox'], span['text'] | ||
521 | if '签署日期' in text: | ||
522 | signature_role_1['position'] = bbox | ||
523 | signature_role_1['page'] = pno | ||
524 | signature_role_1['words'] = text | ||
525 | return signature_role_1 | ||
526 | |||
527 | def get_signature_role_2(self): | ||
528 | signature_role_2 = self.init_item.copy() | ||
529 | # 先定位签字区域 | ||
530 | texts = [] | ||
531 | boxes = [] | ||
532 | page_num = None | ||
533 | position = None | ||
534 | words = None | ||
535 | region = False | ||
536 | for i in list(self.pdf_info.keys()): | ||
537 | for block in self.pdf_info[i]['blocks']: | ||
538 | if block['type'] != 0: | ||
539 | continue | ||
540 | for line in block['lines']: | ||
541 | for span in line['spans']: | ||
542 | bbox, text = span['bbox'], span['text'] | ||
543 | if '共同借款人(共同抵押人)' in text: | ||
544 | region = True | ||
545 | if '日期' in text: | ||
546 | region = False | ||
547 | if region == True: | ||
548 | page_num = i | ||
549 | texts.append(text) | ||
550 | boxes.append(bbox) | ||
551 | if len(texts) > 4: | ||
552 | words = '有' | ||
553 | else: | ||
554 | words = '无' | ||
555 | boxes = np.array(boxes).reshape((-1, 2)) | ||
556 | position = [min(boxes[:,0]), min(boxes[:,1]), max(boxes[:,0]), max(boxes[:,1])] | ||
557 | signature_role_2['page_num'] = page_num | ||
558 | signature_role_2['position'] = position | ||
559 | signature_role_2['words'] = words | ||
560 | return signature_role_2 | ||
561 | |||
562 | def get_signature_role_3(self): | ||
563 | signature_role_3 = self.init_item.copy() | ||
564 | # 先定位签字区域 | ||
565 | texts = [] | ||
566 | boxes = [] | ||
567 | page_num = None | ||
568 | position = None | ||
569 | words = None | ||
570 | region = False | ||
571 | for i in list(self.pdf_info.keys()): | ||
572 | for block in self.pdf_info[i]['blocks']: | ||
573 | if block['type'] != 0: | ||
574 | continue | ||
575 | for line in block['lines']: | ||
576 | for span in line['spans']: | ||
577 | bbox, text = span['bbox'], span['text'] | ||
578 | if '保证人1' in text and int(i) != 0: | ||
579 | region = True | ||
580 | if '日期' in text: | ||
581 | region = False | ||
582 | if region == True: | ||
583 | page_num = i | ||
584 | texts.append(text) | ||
585 | boxes.append(bbox) | ||
586 | if len(texts) > 4: | ||
587 | words = '有' | ||
588 | else: | ||
589 | words = '无' | ||
590 | boxes = np.array(boxes).reshape((-1, 2)) | ||
591 | position = [min(boxes[:,0]), min(boxes[:,1]), max(boxes[:,0]), max(boxes[:,1])] | ||
592 | signature_role_3['page_num'] = page_num | ||
593 | signature_role_3['position'] = position | ||
594 | signature_role_3['words'] = words | ||
595 | return signature_role_3 | ||
596 | |||
597 | def get_signature_role_4(self): | ||
598 | signature_role_4 = self.init_item.copy() | ||
599 | # 先定位签字区域 | ||
600 | texts = [] | ||
601 | boxes = [] | ||
602 | page_num = None | ||
603 | position = None | ||
604 | words = None | ||
605 | region = False | ||
606 | for i in list(self.pdf_info.keys()): | ||
607 | for block in self.pdf_info[i]['blocks']: | ||
608 | if block['type'] != 0: | ||
609 | continue | ||
610 | for line in block['lines']: | ||
611 | for span in line['spans']: | ||
612 | bbox, text = span['bbox'], span['text'] | ||
613 | if '保证人2' in text and int(i) != 0: | ||
614 | region = True | ||
615 | if '日期' in text: | ||
616 | region = False | ||
617 | if region == True: | ||
618 | page_num = i | ||
619 | texts.append(text) | ||
620 | boxes.append(bbox) | ||
621 | if len(texts) > 4: | ||
622 | words = '有' | ||
623 | else: | ||
624 | words = '无' | ||
625 | boxes = np.array(boxes).reshape((-1, 2)) | ||
626 | position = [min(boxes[:,0]), min(boxes[:,1]), max(boxes[:,0]), max(boxes[:,1])] | ||
627 | signature_role_4['page_num'] = page_num | ||
628 | signature_role_4['position'] = position | ||
629 | signature_role_4['words'] = words | ||
630 | return signature_role_4 | ||
631 | |||
632 | def get_signature_role_5(self): | ||
633 | signature_role_5 = self.init_item.copy() | ||
634 | # 先定位签字区域 | ||
635 | texts = [] | ||
636 | boxes = [] | ||
637 | page_num = None | ||
638 | position = None | ||
639 | words = None | ||
640 | region = False | ||
641 | for i in list(self.pdf_info.keys()): | ||
642 | for block in self.pdf_info[i]['blocks']: | ||
643 | if block['type'] != 0: | ||
644 | continue | ||
645 | for line in block['lines']: | ||
646 | for span in line['spans']: | ||
647 | bbox, text = span['bbox'], span['text'] | ||
648 | if '见证人签字' in text and int(i) != 0: | ||
649 | region = True | ||
650 | if '年' in text: | ||
651 | region = False | ||
652 | if region == True: | ||
653 | page_num = i | ||
654 | texts.append(text) | ||
655 | boxes.append(bbox) | ||
656 | # print(texts) | ||
657 | if len(texts) > 4: | ||
658 | words = '有' | ||
659 | else: | ||
660 | words = '无' | ||
661 | boxes = np.array(boxes).reshape((-1, 2)) | ||
662 | position = [min(boxes[:,0]), min(boxes[:,1]), max(boxes[:,0]), max(boxes[:,1])] | ||
663 | signature_role_5['page_num'] = page_num | ||
664 | signature_role_5['position'] = position | ||
665 | signature_role_5['words'] = words | ||
666 | return signature_role_5 | ||
667 | |||
668 | def get_last_page_signature(self, page_num, top, bottom): | ||
669 | signature_name = self.item.copy() | ||
670 | signature_date = self.item.copy() | ||
671 | anchor_top = None | ||
672 | anchor_bottom = None | ||
673 | for block in self.pdf_info[page_num]['blocks']: | ||
674 | if block['type'] != 0: | ||
675 | continue | ||
676 | for line in block['lines']: | ||
677 | for span in line['spans']: | ||
678 | bbox, text = span['bbox'], span['text'] | ||
679 | if top in text: | ||
680 | anchor_top = bbox[1] | ||
681 | if bottom in text: | ||
682 | anchor_bottom = bbox[1] | ||
683 | if anchor_top is not None and anchor_bottom is not None: | ||
684 | for block in self.pdf_info[page_num]['blocks']: | ||
685 | if block['type'] != 0: | ||
686 | continue | ||
687 | for line in block['lines']: | ||
688 | for span in line['spans']: | ||
689 | bbox, text = span['bbox'], span['text'] | ||
690 | if '签署日期' in text and int(anchor_top)<np.mean(bbox[1::2])<int(anchor_bottom): | ||
691 | name = text.split(' ')[0] | ||
692 | date = text.split(':')[-1] | ||
693 | signature_name['words'] = name | ||
694 | signature_name['position'] = bbox | ||
695 | signature_date['words'] = date | ||
696 | signature_name['position'] = bbox | ||
697 | return signature_name, signature_date | ||
698 | |||
699 | def get_electronic_signature(self, top, bottom): | ||
700 | signature = self.item.copy() | ||
701 | anchor_top = None | ||
702 | anchor_bottom = None | ||
703 | for pno in self.pdf_info: | ||
704 | for block in self.pdf_info[pno]['blocks']: | ||
705 | if block['type'] != 0: | ||
706 | continue | ||
707 | for line in block['lines']: | ||
708 | for span in line['spans']: | ||
709 | bbox, text = span['bbox'], span['text'] | ||
710 | if top in text: | ||
711 | anchor_top = bbox[1] | ||
712 | if bottom in text: | ||
713 | anchor_bottom = bbox[3] | ||
714 | if anchor_top is not None and anchor_bottom is not None: | ||
715 | for pno in self.pdf_info: | ||
716 | for block in self.pdf_info[pno]['blocks']: | ||
717 | if block['type'] != 0: | ||
718 | continue | ||
719 | for line in block['lines']: | ||
720 | for span in line['spans']: | ||
721 | bbox, text = span['bbox'], span['text'] | ||
722 | # ------------ # | ||
723 | # print("--text = ", text) | ||
724 | if '签署日期' in text and int(anchor_top)<np.mean(bbox[1::2])<int(anchor_bottom): | ||
725 | words = text | ||
726 | signature['words'] = words | ||
727 | signature['page'] = pno | ||
728 | signature['position'] = bbox | ||
729 | return signature | ||
730 | |||
731 | def get_role_info(self, role_key, page_num='0'): | ||
732 | name = self.item.copy() | ||
733 | id_num = self.item.copy() | ||
734 | representative = self.item.copy() | ||
735 | |||
736 | # 以保证人3 的左上角为定位点 | ||
737 | anchor = None | ||
738 | for block in self.pdf_info[page_num]['blocks']: | ||
739 | if block['type'] != 0: | ||
740 | continue | ||
741 | for line in block['lines']: | ||
742 | for span in line['spans']: | ||
743 | bbox, text = span['bbox'], span['text'] | ||
744 | # 找到角色姓名 | ||
745 | if re.match('保证人3', text) is not None: | ||
746 | anchor = [bbox[0], bbox[1]] | ||
747 | |||
748 | if anchor is not None: | ||
749 | for block in self.pdf_info[page_num]['blocks']: | ||
750 | if block['type'] != 0: | ||
751 | continue | ||
752 | for line in block['lines']: | ||
753 | for span in line['spans']: | ||
754 | bbox, text = span['bbox'], span['text'] | ||
755 | # 找到角色姓名 | ||
756 | if re.match(role_key, text) is not None: | ||
757 | words = text.split(':')[-1] | ||
758 | name['words'] = words | ||
759 | name['page'] = page_num | ||
760 | name['position'] = bbox | ||
761 | if role_key == '承租人:': | ||
762 | # 找到证件号码且确定位置 | ||
763 | if re.match('证件号码:', text) is not None and np.mean(bbox[::2]) < anchor[0] and np.mean(bbox[1::2]) < anchor[1]: | ||
764 | words = text.split(':')[-1] | ||
765 | id_num['words'] = words | ||
766 | id_num['page'] = page_num | ||
767 | id_num['position'] = bbox | ||
768 | # 找到法人代表且确定位置 | ||
769 | if re.match('法定代表人或授权代表:', text) is not None and np.mean(bbox[::2]) < anchor[0] and np.mean(bbox[1::2]) < anchor[1]: | ||
770 | words = text.split(':')[-1] | ||
771 | representative['words'] = words | ||
772 | representative['page'] = page_num | ||
773 | representative['position'] = bbox | ||
774 | if role_key == '保证人1:': | ||
775 | # 找到证件号码且确定位置 | ||
776 | if re.match('证件号码:', text) is not None and np.mean(bbox[::2]) < anchor[0] and np.mean(bbox[1::2]) > anchor[1]: | ||
777 | words = text.split(':')[-1] | ||
778 | id_num['words'] = words | ||
779 | id_num['page'] = page_num | ||
780 | id_num['position'] = bbox | ||
781 | # 找到法人代表且确定位置 | ||
782 | if re.match('法定代表人或授权代表:', text) is not None and np.mean(bbox[::2]) < anchor[0] and np.mean(bbox[1::2]) > anchor[1]: | ||
783 | words = text.split(':')[-1] | ||
784 | representative['words'] = words | ||
785 | representative['page'] = page_num | ||
786 | representative['position'] = bbox | ||
787 | if role_key == '保证人2:': | ||
788 | # 找到证件号码且确定位置 | ||
789 | if re.match('证件号码:', text) is not None and np.mean(bbox[::2]) > anchor[0] and np.mean(bbox[1::2]) < anchor[1]: | ||
790 | words = text.split(':')[-1] | ||
791 | id_num['words'] = words | ||
792 | id_num['page'] = page_num | ||
793 | id_num['position'] = bbox | ||
794 | # 找到法人代表且确定位置 | ||
795 | if re.match('法定代表人或授权代表:', text) is not None and np.mean(bbox[::2]) > anchor[0] and np.mean(bbox[1::2]) < anchor[1]: | ||
796 | words = text.split(':')[-1] | ||
797 | representative['words'] = words | ||
798 | representative['page'] = page_num | ||
799 | representative['position'] = bbox | ||
800 | if role_key == '保证人3:': | ||
801 | # 找到证件号码且确定位置 | ||
802 | if re.match('证件号码:', text) is not None and np.mean(bbox[::2]) > anchor[0] and np.mean(bbox[1::2]) > anchor[1]: | ||
803 | words = text.split(':')[-1] | ||
804 | id_num['words'] = words | ||
805 | id_num['page'] = page_num | ||
806 | id_num['position'] = bbox | ||
807 | # 找到法人代表且确定位置 | ||
808 | if re.match('法定代表人或授权代表:', text) is not None and np.mean(bbox[::2]) > anchor[0] and np.mean(bbox[1::2]) > anchor[1]: | ||
809 | words = text.split(':')[-1] | ||
810 | representative['words'] = words | ||
811 | representative['page'] = page_num | ||
812 | representative['position'] = bbox | ||
813 | return name, id_num, representative | ||
814 | |||
815 | def get_table_add_product(self): | ||
816 | table_add_product = self.item.copy() | ||
817 | |||
818 | add_product_page_num = None | ||
819 | for pno in self.pdf_info: | ||
820 | for block in self.pdf_info[f'{pno}']['blocks']: | ||
821 | if block['type'] != 0: | ||
822 | continue | ||
823 | for line in block['lines']: | ||
824 | for span in line['spans']: | ||
825 | bbox, text = span['bbox'], span['text'] | ||
826 | if '车辆附加产品(明细见下表)' in text: | ||
827 | add_product_page_num = pno | ||
828 | ocr_results = [] | ||
829 | for block in self.pdf_info[f'{add_product_page_num}']['blocks']: | ||
830 | if block['type'] != 0: | ||
831 | continue | ||
832 | for line in block['lines']: | ||
833 | for span in line['spans']: | ||
834 | bbox, text = span['bbox'], span['text'] | ||
835 | xmin, ymin, xmax, ymax = bbox | ||
836 | bbox = [xmin, ymin, xmax, ymin, xmax, ymax, xmin, ymax] | ||
837 | ocr_results.append([bbox, text]) | ||
838 | |||
839 | lines = [['项目', '购买价格', '实际融资金额']] | ||
840 | |||
841 | key_xm = None | ||
842 | key_gmjg = None | ||
843 | key_sjrzje = None | ||
844 | key_total = None | ||
845 | |||
846 | for index, span in enumerate(ocr_results): | ||
847 | if span[1] == '项目': | ||
848 | key_xm = index | ||
849 | if span[1] == '购买价格': | ||
850 | key_gmjg = index | ||
851 | if span[1] == '实际融资金额': | ||
852 | key_sjrzje = index | ||
853 | if span[1] == '总计': | ||
854 | key_total = index | ||
855 | |||
856 | bbox, text = ocr_results[key_xm] | ||
857 | rh = abs(bbox[1]-bbox[-1]) | ||
858 | anchor = np.array(bbox).reshape((-1, 2)) | ||
859 | anchor[:, 0] += 2*rh | ||
860 | anchor[:, 1] += rh | ||
861 | |||
862 | for i in range(5): | ||
863 | for span in ocr_results: | ||
864 | iou = caculate_iou(anchor, span[0]) | ||
865 | if iou > 0.01 and span[1].strip() != '所购': | ||
866 | x = get_table_info(span[0], ocr_results[key_gmjg][0], ocr_results) | ||
867 | y = get_table_info(span[0], ocr_results[key_sjrzje][0], ocr_results) | ||
868 | line = [span[1].replace('\u3000', ' '), x, y] | ||
869 | # print(line) | ||
870 | lines.append(line) | ||
871 | anchor = np.array(span[0]).reshape((-1, 2)) | ||
872 | anchor[:, 1] += rh | ||
873 | |||
874 | total = get_table_info(ocr_results[key_total][0], ocr_results[key_sjrzje][0], ocr_results) | ||
875 | lines.append(['总计', '', total]) | ||
876 | |||
877 | # 所购 BMW悦然焕 | ||
878 | # 新服务 | ||
879 | |||
880 | # 所购 BMW5年10 | ||
881 | # 万公里长悦保养套餐 | ||
882 | |||
883 | # 所购 事故维修补偿 | ||
884 | # 方案 | ||
885 | |||
886 | # 所购 BMW5年10万公里 | ||
887 | # 长悦保养套餐 | ||
888 | |||
889 | # 所购 MINI4年6万公里长悦 | ||
890 | # 保养套餐 | ||
891 | |||
892 | filtered_lines = [] | ||
893 | for line in lines: | ||
894 | if line[0][:2] not in ['所购', '项目', '总计']: | ||
895 | continue | ||
896 | if 'BMW悦然' in line[0]: | ||
897 | line[0] = '所购 BMW悦然焕新服务' | ||
898 | if 'BMW5年10' in line[0]: | ||
899 | line[0] = '所购 BMW5年10万公里长悦保养套餐' | ||
900 | if '事故维修补' in line[0]: | ||
901 | line[0] = '所购 事故维修补偿方案' | ||
902 | if 'MINI4年6万公里长悦' in line[0]: | ||
903 | line[0] = '所购 MINI4年6万公里长悦保养套餐' | ||
904 | filtered_lines.append(line) | ||
905 | table_add_product['words'] = filtered_lines | ||
906 | table_add_product['page'] = add_product_page_num | ||
907 | table_add_product['position'] = None | ||
908 | return table_add_product | ||
909 | |||
910 | def get_contract_no_dy(self): | ||
911 | # 查找抵押合同编号 | ||
912 | contract_no = self.item.copy() | ||
913 | |||
914 | key_box = None | ||
915 | for pno in self.pdf_info: | ||
916 | for block in self.pdf_info[pno]['blocks']: | ||
917 | if block['type'] != 0: | ||
918 | continue | ||
919 | for line in block['lines']: | ||
920 | for span in line['spans']: | ||
921 | bbox, text = span['bbox'], span['text'] | ||
922 | if '抵押合同编号' in text: | ||
923 | key_box = bbox | ||
924 | |||
925 | if key_box is not None: | ||
926 | for pno in self.pdf_info: | ||
927 | for block in self.pdf_info[pno]['blocks']: | ||
928 | if block['type'] != 0: | ||
929 | continue | ||
930 | for line in block['lines']: | ||
931 | for span in line['spans']: | ||
932 | bbox, text = span['bbox'], span['text'] | ||
933 | if key_box[1] < np.mean(bbox[1::2]) < key_box[3] and 'CH-' in text: | ||
934 | contract_no['position'] = bbox | ||
935 | contract_no['page'] = pno | ||
936 | contract_no['words'] = text | ||
937 | return contract_no | ||
938 | |||
939 | def get_dyr_name_id(self): | ||
940 | name = self.item.copy() | ||
941 | _id = self.item.copy() | ||
942 | |||
943 | key_box = None | ||
944 | for pno in self.pdf_info: | ||
945 | for block in self.pdf_info[pno]['blocks']: | ||
946 | if block['type'] != 0: | ||
947 | continue | ||
948 | for line in block['lines']: | ||
949 | for span in line['spans']: | ||
950 | bbox, text = span['bbox'], span['text'] | ||
951 | if text == '抵押人': | ||
952 | key_box = bbox | ||
953 | |||
954 | if key_box is not None: | ||
955 | rh = abs(key_box[1]-key_box[3]) | ||
956 | for pno in self.pdf_info: | ||
957 | for block in self.pdf_info[pno]['blocks']: | ||
958 | if block['type'] != 0: | ||
959 | continue | ||
960 | for line in block['lines']: | ||
961 | for span in line['spans']: | ||
962 | bbox, text = span['bbox'], span['text'] | ||
963 | if key_box[1] < np.mean(bbox[1::2]) < key_box[3]+rh*3 and '姓名' in text: | ||
964 | words = text.split(':')[-1] | ||
965 | name['position'] = bbox | ||
966 | name['page'] = pno | ||
967 | name['words'] = words | ||
968 | if key_box[1] < np.mean(bbox[1::2]) < key_box[3]+rh*3 and '证件号码' in text: | ||
969 | words = text.split(':')[-1] | ||
970 | _id['position'] = bbox | ||
971 | _id['page'] = pno | ||
972 | _id['words'] = words | ||
973 | return name, _id | ||
974 | |||
975 | def get_dyrpo_name_id(self): | ||
976 | name = self.item.copy() | ||
977 | _id = self.item.copy() | ||
978 | |||
979 | key_box = None | ||
980 | for pno in self.pdf_info: | ||
981 | for block in self.pdf_info[pno]['blocks']: | ||
982 | if block['type'] != 0: | ||
983 | continue | ||
984 | for line in block['lines']: | ||
985 | for span in line['spans']: | ||
986 | bbox, text = span['bbox'], span['text'] | ||
987 | if text == '抵押人配偶(如适': | ||
988 | key_box = bbox | ||
989 | |||
990 | if key_box is not None: | ||
991 | rh = abs(key_box[1]-key_box[3]) | ||
992 | for pno in self.pdf_info: | ||
993 | for block in self.pdf_info[pno]['blocks']: | ||
994 | if block['type'] != 0: | ||
995 | continue | ||
996 | for line in block['lines']: | ||
997 | for span in line['spans']: | ||
998 | bbox, text = span['bbox'], span['text'] | ||
999 | if key_box[1] < np.mean(bbox[1::2]) < key_box[3]+rh*3 and '姓名' in text: | ||
1000 | words = text.split(':')[-1] | ||
1001 | name['position'] = bbox | ||
1002 | name['page'] = pno | ||
1003 | name['words'] = words | ||
1004 | if key_box[1] < np.mean(bbox[1::2]) < key_box[3]+rh*3 and '证件号码' in text: | ||
1005 | words = text.split(':')[-1] | ||
1006 | _id['position'] = bbox | ||
1007 | _id['page'] = pno | ||
1008 | _id['words'] = words | ||
1009 | return name, _id | ||
1010 | |||
1011 | def get_key_value_position(self, key): | ||
1012 | value = self.item.copy() | ||
1013 | |||
1014 | key_box = None | ||
1015 | for pno in self.pdf_info: | ||
1016 | for block in self.pdf_info[pno]['blocks']: | ||
1017 | if block['type'] != 0: | ||
1018 | continue | ||
1019 | for line in block['lines']: | ||
1020 | for span in line['spans']: | ||
1021 | bbox, text = span['bbox'], span['text'] | ||
1022 | if text == key: | ||
1023 | key_box = bbox | ||
1024 | |||
1025 | if key_box is not None: | ||
1026 | rh = abs(key_box[1]-key_box[3]) | ||
1027 | for pno in self.pdf_info: | ||
1028 | for block in self.pdf_info[pno]['blocks']: | ||
1029 | if block['type'] != 0: | ||
1030 | continue | ||
1031 | for line in block['lines']: | ||
1032 | for span in line['spans']: | ||
1033 | bbox, text = span['bbox'], span['text'] | ||
1034 | if key_box[1] < np.mean(bbox[1::2]) < key_box[3] and key_box[0] < bbox[0] and abs(key_box[2]-bbox[0]) < rh*10: | ||
1035 | words = text | ||
1036 | value['position'] = bbox | ||
1037 | value['page'] = pno | ||
1038 | value['words'] = words | ||
1039 | return value | ||
1040 | |||
1041 | def get_role_info_3_3(self, role_key, page_num='0'): | ||
1042 | name = self.item.copy() | ||
1043 | id_num = self.item.copy() | ||
1044 | representative = self.item.copy() | ||
1045 | |||
1046 | # 以保证人2 的左上角为定位点 | ||
1047 | anchor = None | ||
1048 | for block in self.pdf_info[page_num]['blocks']: | ||
1049 | if block['type'] != 0: | ||
1050 | continue | ||
1051 | for line in block['lines']: | ||
1052 | for span in line['spans']: | ||
1053 | bbox, text = span['bbox'], span['text'] | ||
1054 | # 找到角色姓名 | ||
1055 | if re.match('保证人2', text) is not None: | ||
1056 | anchor = [bbox[0], bbox[1]] | ||
1057 | |||
1058 | if anchor is not None: | ||
1059 | for block in self.pdf_info[page_num]['blocks']: | ||
1060 | if block['type'] != 0: | ||
1061 | continue | ||
1062 | for line in block['lines']: | ||
1063 | for span in line['spans']: | ||
1064 | bbox, text = span['bbox'], span['text'] | ||
1065 | # 找到角色姓名 | ||
1066 | if re.match(role_key, text) is not None: | ||
1067 | words = text.split(':')[-1] | ||
1068 | name['words'] = words | ||
1069 | name['page'] = page_num | ||
1070 | name['position'] = bbox | ||
1071 | if role_key == '承租人一:': | ||
1072 | # 找到证件号码且确定位置 | ||
1073 | if re.match('证件号码:', text) is not None and np.mean(bbox[::2]) < anchor[0] and np.mean(bbox[1::2]) < anchor[1]: | ||
1074 | words = text.split(':')[-1] | ||
1075 | id_num['words'] = words | ||
1076 | id_num['page'] = page_num | ||
1077 | id_num['position'] = bbox | ||
1078 | # 找到法人代表且确定位置 | ||
1079 | if re.match('法定代表人或授权代表:', text) is not None and np.mean(bbox[::2]) < anchor[0] and np.mean(bbox[1::2]) < anchor[1]: | ||
1080 | words = text.split(':')[-1] | ||
1081 | representative['words'] = words | ||
1082 | representative['page'] = page_num | ||
1083 | representative['position'] = bbox | ||
1084 | if role_key == '共同承租人:': | ||
1085 | # 找到证件号码且确定位置 | ||
1086 | if re.match('证件号码:', text) is not None and np.mean(bbox[::2]) < anchor[0] and np.mean(bbox[1::2]) > anchor[1]: | ||
1087 | words = text.split(':')[-1] | ||
1088 | id_num['words'] = words | ||
1089 | id_num['page'] = page_num | ||
1090 | id_num['position'] = bbox | ||
1091 | # 找到法人代表且确定位置 | ||
1092 | if re.match('法定代表人或授权代表:', text) is not None and np.mean(bbox[::2]) < anchor[0] and np.mean(bbox[1::2]) > anchor[1]: | ||
1093 | words = text.split(':')[-1] | ||
1094 | representative['words'] = words | ||
1095 | representative['page'] = page_num | ||
1096 | representative['position'] = bbox | ||
1097 | if role_key == '保证人1:': | ||
1098 | # 找到证件号码且确定位置 | ||
1099 | if re.match('证件号码:', text) is not None and np.mean(bbox[::2]) > anchor[0] and np.mean(bbox[1::2]) < anchor[1]: | ||
1100 | words = text.split(':')[-1] | ||
1101 | id_num['words'] = words | ||
1102 | id_num['page'] = page_num | ||
1103 | id_num['position'] = bbox | ||
1104 | # 找到法人代表且确定位置 | ||
1105 | if re.match('法定代表人或授权代表:', text) is not None and np.mean(bbox[::2]) > anchor[0] and np.mean(bbox[1::2]) < anchor[1]: | ||
1106 | words = text.split(':')[-1] | ||
1107 | representative['words'] = words | ||
1108 | representative['page'] = page_num | ||
1109 | representative['position'] = bbox | ||
1110 | if role_key == '保证人2:': | ||
1111 | # 找到证件号码且确定位置 | ||
1112 | if re.match('证件号码:', text) is not None and np.mean(bbox[::2]) > anchor[0] and np.mean(bbox[1::2]) > anchor[1]: | ||
1113 | words = text.split(':')[-1] | ||
1114 | id_num['words'] = words | ||
1115 | id_num['page'] = page_num | ||
1116 | id_num['position'] = bbox | ||
1117 | # 找到法人代表且确定位置 | ||
1118 | if re.match('法定代表人或授权代表:', text) is not None and np.mean(bbox[::2]) > anchor[0] and np.mean(bbox[1::2]) > anchor[1]: | ||
1119 | words = text.split(':')[-1] | ||
1120 | representative['words'] = words | ||
1121 | representative['page'] = page_num | ||
1122 | representative['position'] = bbox | ||
1123 | return name, id_num, representative | ||
1124 | |||
1125 | def get_value_by_findall(self, prefix, suffix, page_num): | ||
1126 | value = self.item.copy() | ||
1127 | all_text = '' | ||
1128 | pno = page_num | ||
1129 | for block in self.pdf_info[pno]['blocks']: | ||
1130 | if block['type'] != 0: | ||
1131 | continue | ||
1132 | for line in block['lines']: | ||
1133 | for span in line['spans']: | ||
1134 | bbox, text = span['bbox'], span['text'] | ||
1135 | all_text += text | ||
1136 | words_list = re.findall(f"{prefix}(.*?){suffix}", all_text) | ||
1137 | if len(words_list) > 0: | ||
1138 | for block in self.pdf_info[pno]['blocks']: | ||
1139 | if block['type'] != 0: | ||
1140 | continue | ||
1141 | for line in block['lines']: | ||
1142 | for span in line['spans']: | ||
1143 | bbox, text = span['bbox'], span['text'] | ||
1144 | if words_list[0] in text: | ||
1145 | value['position'] = bbox | ||
1146 | value['page'] = pno | ||
1147 | value['words'] = words_list[0] | ||
1148 | return value | ||
1149 | |||
1150 | def get_info(self): | ||
1151 | """ | ||
1152 | block['type'] == 0 : 表示该元素为图片 | ||
1153 | |||
1154 | Returns: | ||
1155 | dict: Description | ||
1156 | """ | ||
1157 | if len(self.pdf_info) > 0: | ||
1158 | # 取 Page 1 上的合同编号 | ||
1159 | contract_no = self.get_contract_no(page_num='0') | ||
1160 | self.init_result['合同编号'] = contract_no | ||
1161 | |||
1162 | # 粗略判断是否是 ‘车贷分离版本’ 的合同 | ||
1163 | is_cdfl = False | ||
1164 | for block in self.pdf_info['0']['blocks']: | ||
1165 | if block['type'] != 0: | ||
1166 | continue | ||
1167 | for line in block['lines']: | ||
1168 | for span in line['spans']: | ||
1169 | bbox, text = span['bbox'], span['text'] | ||
1170 | if '共同承租人:' in text: | ||
1171 | is_cdfl = True | ||
1172 | |||
1173 | if is_cdfl == False: | ||
1174 | # 从第一页上取四个角色的姓名和证件号码 | ||
1175 | name, id_num, representative = self.get_role_info(role_key='承租人:', page_num='0') | ||
1176 | |||
1177 | if name["words"] == None: | ||
1178 | name, id_num, representative = self.get_role_info_3_3(role_key='承租人一:', page_num='0') | ||
1179 | self.init_result['承租人-姓名'] = name | ||
1180 | self.init_result['承租人-证件号码'] = id_num | ||
1181 | self.init_result['承租人-法定代表人或授权代表'] = representative | ||
1182 | |||
1183 | name, id_num, representative = self.get_role_info(role_key='保证人1:', page_num='0') | ||
1184 | self.init_result['保证人1-姓名'] = name | ||
1185 | self.init_result['保证人1-证件号码'] = id_num | ||
1186 | self.init_result['保证人1-法定代表人或授权代表'] = representative | ||
1187 | # if条件判别 对应3_3版本 | ||
1188 | if name["words"] == None: | ||
1189 | name, id_num, representative = self.get_role_info_3_3(role_key='共同承租人:', page_num='0') | ||
1190 | self.init_result['共同承租人-姓名'] = name | ||
1191 | self.init_result['共同承租人-证件号码'] = id_num | ||
1192 | self.init_result['共同承租人-法定代表人或授权代表'] = representative | ||
1193 | |||
1194 | name, id_num, representative = self.get_role_info(role_key='保证人2:', page_num='0') | ||
1195 | self.init_result['保证人2-姓名'] = name | ||
1196 | self.init_result['保证人2-证件号码'] = id_num | ||
1197 | self.init_result['保证人2-法定代表人或授权代表'] = representative | ||
1198 | # if条件判别 对应3_3版本 | ||
1199 | if name["words"] == None: | ||
1200 | name, id_num, representative = self.get_role_info_3_3(role_key='保证人1:', page_num='0') | ||
1201 | self.init_result['保证人2-姓名'] = name | ||
1202 | self.init_result['保证人2-证件号码'] = id_num | ||
1203 | self.init_result['保证人2-法定代表人或授权代表'] = representative | ||
1204 | |||
1205 | name, id_num, representative = self.get_role_info(role_key='保证人3:', page_num='0') | ||
1206 | self.init_result['保证人3-姓名'] = name | ||
1207 | self.init_result['保证人3-证件号码'] = id_num | ||
1208 | self.init_result['保证人3-法定代表人或授权代表'] = representative | ||
1209 | if name["words"] == None: | ||
1210 | name, id_num, representative = self.get_role_info_3_3(role_key='保证人2:', page_num='0') | ||
1211 | self.init_result['保证人3-姓名'] = name | ||
1212 | self.init_result['保证人3-证件号码'] = id_num | ||
1213 | self.init_result['保证人3-法定代表人或授权代表'] = representative | ||
1214 | else: | ||
1215 | name, id_num, representative = self.get_role_info_3_3(role_key='承租人一:', page_num='0') | ||
1216 | self.init_result['承租人-姓名'] = name | ||
1217 | self.init_result['承租人-证件号码'] = id_num | ||
1218 | self.init_result['承租人-法定代表人或授权代表'] = representative | ||
1219 | |||
1220 | name, id_num, representative = self.get_role_info_3_3(role_key='共同承租人:', page_num='0') | ||
1221 | self.init_result['共同承租人-姓名'] = name | ||
1222 | self.init_result['共同承租人-证件号码'] = id_num | ||
1223 | self.init_result['共同承租人-法定代表人或授权代表'] = representative | ||
1224 | |||
1225 | name, id_num, representative = self.get_role_info_3_3(role_key='保证人1:', page_num='0') | ||
1226 | self.init_result['保证人1-姓名'] = name | ||
1227 | self.init_result['保证人1-证件号码'] = id_num | ||
1228 | self.init_result['保证人1-法定代表人或授权代表'] = representative | ||
1229 | |||
1230 | name, id_num, representative = self.get_role_info_3_3(role_key='保证人2:', page_num='0') | ||
1231 | self.init_result['保证人2-姓名'] = name | ||
1232 | self.init_result['保证人2-证件号码'] = id_num | ||
1233 | self.init_result['保证人2-法定代表人或授权代表'] = representative | ||
1234 | |||
1235 | # 在所有页面中找正文中(第二部分 融资租赁主要条款及付款计划)的那个编号,因为存在换行的情况所以暂时不带位置输出 | ||
1236 | contract_no = self.get_contract_no_one() | ||
1237 | self.init_result['合同编号(正文)'] = contract_no | ||
1238 | # 找到车辆识别代码 | ||
1239 | vin = self.get_key_value(key='车辆识别代码:') | ||
1240 | self.init_result['车辆识别代码'] = vin | ||
1241 | # 找到经销商(车辆卖方(经销商)) | ||
1242 | seller = self.get_key_value(key='车辆卖方(经销商):') | ||
1243 | if seller['words'] == None: | ||
1244 | seller = self.get_key_value(key='车辆卖方:') | ||
1245 | self.init_result['车辆卖方(经销商)'] = seller | ||
1246 | # 找到 —— 车辆原始销售价格 | ||
1247 | vehicle_price = self.get_key_value(key='车辆原始销售价格(《机动车销售统一发票》所列金额):') | ||
1248 | self.init_result['车辆原始销售价格(《机动车销售统一发票》所列金额)'] = vehicle_price | ||
1249 | # 找车辆附加产品明细(表) | ||
1250 | table_add_product = self.get_table_add_product() | ||
1251 | self.init_result['车辆附加产品明细表'] = table_add_product | ||
1252 | # 找融资成本总额 | ||
1253 | financing_cost = self.get_key_value(key='融资成本总额:') | ||
1254 | self.init_result['融资成本总额'] = financing_cost | ||
1255 | # 找租期 | ||
1256 | lease_term = self.get_key_value(key='租期:') | ||
1257 | self.init_result['租期'] = lease_term | ||
1258 | # 找还款计划(表) | ||
1259 | repayment_schedule = self.get_repayment_schedule() | ||
1260 | self.init_result['付款计划表'] = repayment_schedule | ||
1261 | # 找承租人收款账户户名、银行账号、银行 | ||
1262 | name = self.get_key_value(key='户名:', page_num='4') | ||
1263 | self.init_result['承租人收款账户-户名'] = name | ||
1264 | account = self.get_key_value(key='银行账号:', page_num='4') | ||
1265 | self.init_result['承租人收款账户-银行账号'] = account | ||
1266 | bank = self.get_key_value(key='开户银行:', page_num='4') | ||
1267 | self.init_result['承租人收款账户-开户行'] = bank | ||
1268 | # 找承租人扣款账户户名、银行账号、银行 | ||
1269 | name = self.get_key_value(key='户名:', page_num='5') | ||
1270 | self.init_result['承租人扣款账户-户名'] = name | ||
1271 | account = self.get_key_value(key='银行账号:', page_num='5') | ||
1272 | self.init_result['承租人扣款账户-银行账号'] = account | ||
1273 | bank = self.get_key_value(key='开户银行:', page_num='5') | ||
1274 | self.init_result['承租人扣款账户-开户行'] = bank | ||
1275 | |||
1276 | # 找签字页上的系列信息 | ||
1277 | # 承租人姓名、签章 | ||
1278 | if is_cdfl == False: | ||
1279 | name = self.get_key_value(key='承租人姓名:') | ||
1280 | electronic_signature = self.get_electronic_signature(top='承租人姓名:', bottom='保证人1姓名:') | ||
1281 | |||
1282 | if name["words"] == None: | ||
1283 | name = self.get_key_value(key='承租人一姓名:') | ||
1284 | electronic_signature = self.get_electronic_signature(top='承租人一姓名:', bottom='共同承租人名称:') | ||
1285 | |||
1286 | self.init_result['签字页-承租人姓名'] = name | ||
1287 | self.init_result['签字页-承租人签章'] = electronic_signature | ||
1288 | # 保证人1姓名、签章 | ||
1289 | name = self.get_key_value(key='保证人1姓名:') | ||
1290 | electronic_signature = self.get_electronic_signature(top='保证人1姓名:', bottom='保证人2姓名:') | ||
1291 | self.init_result['签字页-保证人1姓名'] = name | ||
1292 | self.init_result['签字页-保证人1签章'] = electronic_signature | ||
1293 | # 这里用的是 name["words"] == "" | ||
1294 | if name["words"] == "": | ||
1295 | name = self.get_key_value(key='共同承租人名称:') | ||
1296 | electronic_signature = self.get_electronic_signature(top='共同承租人名称:', bottom='保证人1姓名:') | ||
1297 | self.init_result['签字页-共同承租人姓名'] = name | ||
1298 | self.init_result['签字页-共同承租人签章'] = electronic_signature | ||
1299 | # 保证人2姓名、签章 | ||
1300 | name = self.get_key_value(key='保证人2姓名:') | ||
1301 | electronic_signature = self.get_electronic_signature(top='保证人2姓名:', bottom='保证人3姓名:') | ||
1302 | self.init_result['签字页-保证人2姓名'] = name | ||
1303 | self.init_result['签字页-保证人2签章'] = electronic_signature | ||
1304 | # if判断条件对应3_3版本 | ||
1305 | if name["words"] == "": | ||
1306 | name = self.get_key_value(key='保证人1姓名:') | ||
1307 | electronic_signature = self.get_electronic_signature(top='保证人1姓名:', bottom='保证人2姓名:') | ||
1308 | self.init_result['签字页-保证人1姓名'] = name | ||
1309 | self.init_result['签字页-保证人1签章'] = electronic_signature | ||
1310 | # 保证人3姓名、签章 | ||
1311 | name = self.get_key_value(key='保证人3姓名:') | ||
1312 | electronic_signature = self.get_electronic_signature(top='保证人3姓名:', bottom='日期:') | ||
1313 | self.init_result['签字页-保证人3姓名'] = name | ||
1314 | self.init_result['签字页-保证人3签章'] = electronic_signature | ||
1315 | # if判断条件对应3_3版本 | ||
1316 | if name["words"] == None: | ||
1317 | name = self.get_key_value(key='保证人2姓名:') | ||
1318 | electronic_signature = self.get_electronic_signature(top='保证人2姓名:', bottom='日期:') | ||
1319 | self.init_result['签字页-保证人2姓名'] = name | ||
1320 | self.init_result['签字页-保证人2签章'] = electronic_signature | ||
1321 | else: | ||
1322 | name = self.get_key_value(key='承租人一姓名:') | ||
1323 | electronic_signature = self.get_electronic_signature(top='承租人一姓名:', bottom='共同承租人名称:') | ||
1324 | self.init_result['签字页-承租人姓名'] = name | ||
1325 | self.init_result['签字页-承租人签章'] = electronic_signature | ||
1326 | |||
1327 | name = self.get_key_value(key='共同承租人名称:') | ||
1328 | electronic_signature = self.get_electronic_signature(top='共同承租人名称:', bottom='保证人1姓名:') | ||
1329 | self.init_result['签字页-共同承租人姓名'] = name | ||
1330 | self.init_result['签字页-共同承租人签章'] = electronic_signature | ||
1331 | |||
1332 | name = self.get_key_value(key='保证人1姓名:') | ||
1333 | electronic_signature = self.get_electronic_signature(top='保证人1姓名:', bottom='保证人2姓名:') | ||
1334 | self.init_result['签字页-保证人1姓名'] = name | ||
1335 | self.init_result['签字页-保证人1签章'] = electronic_signature | ||
1336 | |||
1337 | name = self.get_key_value(key='保证人2姓名:') | ||
1338 | electronic_signature = self.get_electronic_signature(top='保证人2姓名:', bottom='保证人3姓名:') | ||
1339 | self.init_result['签字页-保证人2姓名'] = name | ||
1340 | self.init_result['签字页-保证人2签章'] = electronic_signature | ||
1341 | |||
1342 | return self.init_result | ||
1343 | |||
1344 | def get_info_1(self): | ||
1345 | if len(self.pdf_info) > 0: | ||
1346 | contract_no = self.get_contract_no(page_num='0') | ||
1347 | self.init_result_1['合同编号'] = contract_no | ||
1348 | # 承租人姓名 | ||
1349 | name = self.get_key_value(key='承租人:', page_num='0') | ||
1350 | self.init_result_1['承租人-姓名'] = name | ||
1351 | # 承租人证件号码 | ||
1352 | _id = self.get_key_value(key='证件号码:', page_num='0') | ||
1353 | self.init_result_1['承租人-证件号码'] = _id | ||
1354 | # 销售经销商 | ||
1355 | seller = self.get_key_value(key='销售经销商:', page_num='0') | ||
1356 | if seller['words'] == "": | ||
1357 | seller = self.get_value_by_findall('销售经销商:', '地址:', page_num='0') | ||
1358 | self.init_result_1['销售经销商'] = seller | ||
1359 | # 合同编号(正文) | ||
1360 | contract_no = self.get_contract_no_one() | ||
1361 | self.init_result_1['合同编号(正文)'] = contract_no | ||
1362 | # 签字页-承租人姓名 | ||
1363 | name = self.get_key_value(key='姓名/名称:') | ||
1364 | self.init_result_1['签字页-承租人姓名'] = name | ||
1365 | # 签字页-承租人证件号码 | ||
1366 | _id = self.get_key_value(key='自然人身份证件号码/法人执照号码:') | ||
1367 | self.init_result_1['签字页-承租人证件号码'] = _id | ||
1368 | # 签字页-承租人签章 | ||
1369 | signature_role_1 = self.get_signature_role_1() | ||
1370 | self.init_result_1['签字页-承租人签章'] = signature_role_1 | ||
1371 | # 签字页-销售经销商 | ||
1372 | seller = self.get_key_value(key='销售经销商:') | ||
1373 | if seller['words'] == "": | ||
1374 | # 销售经销商:深圳市宝创汽车贸易有限公司南山分公司(请授权代表签字并请盖章) | ||
1375 | seller = self.get_value_by_findall('销售经销商:', '(请授权代表签字并请盖章)', page_num='3') | ||
1376 | self.init_result_1['签字页-销售经销商'] = seller | ||
1377 | # 经销商签章 | ||
1378 | pass | ||
1379 | return self.init_result_1 | ||
1380 | |||
1381 | def get_info_2(self): | ||
1382 | if len(self.pdf_info) > 0: | ||
1383 | contract_no = self.get_contract_no_dy() | ||
1384 | self.init_result_2['合同编号'] = contract_no | ||
1385 | # 合同编号(正文) | ||
1386 | contract_no = self.get_contract_no_one() | ||
1387 | self.init_result_2['合同编号(正文)'] = contract_no | ||
1388 | # 抵押人姓名/名称 | ||
1389 | name, _id = self.get_dyr_name_id() | ||
1390 | self.init_result_2['抵押人姓名/名称'] = name | ||
1391 | self.init_result_2['抵押人证件号码'] = _id | ||
1392 | # 抵押人配偶信息 | ||
1393 | name, _id = self.get_dyrpo_name_id() | ||
1394 | self.init_result_2['抵押人配偶姓名/名称'] = name | ||
1395 | self.init_result_2['抵押人配偶证件号码'] = _id | ||
1396 | # 车辆识别代码 | ||
1397 | vin = self.get_key_value(key='车辆识别代码:') | ||
1398 | self.init_result_2['车辆识别代码'] = vin | ||
1399 | # 租金总额 | ||
1400 | rent = self.get_key_value_position(key='租金总额') | ||
1401 | self.init_result_2['租金总额'] = rent | ||
1402 | # 融资租赁期限 | ||
1403 | lease_term = self.get_key_value_position(key='融资租赁期限') | ||
1404 | self.init_result_2['融资租赁期限'] = lease_term | ||
1405 | # 签字页抵押人姓名和签章 | ||
1406 | name = self.get_key_value(key='抵押人姓名:') | ||
1407 | electronic_signature = self.get_electronic_signature(top='抵押权人盖章', bottom='抵押人配偶姓名:') | ||
1408 | self.init_result_2['签字页-抵押人姓名'] = name | ||
1409 | self.init_result_2['签字页-抵押人签章'] = electronic_signature | ||
1410 | # 签字页抵押人配偶姓名和签章 | ||
1411 | name = self.get_key_value(key='抵押人配偶姓名:') | ||
1412 | electronic_signature = self.get_electronic_signature(top='抵押人配偶姓名:', bottom='日期') | ||
1413 | self.init_result_2['签字页-抵押人配偶姓名'] = name | ||
1414 | self.init_result_2['签字页-抵押人配偶签章'] = electronic_signature | ||
1415 | return self.init_result_2 | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
... | @@ -6,9 +6,10 @@ | ... | @@ -6,9 +6,10 @@ |
6 | # @Description : | 6 | # @Description : |
7 | 7 | ||
8 | from .get_char import Finder | 8 | from .get_char import Finder |
9 | from .get_char_fsm import Finder as FSMFinder | ||
9 | 10 | ||
10 | 11 | ||
11 | def predict(pdf_info, file_cls): | 12 | def predict(pdf_info, file_cls, is_fsm=False): |
12 | """Summary | 13 | """Summary |
13 | 14 | ||
14 | Args: | 15 | Args: |
... | @@ -58,6 +59,10 @@ def predict(pdf_info, file_cls): | ... | @@ -58,6 +59,10 @@ def predict(pdf_info, file_cls): |
58 | pdf_info = dict() | 59 | pdf_info = dict() |
59 | for pno, page_info in enumerate(pdf_info_1): | 60 | for pno, page_info in enumerate(pdf_info_1): |
60 | pdf_info[str(pno)] = page_info | 61 | pdf_info[str(pno)] = page_info |
62 | |||
63 | if is_fsm: | ||
64 | f = FSMFinder(pdf_info) | ||
65 | else: | ||
61 | f = Finder(pdf_info) | 66 | f = Finder(pdf_info) |
62 | if file_cls == 0: | 67 | if file_cls == 0: |
63 | results = f.get_info() | 68 | results = f.get_info() | ... | ... |
-
Please register or sign in to post a comment