add FSM AFC/HIL Contract
Showing
8 changed files
with
2395 additions
and
15 deletions
| ... | @@ -11,7 +11,7 @@ PAGE_SIZE_DEFAULT = 10 | ... | @@ -11,7 +11,7 @@ PAGE_SIZE_DEFAULT = 10 |
| 11 | FIXED_APPLICATION_ID_PREFIX = 'CH-S' | 11 | FIXED_APPLICATION_ID_PREFIX = 'CH-S' |
| 12 | 12 | ||
| 13 | DOC_SCHEME_LIST = ['ACCEPTANCE', 'SETTLEMENT', 'CONTRACTMANAGEMENT'] | 13 | DOC_SCHEME_LIST = ['ACCEPTANCE', 'SETTLEMENT', 'CONTRACTMANAGEMENT'] |
| 14 | DATA_SOURCE_LIST = ['POS', 'EAPP', 'ECONTRACT'] | 14 | DATA_SOURCE_LIST = ['POS', 'EAPP', 'ECONTRACT', 'OVP'] |
| 15 | COMPARE_DOC_SCHEME_LIST = ['CA', 'SE'] | 15 | COMPARE_DOC_SCHEME_LIST = ['CA', 'SE'] |
| 16 | 16 | ||
| 17 | HIL_PREFIX = 'HIL' | 17 | HIL_PREFIX = 'HIL' | ... | ... |
| ... | @@ -1476,7 +1476,8 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -1476,7 +1476,8 @@ class Command(BaseCommand, LoggerMixin): |
| 1476 | 1476 | ||
| 1477 | # AFC合同 | 1477 | # AFC合同 |
| 1478 | if classify_1_str == str(consts.CONTRACT_CLASSIFY): | 1478 | if classify_1_str == str(consts.CONTRACT_CLASSIFY): |
| 1479 | ocr_result = afc_predict(pdf_handler.pdf_info) | 1479 | is_fsm = doc.data_source == consts.DATA_SOURCE_LIST[3] |
| 1480 | ocr_result = afc_predict(pdf_handler.pdf_info, is_fsm=is_fsm) | ||
| 1480 | page_res = {} | 1481 | page_res = {} |
| 1481 | for page_num, page_info in ocr_result.get('page_info', {}).items(): | 1482 | for page_num, page_info in ocr_result.get('page_info', {}).items(): |
| 1482 | if isinstance(page_num, str) and page_num.startswith('page_'): | 1483 | if isinstance(page_num, str) and page_num.startswith('page_'): |
| ... | @@ -1499,8 +1500,9 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -1499,8 +1500,9 @@ class Command(BaseCommand, LoggerMixin): |
| 1499 | } | 1500 | } |
| 1500 | # HIL合同 | 1501 | # HIL合同 |
| 1501 | elif classify_1_str in consts.HIL_CONTRACT_TYPE_MAP: | 1502 | elif classify_1_str in consts.HIL_CONTRACT_TYPE_MAP: |
| 1503 | is_fsm = doc.data_source == consts.DATA_SOURCE_LIST[3] | ||
| 1502 | file_type_1 = consts.HIL_CONTRACT_TYPE_MAP.get(classify_1_str) | 1504 | file_type_1 = consts.HIL_CONTRACT_TYPE_MAP.get(classify_1_str) |
| 1503 | ocr_result_1 = hil_predict(pdf_handler.pdf_info, file_type_1) | 1505 | ocr_result_1 = hil_predict(pdf_handler.pdf_info, file_type_1, is_fsm=is_fsm) |
| 1504 | rebuild_res_1 = {} | 1506 | rebuild_res_1 = {} |
| 1505 | page_res = {} | 1507 | page_res = {} |
| 1506 | for field_name, field_info in ocr_result_1.items(): | 1508 | for field_name, field_info in ocr_result_1.items(): |
| ... | @@ -1526,8 +1528,8 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -1526,8 +1528,8 @@ class Command(BaseCommand, LoggerMixin): |
| 1526 | 'page_info': page_info | 1528 | 'page_info': page_info |
| 1527 | } | 1529 | } |
| 1528 | # hmh | 1530 | # hmh |
| 1529 | else: | 1531 | # else: |
| 1530 | pass | 1532 | # pass |
| 1531 | 1533 | ||
| 1532 | 1534 | ||
| 1533 | contract_res = {} | 1535 | contract_res = {} | ... | ... |
| ... | @@ -36,6 +36,7 @@ class RequestTrigger(NamedEnum): | ... | @@ -36,6 +36,7 @@ class RequestTrigger(NamedEnum): |
| 36 | DOCUPLOAD = (3, 'Document Upload') | 36 | DOCUPLOAD = (3, 'Document Upload') |
| 37 | SUBMITING = (4, 'Submiting') | 37 | SUBMITING = (4, 'Submiting') |
| 38 | UPLOADING = (5, 'Uploading') | 38 | UPLOADING = (5, 'Uploading') |
| 39 | OVP = (6, 'OVP') | ||
| 39 | 40 | ||
| 40 | 41 | ||
| 41 | class FailureReason(NamedEnum): | 42 | class FailureReason(NamedEnum): | ... | ... |
| ... | @@ -590,12 +590,13 @@ class UploadDocView(GenericView, DocHandler): | ... | @@ -590,12 +590,13 @@ class UploadDocView(GenericView, DocHandler): |
| 590 | is_zip = False | 590 | is_zip = False |
| 591 | 591 | ||
| 592 | classify_1 = 0 | 592 | classify_1 = 0 |
| 593 | # 电子合同 | 593 | # 电子合同 Econtract or OVP(FSM) |
| 594 | if data_source == consts.DATA_SOURCE_LIST[-1] and document_scheme == consts.DOC_SCHEME_LIST[1]: | 594 | if data_source == consts.DATA_SOURCE_LIST[2] or data_source == consts.DATA_SOURCE_LIST[3]: |
| 595 | for keyword, classify_1_tmp in consts.ECONTRACT_KEYWORDS_MAP.get(prefix): | 595 | if document_scheme == consts.DOC_SCHEME_LIST[1]: |
| 596 | if keyword in document_name: | 596 | for keyword, classify_1_tmp in consts.ECONTRACT_KEYWORDS_MAP.get(prefix): |
| 597 | classify_1 = classify_1_tmp | 597 | if keyword in document_name: |
| 598 | break | 598 | classify_1 = classify_1_tmp |
| 599 | break | ||
| 599 | # FSM合同:WEP/MSI/SC | 600 | # FSM合同:WEP/MSI/SC |
| 600 | elif data_source == consts.DATA_SOURCE_LIST[0] and document_scheme == consts.DOC_SCHEME_LIST[0]: | 601 | elif data_source == consts.DATA_SOURCE_LIST[0] and document_scheme == consts.DOC_SCHEME_LIST[0]: |
| 601 | for keyword, classify_1_tmp in consts.FSM_ECONTRACT_KEYWORDS_MAP.get(prefix): | 602 | for keyword, classify_1_tmp in consts.FSM_ECONTRACT_KEYWORDS_MAP.get(prefix): | ... | ... |
| ... | @@ -6,6 +6,7 @@ | ... | @@ -6,6 +6,7 @@ |
| 6 | # @Description : | 6 | # @Description : |
| 7 | 7 | ||
| 8 | from .get_char import Finder | 8 | from .get_char import Finder |
| 9 | from .get_char_fsm import Finder as FSMFinder | ||
| 9 | import numpy as np | 10 | import numpy as np |
| 10 | 11 | ||
| 11 | 12 | ||
| ... | @@ -23,7 +24,7 @@ def extract_info(ocr_results): | ... | @@ -23,7 +24,7 @@ def extract_info(ocr_results): |
| 23 | return {'page_1': {'合同编号': contract_no}} | 24 | return {'page_1': {'合同编号': contract_no}} |
| 24 | 25 | ||
| 25 | 26 | ||
| 26 | def predict(pdf_info, is_qrs=False): | 27 | def predict(pdf_info, is_qrs=False, is_fsm=False): |
| 27 | ocr_results = {} | 28 | ocr_results = {} |
| 28 | for pno in pdf_info: | 29 | for pno in pdf_info: |
| 29 | ocr_results[pno] = {} | 30 | ocr_results[pno] = {} |
| ... | @@ -50,7 +51,10 @@ def predict(pdf_info, is_qrs=False): | ... | @@ -50,7 +51,10 @@ def predict(pdf_info, is_qrs=False): |
| 50 | results = extract_info(ocr_results) | 51 | results = extract_info(ocr_results) |
| 51 | else: | 52 | else: |
| 52 | # 输入是整个 PDF 中的信息 | 53 | # 输入是整个 PDF 中的信息 |
| 53 | f = Finder(pdf_info, ocr_results=ocr_results) | 54 | if is_fsm: |
| 55 | f = FSMFinder(pdf_info, ocr_results=ocr_results) | ||
| 56 | else: | ||
| 57 | f = Finder(pdf_info, ocr_results=ocr_results) | ||
| 54 | results = f.get_info() | 58 | results = f.get_info() |
| 55 | return results | 59 | return results |
| 56 | 60 | ... | ... |
| 1 | import re | ||
| 2 | import numpy as np | ||
| 3 | from fuzzywuzzy import fuzz | ||
| 4 | from shapely.geometry import Polygon | ||
| 5 | |||
| 6 | |||
| 7 | class Finder: | ||
| 8 | |||
| 9 | def __init__(self, pdf_info, ocr_results): | ||
| 10 | self.pdf_info = pdf_info | ||
| 11 | self.ocr_results = ocr_results | ||
| 12 | self.is_asp = False | ||
| 13 | self.item = {"words": None, | ||
| 14 | "position": None, | ||
| 15 | } | ||
| 16 | |||
| 17 | def gen_init_result(self, is_asp): | ||
| 18 | # 格式化算法输出 | ||
| 19 | self.init_result = {"page_1": {"合同编号": self.item, | ||
| 20 | "所购车辆价格": self.item, | ||
| 21 | "车架号": self.item, | ||
| 22 | "贷款本金金额": {"大写": self.item, | ||
| 23 | "小写": self.item, | ||
| 24 | "车辆贷款本金金额": self.item, | ||
| 25 | "附加产品融资贷款本金总金额": self.item, | ||
| 26 | }, | ||
| 27 | "贷款期限": self.item, | ||
| 28 | "附加产品融资贷款本金总金额明细": self.item, | ||
| 29 | "借款人签字及时间": self.item, | ||
| 30 | }, | ||
| 31 | "page_2": {"合同编号": self.item, | ||
| 32 | "借款人及抵押人": {"name": self.item, | ||
| 33 | "id": self.item, | ||
| 34 | }, | ||
| 35 | "共同借款人及共同抵押人": {"name": self.item, | ||
| 36 | "id": self.item, | ||
| 37 | }, | ||
| 38 | "保证人1": {"name": self.item, | ||
| 39 | "id": self.item, | ||
| 40 | }, | ||
| 41 | "保证人2": {"name": self.item, | ||
| 42 | "id": self.item, | ||
| 43 | }, | ||
| 44 | "所购车辆价格": self.item, | ||
| 45 | "车架号": self.item, | ||
| 46 | "经销商": self.item, | ||
| 47 | "贷款本金金额": {"大写": self.item, | ||
| 48 | "小写": self.item, | ||
| 49 | "车辆贷款本金金额": self.item, | ||
| 50 | "附加产品融资贷款本金总金额": self.item, | ||
| 51 | }, | ||
| 52 | "贷款期限": self.item, | ||
| 53 | "标准利率": self.item, | ||
| 54 | "借款人收款账户": {"账号": self.item, | ||
| 55 | "户名": self.item, | ||
| 56 | "开户行": self.item, | ||
| 57 | }, | ||
| 58 | "还款账户": {"账号": self.item, | ||
| 59 | "户名": self.item, | ||
| 60 | "开户行": self.item, | ||
| 61 | }, | ||
| 62 | }, | ||
| 63 | "page_3": {"合同编号": self.item, | ||
| 64 | "还款计划表": self.item, | ||
| 65 | }, | ||
| 66 | "page_4": {"合同编号": self.item, | ||
| 67 | "附加产品融资贷款本金总金额明细": self.item, | ||
| 68 | }, | ||
| 69 | "page_5": {"合同编号": self.item, | ||
| 70 | }, | ||
| 71 | "page_6": {"合同编号": self.item, | ||
| 72 | }, | ||
| 73 | } | ||
| 74 | self.init_result["page_7"] = {"合同编号": self.item, | ||
| 75 | } | ||
| 76 | self.init_result["page_8"] = {"合同编号": self.item, | ||
| 77 | "主借人签字": {"签字": self.item, | ||
| 78 | "日期": self.item, | ||
| 79 | }, | ||
| 80 | "共借人签字": {"签字": self.item, | ||
| 81 | "日期": self.item, | ||
| 82 | }, | ||
| 83 | "保证人1签字": {"签字": self.item, | ||
| 84 | "日期": self.item, | ||
| 85 | }, | ||
| 86 | "保证人2签字": {"签字": self.item, | ||
| 87 | "日期": self.item, | ||
| 88 | }, | ||
| 89 | "见证人签字": {"签字": self.item, | ||
| 90 | "日期": self.item, | ||
| 91 | }, | ||
| 92 | } | ||
| 93 | |||
| 94 | def get_top_iou(self, poly, ocr_result): | ||
| 95 | """传入一个多边形, 找到与之最匹配的多边形 | ||
| 96 | |||
| 97 | Args: | ||
| 98 | poly (TYPE): Description | ||
| 99 | """ | ||
| 100 | iou_list = [] | ||
| 101 | for key in ocr_result: | ||
| 102 | bbox, text = ocr_result[key] | ||
| 103 | g = Polygon(np.array(bbox).reshape((-1, 2))) | ||
| 104 | p = Polygon(np.array(poly).reshape((-1, 2))) | ||
| 105 | if not g.is_valid or not p.is_valid: | ||
| 106 | continue | ||
| 107 | inter = Polygon(g).intersection(Polygon(p)).area | ||
| 108 | union = g.area + p.area - inter | ||
| 109 | iou = inter/union | ||
| 110 | iou_list.append([iou, key]) | ||
| 111 | if len(iou_list) == 0: | ||
| 112 | return -1, -1 | ||
| 113 | top_iou = sorted(iou_list, key=lambda x: x[0])[-1] | ||
| 114 | return top_iou | ||
| 115 | |||
| 116 | def poly_to_rectangle(self, poly): | ||
| 117 | xmin, ymin, xmax, ymin, xmax, ymax, xmin, ymax = poly | ||
| 118 | bbox = [xmin, ymin, xmax, ymax] | ||
| 119 | return bbox | ||
| 120 | |||
| 121 | def get_contract_no(self, page_num): | ||
| 122 | """传入页码,查看该页码右上角的编号 | ||
| 123 | |||
| 124 | Args: | ||
| 125 | page_num (string): | ||
| 126 | |||
| 127 | Returns: | ||
| 128 | sting: | ||
| 129 | """ | ||
| 130 | contract_no = self.item.copy() | ||
| 131 | # contract_no['words'] = '' | ||
| 132 | # contract_no['position'] = [-1, -1, -1, -1] | ||
| 133 | # 只看第一页 | ||
| 134 | for key in self.ocr_results[page_num]: | ||
| 135 | bbox, text = self.ocr_results[page_num][key] | ||
| 136 | if '合同编号:' in text: | ||
| 137 | words = text.split(':')[-1] | ||
| 138 | location = self.poly_to_rectangle(bbox) | ||
| 139 | contract_no['words'] = words | ||
| 140 | contract_no['position'] = location | ||
| 141 | return contract_no | ||
| 142 | |||
| 143 | def get_vehicle_price(self, page_num='0'): | ||
| 144 | vehicle_price = self.item.copy() | ||
| 145 | # vehicle_price['words'] = '' | ||
| 146 | # vehicle_price['position'] = [-1, -1, -1, -1] | ||
| 147 | for key in self.ocr_results[page_num]: | ||
| 148 | bbox, text = self.ocr_results[page_num][key] | ||
| 149 | if '所购车辆价格为人民币' in text: | ||
| 150 | words = text.split('币')[-1] | ||
| 151 | location = self.poly_to_rectangle(bbox) | ||
| 152 | vehicle_price['words'] = words | ||
| 153 | vehicle_price['position'] = location | ||
| 154 | return vehicle_price | ||
| 155 | |||
| 156 | def get_vin(self, page_num='0'): | ||
| 157 | vin = self.item.copy() | ||
| 158 | # vin['words'] = '' | ||
| 159 | # vin['position'] = [-1, -1, -1, -1] | ||
| 160 | for key in self.ocr_results[page_num]: | ||
| 161 | bbox, text = self.ocr_results[page_num][key] | ||
| 162 | if '车架号:' in text: | ||
| 163 | words = text.split(':')[-1] | ||
| 164 | location = self.poly_to_rectangle(bbox) | ||
| 165 | vin['words'] = words | ||
| 166 | vin['position'] = location | ||
| 167 | return vin | ||
| 168 | |||
| 169 | def get_loan_principal(self, page_num='0'): | ||
| 170 | chinese_keywords = ['壹', '贰', '叁', '肆', '伍', '陆', '柒', '捌', '玖', '拾', | ||
| 171 | '佰', '仟', '万', '亿', '元', '角', '分', '零', '整'] | ||
| 172 | upper = self.item.copy() | ||
| 173 | lower = self.item.copy() | ||
| 174 | asp_1 = self.item.copy() | ||
| 175 | asp_2 = self.item.copy() | ||
| 176 | anchor_bbox = None | ||
| 177 | for block in self.pdf_info[page_num]['blocks']: | ||
| 178 | if block['type'] != 0: | ||
| 179 | continue | ||
| 180 | for line in block['lines']: | ||
| 181 | for span in line['spans']: | ||
| 182 | bbox, text = span['bbox'], span['text'] | ||
| 183 | if fuzz.ratio(''.join(chinese_keywords), text) > 15: | ||
| 184 | text = text.split(':')[-1].strip() | ||
| 185 | upper['position'] = bbox | ||
| 186 | upper['words'] = text | ||
| 187 | if '小写:¥' in text: | ||
| 188 | words = text.split('¥')[-1].strip() | ||
| 189 | lower['position'] = bbox | ||
| 190 | lower['words'] = words | ||
| 191 | if '附加产品融资贷款本金总金额' == text: | ||
| 192 | anchor_bbox = bbox | ||
| 193 | if anchor_bbox: | ||
| 194 | for block in self.pdf_info[page_num]['blocks']: | ||
| 195 | if block['type'] != 0: | ||
| 196 | continue | ||
| 197 | for line in block['lines']: | ||
| 198 | for span in line['spans']: | ||
| 199 | bbox, text = span['bbox'], span['text'] | ||
| 200 | if np.mean(bbox[1::2]) < np.mean(anchor_bbox[1::2]) and '人民币:小写:' in text: | ||
| 201 | words = re.findall(r'人民币:小写:\[(.*)\]', text)[0] | ||
| 202 | asp_1['position'] = bbox | ||
| 203 | asp_1['words'] = words | ||
| 204 | if np.mean(bbox[1::2]) > np.mean(anchor_bbox[1::2]) and '人民币:小写:' in text: | ||
| 205 | words = re.findall(r'人民币:小写:\[(.*)\]', text)[0] | ||
| 206 | asp_2['position'] = bbox | ||
| 207 | asp_2['words'] = words | ||
| 208 | return upper, lower, asp_1, asp_2 | ||
| 209 | |||
| 210 | def get_loan_term(self, page_num='0'): | ||
| 211 | loan_term = self.item.copy() | ||
| 212 | all_text = '' | ||
| 213 | for block in self.pdf_info[page_num]['blocks']: | ||
| 214 | if block['type'] != 0: | ||
| 215 | continue | ||
| 216 | for line in block['lines']: | ||
| 217 | for span in line['spans']: | ||
| 218 | bbox, text = span['bbox'], span['text'] | ||
| 219 | all_text += text | ||
| 220 | matchs = re.search(r'贷款期限(\d+)个月', all_text) | ||
| 221 | if matchs: | ||
| 222 | words = matchs.group(1) | ||
| 223 | for block in self.pdf_info[page_num]['blocks']: | ||
| 224 | if block['type'] != 0: | ||
| 225 | continue | ||
| 226 | for line in block['lines']: | ||
| 227 | for span in line['spans']: | ||
| 228 | bbox, text = span['bbox'], span['text'] | ||
| 229 | if f'{words}个月' in text: | ||
| 230 | loan_term['position'] = bbox | ||
| 231 | loan_term['words'] = words | ||
| 232 | return loan_term | ||
| 233 | |||
| 234 | def get_standard_rate(self, page_num='0'): | ||
| 235 | standard_rate = self.item.copy() | ||
| 236 | for block in self.pdf_info[page_num]['blocks']: | ||
| 237 | if block['type'] != 0: | ||
| 238 | continue | ||
| 239 | for line in block['lines']: | ||
| 240 | for span in line['spans']: | ||
| 241 | bbox, text = span['bbox'], span['text'] | ||
| 242 | matchs = re.search(r'本合同当期的标准利率为(\S+)%/年', text) | ||
| 243 | if matchs: | ||
| 244 | standard_rate['position'] = bbox | ||
| 245 | standard_rate['words'] = matchs.group(1) | ||
| 246 | return standard_rate | ||
| 247 | |||
| 248 | def mergelist(self, text_list): | ||
| 249 | pattern = re.compile("[^\u4e00-\u9fa5]") # 匹配不是中文的其他字符 | ||
| 250 | mergeindex = -1 | ||
| 251 | for index, i in enumerate(text_list): | ||
| 252 | if '所购' in i and len(pattern.sub('', pattern.sub('', text_list[index+1]))) != 0: | ||
| 253 | # if '所购' in i and '.00' not in text_list[index+1]: | ||
| 254 | mergeindex = index | ||
| 255 | if mergeindex == -1: | ||
| 256 | return text_list | ||
| 257 | else: | ||
| 258 | new_text_list = text_list[:mergeindex] + [text_list[mergeindex] + text_list[mergeindex+1]] + text_list[mergeindex+2:] | ||
| 259 | return self.mergelist(new_text_list) | ||
| 260 | |||
| 261 | def get_asp_details(self, page_num): | ||
| 262 | asp_details_table_term = self.item.copy() | ||
| 263 | |||
| 264 | asp_details_table = [['附加产品融资贷款本金总金额及贷款利率明细'], ['项目1', '用途总金额2', '贷款本金3']] | ||
| 265 | |||
| 266 | bbox_xm = None | ||
| 267 | bbox_ytzje = None | ||
| 268 | bbox_dkbj = None | ||
| 269 | bbox_total = None | ||
| 270 | for key in self.ocr_results[page_num]: | ||
| 271 | bbox, text = self.ocr_results[page_num][key] | ||
| 272 | if text == '项目1': | ||
| 273 | bbox_xm = bbox | ||
| 274 | if text == '用途总金额2': | ||
| 275 | bbox_ytzje = bbox | ||
| 276 | if text == '贷款本金3': | ||
| 277 | bbox_dkbj = bbox | ||
| 278 | if text in ['附加产品融资贷款本', '附加产品融资贷款本金', '附加产品融资贷']: | ||
| 279 | bbox_total = bbox | ||
| 280 | |||
| 281 | if bbox_xm: | ||
| 282 | for i in range(10): | ||
| 283 | rh = abs(bbox_xm[1]-bbox_xm[-1]) | ||
| 284 | anchor = np.array(bbox_xm).reshape((-1 ,2)) | ||
| 285 | anchor[:, 1] += int(rh*1.4) | ||
| 286 | _iou, _key = self.get_top_iou(poly=anchor, ocr_result=self.ocr_results[page_num]) | ||
| 287 | if _iou > 0: | ||
| 288 | bbox, xm_text = self.ocr_results[page_num][_key] | ||
| 289 | bbox_xm = bbox | ||
| 290 | # 解决项目内容是两行的问题 | ||
| 291 | if not '所购' in xm_text: | ||
| 292 | line = asp_details_table[-1] | ||
| 293 | line[0] += xm_text | ||
| 294 | asp_details_table[-1] = line | ||
| 295 | continue | ||
| 296 | # print(xm_text) | ||
| 297 | anchor_1 = [bbox_ytzje[0], bbox[1], bbox_ytzje[2], bbox[3], | ||
| 298 | bbox_ytzje[4], bbox[5], bbox_ytzje[6], bbox[7]] | ||
| 299 | _iou, _key = self.get_top_iou(poly=anchor_1, ocr_result=self.ocr_results[page_num]) | ||
| 300 | bbox, ytzje_text = self.ocr_results[page_num][_key] | ||
| 301 | # print(ytzje_text) | ||
| 302 | anchor_2 = [bbox_dkbj[0], bbox[1], bbox_dkbj[2], bbox[3], | ||
| 303 | bbox_dkbj[4], bbox[5], bbox_dkbj[6], bbox[7]] | ||
| 304 | _iou, _key = self.get_top_iou(poly=anchor_2, ocr_result=self.ocr_results[page_num]) | ||
| 305 | bbox, dkbj_text = self.ocr_results[page_num][_key] | ||
| 306 | # print(dkbj_text) | ||
| 307 | if xm_text == ytzje_text: | ||
| 308 | xm_text, ytzje_text = xm_text.split(' ') | ||
| 309 | line = [xm_text, ytzje_text, dkbj_text] | ||
| 310 | asp_details_table.append(line) | ||
| 311 | else: | ||
| 312 | break | ||
| 313 | |||
| 314 | if bbox_total: | ||
| 315 | anchor = [bbox_dkbj[0], bbox_total[1], bbox_dkbj[2], bbox_total[3], | ||
| 316 | bbox_dkbj[4], bbox_total[5], bbox_dkbj[6], bbox_total[7]] | ||
| 317 | _iou, _key = self.get_top_iou(poly=anchor, ocr_result=self.ocr_results[page_num]) | ||
| 318 | bbox, total_text = self.ocr_results[page_num][_key] | ||
| 319 | asp_details_table.append(['附加产品融资贷款本金总金额:', '', total_text]) | ||
| 320 | asp_details_table_term['words'] = asp_details_table | ||
| 321 | |||
| 322 | return asp_details_table_term | ||
| 323 | |||
| 324 | def get_signature(self): | ||
| 325 | signature = self.item.copy() | ||
| 326 | |||
| 327 | for block in self.pdf_info['0']['blocks']: | ||
| 328 | if block['type'] != 0: | ||
| 329 | continue | ||
| 330 | for line in block['lines']: | ||
| 331 | for span in line['spans']: | ||
| 332 | bbox, text = span['bbox'], span['text'] | ||
| 333 | if '签署日期' in text: | ||
| 334 | words = text | ||
| 335 | signature['words'] = words | ||
| 336 | signature['position'] = bbox | ||
| 337 | return signature | ||
| 338 | |||
| 339 | def get_somebody(self, top, bottom): | ||
| 340 | # 指定上下边界后,返回上下边界内的客户信息 | ||
| 341 | _name = self.item.copy() | ||
| 342 | _id = self.item.copy() | ||
| 343 | # 只看第一页,先划定上下边界 | ||
| 344 | y_top = 0 | ||
| 345 | y_bottom = 0 | ||
| 346 | for block in self.pdf_info['1']['blocks']: | ||
| 347 | if block['type'] != 0: | ||
| 348 | continue | ||
| 349 | for line in block['lines']: | ||
| 350 | for span in line['spans']: | ||
| 351 | bbox, text = span['bbox'], span['text'] | ||
| 352 | if top in text: | ||
| 353 | y_top = bbox[3] | ||
| 354 | if bottom in text: | ||
| 355 | y_bottom = bbox[3] | ||
| 356 | for block in self.pdf_info['1']['blocks']: | ||
| 357 | if block['type'] != 0: | ||
| 358 | continue | ||
| 359 | for line in block['lines']: | ||
| 360 | for span in line['spans']: | ||
| 361 | bbox, text = span['bbox'], span['text'] | ||
| 362 | if y_top < bbox[3] < y_bottom: | ||
| 363 | # print(top, bottom, text) | ||
| 364 | if '姓名/名称' in text: | ||
| 365 | words = text.split(':')[-1] | ||
| 366 | _name['position'] = bbox | ||
| 367 | _name['words'] = words | ||
| 368 | if '自然人身份证件号码/法人执照号码' in text: | ||
| 369 | words = text.split(':')[-1] | ||
| 370 | _id['position'] = bbox | ||
| 371 | _id['words'] = words | ||
| 372 | return _name, _id | ||
| 373 | |||
| 374 | def get_seller(self): | ||
| 375 | seller = self.item.copy() | ||
| 376 | # 先找到 key | ||
| 377 | anchor_bbox = None | ||
| 378 | for block in self.pdf_info['1']['blocks']: | ||
| 379 | if block['type'] != 0: | ||
| 380 | continue | ||
| 381 | for line in block['lines']: | ||
| 382 | for span in line['spans']: | ||
| 383 | bbox, text = span['bbox'], span['text'] | ||
| 384 | if text in ['经销商', '车辆销售方']: | ||
| 385 | anchor_bbox = bbox | ||
| 386 | # 当找到了 key, 则根据 key 去匹配 value | ||
| 387 | if anchor_bbox: | ||
| 388 | half_width = self.pdf_info['1']['width'] * 0.5 | ||
| 389 | for block in self.pdf_info['1']['blocks']: | ||
| 390 | if block['type'] != 0: | ||
| 391 | continue | ||
| 392 | for line in block['lines']: | ||
| 393 | for span in line['spans']: | ||
| 394 | bbox, text = span['bbox'], span['text'] | ||
| 395 | if anchor_bbox[2]<np.mean(bbox[::2])<half_width and \ | ||
| 396 | anchor_bbox[1]<np.mean(bbox[1::2])<anchor_bbox[3]: | ||
| 397 | seller['position'] = bbox | ||
| 398 | seller['words'] = text | ||
| 399 | return seller | ||
| 400 | |||
| 401 | def get_borrower_collection_account(self): | ||
| 402 | account = self.item.copy() | ||
| 403 | account_name = self.item.copy() | ||
| 404 | account_bank = self.item.copy() | ||
| 405 | all_text = '' | ||
| 406 | for block in self.pdf_info['1']['blocks']: | ||
| 407 | if block['type'] != 0: | ||
| 408 | continue | ||
| 409 | for line in block['lines']: | ||
| 410 | for span in line['spans']: | ||
| 411 | bbox, text = span['bbox'], span['text'] | ||
| 412 | all_text += text | ||
| 413 | # 首先确定账户信息是哪种,我们只输出非另行通知的格式 | ||
| 414 | if '借款人收款账户' in all_text: | ||
| 415 | all_text = all_text.replace(' ', '').replace(' ', '') | ||
| 416 | matchs_1 = re.findall(r'账号:(.*?)户名', all_text) | ||
| 417 | if matchs_1: | ||
| 418 | words = matchs_1[0] | ||
| 419 | for block in self.pdf_info['1']['blocks']: | ||
| 420 | if block['type'] != 0: | ||
| 421 | continue | ||
| 422 | for line in block['lines']: | ||
| 423 | for span in line['spans']: | ||
| 424 | bbox, text = span['bbox'], span['text'] | ||
| 425 | if f'{words}' in text: | ||
| 426 | account['position'] = bbox | ||
| 427 | account['words'] = words | ||
| 428 | matchs_2 = re.findall(r'户名:(.*?)开户行', all_text) | ||
| 429 | if matchs_2: | ||
| 430 | words = matchs_2[0] | ||
| 431 | for block in self.pdf_info['1']['blocks']: | ||
| 432 | if block['type'] != 0: | ||
| 433 | continue | ||
| 434 | for line in block['lines']: | ||
| 435 | for span in line['spans']: | ||
| 436 | bbox, text = span['bbox'], span['text'] | ||
| 437 | if f'{words}' in text: | ||
| 438 | account_name['position'] = bbox | ||
| 439 | account_name['words'] = words | ||
| 440 | matchs_3 = re.findall(r'开户行:(.*?)借款人', all_text) | ||
| 441 | if matchs_3: | ||
| 442 | words = matchs_3[0] | ||
| 443 | for block in self.pdf_info['1']['blocks']: | ||
| 444 | if block['type'] != 0: | ||
| 445 | continue | ||
| 446 | for line in block['lines']: | ||
| 447 | for span in line['spans']: | ||
| 448 | bbox, text = span['bbox'], span['text'] | ||
| 449 | if f'{words}' in text: | ||
| 450 | account_bank['position'] = bbox | ||
| 451 | account_bank['words'] = words | ||
| 452 | return account, account_name, account_bank | ||
| 453 | |||
| 454 | def get_payback_account(self): | ||
| 455 | account = self.item.copy() | ||
| 456 | account_name = self.item.copy() | ||
| 457 | account_bank = self.item.copy() | ||
| 458 | all_text = '' | ||
| 459 | for block in self.pdf_info['1']['blocks']: | ||
| 460 | if block['type'] != 0: | ||
| 461 | continue | ||
| 462 | for line in block['lines']: | ||
| 463 | for span in line['spans']: | ||
| 464 | bbox, text = span['bbox'], span['text'] | ||
| 465 | all_text += text | ||
| 466 | # 首先确定账户信息是哪种,我们只输出非另行通知的格式 | ||
| 467 | if '(13) 还款账户' in all_text: | ||
| 468 | all_text = all_text.split('(13) 还款账户')[-1] | ||
| 469 | all_text = all_text.replace(' ', '').replace(' ', '') | ||
| 470 | matchs_1 = re.findall(r'账号:(.*?)户名', all_text) | ||
| 471 | if matchs_1: | ||
| 472 | words = matchs_1[0] | ||
| 473 | for block in self.pdf_info['1']['blocks']: | ||
| 474 | if block['type'] != 0: | ||
| 475 | continue | ||
| 476 | for line in block['lines']: | ||
| 477 | for span in line['spans']: | ||
| 478 | bbox, text = span['bbox'], span['text'] | ||
| 479 | if f'{words}' in text: | ||
| 480 | account['position'] = bbox | ||
| 481 | account['words'] = words | ||
| 482 | matchs_2 = re.findall(r'户名:(.*?)开户行', all_text) | ||
| 483 | if matchs_2: | ||
| 484 | words = matchs_2[0] | ||
| 485 | for block in self.pdf_info['1']['blocks']: | ||
| 486 | if block['type'] != 0: | ||
| 487 | continue | ||
| 488 | for line in block['lines']: | ||
| 489 | for span in line['spans']: | ||
| 490 | bbox, text = span['bbox'], span['text'] | ||
| 491 | if f'{words}' in text: | ||
| 492 | account_name['position'] = bbox | ||
| 493 | account_name['words'] = words | ||
| 494 | matchs_3 = re.findall(r'开户行:(.*?);', all_text) | ||
| 495 | if matchs_3: | ||
| 496 | words = matchs_3[0] | ||
| 497 | for block in self.pdf_info['1']['blocks']: | ||
| 498 | if block['type'] != 0: | ||
| 499 | continue | ||
| 500 | for line in block['lines']: | ||
| 501 | for span in line['spans']: | ||
| 502 | bbox, text = span['bbox'], span['text'] | ||
| 503 | if f'开户行:{words};' in text.replace(' ', ''): | ||
| 504 | account_bank['position'] = bbox | ||
| 505 | account_bank['words'] = words | ||
| 506 | return account, account_name, account_bank | ||
| 507 | |||
| 508 | def get_repayment_schedule(self): | ||
| 509 | repayment_schedule = self.item.copy() | ||
| 510 | # 只看第二页 | ||
| 511 | repayment_schedule_table = [] | ||
| 512 | repayment_schedule_text_list = [] | ||
| 513 | table = False | ||
| 514 | for block in self.pdf_info['2']['blocks']: | ||
| 515 | if block['type'] != 0: | ||
| 516 | continue | ||
| 517 | for line in block['lines']: | ||
| 518 | for span in line['spans']: | ||
| 519 | bbox, text = span['bbox'], span['text'] | ||
| 520 | if '序号' == text: | ||
| 521 | table = True | ||
| 522 | if '以上表格中所列的序号并非还款期数' in text: | ||
| 523 | table = False | ||
| 524 | if table == True: | ||
| 525 | repayment_schedule_text_list.append(text) | ||
| 526 | |||
| 527 | for i in range(len(repayment_schedule_text_list)//5): | ||
| 528 | |||
| 529 | line = [] | ||
| 530 | # 5表示5列的意思 | ||
| 531 | for j in range(5): | ||
| 532 | line.append(repayment_schedule_text_list[i*5+j]) | ||
| 533 | |||
| 534 | if str(i+1) == line[1]: | ||
| 535 | break | ||
| 536 | |||
| 537 | repayment_schedule_table.append(line) | ||
| 538 | |||
| 539 | if len(repayment_schedule_table) > 0: | ||
| 540 | repayment_schedule['words'] = repayment_schedule_table | ||
| 541 | return repayment_schedule | ||
| 542 | |||
| 543 | def get_signature_role_1(self): | ||
| 544 | signature_role_1 = self.init_item.copy() | ||
| 545 | # 先定位签字区域 | ||
| 546 | texts = [] | ||
| 547 | boxes = [] | ||
| 548 | page_num = None | ||
| 549 | position = None | ||
| 550 | words = None | ||
| 551 | region = False | ||
| 552 | for i in list(self.pdf_info.keys()): | ||
| 553 | for block in self.pdf_info[i]['blocks']: | ||
| 554 | if block['type'] != 0: | ||
| 555 | continue | ||
| 556 | for line in block['lines']: | ||
| 557 | for span in line['spans']: | ||
| 558 | bbox, text = span['bbox'], span['text'] | ||
| 559 | if '借款人(抵押人)' in text: | ||
| 560 | region = True | ||
| 561 | if '日期' in text: | ||
| 562 | region = False | ||
| 563 | if region == True: | ||
| 564 | page_num = i | ||
| 565 | texts.append(text) | ||
| 566 | boxes.append(bbox) | ||
| 567 | if len(texts) > 4: | ||
| 568 | words = '有' | ||
| 569 | else: | ||
| 570 | words = '无' | ||
| 571 | boxes = np.array(boxes).reshape((-1, 2)) | ||
| 572 | position = [min(boxes[:,0]), min(boxes[:,1]), max(boxes[:,0]), max(boxes[:,1])] | ||
| 573 | signature_role_1['page_num'] = page_num | ||
| 574 | signature_role_1['position'] = position | ||
| 575 | signature_role_1['words'] = words | ||
| 576 | return signature_role_1 | ||
| 577 | |||
| 578 | def get_signature_role_2(self): | ||
| 579 | signature_role_2 = self.init_item.copy() | ||
| 580 | # 先定位签字区域 | ||
| 581 | texts = [] | ||
| 582 | boxes = [] | ||
| 583 | page_num = None | ||
| 584 | position = None | ||
| 585 | words = None | ||
| 586 | region = False | ||
| 587 | for i in list(self.pdf_info.keys()): | ||
| 588 | for block in self.pdf_info[i]['blocks']: | ||
| 589 | if block['type'] != 0: | ||
| 590 | continue | ||
| 591 | for line in block['lines']: | ||
| 592 | for span in line['spans']: | ||
| 593 | bbox, text = span['bbox'], span['text'] | ||
| 594 | if '共同借款人(共同抵押人)' in text: | ||
| 595 | region = True | ||
| 596 | if '日期' in text: | ||
| 597 | region = False | ||
| 598 | if region == True: | ||
| 599 | page_num = i | ||
| 600 | texts.append(text) | ||
| 601 | boxes.append(bbox) | ||
| 602 | if len(texts) > 4: | ||
| 603 | words = '有' | ||
| 604 | else: | ||
| 605 | words = '无' | ||
| 606 | boxes = np.array(boxes).reshape((-1, 2)) | ||
| 607 | position = [min(boxes[:,0]), min(boxes[:,1]), max(boxes[:,0]), max(boxes[:,1])] | ||
| 608 | signature_role_2['page_num'] = page_num | ||
| 609 | signature_role_2['position'] = position | ||
| 610 | signature_role_2['words'] = words | ||
| 611 | return signature_role_2 | ||
| 612 | |||
| 613 | def get_signature_role_3(self): | ||
| 614 | signature_role_3 = self.init_item.copy() | ||
| 615 | # 先定位签字区域 | ||
| 616 | texts = [] | ||
| 617 | boxes = [] | ||
| 618 | page_num = None | ||
| 619 | position = None | ||
| 620 | words = None | ||
| 621 | region = False | ||
| 622 | for i in list(self.pdf_info.keys()): | ||
| 623 | for block in self.pdf_info[i]['blocks']: | ||
| 624 | if block['type'] != 0: | ||
| 625 | continue | ||
| 626 | for line in block['lines']: | ||
| 627 | for span in line['spans']: | ||
| 628 | bbox, text = span['bbox'], span['text'] | ||
| 629 | if '保证人1' in text and int(i) != 0: | ||
| 630 | region = True | ||
| 631 | if '日期' in text: | ||
| 632 | region = False | ||
| 633 | if region == True: | ||
| 634 | page_num = i | ||
| 635 | texts.append(text) | ||
| 636 | boxes.append(bbox) | ||
| 637 | if len(texts) > 4: | ||
| 638 | words = '有' | ||
| 639 | else: | ||
| 640 | words = '无' | ||
| 641 | boxes = np.array(boxes).reshape((-1, 2)) | ||
| 642 | position = [min(boxes[:,0]), min(boxes[:,1]), max(boxes[:,0]), max(boxes[:,1])] | ||
| 643 | signature_role_3['page_num'] = page_num | ||
| 644 | signature_role_3['position'] = position | ||
| 645 | signature_role_3['words'] = words | ||
| 646 | return signature_role_3 | ||
| 647 | |||
| 648 | def get_signature_role_4(self): | ||
| 649 | signature_role_4 = self.init_item.copy() | ||
| 650 | # 先定位签字区域 | ||
| 651 | texts = [] | ||
| 652 | boxes = [] | ||
| 653 | page_num = None | ||
| 654 | position = None | ||
| 655 | words = None | ||
| 656 | region = False | ||
| 657 | for i in list(self.pdf_info.keys()): | ||
| 658 | for block in self.pdf_info[i]['blocks']: | ||
| 659 | if block['type'] != 0: | ||
| 660 | continue | ||
| 661 | for line in block['lines']: | ||
| 662 | for span in line['spans']: | ||
| 663 | bbox, text = span['bbox'], span['text'] | ||
| 664 | if '保证人2' in text and int(i) != 0: | ||
| 665 | region = True | ||
| 666 | if '日期' in text: | ||
| 667 | region = False | ||
| 668 | if region == True: | ||
| 669 | page_num = i | ||
| 670 | texts.append(text) | ||
| 671 | boxes.append(bbox) | ||
| 672 | if len(texts) > 4: | ||
| 673 | words = '有' | ||
| 674 | else: | ||
| 675 | words = '无' | ||
| 676 | boxes = np.array(boxes).reshape((-1, 2)) | ||
| 677 | position = [min(boxes[:,0]), min(boxes[:,1]), max(boxes[:,0]), max(boxes[:,1])] | ||
| 678 | signature_role_4['page_num'] = page_num | ||
| 679 | signature_role_4['position'] = position | ||
| 680 | signature_role_4['words'] = words | ||
| 681 | return signature_role_4 | ||
| 682 | |||
| 683 | def get_signature_role_5(self): | ||
| 684 | signature_role_5 = self.init_item.copy() | ||
| 685 | # 先定位签字区域 | ||
| 686 | texts = [] | ||
| 687 | boxes = [] | ||
| 688 | page_num = None | ||
| 689 | position = None | ||
| 690 | words = None | ||
| 691 | region = False | ||
| 692 | for i in list(self.pdf_info.keys()): | ||
| 693 | for block in self.pdf_info[i]['blocks']: | ||
| 694 | if block['type'] != 0: | ||
| 695 | continue | ||
| 696 | for line in block['lines']: | ||
| 697 | for span in line['spans']: | ||
| 698 | bbox, text = span['bbox'], span['text'] | ||
| 699 | if '见证人签字' in text and int(i) != 0: | ||
| 700 | region = True | ||
| 701 | if '年' in text: | ||
| 702 | region = False | ||
| 703 | if region == True: | ||
| 704 | page_num = i | ||
| 705 | texts.append(text) | ||
| 706 | boxes.append(bbox) | ||
| 707 | print(texts) | ||
| 708 | if len(texts) > 4: | ||
| 709 | words = '有' | ||
| 710 | else: | ||
| 711 | words = '无' | ||
| 712 | boxes = np.array(boxes).reshape((-1, 2)) | ||
| 713 | position = [min(boxes[:,0]), min(boxes[:,1]), max(boxes[:,0]), max(boxes[:,1])] | ||
| 714 | signature_role_5['page_num'] = page_num | ||
| 715 | signature_role_5['position'] = position | ||
| 716 | signature_role_5['words'] = words | ||
| 717 | return signature_role_5 | ||
| 718 | |||
| 719 | def get_last_page_signature(self, page_num, top, bottom): | ||
| 720 | signature_name = self.item.copy() | ||
| 721 | signature_date = self.item.copy() | ||
| 722 | anchor_top = None | ||
| 723 | anchor_bottom = None | ||
| 724 | for block in self.pdf_info[page_num]['blocks']: | ||
| 725 | if block['type'] != 0: | ||
| 726 | continue | ||
| 727 | for line in block['lines']: | ||
| 728 | for span in line['spans']: | ||
| 729 | bbox, text = span['bbox'], span['text'] | ||
| 730 | if top in text: | ||
| 731 | anchor_top = bbox[1] | ||
| 732 | if bottom in text: | ||
| 733 | anchor_bottom = bbox[1] | ||
| 734 | # print(top, anchor_top, anchor_bottom) | ||
| 735 | if anchor_top is not None and anchor_bottom is not None: | ||
| 736 | for block in self.pdf_info[page_num]['blocks']: | ||
| 737 | if block['type'] != 0: | ||
| 738 | continue | ||
| 739 | for line in block['lines']: | ||
| 740 | for span in line['spans']: | ||
| 741 | bbox, text = span['bbox'], span['text'] | ||
| 742 | if '签署日期' in text and int(anchor_top)<np.mean(bbox[1::2])<int(anchor_bottom): | ||
| 743 | name = text.split(' ')[0] | ||
| 744 | date = text.split(':')[-1] | ||
| 745 | signature_name['words'] = name | ||
| 746 | signature_name['position'] = bbox | ||
| 747 | signature_date['words'] = date | ||
| 748 | signature_date['position'] = bbox | ||
| 749 | return signature_name, signature_date | ||
| 750 | |||
| 751 | def get_info(self): | ||
| 752 | """ | ||
| 753 | block['type'] == 0 : 表示该元素为图片 | ||
| 754 | |||
| 755 | Returns: | ||
| 756 | dict: Description | ||
| 757 | """ | ||
| 758 | |||
| 759 | # 先判断是否为 ASP 产品 | ||
| 760 | # 只看第一页,判断是否有 '附加产品融资贷款本金总金额' 这一句话,若有则为 ASP 产品 | ||
| 761 | # print(self.pdf_info['0']['blocks']) | ||
| 762 | # for block in self.pdf_info['0']['blocks']: | ||
| 763 | # if block['type'] != 0: | ||
| 764 | # continue | ||
| 765 | # for line in block['lines']: | ||
| 766 | # for span in line['spans']: | ||
| 767 | # bbox, text = span['bbox'], span['text'] | ||
| 768 | # if '附加产品融资贷款本金总金额' == text: | ||
| 769 | # self.is_asp = True | ||
| 770 | for key in self.ocr_results['0']: | ||
| 771 | bbox, text = self.ocr_results['0'][key] | ||
| 772 | if '附加产品融资贷款本金总金额' in text: | ||
| 773 | self.is_asp = True | ||
| 774 | |||
| 775 | self.gen_init_result(self.is_asp) | ||
| 776 | |||
| 777 | if len(list(self.ocr_results.keys())) <= 8: # 8.5 版本客户提供的样本出现串页的情况,暂时无法识别 | ||
| 778 | # Page 1 | ||
| 779 | # 找合同编号 | ||
| 780 | contract_no = self.get_contract_no(page_num='0') | ||
| 781 | # print(contract_no) | ||
| 782 | self.init_result['page_1']['合同编号'] = contract_no | ||
| 783 | # 所购车辆价格 | ||
| 784 | vehicle_price = self.get_vehicle_price() | ||
| 785 | # print(vehicle_price) | ||
| 786 | self.init_result['page_1']['所购车辆价格'] = vehicle_price | ||
| 787 | # 车架号 | ||
| 788 | vin = self.get_vin() | ||
| 789 | # print(vin) | ||
| 790 | self.init_result['page_1']['车架号'] = vin | ||
| 791 | # 贷款本金金额(如果是 ASP产品)则'贷款本金金额'项目中包含'车辆贷款本金金额'和'附加产品融资贷款本金总金额'两个项目 | ||
| 792 | upper, lower, asp_1, asp_2 = self.get_loan_principal() | ||
| 793 | # print(upper, lower, asp_1, asp_2) | ||
| 794 | self.init_result['page_1']['贷款本金金额']['大写'] = upper | ||
| 795 | self.init_result['page_1']['贷款本金金额']['小写'] = lower | ||
| 796 | self.init_result['page_1']['贷款本金金额']['车辆贷款本金金额'] = asp_1 | ||
| 797 | self.init_result['page_1']['贷款本金金额']['附加产品融资贷款本金总金额'] = asp_2 | ||
| 798 | # 贷款期限 | ||
| 799 | loan_term = self.get_loan_term() | ||
| 800 | # print(loan_term) | ||
| 801 | self.init_result['page_1']['贷款期限'] = loan_term | ||
| 802 | # 附加产品融资贷款本金总金额明细(ASP-表格) | ||
| 803 | asp_details_table = self.get_asp_details(page_num='0') | ||
| 804 | # print(asp_details_table) | ||
| 805 | self.init_result['page_1']['附加产品融资贷款本金总金额明细'] = asp_details_table | ||
| 806 | # 借款人签字及时间 | ||
| 807 | signature = self.get_signature() | ||
| 808 | # print(signature) | ||
| 809 | self.init_result['page_1']['借款人签字及时间'] = signature | ||
| 810 | ####################################### | ||
| 811 | # Page 2 | ||
| 812 | # 找合同编号 | ||
| 813 | contract_no = self.get_contract_no(page_num='0') | ||
| 814 | # print(contract_no) | ||
| 815 | self.init_result['page_2']['合同编号'] = contract_no | ||
| 816 | # 找借款人及抵押人(地址字段原本有空格) | ||
| 817 | borrower_name, borrower_id = self.get_somebody(top='借款人及抵押人:', bottom='共同借款人:') | ||
| 818 | # 这是为了同时兼容 8.1 版本 | ||
| 819 | if borrower_name['words'] == None: | ||
| 820 | borrower_name, borrower_id = self.get_somebody(top='借款人及抵押人:', bottom='共同借款人及共同抵押人:') | ||
| 821 | # 这是为了兼容车贷分离版本 | ||
| 822 | if borrower_name['words'] == None: | ||
| 823 | borrower_name, borrower_id = self.get_somebody(top='借款人:', bottom='共同借款人及抵押人:') | ||
| 824 | # print(borrower_name, borrower_id) | ||
| 825 | self.init_result['page_2']['借款人及抵押人']['name'] = borrower_name | ||
| 826 | self.init_result['page_2']['借款人及抵押人']['id'] = borrower_id | ||
| 827 | # 找共同借款人及共同抵押人 | ||
| 828 | co_borrower_name, co_borrower_id = self.get_somebody(top='共同借款人:', bottom='保证人1:') | ||
| 829 | # print(co_borrower_name, co_borrower_id) | ||
| 830 | self.init_result['page_2']['共同借款人及共同抵押人']['name'] = co_borrower_name | ||
| 831 | self.init_result['page_2']['共同借款人及共同抵押人']['id'] = co_borrower_id | ||
| 832 | # 保证人1 | ||
| 833 | first_guarantor_name, first_guarantor_id = self.get_somebody(top='保证人1:', bottom='保证人2:') | ||
| 834 | self.init_result['page_2']['保证人1']['name'] = first_guarantor_name | ||
| 835 | self.init_result['page_2']['保证人1']['id'] = first_guarantor_id | ||
| 836 | # 保证人2 | ||
| 837 | second_guarantor_name, second_guarantor_id = self.get_somebody(top='保证人2:', bottom='第一章') | ||
| 838 | self.init_result['page_2']['保证人2']['name'] = second_guarantor_name | ||
| 839 | self.init_result['page_2']['保证人2']['id'] = second_guarantor_id | ||
| 840 | # 所购车辆价格 | ||
| 841 | vehicle_price = self.get_vehicle_price(page_num='1') | ||
| 842 | # print(vehicle_price) | ||
| 843 | self.init_result['page_2']['所购车辆价格'] = vehicle_price | ||
| 844 | # 车架号 | ||
| 845 | vin = self.get_vin(page_num='1') | ||
| 846 | # print(vin) | ||
| 847 | self.init_result['page_2']['车架号'] = vin | ||
| 848 | # 经销商 | ||
| 849 | seller = self.get_seller() | ||
| 850 | # print(seller) | ||
| 851 | self.init_result['page_2']['经销商'] = seller | ||
| 852 | # 贷款本金金额(如果是 ASP产品)则'贷款本金金额'项目中包含'车辆贷款本金金额'和'附加产品融资贷款本金总金额'两个项目 | ||
| 853 | upper, lower, asp_1, asp_2 = self.get_loan_principal(page_num='1') | ||
| 854 | # print(upper, lower, asp_1, asp_2) | ||
| 855 | self.init_result['page_2']['贷款本金金额']['大写'] = upper | ||
| 856 | self.init_result['page_2']['贷款本金金额']['小写'] = lower | ||
| 857 | self.init_result['page_2']['贷款本金金额']['车辆贷款本金金额'] = asp_1 | ||
| 858 | self.init_result['page_2']['贷款本金金额']['附加产品融资贷款本金总金额'] = asp_2 | ||
| 859 | # 贷款期限 | ||
| 860 | loan_term = self.get_loan_term(page_num='1') | ||
| 861 | # print(loan_term) | ||
| 862 | self.init_result['page_2']['贷款期限'] = loan_term | ||
| 863 | # 本合同当期的标准利率 | ||
| 864 | standard_rate = self.get_standard_rate(page_num='1') | ||
| 865 | # print(standard_rate) | ||
| 866 | self.init_result['page_2']['标准利率'] = standard_rate | ||
| 867 | # 202212 release 新增借款人收款账户 | ||
| 868 | account, account_name, account_bank = self.get_borrower_collection_account() | ||
| 869 | # print(account, account_name, account_bank) | ||
| 870 | self.init_result['page_2']['借款人收款账户']['账号'] = account | ||
| 871 | self.init_result['page_2']['借款人收款账户']['户名'] = account_name | ||
| 872 | self.init_result['page_2']['借款人收款账户']['开户行'] = account_bank | ||
| 873 | # 还款账户 | ||
| 874 | account, account_name, account_bank = self.get_payback_account() | ||
| 875 | # print(account, account_name, account_bank) | ||
| 876 | self.init_result['page_2']['还款账户']['账号'] = account | ||
| 877 | self.init_result['page_2']['还款账户']['户名'] = account_name | ||
| 878 | self.init_result['page_2']['还款账户']['开户行'] = account_bank | ||
| 879 | ####################################### | ||
| 880 | # Page 3 | ||
| 881 | # 找合同编号 | ||
| 882 | contract_no = self.get_contract_no(page_num='2') | ||
| 883 | self.init_result['page_3']['合同编号'] = contract_no | ||
| 884 | # 还款计划表(表格) | ||
| 885 | repayment_schedule_table = self.get_repayment_schedule() | ||
| 886 | # print(repayment_schedule_table) | ||
| 887 | self.init_result['page_3']['还款计划表'] = repayment_schedule_table | ||
| 888 | ####################################### | ||
| 889 | # Page 4 | ||
| 890 | # 找合同编号 | ||
| 891 | contract_no = self.get_contract_no(page_num='3') | ||
| 892 | # print(contract_no) | ||
| 893 | self.init_result['page_4']['合同编号'] = contract_no | ||
| 894 | # 附加产品融资贷款本金总金额明细(ASP-表格) | ||
| 895 | asp_details_table = self.get_asp_details(page_num='3') | ||
| 896 | # print(asp_details_table) | ||
| 897 | self.init_result['page_4']['附加产品融资贷款本金总金额明细'] = asp_details_table | ||
| 898 | ####################################### | ||
| 899 | # Page 5 | ||
| 900 | # 找合同编号 | ||
| 901 | contract_no = self.get_contract_no(page_num='4') | ||
| 902 | # print(contract_no) | ||
| 903 | self.init_result['page_5']['合同编号'] = contract_no | ||
| 904 | ####################################### | ||
| 905 | # Page 6 | ||
| 906 | # 找合同编号 | ||
| 907 | contract_no = self.get_contract_no(page_num='5') | ||
| 908 | # print(contract_no) | ||
| 909 | self.init_result['page_6']['合同编号'] = contract_no | ||
| 910 | # Page 7 | ||
| 911 | # 找合同编号 | ||
| 912 | contract_no = self.get_contract_no(page_num='6') | ||
| 913 | self.init_result['page_7']['合同编号'] = contract_no | ||
| 914 | # Page 8 | ||
| 915 | # 找合同编号 | ||
| 916 | contract_no = self.get_contract_no(page_num='7') | ||
| 917 | self.init_result['page_8']['合同编号'] = contract_no | ||
| 918 | signature_name, signature_date = self.get_last_page_signature(page_num='7', | ||
| 919 | top='合同编号', bottom='共同借款人') | ||
| 920 | if signature_name['words'] == None: | ||
| 921 | signature_name, signature_date = self.get_last_page_signature(page_num='7', | ||
| 922 | top='合同编号', bottom='共同借款人(抵押人)') | ||
| 923 | # print(signature_name, signature_date) | ||
| 924 | self.init_result['page_8']['主借人签字']['签字'] = signature_name | ||
| 925 | self.init_result['page_8']['主借人签字']['日期'] = signature_date | ||
| 926 | signature_name, signature_date = self.get_last_page_signature(page_num='7', | ||
| 927 | top='共同借款人', bottom='保证人1') | ||
| 928 | if signature_name['words'] == None: | ||
| 929 | signature_name, signature_date = self.get_last_page_signature(page_num='7', | ||
| 930 | top='共同借款人(抵押人)', bottom='保证人1') | ||
| 931 | # print(signature_name, signature_date) | ||
| 932 | self.init_result['page_8']['共借人签字']['签字'] = signature_name | ||
| 933 | self.init_result['page_8']['共借人签字']['日期'] = signature_date | ||
| 934 | signature_name, signature_date = self.get_last_page_signature(page_num='7', | ||
| 935 | top='保证人1', bottom='保证人2') | ||
| 936 | self.init_result['page_8']['保证人1签字']['签字'] = signature_name | ||
| 937 | self.init_result['page_8']['保证人1签字']['日期'] = signature_date | ||
| 938 | signature_name, signature_date = self.get_last_page_signature(page_num='7', | ||
| 939 | top='保证人2', bottom='在本人面前亲笔签署本合同') | ||
| 940 | self.init_result['page_8']['保证人2签字']['签字'] = signature_name | ||
| 941 | self.init_result['page_8']['保证人2签字']['日期'] = signature_date | ||
| 942 | signature_name, signature_date = self.get_last_page_signature(page_num='7', | ||
| 943 | top='在本人面前亲笔签署本合同', bottom='以下无正文') | ||
| 944 | # print(signature_name, signature_date) | ||
| 945 | self.init_result['page_8']['见证人签字']['签字'] = signature_name | ||
| 946 | self.init_result['page_8']['见证人签字']['日期'] = signature_date | ||
| 947 | |||
| 948 | # 重新定制输出 | ||
| 949 | new_results = {"is_asp": self.is_asp, | ||
| 950 | "page_info": self.init_result | ||
| 951 | } | ||
| 952 | return new_results | ||
| ... | \ No newline at end of file | ... | \ No newline at end of file |
| 1 | import re | ||
| 2 | import numpy as np | ||
| 3 | from fuzzywuzzy import fuzz | ||
| 4 | from shapely.geometry import Polygon | ||
| 5 | |||
| 6 | def caculate_iou(g, p): | ||
| 7 | g = Polygon(np.array(g).reshape((-1, 2))) | ||
| 8 | p = Polygon(np.array(p).reshape((-1, 2))) | ||
| 9 | inter = Polygon(g).intersection(Polygon(p)).area | ||
| 10 | union = g.area + p.area - inter | ||
| 11 | return inter/union | ||
| 12 | |||
| 13 | def get_table_info(bbox_1, bbox_2, ocr_result): | ||
| 14 | anchor = [bbox_2[0], bbox_1[1], bbox_2[2], bbox_1[3], | ||
| 15 | bbox_2[4], bbox_1[5], bbox_2[6], bbox_1[7]] | ||
| 16 | table_info = '' | ||
| 17 | for span in ocr_result: | ||
| 18 | iou = caculate_iou(anchor, span[0]) | ||
| 19 | if iou > 0: | ||
| 20 | table_info = span[1] | ||
| 21 | return table_info | ||
| 22 | |||
| 23 | class Finder: | ||
| 24 | |||
| 25 | def __init__(self, pdf_info): | ||
| 26 | self.pdf_info = pdf_info | ||
| 27 | self.item = {"words": None, | ||
| 28 | "page": None, | ||
| 29 | "position": None, | ||
| 30 | } | ||
| 31 | # 格式化算法输出 | ||
| 32 | self.init_result = {"合同编号": self.item, | ||
| 33 | "承租人-姓名": self.item, | ||
| 34 | "承租人-证件号码": self.item, | ||
| 35 | "承租人-法定代表人或授权代表": self.item, | ||
| 36 | |||
| 37 | "共同承租人-姓名": self.item, | ||
| 38 | "共同承租人-证件号码": self.item, | ||
| 39 | "共同承租人-法定代表人或授权代表": self.item, | ||
| 40 | |||
| 41 | "保证人1-姓名": self.item, | ||
| 42 | "保证人1-证件号码": self.item, | ||
| 43 | "保证人1-法定代表人或授权代表": self.item, | ||
| 44 | |||
| 45 | "保证人2-姓名": self.item, | ||
| 46 | "保证人2-证件号码": self.item, | ||
| 47 | "保证人2-法定代表人或授权代表": self.item, | ||
| 48 | "保证人3-姓名": self.item, | ||
| 49 | "保证人3-证件号码": self.item, | ||
| 50 | "保证人3-法定代表人或授权代表": self.item, | ||
| 51 | "合同编号(正文)": self.item, | ||
| 52 | "车辆识别代码": self.item, | ||
| 53 | "车辆卖方(经销商)": self.item, | ||
| 54 | "车辆原始销售价格(《机动车销售统一发票》所列金额)": self.item, | ||
| 55 | "车辆附加产品明细表": self.item, | ||
| 56 | "融资成本总额": self.item, | ||
| 57 | "租期": self.item, | ||
| 58 | "付款计划表": self.item, | ||
| 59 | "承租人收款账户-户名": self.item, | ||
| 60 | "承租人收款账户-银行账号": self.item, | ||
| 61 | "承租人收款账户-开户行": self.item, | ||
| 62 | "承租人扣款账户-户名": self.item, | ||
| 63 | "承租人扣款账户-银行账号": self.item, | ||
| 64 | "承租人扣款账户-开户行": self.item, | ||
| 65 | "签字页-承租人姓名": self.item, | ||
| 66 | "签字页-承租人签章": self.item, | ||
| 67 | |||
| 68 | "签字页-共同承租人姓名": self.item, | ||
| 69 | "签字页-共同承租人签章": self.item, | ||
| 70 | |||
| 71 | "签字页-保证人1姓名": self.item, | ||
| 72 | "签字页-保证人1签章": self.item, | ||
| 73 | |||
| 74 | "签字页-保证人2姓名": self.item, | ||
| 75 | "签字页-保证人2签章": self.item, | ||
| 76 | "签字页-保证人3姓名": self.item, | ||
| 77 | "签字页-保证人3签章": self.item, | ||
| 78 | } | ||
| 79 | |||
| 80 | # 格式化输出 车辆处置协议 要是别的字段 | ||
| 81 | self.init_result_1 = {"合同编号": self.item, | ||
| 82 | "承租人-姓名": self.item, | ||
| 83 | "承租人-证件号码": self.item, | ||
| 84 | "销售经销商": self.item, | ||
| 85 | "合同编号(正文)": self.item, | ||
| 86 | "签字页-承租人姓名": self.item, | ||
| 87 | "签字页-承租人证件号码": self.item, | ||
| 88 | "签字页-承租人签章": self.item, | ||
| 89 | "签字页-销售经销商": self.item, | ||
| 90 | "签字页-销售经销商签章": self.item, | ||
| 91 | } | ||
| 92 | |||
| 93 | # 格式化输出 车辆租赁抵押合同 | ||
| 94 | self.init_result_2 = {"合同编号": self.item, | ||
| 95 | "合同编号(正文)": self.item, | ||
| 96 | "抵押人姓名/名称": self.item, | ||
| 97 | "抵押人证件号码": self.item, | ||
| 98 | "抵押人配偶姓名/名称": self.item, | ||
| 99 | "抵押人配偶证件号码": self.item, | ||
| 100 | "车辆识别代码": self.item, | ||
| 101 | "租金总额": self.item, | ||
| 102 | "融资租赁期限": self.item, | ||
| 103 | "签字页-抵押人姓名": self.item, | ||
| 104 | "签字页-抵押人签章": self.item, | ||
| 105 | "签字页-抵押人配偶姓名": self.item, | ||
| 106 | "签字页-抵押人配偶签章": self.item, | ||
| 107 | } | ||
| 108 | |||
| 109 | def get_contract_no(self, page_num): | ||
| 110 | """传入页码,查看该页码右上角的编号 | ||
| 111 | |||
| 112 | Args: | ||
| 113 | page_num (string): | ||
| 114 | |||
| 115 | Returns: | ||
| 116 | sting: | ||
| 117 | """ | ||
| 118 | contract_no = self.item.copy() | ||
| 119 | # 只看第一页 | ||
| 120 | for block in self.pdf_info[page_num]['blocks']: | ||
| 121 | if block['type'] != 0: | ||
| 122 | continue | ||
| 123 | for line in block['lines']: | ||
| 124 | for span in line['spans']: | ||
| 125 | bbox, text = span['bbox'], span['text'] | ||
| 126 | if '合同编号:' in text: | ||
| 127 | words = text.split(':')[-1] | ||
| 128 | contract_no['position'] = bbox | ||
| 129 | contract_no['page'] = page_num | ||
| 130 | contract_no['words'] = words | ||
| 131 | if contract_no['words'] == '': | ||
| 132 | for block in self.pdf_info[page_num]['blocks']: | ||
| 133 | if block['type'] != 0: | ||
| 134 | continue | ||
| 135 | for line in block['lines']: | ||
| 136 | for span in line['spans']: | ||
| 137 | bbox, text = span['bbox'], span['text'] | ||
| 138 | if bbox[1] < contract_no['position'][3] and 'CH' in text: | ||
| 139 | contract_no['position'] = bbox | ||
| 140 | contract_no['page'] = page_num | ||
| 141 | contract_no['words'] = text | ||
| 142 | return contract_no | ||
| 143 | |||
| 144 | def get_vehicle_price(self, page_num='0'): | ||
| 145 | vehicle_price = self.item.copy() | ||
| 146 | for block in self.pdf_info[page_num]['blocks']: | ||
| 147 | if block['type'] != 0: | ||
| 148 | continue | ||
| 149 | for line in block['lines']: | ||
| 150 | for span in line['spans']: | ||
| 151 | bbox, text = span['bbox'], span['text'] | ||
| 152 | if '所购车辆价格为人民币' in text: | ||
| 153 | words = text.split('币')[-1] | ||
| 154 | vehicle_price['position'] = bbox | ||
| 155 | vehicle_price['words'] = words | ||
| 156 | return vehicle_price | ||
| 157 | |||
| 158 | def get_contract_no_one(self): | ||
| 159 | # 查找正文中的合同编号,有可能存在换行的情况 | ||
| 160 | contract_no = self.item.copy() | ||
| 161 | for pno in self.pdf_info: | ||
| 162 | all_text = '' | ||
| 163 | for block in self.pdf_info[pno]['blocks']: | ||
| 164 | if block['type'] != 0: | ||
| 165 | continue | ||
| 166 | for line in block['lines']: | ||
| 167 | for span in line['spans']: | ||
| 168 | bbox, text = span['bbox'], span['text'] | ||
| 169 | all_text += text | ||
| 170 | all_text = all_text.replace(' ', '') | ||
| 171 | matchObj = re.search(r'(合同编号:\[(.*?)\])', all_text) | ||
| 172 | if matchObj: | ||
| 173 | words = matchObj.group(1) | ||
| 174 | contract_no['position'] = None | ||
| 175 | contract_no['page'] = pno | ||
| 176 | # contract_no['words'] = words | ||
| 177 | contract_no['words'] = re.sub("\s", "", words).replace(")", "") | ||
| 178 | return contract_no | ||
| 179 | |||
| 180 | matchObj = re.search(r'编号为(.*?)的', all_text) | ||
| 181 | if matchObj: | ||
| 182 | words = matchObj.group(1).strip() | ||
| 183 | contract_no['position'] = None | ||
| 184 | contract_no['page'] = pno | ||
| 185 | # contract_no['words'] = words | ||
| 186 | contract_no['words'] = re.sub("\s", "", words).replace(")", "") | ||
| 187 | return contract_no | ||
| 188 | |||
| 189 | matchObj = re.search(r'编号为(.*?))的', all_text) | ||
| 190 | if matchObj: | ||
| 191 | words = matchObj.group(1).strip() | ||
| 192 | contract_no['position'] = None | ||
| 193 | contract_no['page'] = pno | ||
| 194 | # contract_no['words'] = words | ||
| 195 | contract_no['words'] = re.sub("\s", "", words) | ||
| 196 | return contract_no | ||
| 197 | |||
| 198 | def get_key_value(self, key, page_num=None): | ||
| 199 | value = self.item.copy() | ||
| 200 | if page_num is not None: | ||
| 201 | pno = page_num | ||
| 202 | for block in self.pdf_info[pno]['blocks']: | ||
| 203 | if block['type'] != 0: | ||
| 204 | continue | ||
| 205 | for line in block['lines']: | ||
| 206 | for span in line['spans']: | ||
| 207 | bbox, text = span['bbox'], span['text'] | ||
| 208 | if key in text: | ||
| 209 | words = text.split(':')[-1].replace("。", "") | ||
| 210 | value['position'] = bbox | ||
| 211 | value['page'] = pno | ||
| 212 | # value['words'] = words | ||
| 213 | value['words'] = re.sub("\s", "", words) | ||
| 214 | else: | ||
| 215 | for pno in self.pdf_info: | ||
| 216 | for block in self.pdf_info[pno]['blocks']: | ||
| 217 | if block['type'] != 0: | ||
| 218 | continue | ||
| 219 | for line in block['lines']: | ||
| 220 | for span in line['spans']: | ||
| 221 | bbox, text = span['bbox'], span['text'] | ||
| 222 | if key in text: | ||
| 223 | # print(self.pdf_info[pno]) | ||
| 224 | words = text.split(':')[-1].replace("。", "") | ||
| 225 | value['position'] = bbox | ||
| 226 | value['page'] = pno | ||
| 227 | # value['words'] = words | ||
| 228 | value['words'] = re.sub("\s", "", words) | ||
| 229 | return value | ||
| 230 | |||
| 231 | def get_loan_principal(self, page_num='0'): | ||
| 232 | chinese_keywords = ['壹', '贰', '叁', '肆', '伍', '陆', '柒', '捌', '玖', '拾', | ||
| 233 | '佰', '仟', '万', '亿', '元', '角', '分', '零', '整'] | ||
| 234 | upper = self.item.copy() | ||
| 235 | lower = self.item.copy() | ||
| 236 | asp_1 = self.item.copy() | ||
| 237 | asp_2 = self.item.copy() | ||
| 238 | anchor_bbox = None | ||
| 239 | for block in self.pdf_info[page_num]['blocks']: | ||
| 240 | if block['type'] != 0: | ||
| 241 | continue | ||
| 242 | for line in block['lines']: | ||
| 243 | for span in line['spans']: | ||
| 244 | bbox, text = span['bbox'], span['text'] | ||
| 245 | if fuzz.ratio(''.join(chinese_keywords), text) > 15: | ||
| 246 | text = text.split(':')[-1].strip() | ||
| 247 | upper['position'] = bbox | ||
| 248 | upper['words'] = text | ||
| 249 | if '小写:¥' in text: | ||
| 250 | words = text.split('¥')[-1].strip() | ||
| 251 | lower['position'] = bbox | ||
| 252 | lower['words'] = words | ||
| 253 | if '附加产品融资贷款本金总金额' == text: | ||
| 254 | anchor_bbox = bbox | ||
| 255 | if anchor_bbox: | ||
| 256 | for block in self.pdf_info[page_num]['blocks']: | ||
| 257 | if block['type'] != 0: | ||
| 258 | continue | ||
| 259 | for line in block['lines']: | ||
| 260 | for span in line['spans']: | ||
| 261 | bbox, text = span['bbox'], span['text'] | ||
| 262 | if np.mean(bbox[1::2]) < np.mean(anchor_bbox[1::2]) and '人民币:小写:' in text: | ||
| 263 | words = re.findall(r'人民币:小写:\[(.*)\]', text)[0] | ||
| 264 | asp_1['position'] = bbox | ||
| 265 | asp_1['words'] = words | ||
| 266 | if np.mean(bbox[1::2]) > np.mean(anchor_bbox[1::2]) and '人民币:小写:' in text: | ||
| 267 | words = re.findall(r'人民币:小写:\[(.*)\]', text)[0] | ||
| 268 | asp_2['position'] = bbox | ||
| 269 | asp_2['words'] = words | ||
| 270 | return upper, lower, asp_1, asp_2 | ||
| 271 | |||
| 272 | def get_loan_term(self, page_num='0'): | ||
| 273 | loan_term = self.item.copy() | ||
| 274 | all_text = '' | ||
| 275 | for block in self.pdf_info[page_num]['blocks']: | ||
| 276 | if block['type'] != 0: | ||
| 277 | continue | ||
| 278 | for line in block['lines']: | ||
| 279 | for span in line['spans']: | ||
| 280 | bbox, text = span['bbox'], span['text'] | ||
| 281 | all_text += text | ||
| 282 | matchs = re.search(r'贷款期限(\d+)个月', all_text) | ||
| 283 | if matchs: | ||
| 284 | words = matchs.group(1) | ||
| 285 | for block in self.pdf_info[page_num]['blocks']: | ||
| 286 | if block['type'] != 0: | ||
| 287 | continue | ||
| 288 | for line in block['lines']: | ||
| 289 | for span in line['spans']: | ||
| 290 | bbox, text = span['bbox'], span['text'] | ||
| 291 | if f'{words}个月' in text: | ||
| 292 | loan_term['position'] = bbox | ||
| 293 | loan_term['words'] = words | ||
| 294 | return loan_term | ||
| 295 | |||
| 296 | def get_asp_details(self, page_num): | ||
| 297 | asp_details_table_term = self.item.copy() | ||
| 298 | |||
| 299 | asp_details_table = [] | ||
| 300 | asp_details_text_list = [] | ||
| 301 | table = False | ||
| 302 | for block in self.pdf_info[page_num]['blocks']: | ||
| 303 | if block['type'] != 0: | ||
| 304 | continue | ||
| 305 | for line in block['lines']: | ||
| 306 | for span in line['spans']: | ||
| 307 | bbox, text = span['bbox'], span['text'] | ||
| 308 | if '附加产品融资贷款本金总金额明细' == text: | ||
| 309 | table = True | ||
| 310 | if '第二条' in text or '征信管理' in text: | ||
| 311 | table = False | ||
| 312 | if table == True: | ||
| 313 | asp_details_text_list.append(text) | ||
| 314 | |||
| 315 | for i in range((len(asp_details_text_list)+2)//3): | ||
| 316 | |||
| 317 | line = [] | ||
| 318 | if i == 0: | ||
| 319 | line = [asp_details_text_list[0]] | ||
| 320 | else: | ||
| 321 | for j in range(3): | ||
| 322 | line.append(asp_details_text_list[i*3-2+j]) | ||
| 323 | |||
| 324 | asp_details_table.append(line) | ||
| 325 | |||
| 326 | if len(asp_details_table) > 0: | ||
| 327 | asp_details_table_term['words'] = asp_details_table | ||
| 328 | return asp_details_table_term | ||
| 329 | |||
| 330 | def get_signature(self): | ||
| 331 | signature = self.item.copy() | ||
| 332 | |||
| 333 | for block in self.pdf_info['0']['blocks']: | ||
| 334 | if block['type'] != 0: | ||
| 335 | continue | ||
| 336 | for line in block['lines']: | ||
| 337 | for span in line['spans']: | ||
| 338 | bbox, text = span['bbox'], span['text'] | ||
| 339 | if '签署日期' in text: | ||
| 340 | words = text | ||
| 341 | signature['words'] = words | ||
| 342 | signature['position'] = bbox | ||
| 343 | return signature | ||
| 344 | |||
| 345 | def get_somebody(self, top, bottom): | ||
| 346 | # 指定上下边界后,返回上下边界内的客户信息 | ||
| 347 | _name = self.item.copy() | ||
| 348 | _id = self.item.copy() | ||
| 349 | # 只看第一页,先划定上下边界 | ||
| 350 | y_top = 0 | ||
| 351 | y_bottom = 0 | ||
| 352 | for block in self.pdf_info['1']['blocks']: | ||
| 353 | if block['type'] != 0: | ||
| 354 | continue | ||
| 355 | for line in block['lines']: | ||
| 356 | for span in line['spans']: | ||
| 357 | bbox, text = span['bbox'], span['text'] | ||
| 358 | if top in text: | ||
| 359 | y_top = bbox[3] | ||
| 360 | if bottom in text: | ||
| 361 | y_bottom = bbox[3] | ||
| 362 | for block in self.pdf_info['1']['blocks']: | ||
| 363 | if block['type'] != 0: | ||
| 364 | continue | ||
| 365 | for line in block['lines']: | ||
| 366 | for span in line['spans']: | ||
| 367 | bbox, text = span['bbox'], span['text'] | ||
| 368 | if y_top < bbox[3] < y_bottom: | ||
| 369 | if '姓名/名称' in text: | ||
| 370 | words = text.split(':')[-1] | ||
| 371 | _name['position'] = bbox | ||
| 372 | _name['words'] = words | ||
| 373 | if '自然人身份证件号码/法人执照号码' in text: | ||
| 374 | words = text.split(':')[-1] | ||
| 375 | _id['position'] = bbox | ||
| 376 | _id['words'] = words | ||
| 377 | return _name, _id | ||
| 378 | |||
| 379 | def get_seller(self): | ||
| 380 | seller = self.item.copy() | ||
| 381 | # 先找到 key | ||
| 382 | anchor_bbox = None | ||
| 383 | for block in self.pdf_info['1']['blocks']: | ||
| 384 | if block['type'] != 0: | ||
| 385 | continue | ||
| 386 | for line in block['lines']: | ||
| 387 | for span in line['spans']: | ||
| 388 | bbox, text = span['bbox'], span['text'] | ||
| 389 | if '经销商' == text: | ||
| 390 | anchor_bbox = bbox | ||
| 391 | # 当找到了 key, 则根据 key 去匹配 value | ||
| 392 | if anchor_bbox: | ||
| 393 | half_width = self.pdf_info['1']['width'] * 0.5 | ||
| 394 | for block in self.pdf_info['1']['blocks']: | ||
| 395 | if block['type'] != 0: | ||
| 396 | continue | ||
| 397 | for line in block['lines']: | ||
| 398 | for span in line['spans']: | ||
| 399 | bbox, text = span['bbox'], span['text'] | ||
| 400 | if anchor_bbox[2]<np.mean(bbox[::2])<half_width and \ | ||
| 401 | anchor_bbox[1]<np.mean(bbox[1::2])<anchor_bbox[3]: | ||
| 402 | seller['position'] = bbox | ||
| 403 | seller['words'] = text | ||
| 404 | return seller | ||
| 405 | |||
| 406 | def get_payback_account(self): | ||
| 407 | account = self.item.copy() | ||
| 408 | account_name = self.item.copy() | ||
| 409 | account_bank = self.item.copy() | ||
| 410 | all_text = '' | ||
| 411 | for block in self.pdf_info['1']['blocks']: | ||
| 412 | if block['type'] != 0: | ||
| 413 | continue | ||
| 414 | for line in block['lines']: | ||
| 415 | for span in line['spans']: | ||
| 416 | bbox, text = span['bbox'], span['text'] | ||
| 417 | all_text += text | ||
| 418 | # 首先确定账户信息是哪种,我们只输出非另行通知的格式 | ||
| 419 | if '☑账号' in all_text: | ||
| 420 | all_text = all_text.replace(' ', '') | ||
| 421 | matchs_1 = re.findall(r'账号:(.*)户名', all_text) | ||
| 422 | if matchs_1: | ||
| 423 | words = matchs_1[0] | ||
| 424 | for block in self.pdf_info['1']['blocks']: | ||
| 425 | if block['type'] != 0: | ||
| 426 | continue | ||
| 427 | for line in block['lines']: | ||
| 428 | for span in line['spans']: | ||
| 429 | bbox, text = span['bbox'], span['text'] | ||
| 430 | if f'{words}' in text: | ||
| 431 | account['position'] = bbox | ||
| 432 | account['words'] = words | ||
| 433 | matchs_2 = re.findall(r'户名:(.*)开户行', all_text) | ||
| 434 | if matchs_2: | ||
| 435 | words = matchs_2[0] | ||
| 436 | for block in self.pdf_info['1']['blocks']: | ||
| 437 | if block['type'] != 0: | ||
| 438 | continue | ||
| 439 | for line in block['lines']: | ||
| 440 | for span in line['spans']: | ||
| 441 | bbox, text = span['bbox'], span['text'] | ||
| 442 | if f'{words}' in text: | ||
| 443 | account_name['position'] = bbox | ||
| 444 | account_name['words'] = words | ||
| 445 | matchs_3 = re.findall(r'开户行:(.*);', all_text) | ||
| 446 | if matchs_3: | ||
| 447 | words = matchs_3[0] | ||
| 448 | for block in self.pdf_info['1']['blocks']: | ||
| 449 | if block['type'] != 0: | ||
| 450 | continue | ||
| 451 | for line in block['lines']: | ||
| 452 | for span in line['spans']: | ||
| 453 | bbox, text = span['bbox'], span['text'] | ||
| 454 | if f'开户行:{words};' in text.replace(' ', ''): | ||
| 455 | account_bank['position'] = bbox | ||
| 456 | account_bank['words'] = words | ||
| 457 | return account, account_name, account_bank | ||
| 458 | |||
| 459 | def get_repayment_schedule(self): | ||
| 460 | repayment_schedule = self.item.copy() | ||
| 461 | |||
| 462 | repayment_schedule_text_list = [] | ||
| 463 | table = False | ||
| 464 | page = None | ||
| 465 | left = 0 | ||
| 466 | right = 0 | ||
| 467 | for pno in self.pdf_info: | ||
| 468 | for block in self.pdf_info[pno]['blocks']: | ||
| 469 | if block['type'] != 0: | ||
| 470 | continue | ||
| 471 | for line in block['lines']: | ||
| 472 | for span in line['spans']: | ||
| 473 | bbox, text = span['bbox'], span['text'] | ||
| 474 | if '剩余融资' in text: | ||
| 475 | right = bbox[2] | ||
| 476 | if '以上表格中所列序号' in text: | ||
| 477 | table = False | ||
| 478 | if table == True: | ||
| 479 | # 过滤汉字 | ||
| 480 | if re.compile(r'[\u4e00-\u9fff]').search(text): | ||
| 481 | continue | ||
| 482 | # 过滤 1. - 61. 这些标题 | ||
| 483 | if re.findall("\d+", text): | ||
| 484 | if len(re.findall("\d+", text)) == 1: | ||
| 485 | continue | ||
| 486 | if not left < bbox[0] < right: | ||
| 487 | continue | ||
| 488 | repayment_schedule_text_list.append(text) | ||
| 489 | |||
| 490 | if text.strip() == "61.": | ||
| 491 | page = pno | ||
| 492 | table = True | ||
| 493 | left = bbox[0] | ||
| 494 | # print("repayment_schedule_text_list = ", repayment_schedule_text_list) | ||
| 495 | # repayment_schedule_table = [['序号', '融资租赁成本', '融资租赁费用', '租金', '剩余融资租赁成本']] | ||
| 496 | repayment_schedule_table = [['序号', '租金']] | ||
| 497 | for i in range(len(repayment_schedule_text_list)//4): | ||
| 498 | line = [f'{i+1}.'] | ||
| 499 | # 4表示4列的意思 | ||
| 500 | for j in range(4): | ||
| 501 | line.append(repayment_schedule_text_list[i*4+j]) | ||
| 502 | |||
| 503 | # 只保留序号和租金列 | ||
| 504 | line = [line[0].replace('.', ''), line[3]] | ||
| 505 | |||
| 506 | repayment_schedule_table.append(line) | ||
| 507 | |||
| 508 | repayment_schedule['words'] = repayment_schedule_table | ||
| 509 | repayment_schedule['page'] = page | ||
| 510 | return repayment_schedule | ||
| 511 | |||
| 512 | def get_signature_role_1(self): | ||
| 513 | signature_role_1 = self.item.copy() | ||
| 514 | for pno in self.pdf_info: | ||
| 515 | for block in self.pdf_info[pno]['blocks']: | ||
| 516 | if block['type'] != 0: | ||
| 517 | continue | ||
| 518 | for line in block['lines']: | ||
| 519 | for span in line['spans']: | ||
| 520 | bbox, text = span['bbox'], span['text'] | ||
| 521 | if '签署日期' in text: | ||
| 522 | signature_role_1['position'] = bbox | ||
| 523 | signature_role_1['page'] = pno | ||
| 524 | signature_role_1['words'] = text | ||
| 525 | return signature_role_1 | ||
| 526 | |||
| 527 | def get_signature_role_2(self): | ||
| 528 | signature_role_2 = self.init_item.copy() | ||
| 529 | # 先定位签字区域 | ||
| 530 | texts = [] | ||
| 531 | boxes = [] | ||
| 532 | page_num = None | ||
| 533 | position = None | ||
| 534 | words = None | ||
| 535 | region = False | ||
| 536 | for i in list(self.pdf_info.keys()): | ||
| 537 | for block in self.pdf_info[i]['blocks']: | ||
| 538 | if block['type'] != 0: | ||
| 539 | continue | ||
| 540 | for line in block['lines']: | ||
| 541 | for span in line['spans']: | ||
| 542 | bbox, text = span['bbox'], span['text'] | ||
| 543 | if '共同借款人(共同抵押人)' in text: | ||
| 544 | region = True | ||
| 545 | if '日期' in text: | ||
| 546 | region = False | ||
| 547 | if region == True: | ||
| 548 | page_num = i | ||
| 549 | texts.append(text) | ||
| 550 | boxes.append(bbox) | ||
| 551 | if len(texts) > 4: | ||
| 552 | words = '有' | ||
| 553 | else: | ||
| 554 | words = '无' | ||
| 555 | boxes = np.array(boxes).reshape((-1, 2)) | ||
| 556 | position = [min(boxes[:,0]), min(boxes[:,1]), max(boxes[:,0]), max(boxes[:,1])] | ||
| 557 | signature_role_2['page_num'] = page_num | ||
| 558 | signature_role_2['position'] = position | ||
| 559 | signature_role_2['words'] = words | ||
| 560 | return signature_role_2 | ||
| 561 | |||
| 562 | def get_signature_role_3(self): | ||
| 563 | signature_role_3 = self.init_item.copy() | ||
| 564 | # 先定位签字区域 | ||
| 565 | texts = [] | ||
| 566 | boxes = [] | ||
| 567 | page_num = None | ||
| 568 | position = None | ||
| 569 | words = None | ||
| 570 | region = False | ||
| 571 | for i in list(self.pdf_info.keys()): | ||
| 572 | for block in self.pdf_info[i]['blocks']: | ||
| 573 | if block['type'] != 0: | ||
| 574 | continue | ||
| 575 | for line in block['lines']: | ||
| 576 | for span in line['spans']: | ||
| 577 | bbox, text = span['bbox'], span['text'] | ||
| 578 | if '保证人1' in text and int(i) != 0: | ||
| 579 | region = True | ||
| 580 | if '日期' in text: | ||
| 581 | region = False | ||
| 582 | if region == True: | ||
| 583 | page_num = i | ||
| 584 | texts.append(text) | ||
| 585 | boxes.append(bbox) | ||
| 586 | if len(texts) > 4: | ||
| 587 | words = '有' | ||
| 588 | else: | ||
| 589 | words = '无' | ||
| 590 | boxes = np.array(boxes).reshape((-1, 2)) | ||
| 591 | position = [min(boxes[:,0]), min(boxes[:,1]), max(boxes[:,0]), max(boxes[:,1])] | ||
| 592 | signature_role_3['page_num'] = page_num | ||
| 593 | signature_role_3['position'] = position | ||
| 594 | signature_role_3['words'] = words | ||
| 595 | return signature_role_3 | ||
| 596 | |||
| 597 | def get_signature_role_4(self): | ||
| 598 | signature_role_4 = self.init_item.copy() | ||
| 599 | # 先定位签字区域 | ||
| 600 | texts = [] | ||
| 601 | boxes = [] | ||
| 602 | page_num = None | ||
| 603 | position = None | ||
| 604 | words = None | ||
| 605 | region = False | ||
| 606 | for i in list(self.pdf_info.keys()): | ||
| 607 | for block in self.pdf_info[i]['blocks']: | ||
| 608 | if block['type'] != 0: | ||
| 609 | continue | ||
| 610 | for line in block['lines']: | ||
| 611 | for span in line['spans']: | ||
| 612 | bbox, text = span['bbox'], span['text'] | ||
| 613 | if '保证人2' in text and int(i) != 0: | ||
| 614 | region = True | ||
| 615 | if '日期' in text: | ||
| 616 | region = False | ||
| 617 | if region == True: | ||
| 618 | page_num = i | ||
| 619 | texts.append(text) | ||
| 620 | boxes.append(bbox) | ||
| 621 | if len(texts) > 4: | ||
| 622 | words = '有' | ||
| 623 | else: | ||
| 624 | words = '无' | ||
| 625 | boxes = np.array(boxes).reshape((-1, 2)) | ||
| 626 | position = [min(boxes[:,0]), min(boxes[:,1]), max(boxes[:,0]), max(boxes[:,1])] | ||
| 627 | signature_role_4['page_num'] = page_num | ||
| 628 | signature_role_4['position'] = position | ||
| 629 | signature_role_4['words'] = words | ||
| 630 | return signature_role_4 | ||
| 631 | |||
| 632 | def get_signature_role_5(self): | ||
| 633 | signature_role_5 = self.init_item.copy() | ||
| 634 | # 先定位签字区域 | ||
| 635 | texts = [] | ||
| 636 | boxes = [] | ||
| 637 | page_num = None | ||
| 638 | position = None | ||
| 639 | words = None | ||
| 640 | region = False | ||
| 641 | for i in list(self.pdf_info.keys()): | ||
| 642 | for block in self.pdf_info[i]['blocks']: | ||
| 643 | if block['type'] != 0: | ||
| 644 | continue | ||
| 645 | for line in block['lines']: | ||
| 646 | for span in line['spans']: | ||
| 647 | bbox, text = span['bbox'], span['text'] | ||
| 648 | if '见证人签字' in text and int(i) != 0: | ||
| 649 | region = True | ||
| 650 | if '年' in text: | ||
| 651 | region = False | ||
| 652 | if region == True: | ||
| 653 | page_num = i | ||
| 654 | texts.append(text) | ||
| 655 | boxes.append(bbox) | ||
| 656 | # print(texts) | ||
| 657 | if len(texts) > 4: | ||
| 658 | words = '有' | ||
| 659 | else: | ||
| 660 | words = '无' | ||
| 661 | boxes = np.array(boxes).reshape((-1, 2)) | ||
| 662 | position = [min(boxes[:,0]), min(boxes[:,1]), max(boxes[:,0]), max(boxes[:,1])] | ||
| 663 | signature_role_5['page_num'] = page_num | ||
| 664 | signature_role_5['position'] = position | ||
| 665 | signature_role_5['words'] = words | ||
| 666 | return signature_role_5 | ||
| 667 | |||
| 668 | def get_last_page_signature(self, page_num, top, bottom): | ||
| 669 | signature_name = self.item.copy() | ||
| 670 | signature_date = self.item.copy() | ||
| 671 | anchor_top = None | ||
| 672 | anchor_bottom = None | ||
| 673 | for block in self.pdf_info[page_num]['blocks']: | ||
| 674 | if block['type'] != 0: | ||
| 675 | continue | ||
| 676 | for line in block['lines']: | ||
| 677 | for span in line['spans']: | ||
| 678 | bbox, text = span['bbox'], span['text'] | ||
| 679 | if top in text: | ||
| 680 | anchor_top = bbox[1] | ||
| 681 | if bottom in text: | ||
| 682 | anchor_bottom = bbox[1] | ||
| 683 | if anchor_top is not None and anchor_bottom is not None: | ||
| 684 | for block in self.pdf_info[page_num]['blocks']: | ||
| 685 | if block['type'] != 0: | ||
| 686 | continue | ||
| 687 | for line in block['lines']: | ||
| 688 | for span in line['spans']: | ||
| 689 | bbox, text = span['bbox'], span['text'] | ||
| 690 | if '签署日期' in text and int(anchor_top)<np.mean(bbox[1::2])<int(anchor_bottom): | ||
| 691 | name = text.split(' ')[0] | ||
| 692 | date = text.split(':')[-1] | ||
| 693 | signature_name['words'] = name | ||
| 694 | signature_name['position'] = bbox | ||
| 695 | signature_date['words'] = date | ||
| 696 | signature_name['position'] = bbox | ||
| 697 | return signature_name, signature_date | ||
| 698 | |||
| 699 | def get_electronic_signature(self, top, bottom): | ||
| 700 | signature = self.item.copy() | ||
| 701 | anchor_top = None | ||
| 702 | anchor_bottom = None | ||
| 703 | for pno in self.pdf_info: | ||
| 704 | for block in self.pdf_info[pno]['blocks']: | ||
| 705 | if block['type'] != 0: | ||
| 706 | continue | ||
| 707 | for line in block['lines']: | ||
| 708 | for span in line['spans']: | ||
| 709 | bbox, text = span['bbox'], span['text'] | ||
| 710 | if top in text: | ||
| 711 | anchor_top = bbox[1] | ||
| 712 | if bottom in text: | ||
| 713 | anchor_bottom = bbox[3] | ||
| 714 | if anchor_top is not None and anchor_bottom is not None: | ||
| 715 | for pno in self.pdf_info: | ||
| 716 | for block in self.pdf_info[pno]['blocks']: | ||
| 717 | if block['type'] != 0: | ||
| 718 | continue | ||
| 719 | for line in block['lines']: | ||
| 720 | for span in line['spans']: | ||
| 721 | bbox, text = span['bbox'], span['text'] | ||
| 722 | # ------------ # | ||
| 723 | # print("--text = ", text) | ||
| 724 | if '签署日期' in text and int(anchor_top)<np.mean(bbox[1::2])<int(anchor_bottom): | ||
| 725 | words = text | ||
| 726 | signature['words'] = words | ||
| 727 | signature['page'] = pno | ||
| 728 | signature['position'] = bbox | ||
| 729 | return signature | ||
| 730 | |||
| 731 | def get_role_info(self, role_key, page_num='0'): | ||
| 732 | name = self.item.copy() | ||
| 733 | id_num = self.item.copy() | ||
| 734 | representative = self.item.copy() | ||
| 735 | |||
| 736 | # 以保证人3 的左上角为定位点 | ||
| 737 | anchor = None | ||
| 738 | for block in self.pdf_info[page_num]['blocks']: | ||
| 739 | if block['type'] != 0: | ||
| 740 | continue | ||
| 741 | for line in block['lines']: | ||
| 742 | for span in line['spans']: | ||
| 743 | bbox, text = span['bbox'], span['text'] | ||
| 744 | # 找到角色姓名 | ||
| 745 | if re.match('保证人3', text) is not None: | ||
| 746 | anchor = [bbox[0], bbox[1]] | ||
| 747 | |||
| 748 | if anchor is not None: | ||
| 749 | for block in self.pdf_info[page_num]['blocks']: | ||
| 750 | if block['type'] != 0: | ||
| 751 | continue | ||
| 752 | for line in block['lines']: | ||
| 753 | for span in line['spans']: | ||
| 754 | bbox, text = span['bbox'], span['text'] | ||
| 755 | # 找到角色姓名 | ||
| 756 | if re.match(role_key, text) is not None: | ||
| 757 | words = text.split(':')[-1] | ||
| 758 | name['words'] = words | ||
| 759 | name['page'] = page_num | ||
| 760 | name['position'] = bbox | ||
| 761 | if role_key == '承租人:': | ||
| 762 | # 找到证件号码且确定位置 | ||
| 763 | if re.match('证件号码:', text) is not None and np.mean(bbox[::2]) < anchor[0] and np.mean(bbox[1::2]) < anchor[1]: | ||
| 764 | words = text.split(':')[-1] | ||
| 765 | id_num['words'] = words | ||
| 766 | id_num['page'] = page_num | ||
| 767 | id_num['position'] = bbox | ||
| 768 | # 找到法人代表且确定位置 | ||
| 769 | if re.match('法定代表人或授权代表:', text) is not None and np.mean(bbox[::2]) < anchor[0] and np.mean(bbox[1::2]) < anchor[1]: | ||
| 770 | words = text.split(':')[-1] | ||
| 771 | representative['words'] = words | ||
| 772 | representative['page'] = page_num | ||
| 773 | representative['position'] = bbox | ||
| 774 | if role_key == '保证人1:': | ||
| 775 | # 找到证件号码且确定位置 | ||
| 776 | if re.match('证件号码:', text) is not None and np.mean(bbox[::2]) < anchor[0] and np.mean(bbox[1::2]) > anchor[1]: | ||
| 777 | words = text.split(':')[-1] | ||
| 778 | id_num['words'] = words | ||
| 779 | id_num['page'] = page_num | ||
| 780 | id_num['position'] = bbox | ||
| 781 | # 找到法人代表且确定位置 | ||
| 782 | if re.match('法定代表人或授权代表:', text) is not None and np.mean(bbox[::2]) < anchor[0] and np.mean(bbox[1::2]) > anchor[1]: | ||
| 783 | words = text.split(':')[-1] | ||
| 784 | representative['words'] = words | ||
| 785 | representative['page'] = page_num | ||
| 786 | representative['position'] = bbox | ||
| 787 | if role_key == '保证人2:': | ||
| 788 | # 找到证件号码且确定位置 | ||
| 789 | if re.match('证件号码:', text) is not None and np.mean(bbox[::2]) > anchor[0] and np.mean(bbox[1::2]) < anchor[1]: | ||
| 790 | words = text.split(':')[-1] | ||
| 791 | id_num['words'] = words | ||
| 792 | id_num['page'] = page_num | ||
| 793 | id_num['position'] = bbox | ||
| 794 | # 找到法人代表且确定位置 | ||
| 795 | if re.match('法定代表人或授权代表:', text) is not None and np.mean(bbox[::2]) > anchor[0] and np.mean(bbox[1::2]) < anchor[1]: | ||
| 796 | words = text.split(':')[-1] | ||
| 797 | representative['words'] = words | ||
| 798 | representative['page'] = page_num | ||
| 799 | representative['position'] = bbox | ||
| 800 | if role_key == '保证人3:': | ||
| 801 | # 找到证件号码且确定位置 | ||
| 802 | if re.match('证件号码:', text) is not None and np.mean(bbox[::2]) > anchor[0] and np.mean(bbox[1::2]) > anchor[1]: | ||
| 803 | words = text.split(':')[-1] | ||
| 804 | id_num['words'] = words | ||
| 805 | id_num['page'] = page_num | ||
| 806 | id_num['position'] = bbox | ||
| 807 | # 找到法人代表且确定位置 | ||
| 808 | if re.match('法定代表人或授权代表:', text) is not None and np.mean(bbox[::2]) > anchor[0] and np.mean(bbox[1::2]) > anchor[1]: | ||
| 809 | words = text.split(':')[-1] | ||
| 810 | representative['words'] = words | ||
| 811 | representative['page'] = page_num | ||
| 812 | representative['position'] = bbox | ||
| 813 | return name, id_num, representative | ||
| 814 | |||
| 815 | def get_table_add_product(self): | ||
| 816 | table_add_product = self.item.copy() | ||
| 817 | |||
| 818 | add_product_page_num = None | ||
| 819 | for pno in self.pdf_info: | ||
| 820 | for block in self.pdf_info[f'{pno}']['blocks']: | ||
| 821 | if block['type'] != 0: | ||
| 822 | continue | ||
| 823 | for line in block['lines']: | ||
| 824 | for span in line['spans']: | ||
| 825 | bbox, text = span['bbox'], span['text'] | ||
| 826 | if '车辆附加产品(明细见下表)' in text: | ||
| 827 | add_product_page_num = pno | ||
| 828 | ocr_results = [] | ||
| 829 | for block in self.pdf_info[f'{add_product_page_num}']['blocks']: | ||
| 830 | if block['type'] != 0: | ||
| 831 | continue | ||
| 832 | for line in block['lines']: | ||
| 833 | for span in line['spans']: | ||
| 834 | bbox, text = span['bbox'], span['text'] | ||
| 835 | xmin, ymin, xmax, ymax = bbox | ||
| 836 | bbox = [xmin, ymin, xmax, ymin, xmax, ymax, xmin, ymax] | ||
| 837 | ocr_results.append([bbox, text]) | ||
| 838 | |||
| 839 | lines = [['项目', '购买价格', '实际融资金额']] | ||
| 840 | |||
| 841 | key_xm = None | ||
| 842 | key_gmjg = None | ||
| 843 | key_sjrzje = None | ||
| 844 | key_total = None | ||
| 845 | |||
| 846 | for index, span in enumerate(ocr_results): | ||
| 847 | if span[1] == '项目': | ||
| 848 | key_xm = index | ||
| 849 | if span[1] == '购买价格': | ||
| 850 | key_gmjg = index | ||
| 851 | if span[1] == '实际融资金额': | ||
| 852 | key_sjrzje = index | ||
| 853 | if span[1] == '总计': | ||
| 854 | key_total = index | ||
| 855 | |||
| 856 | bbox, text = ocr_results[key_xm] | ||
| 857 | rh = abs(bbox[1]-bbox[-1]) | ||
| 858 | anchor = np.array(bbox).reshape((-1, 2)) | ||
| 859 | anchor[:, 0] += 2*rh | ||
| 860 | anchor[:, 1] += rh | ||
| 861 | |||
| 862 | for i in range(5): | ||
| 863 | for span in ocr_results: | ||
| 864 | iou = caculate_iou(anchor, span[0]) | ||
| 865 | if iou > 0.01 and span[1].strip() != '所购': | ||
| 866 | x = get_table_info(span[0], ocr_results[key_gmjg][0], ocr_results) | ||
| 867 | y = get_table_info(span[0], ocr_results[key_sjrzje][0], ocr_results) | ||
| 868 | line = [span[1].replace('\u3000', ' '), x, y] | ||
| 869 | # print(line) | ||
| 870 | lines.append(line) | ||
| 871 | anchor = np.array(span[0]).reshape((-1, 2)) | ||
| 872 | anchor[:, 1] += rh | ||
| 873 | |||
| 874 | total = get_table_info(ocr_results[key_total][0], ocr_results[key_sjrzje][0], ocr_results) | ||
| 875 | lines.append(['总计', '', total]) | ||
| 876 | |||
| 877 | # 所购 BMW悦然焕 | ||
| 878 | # 新服务 | ||
| 879 | |||
| 880 | # 所购 BMW5年10 | ||
| 881 | # 万公里长悦保养套餐 | ||
| 882 | |||
| 883 | # 所购 事故维修补偿 | ||
| 884 | # 方案 | ||
| 885 | |||
| 886 | # 所购 BMW5年10万公里 | ||
| 887 | # 长悦保养套餐 | ||
| 888 | |||
| 889 | # 所购 MINI4年6万公里长悦 | ||
| 890 | # 保养套餐 | ||
| 891 | |||
| 892 | filtered_lines = [] | ||
| 893 | for line in lines: | ||
| 894 | if line[0][:2] not in ['所购', '项目', '总计']: | ||
| 895 | continue | ||
| 896 | if 'BMW悦然' in line[0]: | ||
| 897 | line[0] = '所购 BMW悦然焕新服务' | ||
| 898 | if 'BMW5年10' in line[0]: | ||
| 899 | line[0] = '所购 BMW5年10万公里长悦保养套餐' | ||
| 900 | if '事故维修补' in line[0]: | ||
| 901 | line[0] = '所购 事故维修补偿方案' | ||
| 902 | if 'MINI4年6万公里长悦' in line[0]: | ||
| 903 | line[0] = '所购 MINI4年6万公里长悦保养套餐' | ||
| 904 | filtered_lines.append(line) | ||
| 905 | table_add_product['words'] = filtered_lines | ||
| 906 | table_add_product['page'] = add_product_page_num | ||
| 907 | table_add_product['position'] = None | ||
| 908 | return table_add_product | ||
| 909 | |||
| 910 | def get_contract_no_dy(self): | ||
| 911 | # 查找抵押合同编号 | ||
| 912 | contract_no = self.item.copy() | ||
| 913 | |||
| 914 | key_box = None | ||
| 915 | for pno in self.pdf_info: | ||
| 916 | for block in self.pdf_info[pno]['blocks']: | ||
| 917 | if block['type'] != 0: | ||
| 918 | continue | ||
| 919 | for line in block['lines']: | ||
| 920 | for span in line['spans']: | ||
| 921 | bbox, text = span['bbox'], span['text'] | ||
| 922 | if '抵押合同编号' in text: | ||
| 923 | key_box = bbox | ||
| 924 | |||
| 925 | if key_box is not None: | ||
| 926 | for pno in self.pdf_info: | ||
| 927 | for block in self.pdf_info[pno]['blocks']: | ||
| 928 | if block['type'] != 0: | ||
| 929 | continue | ||
| 930 | for line in block['lines']: | ||
| 931 | for span in line['spans']: | ||
| 932 | bbox, text = span['bbox'], span['text'] | ||
| 933 | if key_box[1] < np.mean(bbox[1::2]) < key_box[3] and 'CH-' in text: | ||
| 934 | contract_no['position'] = bbox | ||
| 935 | contract_no['page'] = pno | ||
| 936 | contract_no['words'] = text | ||
| 937 | return contract_no | ||
| 938 | |||
| 939 | def get_dyr_name_id(self): | ||
| 940 | name = self.item.copy() | ||
| 941 | _id = self.item.copy() | ||
| 942 | |||
| 943 | key_box = None | ||
| 944 | for pno in self.pdf_info: | ||
| 945 | for block in self.pdf_info[pno]['blocks']: | ||
| 946 | if block['type'] != 0: | ||
| 947 | continue | ||
| 948 | for line in block['lines']: | ||
| 949 | for span in line['spans']: | ||
| 950 | bbox, text = span['bbox'], span['text'] | ||
| 951 | if text == '抵押人': | ||
| 952 | key_box = bbox | ||
| 953 | |||
| 954 | if key_box is not None: | ||
| 955 | rh = abs(key_box[1]-key_box[3]) | ||
| 956 | for pno in self.pdf_info: | ||
| 957 | for block in self.pdf_info[pno]['blocks']: | ||
| 958 | if block['type'] != 0: | ||
| 959 | continue | ||
| 960 | for line in block['lines']: | ||
| 961 | for span in line['spans']: | ||
| 962 | bbox, text = span['bbox'], span['text'] | ||
| 963 | if key_box[1] < np.mean(bbox[1::2]) < key_box[3]+rh*3 and '姓名' in text: | ||
| 964 | words = text.split(':')[-1] | ||
| 965 | name['position'] = bbox | ||
| 966 | name['page'] = pno | ||
| 967 | name['words'] = words | ||
| 968 | if key_box[1] < np.mean(bbox[1::2]) < key_box[3]+rh*3 and '证件号码' in text: | ||
| 969 | words = text.split(':')[-1] | ||
| 970 | _id['position'] = bbox | ||
| 971 | _id['page'] = pno | ||
| 972 | _id['words'] = words | ||
| 973 | return name, _id | ||
| 974 | |||
| 975 | def get_dyrpo_name_id(self): | ||
| 976 | name = self.item.copy() | ||
| 977 | _id = self.item.copy() | ||
| 978 | |||
| 979 | key_box = None | ||
| 980 | for pno in self.pdf_info: | ||
| 981 | for block in self.pdf_info[pno]['blocks']: | ||
| 982 | if block['type'] != 0: | ||
| 983 | continue | ||
| 984 | for line in block['lines']: | ||
| 985 | for span in line['spans']: | ||
| 986 | bbox, text = span['bbox'], span['text'] | ||
| 987 | if text == '抵押人配偶(如适': | ||
| 988 | key_box = bbox | ||
| 989 | |||
| 990 | if key_box is not None: | ||
| 991 | rh = abs(key_box[1]-key_box[3]) | ||
| 992 | for pno in self.pdf_info: | ||
| 993 | for block in self.pdf_info[pno]['blocks']: | ||
| 994 | if block['type'] != 0: | ||
| 995 | continue | ||
| 996 | for line in block['lines']: | ||
| 997 | for span in line['spans']: | ||
| 998 | bbox, text = span['bbox'], span['text'] | ||
| 999 | if key_box[1] < np.mean(bbox[1::2]) < key_box[3]+rh*3 and '姓名' in text: | ||
| 1000 | words = text.split(':')[-1] | ||
| 1001 | name['position'] = bbox | ||
| 1002 | name['page'] = pno | ||
| 1003 | name['words'] = words | ||
| 1004 | if key_box[1] < np.mean(bbox[1::2]) < key_box[3]+rh*3 and '证件号码' in text: | ||
| 1005 | words = text.split(':')[-1] | ||
| 1006 | _id['position'] = bbox | ||
| 1007 | _id['page'] = pno | ||
| 1008 | _id['words'] = words | ||
| 1009 | return name, _id | ||
| 1010 | |||
| 1011 | def get_key_value_position(self, key): | ||
| 1012 | value = self.item.copy() | ||
| 1013 | |||
| 1014 | key_box = None | ||
| 1015 | for pno in self.pdf_info: | ||
| 1016 | for block in self.pdf_info[pno]['blocks']: | ||
| 1017 | if block['type'] != 0: | ||
| 1018 | continue | ||
| 1019 | for line in block['lines']: | ||
| 1020 | for span in line['spans']: | ||
| 1021 | bbox, text = span['bbox'], span['text'] | ||
| 1022 | if text == key: | ||
| 1023 | key_box = bbox | ||
| 1024 | |||
| 1025 | if key_box is not None: | ||
| 1026 | rh = abs(key_box[1]-key_box[3]) | ||
| 1027 | for pno in self.pdf_info: | ||
| 1028 | for block in self.pdf_info[pno]['blocks']: | ||
| 1029 | if block['type'] != 0: | ||
| 1030 | continue | ||
| 1031 | for line in block['lines']: | ||
| 1032 | for span in line['spans']: | ||
| 1033 | bbox, text = span['bbox'], span['text'] | ||
| 1034 | if key_box[1] < np.mean(bbox[1::2]) < key_box[3] and key_box[0] < bbox[0] and abs(key_box[2]-bbox[0]) < rh*10: | ||
| 1035 | words = text | ||
| 1036 | value['position'] = bbox | ||
| 1037 | value['page'] = pno | ||
| 1038 | value['words'] = words | ||
| 1039 | return value | ||
| 1040 | |||
| 1041 | def get_role_info_3_3(self, role_key, page_num='0'): | ||
| 1042 | name = self.item.copy() | ||
| 1043 | id_num = self.item.copy() | ||
| 1044 | representative = self.item.copy() | ||
| 1045 | |||
| 1046 | # 以保证人2 的左上角为定位点 | ||
| 1047 | anchor = None | ||
| 1048 | for block in self.pdf_info[page_num]['blocks']: | ||
| 1049 | if block['type'] != 0: | ||
| 1050 | continue | ||
| 1051 | for line in block['lines']: | ||
| 1052 | for span in line['spans']: | ||
| 1053 | bbox, text = span['bbox'], span['text'] | ||
| 1054 | # 找到角色姓名 | ||
| 1055 | if re.match('保证人2', text) is not None: | ||
| 1056 | anchor = [bbox[0], bbox[1]] | ||
| 1057 | |||
| 1058 | if anchor is not None: | ||
| 1059 | for block in self.pdf_info[page_num]['blocks']: | ||
| 1060 | if block['type'] != 0: | ||
| 1061 | continue | ||
| 1062 | for line in block['lines']: | ||
| 1063 | for span in line['spans']: | ||
| 1064 | bbox, text = span['bbox'], span['text'] | ||
| 1065 | # 找到角色姓名 | ||
| 1066 | if re.match(role_key, text) is not None: | ||
| 1067 | words = text.split(':')[-1] | ||
| 1068 | name['words'] = words | ||
| 1069 | name['page'] = page_num | ||
| 1070 | name['position'] = bbox | ||
| 1071 | if role_key == '承租人一:': | ||
| 1072 | # 找到证件号码且确定位置 | ||
| 1073 | if re.match('证件号码:', text) is not None and np.mean(bbox[::2]) < anchor[0] and np.mean(bbox[1::2]) < anchor[1]: | ||
| 1074 | words = text.split(':')[-1] | ||
| 1075 | id_num['words'] = words | ||
| 1076 | id_num['page'] = page_num | ||
| 1077 | id_num['position'] = bbox | ||
| 1078 | # 找到法人代表且确定位置 | ||
| 1079 | if re.match('法定代表人或授权代表:', text) is not None and np.mean(bbox[::2]) < anchor[0] and np.mean(bbox[1::2]) < anchor[1]: | ||
| 1080 | words = text.split(':')[-1] | ||
| 1081 | representative['words'] = words | ||
| 1082 | representative['page'] = page_num | ||
| 1083 | representative['position'] = bbox | ||
| 1084 | if role_key == '共同承租人:': | ||
| 1085 | # 找到证件号码且确定位置 | ||
| 1086 | if re.match('证件号码:', text) is not None and np.mean(bbox[::2]) < anchor[0] and np.mean(bbox[1::2]) > anchor[1]: | ||
| 1087 | words = text.split(':')[-1] | ||
| 1088 | id_num['words'] = words | ||
| 1089 | id_num['page'] = page_num | ||
| 1090 | id_num['position'] = bbox | ||
| 1091 | # 找到法人代表且确定位置 | ||
| 1092 | if re.match('法定代表人或授权代表:', text) is not None and np.mean(bbox[::2]) < anchor[0] and np.mean(bbox[1::2]) > anchor[1]: | ||
| 1093 | words = text.split(':')[-1] | ||
| 1094 | representative['words'] = words | ||
| 1095 | representative['page'] = page_num | ||
| 1096 | representative['position'] = bbox | ||
| 1097 | if role_key == '保证人1:': | ||
| 1098 | # 找到证件号码且确定位置 | ||
| 1099 | if re.match('证件号码:', text) is not None and np.mean(bbox[::2]) > anchor[0] and np.mean(bbox[1::2]) < anchor[1]: | ||
| 1100 | words = text.split(':')[-1] | ||
| 1101 | id_num['words'] = words | ||
| 1102 | id_num['page'] = page_num | ||
| 1103 | id_num['position'] = bbox | ||
| 1104 | # 找到法人代表且确定位置 | ||
| 1105 | if re.match('法定代表人或授权代表:', text) is not None and np.mean(bbox[::2]) > anchor[0] and np.mean(bbox[1::2]) < anchor[1]: | ||
| 1106 | words = text.split(':')[-1] | ||
| 1107 | representative['words'] = words | ||
| 1108 | representative['page'] = page_num | ||
| 1109 | representative['position'] = bbox | ||
| 1110 | if role_key == '保证人2:': | ||
| 1111 | # 找到证件号码且确定位置 | ||
| 1112 | if re.match('证件号码:', text) is not None and np.mean(bbox[::2]) > anchor[0] and np.mean(bbox[1::2]) > anchor[1]: | ||
| 1113 | words = text.split(':')[-1] | ||
| 1114 | id_num['words'] = words | ||
| 1115 | id_num['page'] = page_num | ||
| 1116 | id_num['position'] = bbox | ||
| 1117 | # 找到法人代表且确定位置 | ||
| 1118 | if re.match('法定代表人或授权代表:', text) is not None and np.mean(bbox[::2]) > anchor[0] and np.mean(bbox[1::2]) > anchor[1]: | ||
| 1119 | words = text.split(':')[-1] | ||
| 1120 | representative['words'] = words | ||
| 1121 | representative['page'] = page_num | ||
| 1122 | representative['position'] = bbox | ||
| 1123 | return name, id_num, representative | ||
| 1124 | |||
| 1125 | def get_value_by_findall(self, prefix, suffix, page_num): | ||
| 1126 | value = self.item.copy() | ||
| 1127 | all_text = '' | ||
| 1128 | pno = page_num | ||
| 1129 | for block in self.pdf_info[pno]['blocks']: | ||
| 1130 | if block['type'] != 0: | ||
| 1131 | continue | ||
| 1132 | for line in block['lines']: | ||
| 1133 | for span in line['spans']: | ||
| 1134 | bbox, text = span['bbox'], span['text'] | ||
| 1135 | all_text += text | ||
| 1136 | words_list = re.findall(f"{prefix}(.*?){suffix}", all_text) | ||
| 1137 | if len(words_list) > 0: | ||
| 1138 | for block in self.pdf_info[pno]['blocks']: | ||
| 1139 | if block['type'] != 0: | ||
| 1140 | continue | ||
| 1141 | for line in block['lines']: | ||
| 1142 | for span in line['spans']: | ||
| 1143 | bbox, text = span['bbox'], span['text'] | ||
| 1144 | if words_list[0] in text: | ||
| 1145 | value['position'] = bbox | ||
| 1146 | value['page'] = pno | ||
| 1147 | value['words'] = words_list[0] | ||
| 1148 | return value | ||
| 1149 | |||
| 1150 | def get_info(self): | ||
| 1151 | """ | ||
| 1152 | block['type'] == 0 : 表示该元素为图片 | ||
| 1153 | |||
| 1154 | Returns: | ||
| 1155 | dict: Description | ||
| 1156 | """ | ||
| 1157 | if len(self.pdf_info) > 0: | ||
| 1158 | # 取 Page 1 上的合同编号 | ||
| 1159 | contract_no = self.get_contract_no(page_num='0') | ||
| 1160 | self.init_result['合同编号'] = contract_no | ||
| 1161 | |||
| 1162 | # 粗略判断是否是 ‘车贷分离版本’ 的合同 | ||
| 1163 | is_cdfl = False | ||
| 1164 | for block in self.pdf_info['0']['blocks']: | ||
| 1165 | if block['type'] != 0: | ||
| 1166 | continue | ||
| 1167 | for line in block['lines']: | ||
| 1168 | for span in line['spans']: | ||
| 1169 | bbox, text = span['bbox'], span['text'] | ||
| 1170 | if '共同承租人:' in text: | ||
| 1171 | is_cdfl = True | ||
| 1172 | |||
| 1173 | if is_cdfl == False: | ||
| 1174 | # 从第一页上取四个角色的姓名和证件号码 | ||
| 1175 | name, id_num, representative = self.get_role_info(role_key='承租人:', page_num='0') | ||
| 1176 | |||
| 1177 | if name["words"] == None: | ||
| 1178 | name, id_num, representative = self.get_role_info_3_3(role_key='承租人一:', page_num='0') | ||
| 1179 | self.init_result['承租人-姓名'] = name | ||
| 1180 | self.init_result['承租人-证件号码'] = id_num | ||
| 1181 | self.init_result['承租人-法定代表人或授权代表'] = representative | ||
| 1182 | |||
| 1183 | name, id_num, representative = self.get_role_info(role_key='保证人1:', page_num='0') | ||
| 1184 | self.init_result['保证人1-姓名'] = name | ||
| 1185 | self.init_result['保证人1-证件号码'] = id_num | ||
| 1186 | self.init_result['保证人1-法定代表人或授权代表'] = representative | ||
| 1187 | # if条件判别 对应3_3版本 | ||
| 1188 | if name["words"] == None: | ||
| 1189 | name, id_num, representative = self.get_role_info_3_3(role_key='共同承租人:', page_num='0') | ||
| 1190 | self.init_result['共同承租人-姓名'] = name | ||
| 1191 | self.init_result['共同承租人-证件号码'] = id_num | ||
| 1192 | self.init_result['共同承租人-法定代表人或授权代表'] = representative | ||
| 1193 | |||
| 1194 | name, id_num, representative = self.get_role_info(role_key='保证人2:', page_num='0') | ||
| 1195 | self.init_result['保证人2-姓名'] = name | ||
| 1196 | self.init_result['保证人2-证件号码'] = id_num | ||
| 1197 | self.init_result['保证人2-法定代表人或授权代表'] = representative | ||
| 1198 | # if条件判别 对应3_3版本 | ||
| 1199 | if name["words"] == None: | ||
| 1200 | name, id_num, representative = self.get_role_info_3_3(role_key='保证人1:', page_num='0') | ||
| 1201 | self.init_result['保证人2-姓名'] = name | ||
| 1202 | self.init_result['保证人2-证件号码'] = id_num | ||
| 1203 | self.init_result['保证人2-法定代表人或授权代表'] = representative | ||
| 1204 | |||
| 1205 | name, id_num, representative = self.get_role_info(role_key='保证人3:', page_num='0') | ||
| 1206 | self.init_result['保证人3-姓名'] = name | ||
| 1207 | self.init_result['保证人3-证件号码'] = id_num | ||
| 1208 | self.init_result['保证人3-法定代表人或授权代表'] = representative | ||
| 1209 | if name["words"] == None: | ||
| 1210 | name, id_num, representative = self.get_role_info_3_3(role_key='保证人2:', page_num='0') | ||
| 1211 | self.init_result['保证人3-姓名'] = name | ||
| 1212 | self.init_result['保证人3-证件号码'] = id_num | ||
| 1213 | self.init_result['保证人3-法定代表人或授权代表'] = representative | ||
| 1214 | else: | ||
| 1215 | name, id_num, representative = self.get_role_info_3_3(role_key='承租人一:', page_num='0') | ||
| 1216 | self.init_result['承租人-姓名'] = name | ||
| 1217 | self.init_result['承租人-证件号码'] = id_num | ||
| 1218 | self.init_result['承租人-法定代表人或授权代表'] = representative | ||
| 1219 | |||
| 1220 | name, id_num, representative = self.get_role_info_3_3(role_key='共同承租人:', page_num='0') | ||
| 1221 | self.init_result['共同承租人-姓名'] = name | ||
| 1222 | self.init_result['共同承租人-证件号码'] = id_num | ||
| 1223 | self.init_result['共同承租人-法定代表人或授权代表'] = representative | ||
| 1224 | |||
| 1225 | name, id_num, representative = self.get_role_info_3_3(role_key='保证人1:', page_num='0') | ||
| 1226 | self.init_result['保证人1-姓名'] = name | ||
| 1227 | self.init_result['保证人1-证件号码'] = id_num | ||
| 1228 | self.init_result['保证人1-法定代表人或授权代表'] = representative | ||
| 1229 | |||
| 1230 | name, id_num, representative = self.get_role_info_3_3(role_key='保证人2:', page_num='0') | ||
| 1231 | self.init_result['保证人2-姓名'] = name | ||
| 1232 | self.init_result['保证人2-证件号码'] = id_num | ||
| 1233 | self.init_result['保证人2-法定代表人或授权代表'] = representative | ||
| 1234 | |||
| 1235 | # 在所有页面中找正文中(第二部分 融资租赁主要条款及付款计划)的那个编号,因为存在换行的情况所以暂时不带位置输出 | ||
| 1236 | contract_no = self.get_contract_no_one() | ||
| 1237 | self.init_result['合同编号(正文)'] = contract_no | ||
| 1238 | # 找到车辆识别代码 | ||
| 1239 | vin = self.get_key_value(key='车辆识别代码:') | ||
| 1240 | self.init_result['车辆识别代码'] = vin | ||
| 1241 | # 找到经销商(车辆卖方(经销商)) | ||
| 1242 | seller = self.get_key_value(key='车辆卖方(经销商):') | ||
| 1243 | if seller['words'] == None: | ||
| 1244 | seller = self.get_key_value(key='车辆卖方:') | ||
| 1245 | self.init_result['车辆卖方(经销商)'] = seller | ||
| 1246 | # 找到 —— 车辆原始销售价格 | ||
| 1247 | vehicle_price = self.get_key_value(key='车辆原始销售价格(《机动车销售统一发票》所列金额):') | ||
| 1248 | self.init_result['车辆原始销售价格(《机动车销售统一发票》所列金额)'] = vehicle_price | ||
| 1249 | # 找车辆附加产品明细(表) | ||
| 1250 | table_add_product = self.get_table_add_product() | ||
| 1251 | self.init_result['车辆附加产品明细表'] = table_add_product | ||
| 1252 | # 找融资成本总额 | ||
| 1253 | financing_cost = self.get_key_value(key='融资成本总额:') | ||
| 1254 | self.init_result['融资成本总额'] = financing_cost | ||
| 1255 | # 找租期 | ||
| 1256 | lease_term = self.get_key_value(key='租期:') | ||
| 1257 | self.init_result['租期'] = lease_term | ||
| 1258 | # 找还款计划(表) | ||
| 1259 | repayment_schedule = self.get_repayment_schedule() | ||
| 1260 | self.init_result['付款计划表'] = repayment_schedule | ||
| 1261 | # 找承租人收款账户户名、银行账号、银行 | ||
| 1262 | name = self.get_key_value(key='户名:', page_num='4') | ||
| 1263 | self.init_result['承租人收款账户-户名'] = name | ||
| 1264 | account = self.get_key_value(key='银行账号:', page_num='4') | ||
| 1265 | self.init_result['承租人收款账户-银行账号'] = account | ||
| 1266 | bank = self.get_key_value(key='开户银行:', page_num='4') | ||
| 1267 | self.init_result['承租人收款账户-开户行'] = bank | ||
| 1268 | # 找承租人扣款账户户名、银行账号、银行 | ||
| 1269 | name = self.get_key_value(key='户名:', page_num='5') | ||
| 1270 | self.init_result['承租人扣款账户-户名'] = name | ||
| 1271 | account = self.get_key_value(key='银行账号:', page_num='5') | ||
| 1272 | self.init_result['承租人扣款账户-银行账号'] = account | ||
| 1273 | bank = self.get_key_value(key='开户银行:', page_num='5') | ||
| 1274 | self.init_result['承租人扣款账户-开户行'] = bank | ||
| 1275 | |||
| 1276 | # 找签字页上的系列信息 | ||
| 1277 | # 承租人姓名、签章 | ||
| 1278 | if is_cdfl == False: | ||
| 1279 | name = self.get_key_value(key='承租人姓名:') | ||
| 1280 | electronic_signature = self.get_electronic_signature(top='承租人姓名:', bottom='保证人1姓名:') | ||
| 1281 | |||
| 1282 | if name["words"] == None: | ||
| 1283 | name = self.get_key_value(key='承租人一姓名:') | ||
| 1284 | electronic_signature = self.get_electronic_signature(top='承租人一姓名:', bottom='共同承租人名称:') | ||
| 1285 | |||
| 1286 | self.init_result['签字页-承租人姓名'] = name | ||
| 1287 | self.init_result['签字页-承租人签章'] = electronic_signature | ||
| 1288 | # 保证人1姓名、签章 | ||
| 1289 | name = self.get_key_value(key='保证人1姓名:') | ||
| 1290 | electronic_signature = self.get_electronic_signature(top='保证人1姓名:', bottom='保证人2姓名:') | ||
| 1291 | self.init_result['签字页-保证人1姓名'] = name | ||
| 1292 | self.init_result['签字页-保证人1签章'] = electronic_signature | ||
| 1293 | # 这里用的是 name["words"] == "" | ||
| 1294 | if name["words"] == "": | ||
| 1295 | name = self.get_key_value(key='共同承租人名称:') | ||
| 1296 | electronic_signature = self.get_electronic_signature(top='共同承租人名称:', bottom='保证人1姓名:') | ||
| 1297 | self.init_result['签字页-共同承租人姓名'] = name | ||
| 1298 | self.init_result['签字页-共同承租人签章'] = electronic_signature | ||
| 1299 | # 保证人2姓名、签章 | ||
| 1300 | name = self.get_key_value(key='保证人2姓名:') | ||
| 1301 | electronic_signature = self.get_electronic_signature(top='保证人2姓名:', bottom='保证人3姓名:') | ||
| 1302 | self.init_result['签字页-保证人2姓名'] = name | ||
| 1303 | self.init_result['签字页-保证人2签章'] = electronic_signature | ||
| 1304 | # if判断条件对应3_3版本 | ||
| 1305 | if name["words"] == "": | ||
| 1306 | name = self.get_key_value(key='保证人1姓名:') | ||
| 1307 | electronic_signature = self.get_electronic_signature(top='保证人1姓名:', bottom='保证人2姓名:') | ||
| 1308 | self.init_result['签字页-保证人1姓名'] = name | ||
| 1309 | self.init_result['签字页-保证人1签章'] = electronic_signature | ||
| 1310 | # 保证人3姓名、签章 | ||
| 1311 | name = self.get_key_value(key='保证人3姓名:') | ||
| 1312 | electronic_signature = self.get_electronic_signature(top='保证人3姓名:', bottom='日期:') | ||
| 1313 | self.init_result['签字页-保证人3姓名'] = name | ||
| 1314 | self.init_result['签字页-保证人3签章'] = electronic_signature | ||
| 1315 | # if判断条件对应3_3版本 | ||
| 1316 | if name["words"] == None: | ||
| 1317 | name = self.get_key_value(key='保证人2姓名:') | ||
| 1318 | electronic_signature = self.get_electronic_signature(top='保证人2姓名:', bottom='日期:') | ||
| 1319 | self.init_result['签字页-保证人2姓名'] = name | ||
| 1320 | self.init_result['签字页-保证人2签章'] = electronic_signature | ||
| 1321 | else: | ||
| 1322 | name = self.get_key_value(key='承租人一姓名:') | ||
| 1323 | electronic_signature = self.get_electronic_signature(top='承租人一姓名:', bottom='共同承租人名称:') | ||
| 1324 | self.init_result['签字页-承租人姓名'] = name | ||
| 1325 | self.init_result['签字页-承租人签章'] = electronic_signature | ||
| 1326 | |||
| 1327 | name = self.get_key_value(key='共同承租人名称:') | ||
| 1328 | electronic_signature = self.get_electronic_signature(top='共同承租人名称:', bottom='保证人1姓名:') | ||
| 1329 | self.init_result['签字页-共同承租人姓名'] = name | ||
| 1330 | self.init_result['签字页-共同承租人签章'] = electronic_signature | ||
| 1331 | |||
| 1332 | name = self.get_key_value(key='保证人1姓名:') | ||
| 1333 | electronic_signature = self.get_electronic_signature(top='保证人1姓名:', bottom='保证人2姓名:') | ||
| 1334 | self.init_result['签字页-保证人1姓名'] = name | ||
| 1335 | self.init_result['签字页-保证人1签章'] = electronic_signature | ||
| 1336 | |||
| 1337 | name = self.get_key_value(key='保证人2姓名:') | ||
| 1338 | electronic_signature = self.get_electronic_signature(top='保证人2姓名:', bottom='保证人3姓名:') | ||
| 1339 | self.init_result['签字页-保证人2姓名'] = name | ||
| 1340 | self.init_result['签字页-保证人2签章'] = electronic_signature | ||
| 1341 | |||
| 1342 | return self.init_result | ||
| 1343 | |||
| 1344 | def get_info_1(self): | ||
| 1345 | if len(self.pdf_info) > 0: | ||
| 1346 | contract_no = self.get_contract_no(page_num='0') | ||
| 1347 | self.init_result_1['合同编号'] = contract_no | ||
| 1348 | # 承租人姓名 | ||
| 1349 | name = self.get_key_value(key='承租人:', page_num='0') | ||
| 1350 | self.init_result_1['承租人-姓名'] = name | ||
| 1351 | # 承租人证件号码 | ||
| 1352 | _id = self.get_key_value(key='证件号码:', page_num='0') | ||
| 1353 | self.init_result_1['承租人-证件号码'] = _id | ||
| 1354 | # 销售经销商 | ||
| 1355 | seller = self.get_key_value(key='销售经销商:', page_num='0') | ||
| 1356 | if seller['words'] == "": | ||
| 1357 | seller = self.get_value_by_findall('销售经销商:', '地址:', page_num='0') | ||
| 1358 | self.init_result_1['销售经销商'] = seller | ||
| 1359 | # 合同编号(正文) | ||
| 1360 | contract_no = self.get_contract_no_one() | ||
| 1361 | self.init_result_1['合同编号(正文)'] = contract_no | ||
| 1362 | # 签字页-承租人姓名 | ||
| 1363 | name = self.get_key_value(key='姓名/名称:') | ||
| 1364 | self.init_result_1['签字页-承租人姓名'] = name | ||
| 1365 | # 签字页-承租人证件号码 | ||
| 1366 | _id = self.get_key_value(key='自然人身份证件号码/法人执照号码:') | ||
| 1367 | self.init_result_1['签字页-承租人证件号码'] = _id | ||
| 1368 | # 签字页-承租人签章 | ||
| 1369 | signature_role_1 = self.get_signature_role_1() | ||
| 1370 | self.init_result_1['签字页-承租人签章'] = signature_role_1 | ||
| 1371 | # 签字页-销售经销商 | ||
| 1372 | seller = self.get_key_value(key='销售经销商:') | ||
| 1373 | if seller['words'] == "": | ||
| 1374 | # 销售经销商:深圳市宝创汽车贸易有限公司南山分公司(请授权代表签字并请盖章) | ||
| 1375 | seller = self.get_value_by_findall('销售经销商:', '(请授权代表签字并请盖章)', page_num='3') | ||
| 1376 | self.init_result_1['签字页-销售经销商'] = seller | ||
| 1377 | # 经销商签章 | ||
| 1378 | pass | ||
| 1379 | return self.init_result_1 | ||
| 1380 | |||
| 1381 | def get_info_2(self): | ||
| 1382 | if len(self.pdf_info) > 0: | ||
| 1383 | contract_no = self.get_contract_no_dy() | ||
| 1384 | self.init_result_2['合同编号'] = contract_no | ||
| 1385 | # 合同编号(正文) | ||
| 1386 | contract_no = self.get_contract_no_one() | ||
| 1387 | self.init_result_2['合同编号(正文)'] = contract_no | ||
| 1388 | # 抵押人姓名/名称 | ||
| 1389 | name, _id = self.get_dyr_name_id() | ||
| 1390 | self.init_result_2['抵押人姓名/名称'] = name | ||
| 1391 | self.init_result_2['抵押人证件号码'] = _id | ||
| 1392 | # 抵押人配偶信息 | ||
| 1393 | name, _id = self.get_dyrpo_name_id() | ||
| 1394 | self.init_result_2['抵押人配偶姓名/名称'] = name | ||
| 1395 | self.init_result_2['抵押人配偶证件号码'] = _id | ||
| 1396 | # 车辆识别代码 | ||
| 1397 | vin = self.get_key_value(key='车辆识别代码:') | ||
| 1398 | self.init_result_2['车辆识别代码'] = vin | ||
| 1399 | # 租金总额 | ||
| 1400 | rent = self.get_key_value_position(key='租金总额') | ||
| 1401 | self.init_result_2['租金总额'] = rent | ||
| 1402 | # 融资租赁期限 | ||
| 1403 | lease_term = self.get_key_value_position(key='融资租赁期限') | ||
| 1404 | self.init_result_2['融资租赁期限'] = lease_term | ||
| 1405 | # 签字页抵押人姓名和签章 | ||
| 1406 | name = self.get_key_value(key='抵押人姓名:') | ||
| 1407 | electronic_signature = self.get_electronic_signature(top='抵押权人盖章', bottom='抵押人配偶姓名:') | ||
| 1408 | self.init_result_2['签字页-抵押人姓名'] = name | ||
| 1409 | self.init_result_2['签字页-抵押人签章'] = electronic_signature | ||
| 1410 | # 签字页抵押人配偶姓名和签章 | ||
| 1411 | name = self.get_key_value(key='抵押人配偶姓名:') | ||
| 1412 | electronic_signature = self.get_electronic_signature(top='抵押人配偶姓名:', bottom='日期') | ||
| 1413 | self.init_result_2['签字页-抵押人配偶姓名'] = name | ||
| 1414 | self.init_result_2['签字页-抵押人配偶签章'] = electronic_signature | ||
| 1415 | return self.init_result_2 | ||
| ... | \ No newline at end of file | ... | \ No newline at end of file |
| ... | @@ -6,9 +6,10 @@ | ... | @@ -6,9 +6,10 @@ |
| 6 | # @Description : | 6 | # @Description : |
| 7 | 7 | ||
| 8 | from .get_char import Finder | 8 | from .get_char import Finder |
| 9 | from .get_char_fsm import Finder as FSMFinder | ||
| 9 | 10 | ||
| 10 | 11 | ||
| 11 | def predict(pdf_info, file_cls): | 12 | def predict(pdf_info, file_cls, is_fsm=False): |
| 12 | """Summary | 13 | """Summary |
| 13 | 14 | ||
| 14 | Args: | 15 | Args: |
| ... | @@ -58,7 +59,11 @@ def predict(pdf_info, file_cls): | ... | @@ -58,7 +59,11 @@ def predict(pdf_info, file_cls): |
| 58 | pdf_info = dict() | 59 | pdf_info = dict() |
| 59 | for pno, page_info in enumerate(pdf_info_1): | 60 | for pno, page_info in enumerate(pdf_info_1): |
| 60 | pdf_info[str(pno)] = page_info | 61 | pdf_info[str(pno)] = page_info |
| 61 | f = Finder(pdf_info) | 62 | |
| 63 | if is_fsm: | ||
| 64 | f = FSMFinder(pdf_info) | ||
| 65 | else: | ||
| 66 | f = Finder(pdf_info) | ||
| 62 | if file_cls == 0: | 67 | if file_cls == 0: |
| 63 | results = f.get_info() | 68 | results = f.get_info() |
| 64 | if file_cls == 1: | 69 | if file_cls == 1: | ... | ... |
-
Please register or sign in to post a comment