# -*- coding: utf-8 -*- # @Author : lk # @Email : 9428.al@gmail.com # @Create Date : 2021-07-20 16:42:41 # @Last Modified : 2021-10-28 17:41:00 # @Description : import re import numpy as np from fuzzywuzzy import fuzz class Finder: def __init__(self, pdf_info): self.pdf_info = pdf_info self.item = {"words": None, "page": None, "position": None, } # 格式化算法输出 self.init_result = {"合同编号": self.item, "承租人-姓名": self.item, "承租人-证件号码": self.item, "承租人-法定代表人或授权代表": self.item, "共同承租人-姓名": self.item, "共同承租人-证件号码": self.item, "共同承租人-法定代表人或授权代表": self.item, "保证人1-姓名": self.item, "保证人1-证件号码": self.item, "保证人1-法定代表人或授权代表": self.item, "保证人2-姓名": self.item, "保证人2-证件号码": self.item, "保证人2-法定代表人或授权代表": self.item, "保证人3-姓名": self.item, "保证人3-证件号码": self.item, "保证人3-法定代表人或授权代表": self.item, "合同编号(正文)": self.item, "车辆识别代码": self.item, "车辆卖方(经销商)": self.item, "车辆原始销售价格(《机动车销售统一发票》所列金额)": self.item, "车辆附加产品明细表": self.item, "融资成本总额": self.item, "租期": self.item, "付款计划表": self.item, "银行账户-户名": self.item, "银行账户-银行账号": self.item, "银行账户-开户行": self.item, "签字页-承租人姓名": self.item, "签字页-承租人签章": self.item, "签字页-共同承租人姓名": self.item, "签字页-共同承租人签章": self.item, "签字页-保证人1姓名": self.item, "签字页-保证人1签章": self.item, "签字页-保证人2姓名": self.item, "签字页-保证人2签章": self.item, "签字页-保证人3姓名": self.item, "签字页-保证人3签章": self.item, } # 格式化输出 车辆处置协议 要是别的字段 self.init_result_1 = {"合同编号": self.item, "承租人-姓名": self.item, "承租人-证件号码": self.item, "销售经销商": self.item, "合同编号(正文)": self.item, "签字页-承租人姓名": self.item, "签字页-承租人证件号码": self.item, "签字页-承租人签章": self.item, "签字页-销售经销商": self.item, "签字页-销售经销商签章": self.item, } # 格式化输出 车辆租赁抵押合同 self.init_result_2 = {"合同编号": self.item, "合同编号(正文)": self.item, "抵押人姓名/名称": self.item, "抵押人证件号码": self.item, "抵押人配偶姓名/名称": self.item, "抵押人配偶证件号码": self.item, "车辆识别代码": self.item, "租金总额": self.item, "融资租赁期限": self.item, "签字页-抵押人姓名": self.item, "签字页-抵押人签章": self.item, "签字页-抵押人配偶姓名": self.item, "签字页-抵押人配偶签章": self.item, } def get_contract_no(self, page_num): """传入页码,查看该页码右上角的编号 Args: page_num (string): Returns: sting: """ contract_no = self.item.copy() # 只看第一页 for block in self.pdf_info[page_num]['blocks']: if block['type'] != 0: continue for line in block['lines']: for span in line['spans']: bbox, text = span['bbox'], span['text'] if '合同编号:' in text: words = text.split(':')[-1] contract_no['position'] = bbox contract_no['page'] = page_num contract_no['words'] = words if contract_no['words'] == '': for block in self.pdf_info[page_num]['blocks']: if block['type'] != 0: continue for line in block['lines']: for span in line['spans']: bbox, text = span['bbox'], span['text'] if bbox[1] < contract_no['position'][3] and 'CH' in text: contract_no['position'] = bbox contract_no['page'] = page_num contract_no['words'] = text return contract_no def get_vehicle_price(self, page_num='0'): vehicle_price = self.item.copy() for block in self.pdf_info[page_num]['blocks']: if block['type'] != 0: continue for line in block['lines']: for span in line['spans']: bbox, text = span['bbox'], span['text'] if '所购车辆价格为人民币' in text: words = text.split('币')[-1] vehicle_price['position'] = bbox vehicle_price['words'] = words return vehicle_price def get_contract_no_one(self): # 查找正文中的合同编号,有可能存在换行的情况 contract_no = self.item.copy() for pno in self.pdf_info: all_text = '' for block in self.pdf_info[pno]['blocks']: if block['type'] != 0: continue for line in block['lines']: for span in line['spans']: bbox, text = span['bbox'], span['text'] all_text += text all_text = all_text.replace(' ', '') matchObj = re.search(r'(合同编号:\[(.*?)\])', all_text) if matchObj: words = matchObj.group(1) contract_no['position'] = None contract_no['page'] = pno # contract_no['words'] = words contract_no['words'] = re.sub("\s", "", words).replace(")", "") return contract_no matchObj = re.search(r'编号为(.*?)的', all_text) if matchObj: words = matchObj.group(1).strip() contract_no['position'] = None contract_no['page'] = pno # contract_no['words'] = words contract_no['words'] = re.sub("\s", "", words).replace(")", "") return contract_no matchObj = re.search(r'编号为(.*?))的', all_text) if matchObj: words = matchObj.group(1).strip() contract_no['position'] = None contract_no['page'] = pno # contract_no['words'] = words contract_no['words'] = re.sub("\s", "", words) return contract_no def get_key_value(self, key, page_num=None): value = self.item.copy() if page_num is not None: pno = page_num for block in self.pdf_info[pno]['blocks']: if block['type'] != 0: continue for line in block['lines']: for span in line['spans']: bbox, text = span['bbox'], span['text'] if key in text: words = text.split(':')[-1].replace("。", "") value['position'] = bbox value['page'] = pno # value['words'] = words value['words'] = re.sub("\s", "", words) else: for pno in self.pdf_info: for block in self.pdf_info[pno]['blocks']: if block['type'] != 0: continue for line in block['lines']: for span in line['spans']: bbox, text = span['bbox'], span['text'] if key in text: # print(self.pdf_info[pno]) words = text.split(':')[-1].replace("。", "") value['position'] = bbox value['page'] = pno # value['words'] = words value['words'] = re.sub("\s", "", words) return value def get_loan_principal(self, page_num='0'): chinese_keywords = ['壹', '贰', '叁', '肆', '伍', '陆', '柒', '捌', '玖', '拾', '佰', '仟', '万', '亿', '元', '角', '分', '零', '整'] upper = self.item.copy() lower = self.item.copy() asp_1 = self.item.copy() asp_2 = self.item.copy() anchor_bbox = None for block in self.pdf_info[page_num]['blocks']: if block['type'] != 0: continue for line in block['lines']: for span in line['spans']: bbox, text = span['bbox'], span['text'] if fuzz.ratio(''.join(chinese_keywords), text) > 15: text = text.split(':')[-1].strip() upper['position'] = bbox upper['words'] = text if '小写:¥' in text: words = text.split('¥')[-1].strip() lower['position'] = bbox lower['words'] = words if '附加产品融资贷款本金总金额' == text: anchor_bbox = bbox if anchor_bbox: for block in self.pdf_info[page_num]['blocks']: if block['type'] != 0: continue for line in block['lines']: for span in line['spans']: bbox, text = span['bbox'], span['text'] if np.mean(bbox[1::2]) < np.mean(anchor_bbox[1::2]) and '人民币:小写:' in text: words = re.findall(r'人民币:小写:\[(.*)\]', text)[0] asp_1['position'] = bbox asp_1['words'] = words if np.mean(bbox[1::2]) > np.mean(anchor_bbox[1::2]) and '人民币:小写:' in text: words = re.findall(r'人民币:小写:\[(.*)\]', text)[0] asp_2['position'] = bbox asp_2['words'] = words return upper, lower, asp_1, asp_2 def get_loan_term(self, page_num='0'): loan_term = self.item.copy() all_text = '' for block in self.pdf_info[page_num]['blocks']: if block['type'] != 0: continue for line in block['lines']: for span in line['spans']: bbox, text = span['bbox'], span['text'] all_text += text matchs = re.search(r'贷款期限(\d+)个月', all_text) if matchs: words = matchs.group(1) for block in self.pdf_info[page_num]['blocks']: if block['type'] != 0: continue for line in block['lines']: for span in line['spans']: bbox, text = span['bbox'], span['text'] if f'{words}个月' in text: loan_term['position'] = bbox loan_term['words'] = words return loan_term def get_asp_details(self, page_num): asp_details_table_term = self.item.copy() asp_details_table = [] asp_details_text_list = [] table = False for block in self.pdf_info[page_num]['blocks']: if block['type'] != 0: continue for line in block['lines']: for span in line['spans']: bbox, text = span['bbox'], span['text'] if '附加产品融资贷款本金总金额明细' == text: table = True if '第二条' in text or '征信管理' in text: table = False if table == True: asp_details_text_list.append(text) for i in range((len(asp_details_text_list) + 2) // 3): line = [] if i == 0: line = [asp_details_text_list[0]] else: for j in range(3): line.append(asp_details_text_list[i * 3 - 2 + j]) asp_details_table.append(line) if len(asp_details_table) > 0: asp_details_table_term['words'] = asp_details_table return asp_details_table_term def get_signature(self): signature = self.item.copy() for block in self.pdf_info['0']['blocks']: if block['type'] != 0: continue for line in block['lines']: for span in line['spans']: bbox, text = span['bbox'], span['text'] if '签署日期' in text: words = text signature['words'] = words signature['position'] = bbox return signature def get_somebody(self, top, bottom): # 指定上下边界后,返回上下边界内的客户信息 _name = self.item.copy() _id = self.item.copy() # 只看第一页,先划定上下边界 y_top = 0 y_bottom = 0 for block in self.pdf_info['1']['blocks']: if block['type'] != 0: continue for line in block['lines']: for span in line['spans']: bbox, text = span['bbox'], span['text'] if top in text: y_top = bbox[3] if bottom in text: y_bottom = bbox[3] for block in self.pdf_info['1']['blocks']: if block['type'] != 0: continue for line in block['lines']: for span in line['spans']: bbox, text = span['bbox'], span['text'] if y_top < bbox[3] < y_bottom: if '姓名/名称' in text: words = text.split(':')[-1] _name['position'] = bbox _name['words'] = words if '自然人身份证件号码/法人执照号码' in text: words = text.split(':')[-1] _id['position'] = bbox _id['words'] = words return _name, _id def get_seller(self): seller = self.item.copy() # 先找到 key anchor_bbox = None for block in self.pdf_info['1']['blocks']: if block['type'] != 0: continue for line in block['lines']: for span in line['spans']: bbox, text = span['bbox'], span['text'] if '经销商' == text: anchor_bbox = bbox # 当找到了 key, 则根据 key 去匹配 value if anchor_bbox: half_width = self.pdf_info['1']['width'] * 0.5 for block in self.pdf_info['1']['blocks']: if block['type'] != 0: continue for line in block['lines']: for span in line['spans']: bbox, text = span['bbox'], span['text'] if anchor_bbox[2] < np.mean(bbox[::2]) < half_width and \ anchor_bbox[1] < np.mean(bbox[1::2]) < anchor_bbox[3]: seller['position'] = bbox seller['words'] = text return seller def get_payback_account(self): account = self.item.copy() account_name = self.item.copy() account_bank = self.item.copy() all_text = '' for block in self.pdf_info['1']['blocks']: if block['type'] != 0: continue for line in block['lines']: for span in line['spans']: bbox, text = span['bbox'], span['text'] all_text += text # 首先确定账户信息是哪种,我们只输出非另行通知的格式 if '☑账号' in all_text: all_text = all_text.replace(' ', '') matchs_1 = re.findall(r'账号:(.*)户名', all_text) if matchs_1: words = matchs_1[0] for block in self.pdf_info['1']['blocks']: if block['type'] != 0: continue for line in block['lines']: for span in line['spans']: bbox, text = span['bbox'], span['text'] if f'{words}' in text: account['position'] = bbox account['words'] = words matchs_2 = re.findall(r'户名:(.*)开户行', all_text) if matchs_2: words = matchs_2[0] for block in self.pdf_info['1']['blocks']: if block['type'] != 0: continue for line in block['lines']: for span in line['spans']: bbox, text = span['bbox'], span['text'] if f'{words}' in text: account_name['position'] = bbox account_name['words'] = words matchs_3 = re.findall(r'开户行:(.*);', all_text) if matchs_3: words = matchs_3[0] for block in self.pdf_info['1']['blocks']: if block['type'] != 0: continue for line in block['lines']: for span in line['spans']: bbox, text = span['bbox'], span['text'] if f'开户行:{words};' in text.replace(' ', ''): account_bank['position'] = bbox account_bank['words'] = words return account, account_name, account_bank def get_repayment_schedule(self): repayment_schedule = self.item.copy() repayment_schedule_text_list = [] table = False page = None left = 0 right = 0 for pno in self.pdf_info: for block in self.pdf_info[pno]['blocks']: if block['type'] != 0: continue for line in block['lines']: for span in line['spans']: bbox, text = span['bbox'], span['text'] if '剩余融资' in text: right = bbox[2] if '以上表格中所列序号' in text: table = False if table == True: # 过滤汉字 if re.compile(r'[\u4e00-\u9fff]').search(text): continue # 过滤 1. - 61. 这些标题 if re.findall("\d+", text): if len(re.findall("\d+", text)) == 1: continue if not left < bbox[0] < right: continue repayment_schedule_text_list.append(text) if text.strip() == "61.": page = pno table = True left = bbox[0] # print("repayment_schedule_text_list = ", repayment_schedule_text_list) # repayment_schedule_table = [['序号', '融资租赁成本', '融资租赁费用', '租金', '剩余融资租赁成本']] repayment_schedule_table = [['序号', '租金']] for i in range(len(repayment_schedule_text_list) // 4): line = [f'{i + 1}.'] # 4表示4列的意思 for j in range(4): line.append(repayment_schedule_text_list[i * 4 + j]) # 只保留序号和租金列 line = [line[0].replace('.', ''), line[3]] repayment_schedule_table.append(line) repayment_schedule['words'] = repayment_schedule_table repayment_schedule['page'] = page return repayment_schedule def get_signature_role_1(self): signature_role_1 = self.item.copy() for pno in self.pdf_info: for block in self.pdf_info[pno]['blocks']: if block['type'] != 0: continue for line in block['lines']: for span in line['spans']: bbox, text = span['bbox'], span['text'] if '签署日期' in text: signature_role_1['position'] = bbox signature_role_1['page'] = pno signature_role_1['words'] = text return signature_role_1 def get_signature_role_2(self): signature_role_2 = self.init_item.copy() # 先定位签字区域 texts = [] boxes = [] page_num = None position = None words = None region = False for i in list(self.pdf_info.keys()): for block in self.pdf_info[i]['blocks']: if block['type'] != 0: continue for line in block['lines']: for span in line['spans']: bbox, text = span['bbox'], span['text'] if '共同借款人(共同抵押人)' in text: region = True if '日期' in text: region = False if region == True: page_num = i texts.append(text) boxes.append(bbox) if len(texts) > 4: words = '有' else: words = '无' boxes = np.array(boxes).reshape((-1, 2)) position = [min(boxes[:, 0]), min(boxes[:, 1]), max(boxes[:, 0]), max(boxes[:, 1])] signature_role_2['page_num'] = page_num signature_role_2['position'] = position signature_role_2['words'] = words return signature_role_2 def get_signature_role_3(self): signature_role_3 = self.init_item.copy() # 先定位签字区域 texts = [] boxes = [] page_num = None position = None words = None region = False for i in list(self.pdf_info.keys()): for block in self.pdf_info[i]['blocks']: if block['type'] != 0: continue for line in block['lines']: for span in line['spans']: bbox, text = span['bbox'], span['text'] if '保证人1' in text and int(i) != 0: region = True if '日期' in text: region = False if region == True: page_num = i texts.append(text) boxes.append(bbox) if len(texts) > 4: words = '有' else: words = '无' boxes = np.array(boxes).reshape((-1, 2)) position = [min(boxes[:, 0]), min(boxes[:, 1]), max(boxes[:, 0]), max(boxes[:, 1])] signature_role_3['page_num'] = page_num signature_role_3['position'] = position signature_role_3['words'] = words return signature_role_3 def get_signature_role_4(self): signature_role_4 = self.init_item.copy() # 先定位签字区域 texts = [] boxes = [] page_num = None position = None words = None region = False for i in list(self.pdf_info.keys()): for block in self.pdf_info[i]['blocks']: if block['type'] != 0: continue for line in block['lines']: for span in line['spans']: bbox, text = span['bbox'], span['text'] if '保证人2' in text and int(i) != 0: region = True if '日期' in text: region = False if region == True: page_num = i texts.append(text) boxes.append(bbox) if len(texts) > 4: words = '有' else: words = '无' boxes = np.array(boxes).reshape((-1, 2)) position = [min(boxes[:, 0]), min(boxes[:, 1]), max(boxes[:, 0]), max(boxes[:, 1])] signature_role_4['page_num'] = page_num signature_role_4['position'] = position signature_role_4['words'] = words return signature_role_4 def get_signature_role_5(self): signature_role_5 = self.init_item.copy() # 先定位签字区域 texts = [] boxes = [] page_num = None position = None words = None region = False for i in list(self.pdf_info.keys()): for block in self.pdf_info[i]['blocks']: if block['type'] != 0: continue for line in block['lines']: for span in line['spans']: bbox, text = span['bbox'], span['text'] if '见证人签字' in text and int(i) != 0: region = True if '年' in text: region = False if region == True: page_num = i texts.append(text) boxes.append(bbox) print(texts) if len(texts) > 4: words = '有' else: words = '无' boxes = np.array(boxes).reshape((-1, 2)) position = [min(boxes[:, 0]), min(boxes[:, 1]), max(boxes[:, 0]), max(boxes[:, 1])] signature_role_5['page_num'] = page_num signature_role_5['position'] = position signature_role_5['words'] = words return signature_role_5 def get_last_page_signature(self, page_num, top, bottom): signature_name = self.item.copy() signature_date = self.item.copy() anchor_top = None anchor_bottom = None for block in self.pdf_info[page_num]['blocks']: if block['type'] != 0: continue for line in block['lines']: for span in line['spans']: bbox, text = span['bbox'], span['text'] if top in text: anchor_top = bbox[1] if bottom in text: anchor_bottom = bbox[1] if anchor_top is not None and anchor_bottom is not None: for block in self.pdf_info[page_num]['blocks']: if block['type'] != 0: continue for line in block['lines']: for span in line['spans']: bbox, text = span['bbox'], span['text'] if '签署日期' in text and int(anchor_top) < np.mean(bbox[1::2]) < int(anchor_bottom): name = text.split(' ')[0] date = text.split(':')[-1] signature_name['words'] = name signature_name['position'] = bbox signature_date['words'] = date signature_name['position'] = bbox return signature_name, signature_date def get_electronic_signature(self, top, bottom): signature = self.item.copy() anchor_top = None anchor_bottom = None for pno in self.pdf_info: for block in self.pdf_info[pno]['blocks']: if block['type'] != 0: continue for line in block['lines']: for span in line['spans']: bbox, text = span['bbox'], span['text'] if top in text: anchor_top = bbox[1] if bottom in text: anchor_bottom = bbox[3] if anchor_top is not None and anchor_bottom is not None: for pno in self.pdf_info: for block in self.pdf_info[pno]['blocks']: if block['type'] != 0: continue for line in block['lines']: for span in line['spans']: bbox, text = span['bbox'], span['text'] # ------------ # # print("--text = ", text) if '签署日期' in text and int(anchor_top) < np.mean(bbox[1::2]) < int(anchor_bottom): words = text signature['words'] = words signature['page'] = pno signature['position'] = bbox return signature def get_role_info(self, role_key, page_num='0'): name = self.item.copy() id_num = self.item.copy() representative = self.item.copy() # 以保证人3 的左上角为定位点 anchor = None for block in self.pdf_info[page_num]['blocks']: if block['type'] != 0: continue for line in block['lines']: for span in line['spans']: bbox, text = span['bbox'], span['text'] # 找到角色姓名 if re.match('保证人3', text) is not None: anchor = [bbox[0], bbox[1]] if anchor is not None: for block in self.pdf_info[page_num]['blocks']: if block['type'] != 0: continue for line in block['lines']: for span in line['spans']: bbox, text = span['bbox'], span['text'] # 找到角色姓名 if re.match(role_key, text) is not None: words = text.split(':')[-1] name['words'] = words name['page'] = page_num name['position'] = bbox if role_key == '承租人:': # 找到证件号码且确定位置 if re.match('证件号码:', text) is not None and np.mean(bbox[::2]) < anchor[0] and np.mean( bbox[1::2]) < anchor[1]: words = text.split(':')[-1] id_num['words'] = words id_num['page'] = page_num id_num['position'] = bbox # 找到法人代表且确定位置 if re.match('法定代表人或授权代表:', text) is not None and np.mean(bbox[::2]) < anchor[0] and np.mean( bbox[1::2]) < anchor[1]: words = text.split(':')[-1] representative['words'] = words representative['page'] = page_num representative['position'] = bbox if role_key == '保证人1:': # 找到证件号码且确定位置 if re.match('证件号码:', text) is not None and np.mean(bbox[::2]) < anchor[0] and np.mean( bbox[1::2]) > anchor[1]: words = text.split(':')[-1] id_num['words'] = words id_num['page'] = page_num id_num['position'] = bbox # 找到法人代表且确定位置 if re.match('法定代表人或授权代表:', text) is not None and np.mean(bbox[::2]) < anchor[0] and np.mean( bbox[1::2]) > anchor[1]: words = text.split(':')[-1] representative['words'] = words representative['page'] = page_num representative['position'] = bbox if role_key == '保证人2:': # 找到证件号码且确定位置 if re.match('证件号码:', text) is not None and np.mean(bbox[::2]) > anchor[0] and np.mean( bbox[1::2]) < anchor[1]: words = text.split(':')[-1] id_num['words'] = words id_num['page'] = page_num id_num['position'] = bbox # 找到法人代表且确定位置 if re.match('法定代表人或授权代表:', text) is not None and np.mean(bbox[::2]) > anchor[0] and np.mean( bbox[1::2]) < anchor[1]: words = text.split(':')[-1] representative['words'] = words representative['page'] = page_num representative['position'] = bbox if role_key == '保证人3:': # 找到证件号码且确定位置 if re.match('证件号码:', text) is not None and np.mean(bbox[::2]) > anchor[0] and np.mean( bbox[1::2]) > anchor[1]: words = text.split(':')[-1] id_num['words'] = words id_num['page'] = page_num id_num['position'] = bbox # 找到法人代表且确定位置 if re.match('法定代表人或授权代表:', text) is not None and np.mean(bbox[::2]) > anchor[0] and np.mean( bbox[1::2]) > anchor[1]: words = text.split(':')[-1] representative['words'] = words representative['page'] = page_num representative['position'] = bbox return name, id_num, representative def get_table_add_product(self): table_add_product = self.item.copy() items = [] start = False page = None greater_equal_v35 = False for pno in self.pdf_info: condition = False for block in self.pdf_info[f'{pno}']['blocks']: if block['type'] != 0: continue for line in block['lines']: for span in line['spans']: bbox, text = span['bbox'], span['text'] if text == '租赁利率': greater_equal_v35 = True if '总计' in text: start = True if '注:出租人向承租人购买租赁车辆的对价' in text: page = pno start = False if start == True: items.append(text) lines = [['项目', '购买价格', '实际融资金额']] if greater_equal_v35: for i in range(len(items) // 4): line = [items[2 + i * 4 + 0], items[2 + i * 4 + 1], items[2 + i * 4 + 2]] lines.append(line) else: for i in range(len(items) // 3): line = [items[2 + i * 3 + 0], items[2 + i * 3 + 1], items[2 + i * 3 + 2]] lines.append(line) if len(items) > 0: lines.append([items[0], '', items[1]]) table_add_product['words'] = lines table_add_product['page'] = page table_add_product['position'] = None return table_add_product def get_contract_no_dy(self): # 查找抵押合同编号 contract_no = self.item.copy() key_box = None for pno in self.pdf_info: for block in self.pdf_info[pno]['blocks']: if block['type'] != 0: continue for line in block['lines']: for span in line['spans']: bbox, text = span['bbox'], span['text'] if '抵押合同编号' in text: key_box = bbox if key_box is not None: for pno in self.pdf_info: for block in self.pdf_info[pno]['blocks']: if block['type'] != 0: continue for line in block['lines']: for span in line['spans']: bbox, text = span['bbox'], span['text'] if key_box[1] < np.mean(bbox[1::2]) < key_box[3] and 'CH-' in text: contract_no['position'] = bbox contract_no['page'] = pno contract_no['words'] = text return contract_no def get_dyr_name_id(self): name = self.item.copy() _id = self.item.copy() key_box = None for pno in self.pdf_info: for block in self.pdf_info[pno]['blocks']: if block['type'] != 0: continue for line in block['lines']: for span in line['spans']: bbox, text = span['bbox'], span['text'] if text == '抵押人': key_box = bbox if key_box is not None: rh = abs(key_box[1] - key_box[3]) for pno in self.pdf_info: for block in self.pdf_info[pno]['blocks']: if block['type'] != 0: continue for line in block['lines']: for span in line['spans']: bbox, text = span['bbox'], span['text'] if key_box[1] < np.mean(bbox[1::2]) < key_box[3] + rh * 3 and '姓名' in text: words = text.split(':')[-1] name['position'] = bbox name['page'] = pno name['words'] = words if key_box[1] < np.mean(bbox[1::2]) < key_box[3] + rh * 3 and '证件号码' in text: words = text.split(':')[-1] _id['position'] = bbox _id['page'] = pno _id['words'] = words return name, _id def get_dyrpo_name_id(self): name = self.item.copy() _id = self.item.copy() key_box = None for pno in self.pdf_info: for block in self.pdf_info[pno]['blocks']: if block['type'] != 0: continue for line in block['lines']: for span in line['spans']: bbox, text = span['bbox'], span['text'] if text == '抵押人配偶(如适': key_box = bbox if key_box is not None: rh = abs(key_box[1] - key_box[3]) for pno in self.pdf_info: for block in self.pdf_info[pno]['blocks']: if block['type'] != 0: continue for line in block['lines']: for span in line['spans']: bbox, text = span['bbox'], span['text'] if key_box[1] < np.mean(bbox[1::2]) < key_box[3] + rh * 3 and '姓名' in text: words = text.split(':')[-1] name['position'] = bbox name['page'] = pno name['words'] = words if key_box[1] < np.mean(bbox[1::2]) < key_box[3] + rh * 3 and '证件号码' in text: words = text.split(':')[-1] _id['position'] = bbox _id['page'] = pno _id['words'] = words return name, _id def get_key_value_position(self, key): value = self.item.copy() key_box = None for pno in self.pdf_info: for block in self.pdf_info[pno]['blocks']: if block['type'] != 0: continue for line in block['lines']: for span in line['spans']: bbox, text = span['bbox'], span['text'] if text == key: key_box = bbox if key_box is not None: rh = abs(key_box[1] - key_box[3]) for pno in self.pdf_info: for block in self.pdf_info[pno]['blocks']: if block['type'] != 0: continue for line in block['lines']: for span in line['spans']: bbox, text = span['bbox'], span['text'] if key_box[1] < np.mean(bbox[1::2]) < key_box[3] and key_box[0] < bbox[0] and abs( key_box[2] - bbox[0]) < rh * 10: words = text value['position'] = bbox value['page'] = pno value['words'] = words return value def get_role_info_3_3(self, role_key, page_num='0'): name = self.item.copy() id_num = self.item.copy() representative = self.item.copy() # 以保证人2 的左上角为定位点 anchor = None for block in self.pdf_info[page_num]['blocks']: if block['type'] != 0: continue for line in block['lines']: for span in line['spans']: bbox, text = span['bbox'], span['text'] # 找到角色姓名 if re.match('保证人2', text) is not None: anchor = [bbox[0], bbox[1]] if anchor is not None: for block in self.pdf_info[page_num]['blocks']: if block['type'] != 0: continue for line in block['lines']: for span in line['spans']: bbox, text = span['bbox'], span['text'] # 找到角色姓名 if re.match(role_key, text) is not None: words = text.split(':')[-1] name['words'] = words name['page'] = page_num name['position'] = bbox if role_key == '承租人一:': # 找到证件号码且确定位置 if re.match('证件号码:', text) is not None and np.mean(bbox[::2]) < anchor[0] and np.mean( bbox[1::2]) < anchor[1]: words = text.split(':')[-1] id_num['words'] = words id_num['page'] = page_num id_num['position'] = bbox # 找到法人代表且确定位置 if re.match('法定代表人或授权代表:', text) is not None and np.mean(bbox[::2]) < anchor[0] and np.mean( bbox[1::2]) < anchor[1]: words = text.split(':')[-1] representative['words'] = words representative['page'] = page_num representative['position'] = bbox if role_key == '共同承租人:': # 找到证件号码且确定位置 if re.match('证件号码:', text) is not None and np.mean(bbox[::2]) < anchor[0] and np.mean( bbox[1::2]) > anchor[1]: words = text.split(':')[-1] id_num['words'] = words id_num['page'] = page_num id_num['position'] = bbox # 找到法人代表且确定位置 if re.match('法定代表人或授权代表:', text) is not None and np.mean(bbox[::2]) < anchor[0] and np.mean( bbox[1::2]) > anchor[1]: words = text.split(':')[-1] representative['words'] = words representative['page'] = page_num representative['position'] = bbox if role_key == '保证人1:': # 找到证件号码且确定位置 if re.match('证件号码:', text) is not None and np.mean(bbox[::2]) > anchor[0] and np.mean( bbox[1::2]) < anchor[1]: words = text.split(':')[-1] id_num['words'] = words id_num['page'] = page_num id_num['position'] = bbox # 找到法人代表且确定位置 if re.match('法定代表人或授权代表:', text) is not None and np.mean(bbox[::2]) > anchor[0] and np.mean( bbox[1::2]) < anchor[1]: words = text.split(':')[-1] representative['words'] = words representative['page'] = page_num representative['position'] = bbox if role_key == '保证人2:': # 找到证件号码且确定位置 if re.match('证件号码:', text) is not None and np.mean(bbox[::2]) > anchor[0] and np.mean( bbox[1::2]) > anchor[1]: words = text.split(':')[-1] id_num['words'] = words id_num['page'] = page_num id_num['position'] = bbox # 找到法人代表且确定位置 if re.match('法定代表人或授权代表:', text) is not None and np.mean(bbox[::2]) > anchor[0] and np.mean( bbox[1::2]) > anchor[1]: words = text.split(':')[-1] representative['words'] = words representative['page'] = page_num representative['position'] = bbox return name, id_num, representative def get_info(self): """ block['type'] == 0 : 表示该元素为图片 Returns: dict: Description """ if len(self.pdf_info) > 0: # 取 Page 1 上的合同编号 contract_no = self.get_contract_no(page_num='0') self.init_result['合同编号'] = contract_no # 从第一页上取四个角色的姓名和证件号码 name, id_num, representative = self.get_role_info(role_key='承租人:', page_num='0') if name["words"] == None: name, id_num, representative = self.get_role_info_3_3(role_key='承租人一:', page_num='0') self.init_result['承租人-姓名'] = name self.init_result['承租人-证件号码'] = id_num self.init_result['承租人-法定代表人或授权代表'] = representative name, id_num, representative = self.get_role_info(role_key='保证人1:', page_num='0') self.init_result['保证人1-姓名'] = name self.init_result['保证人1-证件号码'] = id_num self.init_result['保证人1-法定代表人或授权代表'] = representative # if条件判别 对应3_3版本 if name["words"] == None: name, id_num, representative = self.get_role_info_3_3(role_key='共同承租人:', page_num='0') self.init_result['共同承租人-姓名'] = name self.init_result['共同承租人-证件号码'] = id_num self.init_result['共同承租人-法定代表人或授权代表'] = representative name, id_num, representative = self.get_role_info(role_key='保证人2:', page_num='0') self.init_result['保证人2-姓名'] = name self.init_result['保证人2-证件号码'] = id_num self.init_result['保证人2-法定代表人或授权代表'] = representative # if条件判别 对应3_3版本 if name["words"] == None: name, id_num, representative = self.get_role_info_3_3(role_key='保证人1:', page_num='0') self.init_result['保证人2-姓名'] = name self.init_result['保证人2-证件号码'] = id_num self.init_result['保证人2-法定代表人或授权代表'] = representative name, id_num, representative = self.get_role_info(role_key='保证人3:', page_num='0') self.init_result['保证人3-姓名'] = name self.init_result['保证人3-证件号码'] = id_num self.init_result['保证人3-法定代表人或授权代表'] = representative if name["words"] == None: name, id_num, representative = self.get_role_info_3_3(role_key='保证人2:', page_num='0') self.init_result['保证人3-姓名'] = name self.init_result['保证人3-证件号码'] = id_num self.init_result['保证人3-法定代表人或授权代表'] = representative # 在所有页面中找正文中(第二部分 融资租赁主要条款及付款计划)的那个编号,因为存在换行的情况所以暂时不带位置输出 contract_no = self.get_contract_no_one() self.init_result['合同编号(正文)'] = contract_no # 找到车辆识别代码 vin = self.get_key_value(key='车辆识别代码:') self.init_result['车辆识别代码'] = vin # 找到经销商(车辆卖方(经销商)) seller = self.get_key_value(key='车辆卖方(经销商):') self.init_result['车辆卖方(经销商)'] = seller # 找到 —— 车辆原始销售价格 vehicle_price = self.get_key_value(key='车辆原始销售价格(《机动车销售统一发票》所列金额):') self.init_result['车辆原始销售价格(《机动车销售统一发票》所列金额)'] = vehicle_price # 找车辆附加产品明细(表) table_add_product = self.get_table_add_product() self.init_result['车辆附加产品明细表'] = table_add_product # 找融资成本总额 financing_cost = self.get_key_value(key='融资成本总额:') self.init_result['融资成本总额'] = financing_cost # 找租期 lease_term = self.get_key_value(key='租期:') self.init_result['租期'] = lease_term # 找还款计划(表) repayment_schedule = self.get_repayment_schedule() self.init_result['付款计划表'] = repayment_schedule # 找开户行户名、银行账号、银行 name = self.get_key_value(key='户名:') self.init_result['银行账户-户名'] = name account = self.get_key_value(key='银行账号:') self.init_result['银行账户-银行账号'] = account bank = self.get_key_value(key='开户银行:') self.init_result['银行账户-开户行'] = bank # 找签字页上的系列信息 # 承租人姓名、签章 name = self.get_key_value(key='承租人姓名:') electronic_signature = self.get_electronic_signature(top='承租人姓名:', bottom='保证人1姓名:') if name["words"] == None: name = self.get_key_value(key='承租人一姓名:') electronic_signature = self.get_electronic_signature(top='承租人一姓名:', bottom='共同承租人名称:') self.init_result['签字页-承租人姓名'] = name self.init_result['签字页-承租人签章'] = electronic_signature # 保证人1姓名、签章 name = self.get_key_value(key='保证人1姓名:') electronic_signature = self.get_electronic_signature(top='保证人1姓名:', bottom='保证人2姓名:') self.init_result['签字页-保证人1姓名'] = name self.init_result['签字页-保证人1签章'] = electronic_signature # 这里用的是 name["words"] == "" if name["words"] == "": name = self.get_key_value(key='共同承租人名称:') electronic_signature = self.get_electronic_signature(top='共同承租人名称:', bottom='保证人1姓名:') self.init_result['签字页-共同承租人姓名'] = name self.init_result['签字页-共同承租人签章'] = electronic_signature # 保证人2姓名、签章 name = self.get_key_value(key='保证人2姓名:') electronic_signature = self.get_electronic_signature(top='保证人2姓名:', bottom='保证人3姓名:') self.init_result['签字页-保证人2姓名'] = name self.init_result['签字页-保证人2签章'] = electronic_signature # if判断条件对应3_3版本 if name["words"] == "": name = self.get_key_value(key='保证人1姓名:') electronic_signature = self.get_electronic_signature(top='保证人1姓名:', bottom='保证人2姓名:') self.init_result['签字页-保证人1姓名'] = name self.init_result['签字页-保证人1签章'] = electronic_signature # 保证人3姓名、签章 name = self.get_key_value(key='保证人3姓名:') electronic_signature = self.get_electronic_signature(top='保证人3姓名:', bottom='日期:') self.init_result['签字页-保证人3姓名'] = name self.init_result['签字页-保证人3签章'] = electronic_signature # if判断条件对应3_3版本 if name["words"] == None: name = self.get_key_value(key='保证人2姓名:') electronic_signature = self.get_electronic_signature(top='保证人2姓名:', bottom='日期:') self.init_result['签字页-保证人2姓名'] = name self.init_result['签字页-保证人2签章'] = electronic_signature return self.init_result # results['is_shhz_contract'] = True # results['pdf_info'] = self.init_result # return results def get_info_1(self): if len(self.pdf_info) > 0: contract_no = self.get_contract_no(page_num='0') self.init_result_1['合同编号'] = contract_no # 承租人姓名 name = self.get_key_value(key='承租人:', page_num='0') self.init_result_1['承租人-姓名'] = name # 承租人证件号码 _id = self.get_key_value(key='证件号码:', page_num='0') self.init_result_1['承租人-证件号码'] = _id # 销售经销商 seller = self.get_key_value(key='销售经销商:', page_num='0') self.init_result_1['销售经销商'] = seller # 合同编号(正文) contract_no = self.get_contract_no_one() self.init_result_1['合同编号(正文)'] = contract_no # 签字页-承租人姓名 name = self.get_key_value(key='姓名/名称:') self.init_result_1['签字页-承租人姓名'] = name # 签字页-承租人证件号码 _id = self.get_key_value(key='自然人身份证件号码/法人执照号码:') self.init_result_1['签字页-承租人证件号码'] = _id # 签字页-承租人签章 signature_role_1 = self.get_signature_role_1() self.init_result_1['签字页-承租人签章'] = signature_role_1 # 签字页-销售经销商 seller = self.get_key_value(key='销售经销商:') self.init_result_1['签字页-销售经销商'] = seller # 经销商签章 pass return self.init_result_1 def get_info_2(self): if len(self.pdf_info) > 0: contract_no = self.get_contract_no_dy() self.init_result_2['合同编号'] = contract_no # 合同编号(正文) contract_no = self.get_contract_no_one() self.init_result_2['合同编号(正文)'] = contract_no # 抵押人姓名/名称 name, _id = self.get_dyr_name_id() self.init_result_2['抵押人姓名/名称'] = name self.init_result_2['抵押人证件号码'] = _id # 抵押人配偶信息 name, _id = self.get_dyrpo_name_id() self.init_result_2['抵押人配偶姓名/名称'] = name self.init_result_2['抵押人配偶证件号码'] = _id # 车辆识别代码 vin = self.get_key_value(key='车辆识别代码:') self.init_result_2['车辆识别代码'] = vin # 租金总额 rent = self.get_key_value_position(key='租金总额') self.init_result_2['租金总额'] = rent # 融资租赁期限 lease_term = self.get_key_value_position(key='融资租赁期限') self.init_result_2['融资租赁期限'] = lease_term # 签字页抵押人姓名和签章 name = self.get_key_value(key='抵押人姓名:') electronic_signature = self.get_electronic_signature(top='抵押权人盖章', bottom='抵押人配偶姓名:') self.init_result_2['签字页-抵押人姓名'] = name self.init_result_2['签字页-抵押人签章'] = electronic_signature # 签字页抵押人配偶姓名和签章 name = self.get_key_value(key='抵押人配偶姓名:') electronic_signature = self.get_electronic_signature(top='抵押人配偶姓名:', bottom='日期') self.init_result_2['签字页-抵押人配偶姓名'] = name self.init_result_2['签字页-抵押人配偶签章'] = electronic_signature return self.init_result_2