def pdf_info_rebuild(pdf_info, fix_bbox=True, file_type=0): pdf_text_info = dict() pdf_img_info = dict() for pno_str, page_info in pdf_info.items(): text_set = set() for block in page_info['blocks']: if block['type'] == 0: # text有重复的现象 text_set.clear() for line in block['lines']: for span in line['spans']: bbox, text = span['bbox'], span['text'].strip() if len(text) != 0 and text not in text_set: if file_type != 3: # 汽车销售合同补充协议,相同的总价会被过滤,所以取消 text_set.add(text) # bbox的高,不准 if fix_bbox and bbox[-1] - bbox[1] < span['size']: bbox[-1] = bbox[-1] + span['size'] pdf_text_info.setdefault(pno_str, list()).append([bbox, text]) elif block['type'] == 1: pdf_img_info.setdefault(pno_str, list()).append((block['bbox'], '有')) return pdf_text_info, pdf_img_info