tools.py
1.17 KB
def pdf_info_rebuild(pdf_info, fix_bbox=True, file_type=0):
pdf_text_info = dict()
pdf_img_info = dict()
for pno_str, page_info in pdf_info.items():
text_set = set()
for block in page_info['blocks']:
if block['type'] == 0:
# text有重复的现象
text_set.clear()
for line in block['lines']:
for span in line['spans']:
bbox, text = span['bbox'], span['text'].strip()
if len(text) != 0 and text not in text_set:
if file_type != 3: # 汽车销售合同补充协议,相同的总价会被过滤,所以取消
text_set.add(text)
# bbox的高,不准
if fix_bbox and bbox[-1] - bbox[1] < span['size']:
bbox[-1] = bbox[-1] + span['size']
pdf_text_info.setdefault(pno_str, list()).append([bbox, text])
elif block['type'] == 1:
pdf_img_info.setdefault(pno_str, list()).append((block['bbox'], '有'))
return pdf_text_info, pdf_img_info