tools.py 1.03 KB
def pdf_info_rebuild(pdf_info, fix_bbox=True):
    pdf_text_info = dict()
    pdf_img_info = dict()
    for pno_str, page_info in pdf_info.items():
        text_set = set()
        for block in page_info['blocks']:
            if block['type'] == 0:
                # text有重复的现象
                text_set.clear()
                for line in block['lines']:
                    for span in line['spans']:
                        bbox, text = span['bbox'], span['text'].strip()
                        if len(text) != 0 and text not in text_set:
                            text_set.add(text)
                            # bbox的高,不准
                            if fix_bbox and bbox[-1] - bbox[1] < span['size']:
                                bbox[-1] = bbox[-1] + span['size']
                            pdf_text_info.setdefault(pno_str, list()).append([bbox, text])
            elif block['type'] == 1:
                pdf_img_info.setdefault(pno_str, list()).append((block['bbox'], '有'))

        return pdf_text_info, pdf_img_info