import fitz import os from PIL import Image from io import BytesIO class PdfHandler: def __init__(self, pdf_path): self.pdf_path = pdf_path self.pdf_name = os.path.splitext(os.path.basename(pdf_path))[0] def page_to_pix_img(self, save_dir_path, zoom_x, zoom_y): trans = fitz.Matrix(zoom_x, zoom_y).preRotate(0) # zoom factor 2 in each dimension with fitz.Document(self.pdf_path) as pdf: # print(pdf.metadata) # print(pdf.getPageImageList(0)) # print(pdf.getToC()) # 获取大纲 for page in pdf: pm = page.getPixmap(matrix=trans, alpha=False) # print(pm.samples) # a rectangular area of bytes representing the image data (a Python bytes object). # print(pm.width) # print(pm.height) # print(pm.stride) # number of bytes of one horizontal image line) save_path = os.path.join(save_dir_path, '{0}_{1}.png'.format(self.pdf_name, page.number)) # pm.writePNG(save_path) pm.writeImage(save_path) def page_to_svg_img(self, save_dir_path): with fitz.Document(self.pdf_path) as pdf: for page in pdf: svg = page.getSVGimage(matrix=fitz.Identity) # UTF-8 string svg save_path = os.path.join(save_dir_path, '{0}_{1}.svg'.format(self.pdf_name, page.number)) with open(save_path, 'w') as f: f.write(svg) @staticmethod def getimage(pix): if pix.colorspace.n != 4: return pix tpix = fitz.Pixmap(fitz.csRGB, pix) return tpix def recoverpix(self, doc, item): x = item[0] # xref of PDF image s = item[1] # xref of its /SMask is_rgb = True if item[5] == 'DeviceRGB' else False # RGB if is_rgb: if s == 0: return doc.extractImage(x) # we need to reconstruct the alpha channel with the smask pix1 = fitz.Pixmap(doc, x) pix2 = fitz.Pixmap(doc, s) # create pixmap of the /SMask entry # sanity check if not (pix1.irect == pix2.irect and pix1.alpha == pix2.alpha == 0 and pix2.n == 1): pix2 = None return self.getimage(pix1) pix = fitz.Pixmap(pix1) # copy of pix1, alpha channel added pix.setAlpha(pix2.samples) # treat pix2.samples as alpha value pix1 = pix2 = None # free temp pixmaps return self.getimage(pix) # GRAY/CMYK pix1 = fitz.Pixmap(doc, x) pix = fitz.Pixmap(pix1) # copy of pix1, alpha channel added if s != 0: pix2 = fitz.Pixmap(doc, s) # create pixmap of the /SMask entry # sanity check if not (pix1.irect == pix2.irect and pix1.alpha == pix2.alpha == 0 and pix2.n == 1): pix2 = None return self.getimage(pix1) pix.setAlpha(pix2.samples) # treat pix2.samples as alpha value pix1 = pix2 = None # free temp pixmaps pix = fitz.Pixmap(fitz.csRGB, pix) # GRAY/CMYK to RGB return self.getimage(pix) def extract_images(self, save_dir_path): dimlimit = 100 # each image side must be greater than this relsize = 0.05 # image : pixmap size ratio must be larger than this (5%) abssize = 2048 # absolute image size limit 2 KB: ignore if smaller imgdir = save_dir_path # found images are stored in this subfolder xreflist = [] with fitz.Document(self.pdf_path) as pdf: for pno in range(pdf.pageCount): il = pdf.getPageImageList(pno) for img in il: print(img) xref = img[0] if xref in xreflist: continue width = img[2] height = img[3] print(xref, width, height) # if min(width, height) <= dimlimit: # continue pix = self.recoverpix(pdf, img) if type(pix) is dict: # we got a raw image ext = pix["ext"] imgdata = pix["image"] n = pix["colorspace"] imgfile = os.path.join(imgdir, "img-%i.%s" % (xref, ext)) else: # we got a pixmap imgfile = os.path.join(imgdir, "img-%i.png" % xref) n = pix.n imgdata = pix.getPNGData() # if len(imgdata) <= abssize: # continue # # if len(imgdata) / (width * height * n) <= relsize: # continue fout = open(imgfile, "wb") fout.write(imgdata) fout.close() xreflist.append(xref) @staticmethod def split_il(il): img_il_list = [] start = 0 length = len(il) for i in range(length): if i == start: if i == length - 1: img_il_list.append(il[start: length]) continue elif i == length - 1: img_il_list.append(il[start: length]) continue if il[i][2] != il[i - 1][2]: img_il_list.append(il[start: i]) start = i elif il[i][3] != il[i - 1][3]: img_il_list.append(il[start: i + 1]) start = i + 1 return img_il_list def extract_images_pro(self, save_dir_path): with fitz.Document(self.pdf_path) as pdf: print('----------------------------') print(self.pdf_name) print(pdf.metadata) # xref_list = [] # TODO 图片去重 for pno in range(pdf.pageCount): print('========================') il = pdf.getPageImageList(pno) il.sort(key=lambda x: x[0]) # (xref, smask, width, height, bpc, colorspace, alt.colorspace, name, filter, invoker) img_il_list = self.split_il(il) il = None print(img_il_list) print(len(img_il_list)) # TODO 判断单页图片过多时,使用页面转图片 for img_count, img_il in enumerate(img_il_list): print(img_il) height_sum = 0 im_list = [] for img in img_il: # xref = img[0] # if xref in xref_list: # continue width = img[2] height = img[3] pix = self.recoverpix(pdf, img) if type(pix) is dict: # we got a raw image ext = pix["ext"] img_data = pix["image"] else: # we got a pixmap ext = 'png' img_data = pix.getPNGData() # xref_list.append(xref) im = Image.open(BytesIO(img_data)) im_list.append((width, height, im, ext)) height_sum += height print(im_list) save_path = os.path.join(save_dir_path, 'page_{0}_img_{1}.{2}'.format( pno, img_count, im_list[0][3])) # 当只有一张图片时, 简化处理 if len(im_list) == 1: im_list[0][2].save(save_path) # 多张图片,竖向拼接 else: res = Image.new(im_list[0][2].mode, (im_list[0][0], height_sum)) h_now = 0 for _, h, m, _ in im_list: res.paste(m, box=(0, h_now)) h_now += h res.save(save_path) if __name__ == '__main__': dir_path = '/Users/clay/Desktop/biz/pdf_test/银行流水/' pdf_list = os.listdir(dir_path) for path in pdf_list: if path == '.DS_Store': continue pdf_handler = PdfHandler(os.path.join(dir_path, path)) save_path = os.path.join('/Users/clay/Desktop/biz/pdf_test/', 'test', os.path.splitext(os.path.basename(path))[0]) os.mkdir(save_path) pdf_handler.extract_images_pro(save_path) # pdf_handler = PdfHandler('/Users/clay/Desktop/biz/pdf_test/银行流水/竖版-特殊-邮储银行-一本通绿卡通交易明细(客户).pdf') # pdf_handler = PdfHandler('/Users/clay/Desktop/biz/pdf_test/银行流水/横版-表格-工商银行 借记卡账户历史明细清单 .pdf') # pdf_handler.page_to_pix_img('/Users/clay/Desktop/biz/pdf_test/', 3.0, 3.0) # pdf_handler.page_to_svg_img('/Users/clay/Desktop/biz/pdf_test/') # pdf_handler.extract_images_pro('/Users/clay/Desktop/biz/pdf_test/test') # pix = fitz.Pixmap(sys.argv[1]) # read image file # rgb = "RGB" # set PIL parameter # if pix.alpha: # JPEG cannot have alpha! # pix0 = fitz.Pixmap(pix, 0) # drop alpha channel # pix = pix0 # rename pixmap # # img = Image.frombuffer(rgb, [pix.width, pix.height], pix.samples, "raw", rgb, 0, 1) # img.save(outputFileName)