pdf_to_img.py 11.8 KB
import os
import fitz
import signal
from PIL import Image
from io import BytesIO

from django.core.management import BaseCommand
from common.mixins import LoggerMixin


class Command(BaseCommand, LoggerMixin):

    def __init__(self):
        super().__init__()
        self.log_base = '[pdf to img]'
        # 处理文件开关
        self.switch = True
        # pdf页面转图片
        self.zoom_x = 2.0
        self.zoom_y = 2.0
        self.trans = fitz.Matrix(self.zoom_x, self.zoom_y).preRotate(0)  # zoom factor 2 in each dimension
        # 优雅退出信号:15
        signal.signal(signal.SIGTERM, self.signal_handler)

    def signal_handler(self, sig, frame):
        self.switch = False  # 停止处理文件

    @staticmethod
    def getimage(pix):
        if pix.colorspace.n != 4:
            return pix
        tpix = fitz.Pixmap(fitz.csRGB, pix)
        return tpix

    def recoverpix(self, doc, item):
        x = item[0]  # xref of PDF image
        s = item[1]  # xref of its /SMask
        is_rgb = True if item[5] == 'DeviceRGB' else False

        # RGB
        if is_rgb:
            if s == 0:
                return doc.extractImage(x)
            # we need to reconstruct the alpha channel with the smask
            pix1 = fitz.Pixmap(doc, x)
            pix2 = fitz.Pixmap(doc, s)  # create pixmap of the /SMask entry

            # sanity check
            if not (pix1.irect == pix2.irect and pix1.alpha == pix2.alpha == 0 and pix2.n == 1):
                pix2 = None
                return self.getimage(pix1)

            pix = fitz.Pixmap(pix1)  # copy of pix1, alpha channel added
            pix.setAlpha(pix2.samples)  # treat pix2.samples as alpha value
            pix1 = pix2 = None  # free temp pixmaps
            return self.getimage(pix)

        # GRAY/CMYK
        pix1 = fitz.Pixmap(doc, x)
        pix = fitz.Pixmap(pix1)  # copy of pix1, alpha channel added

        if s != 0:
            pix2 = fitz.Pixmap(doc, s)  # create pixmap of the /SMask entry

            # sanity check
            if not (pix1.irect == pix2.irect and pix1.alpha == pix2.alpha == 0 and pix2.n == 1):
                pix2 = None
                return self.getimage(pix1)

            pix.setAlpha(pix2.samples)  # treat pix2.samples as alpha value

        pix1 = pix2 = None  # free temp pixmaps

        pix = fitz.Pixmap(fitz.csRGB, pix)  # GRAY/CMYK to RGB
        return self.getimage(pix)

    @staticmethod
    def get_img_data(pix):
        if type(pix) is dict:  # we got a raw image
            ext = pix["ext"]
            img_data = pix["image"]
        else:  # we got a pixmap
            ext = 'png'
            img_data = pix.getPNGData()
        return ext, img_data

    @staticmethod
    def split_il(il):
        small_img_il_list = []
        big_img_il_list = []
        start = 0
        index = 0
        length = len(il)
        for i in range(length):
            if il[i][2] >= 700 and il[i][3] >= 647:
                if start < i:
                    small_img_il_list.append((il[start: i], index))
                    index += 1
                else:
                    start += 1
                big_img_il_list.append((il[i], index))
                index += 1
                continue
            if i == start:
                if i == length - 1:
                    small_img_il_list.append((il[start: length], index))
                continue
            elif i == length - 1:
                if il[i][2] == il[i - 1][2]:
                    small_img_il_list.append((il[start: length], index))
                else:
                    small_img_il_list.append((il[start: i], index))
                    small_img_il_list.append((il[i: length], index+1))
                continue
            if il[i][2] != il[i - 1][2]:
                small_img_il_list.append((il[start: i], index))
                index += 1
                start = i
            elif il[i][3] != il[i - 1][3] and il[i][2] < 1200:
                small_img_il_list.append((il[start: i + 1], index))
                index += 1
                start = i + 1
        return small_img_il_list, big_img_il_list

    def handle(self, *args, **kwargs):
        pdf_dir = '/Users/clay/Desktop/问题PDF'
        img_dir = '/Users/clay/Desktop/问题PDF'
        for d in os.listdir(pdf_dir):
            # if d in ['.DS_Store', 'CH-B008003736.pdf', 'CH-B006317088.pdf', 'CH-B008487476.pdf', 'CH-B006337608.pdf',
            #          'CH-B006391612.pdf', 'CH-B006536124.pdf', 'CH-B006526652.pdf', 'CH-B009003592.pdf']:
            #     continue
            # if d != 'CH-B006393152.PDF':
            # if d != 'CH-B006526652.pdf':
            if d != 'CH-B008487944.pdf':
                continue
            pdf_path = os.path.join(pdf_dir, d)
            if os.path.isfile(pdf_path):
                img_save_path = os.path.join(img_dir, d[:-4])
                # if os.path.exists(img_save_path):
                #     continue
                os.makedirs(img_save_path, exist_ok=True)
                with fitz.Document(pdf_path) as pdf:
                    self.cronjob_log.info('{0} [pdf_path={1}] [metadata={2}]'.format(
                        self.log_base, pdf_path, pdf.metadata))
                    xref_set = set()
                    for pno in range(pdf.pageCount):
                        print('---------------------------------------')
                        il = pdf.getPageImageList(pno)
                        # (xref, smask, width, height, bpc, colorspace, alt.colorspace, name, filter, invoker)
                        print(il)

                        # for img_index, img in enumerate(il):
                        #     pix = self.recoverpix(pdf, img)
                        #     ext, img_data = self.get_img_data(pix)
                        #     save_path = os.path.join(img_save_path, 'page_{0}_img_{1}.{2}'.format(
                        #         pno, img_index, ext))
                        #     with open(save_path, "wb") as f:
                        #         f.write(img_data)

                        if len(il) == 0:
                            page = pdf.loadPage(pno)
                            pm = page.getPixmap(matrix=self.trans, alpha=False)
                            save_path = os.path.join(img_save_path, 'page_{0}_img_0.png'.format(page.number))
                            pm.writePNG(save_path)
                        elif len(il) == 1:
                            width = il[0][2]
                            height = il[0][3]
                            colorspace = il[0][5]
                            adobe_filter = il[0][-1]
                            if colorspace == '' or adobe_filter in ['', '']:
                                continue
                            # 小图
                            if width < 500 and height < 500:
                                page = pdf.loadPage(pno)
                                pm = page.getPixmap(matrix=self.trans, alpha=False)
                                save_path = os.path.join(img_save_path, 'page_{0}_img_0.png'.format(page.number))
                                pm.writePNG(save_path)
                            # 大图
                            elif il[0][0] not in xref_set:
                                pix = self.recoverpix(pdf, il[0])
                                ext, img_data = self.get_img_data(pix)
                                save_path = os.path.join(img_save_path, 'page_{0}_img_0.{1}'.format(pno, ext))
                                with open(save_path, "wb") as f:
                                    f.write(img_data)
                                xref_set.add(il[0][0])
                        else:
                            il.sort(key=lambda x: x[0])
                            small_img_il_list, big_img_il_list = self.split_il(il)
                            print(small_img_il_list)
                            print(big_img_il_list)
                            print('+++++++++++++++++++++++++++++++++++')

                            if len(small_img_il_list) > 2:  # 单页无规律小图过多时,使用页面转图片
                                page = pdf.loadPage(pno)
                                pm = page.getPixmap(matrix=self.trans, alpha=False)
                                save_path = os.path.join(img_save_path, 'page_{0}_img_0.png'.format(page.number))
                                pm.writePNG(save_path)
                            else:  # 提取图片
                                for img_il, img_index in big_img_il_list:
                                    if img_il[0] in xref_set:
                                        continue
                                    pix = self.recoverpix(pdf, img_il)
                                    ext, img_data = self.get_img_data(pix)
                                    save_path = os.path.join(img_save_path, 'page_{0}_img_{1}.{2}'.format(
                                        pno, img_index, ext))
                                    with open(save_path, "wb") as f:
                                        f.write(img_data)
                                    xref_set.add(img_il[0])

                                for img_il, img_index in small_img_il_list:
                                    # 小图
                                    if len(img_il) == 1 and img_il[0][2] < 500 and img_il[0][3] < 500:
                                        page = pdf.loadPage(pno)
                                        pm = page.getPixmap(matrix=self.trans, alpha=False)
                                        save_path = os.path.join(img_save_path,
                                                                 'page_{0}_img_0.png'.format(page.number))
                                        pm.writePNG(save_path)
                                    elif len(img_il) == 1 and img_il[0][0] not in xref_set:  # 当只有一张图片时, 简化处理
                                        pix = self.recoverpix(pdf, img_il[0])
                                        ext, img_data = self.get_img_data(pix)
                                        save_path = os.path.join(img_save_path, 'page_{0}_img_{1}.{2}'.format(
                                            pno, img_index, ext))
                                        with open(save_path, "wb") as f:
                                            f.write(img_data)
                                        xref_set.add(img_il[0][0])
                                    else:  # 多张图片,竖向拼接
                                        height_sum = 0
                                        im_list = []
                                        width = img_il[0][2]
                                        for img in img_il:
                                            # xref = img[0]
                                            # if xref in xref_list:
                                            #     continue
                                            height = img[3]
                                            pix = self.recoverpix(pdf, img)
                                            ext, img_data = self.get_img_data(pix)

                                            # xref_list.append(xref)

                                            im = Image.open(BytesIO(img_data))
                                            im_list.append((height, im, ext))
                                            height_sum += height

                                        save_path = os.path.join(img_save_path, 'page_{0}_img_{1}.{2}'.format(
                                            pno, img_index, im_list[0][2]))
                                        res = Image.new(im_list[0][1].mode, (width, height_sum))
                                        h_now = 0
                                        for h, m, _ in im_list:
                                            res.paste(m, box=(0, h_now))
                                            h_now += h
                                        res.save(save_path)