pdf_to_img.py 8.08 KB
import os
import fitz
import signal
from PIL import Image
from io import BytesIO

from django.core.management import BaseCommand
from common.mixins import LoggerMixin


class Command(BaseCommand, LoggerMixin):

    def __init__(self):
        super().__init__()
        self.log_base = '[pdf to img]'
        # 处理文件开关
        self.switch = True
        # pdf页面转图片
        self.zoom_x = 2.0
        self.zoom_y = 2.0
        self.trans = fitz.Matrix(self.zoom_x, self.zoom_y).preRotate(0)  # zoom factor 2 in each dimension
        # 优雅退出信号:15
        signal.signal(signal.SIGTERM, self.signal_handler)

    def signal_handler(self, sig, frame):
        self.switch = False  # 停止处理文件

    @staticmethod
    def getimage(pix):
        if pix.colorspace.n != 4:
            return pix
        tpix = fitz.Pixmap(fitz.csRGB, pix)
        return tpix

    def recoverpix(self, doc, item):
        x = item[0]  # xref of PDF image
        s = item[1]  # xref of its /SMask
        is_rgb = True if item[5] == 'DeviceRGB' else False

        # RGB
        if is_rgb:
            if s == 0:
                return doc.extractImage(x)
            # we need to reconstruct the alpha channel with the smask
            pix1 = fitz.Pixmap(doc, x)
            pix2 = fitz.Pixmap(doc, s)  # create pixmap of the /SMask entry

            # sanity check
            if not (pix1.irect == pix2.irect and pix1.alpha == pix2.alpha == 0 and pix2.n == 1):
                pix2 = None
                return self.getimage(pix1)

            pix = fitz.Pixmap(pix1)  # copy of pix1, alpha channel added
            pix.setAlpha(pix2.samples)  # treat pix2.samples as alpha value
            pix1 = pix2 = None  # free temp pixmaps
            return self.getimage(pix)

        # GRAY/CMYK
        pix1 = fitz.Pixmap(doc, x)
        pix = fitz.Pixmap(pix1)  # copy of pix1, alpha channel added

        if s != 0:
            pix2 = fitz.Pixmap(doc, s)  # create pixmap of the /SMask entry

            # sanity check
            if not (pix1.irect == pix2.irect and pix1.alpha == pix2.alpha == 0 and pix2.n == 1):
                pix2 = None
                return self.getimage(pix1)

            pix.setAlpha(pix2.samples)  # treat pix2.samples as alpha value

        pix1 = pix2 = None  # free temp pixmaps

        pix = fitz.Pixmap(fitz.csRGB, pix)  # GRAY/CMYK to RGB
        return self.getimage(pix)

    @staticmethod
    def get_img_data(pix):
        if type(pix) is dict:  # we got a raw image
            ext = pix["ext"]
            img_data = pix["image"]
        else:  # we got a pixmap
            ext = 'png'
            img_data = pix.getPNGData()
        return ext, img_data

    @staticmethod
    def split_il(il):
        img_il_list = []
        start = 0
        length = len(il)
        for i in range(length):
            if i == start:
                if i == length - 1:
                    img_il_list.append(il[start: length])
                continue
            elif i == length - 1:
                img_il_list.append(il[start: length])
                continue
            if il[i][2] != il[i - 1][2]:
                img_il_list.append(il[start: i])
                start = i
            elif il[i][3] != il[i - 1][3]:
                img_il_list.append(il[start: i + 1])
                start = i + 1
        return img_il_list

    def handle(self, *args, **kwargs):
        pdf_dir = '/Users/clay/Desktop/普通打印-部分无线/竖版-无表格-农业银行'
        img_dir = '/Users/clay/Desktop/普通打印-部分无线_img/竖版-无表格-农业银行'
        os.makedirs(img_dir, exist_ok=True)
        for d in os.listdir(pdf_dir):
        #     if d in ['.DS_Store', 'CH-B008486764.pdf', 'CH-B008003736.pdf', 'CH-B008487476.pdf', 'CH-B006763780.pdf',
        #              'CH-B009000564.pdf', 'CH-B009020488.pdf']:
            if d in ['.DS_Store', '1竖版-无表格-农业银行样例.PNG']:
                continue
            pdf_path = os.path.join(pdf_dir, d)
            # pdf_path = '/Users/clay/Desktop/普通打印part2/工商银行(标准版)/CH-B006754676.pdf'
            if os.path.isfile(pdf_path):
                img_save_path = os.path.join(img_dir, d)
                if os.path.exists(img_save_path):
                    continue
                os.makedirs(img_save_path, exist_ok=True)
                with fitz.Document(pdf_path) as pdf:
                    self.cronjob_log.info('{0} [pdf_path={1}] [metadata={2}]'.format(
                        self.log_base, pdf_path, pdf.metadata))
                    # xref_list = []
                    for pno in range(pdf.pageCount):
                        il = pdf.getPageImageList(pno)
                        il.sort(key=lambda x: x[0])
                        img_il_list = self.split_il(il)
                        del il

                        print(img_il_list)
                        if len(img_il_list) > 3:  # 单页无规律小图过多时,使用页面转图片
                            page = pdf.loadPage(pno)
                            pm = page.getPixmap(matrix=self.trans, alpha=False)
                            save_path = os.path.join(img_save_path, 'page_{0}_img_0.png'.format(page.number))
                            pm.writePNG(save_path)
                            # img_path_list.append(save_path)
                            # self.cronjob_log.info('{0} [page to img success] [doc_id={1}] [pdf_path={2}] '
                            #                       '[page={3}]'.format(self.log_base, doc_id, pdf_path, page.number))
                        else:  # 提取图片
                            for img_index, img_il in enumerate(img_il_list):
                                if len(img_il) == 1:  # 当只有一张图片时, 简化处理
                                    pix = self.recoverpix(pdf, img_il[0])
                                    ext, img_data = self.get_img_data(pix)
                                    save_path = os.path.join(img_save_path, 'page_{0}_img_{1}.{2}'.format(
                                        pno, img_index, ext))
                                    with open(save_path, "wb") as f:
                                        f.write(img_data)
                                    # img_path_list.append(save_path)
                                    # self.cronjob_log.info(
                                    #     '{0} [extract img success] [doc_id={1}] [pdf_path={2}] [page={3}] '
                                    #     '[img_index={4}]'.format(self.log_base, doc_id, pdf_path, pno, img_index))
                                else:  # 多张图片,竖向拼接
                                    height_sum = 0
                                    im_list = []
                                    width = img_il[0][2]
                                    for img in img_il:
                                        # xref = img[0]
                                        # if xref in xref_list:
                                        #     continue
                                        height = img[3]
                                        pix = self.recoverpix(pdf, img)
                                        ext, img_data = self.get_img_data(pix)

                                        # xref_list.append(xref)

                                        im = Image.open(BytesIO(img_data))
                                        im_list.append((height, im, ext))
                                        height_sum += height

                                    save_path = os.path.join(img_save_path, 'page_{0}_img_{1}.{2}'.format(
                                        pno, img_index, im_list[0][2]))
                                    res = Image.new(im_list[0][1].mode, (width, height_sum))
                                    h_now = 0
                                    for h, m, _ in im_list:
                                        res.paste(m, box=(0, h_now))
                                        h_now += h
                                    res.save(save_path)
            # else:
            #     img_dir_path = os.path.join(img_dir, d)
            #     os.makedirs(img_dir_path, exist_ok=True)