prune extract model

周伟奇
Showing 3 changed files with 8 additions and 209 deletions
README.md
pdf_to_img.py
requirements.txt
--- a/README.md
View file @94794bd
+++ b/README.md
View file @94794bd
 # PDF转图片脚本

-## 2种转化方式
+## 转化方式
 - 保存整个页面为png图片
- 提取PDF页面中的图片对象
-  - 图片对象数目为0(如电子账单)，保存整个页面为png图片
-  - 图片对象数目为1
-      - 大图，保存图片对象
-      - 小图(如电子账单盖章)，保存整个页面为png图片
-  - 图片对象数目大于1
-      - 多整图，保存图片对象
-      - 多碎图，根据宽高突变位置分组，拼接合并后保存
-  - 其他特殊情况：保存整个页面为png图片
-  
-## 已知问题
- 提取图片对象方式下，整图与碎图通过宽高阈值区分，无法满足所有PDF。个别PDF中，整图很小时会被当做碎图合并，碎图很大时会被当做整图不合并
  
 ## 用法
 - python3.6+
 - `pip install -r requirements.txt`
- - `python pdf_to_img.py [-h] -i INPUT [-o OUTPUT] [-e]`
+ - `python pdf_to_img.py [-h] -i INPUT [-o OUTPUT]`
    ```
    可选参数:
      -h, --help                  查看帮助信息并退出
      -i INPUT, --input INPUT     PDF文件或目录路径，必要参数
      -o OUTPUT, --output OUTPUT  输出图片保存路径，非必要参数，缺省值为PDF文件路径
-      -e, --extract               默认采用整个页面保存png图片的方式，增加该选项选择提取图片方式转化图片
    ```
\ No newline at end of file
--- a/pdf_to_img.py
View file @94794bd
+++ b/pdf_to_img.py
View file @94794bd
@@ -2,8 +2,6 @@ import os
 import sys
 import fitz
 import argparse
-from PIL import Image
-from io import BytesIO

 if sys.version_info[0] < 3:
    raise Exception("This program requires at least python3.6")
@@ -11,7 +9,6 @@ if sys.version_info[0] < 3:
 parser = argparse.ArgumentParser(description='PDF转图片')
 parser.add_argument('-i', '--input', help='PDF文件或目录路径，必要参数', required=True)
 parser.add_argument('-o', '--output', help='输出图片保存路径，非必要参数，缺省值为PDF文件路径')
-parser.add_argument('-e', '--extract', help='默认采用整个页面保存png图片的方式，增加该选项选择提取图片方式转化图片', action="store_true")
 args = parser.parse_args()

 LOG_BASE = '[pdf to img]'
@@ -20,16 +17,6 @@ LOG_BASE = '[pdf to img]'
 ZOOM_X = ZOOM_Y = 2.0
 trans = fitz.Matrix(ZOOM_X, ZOOM_X).preRotate(0)  # zoom factor 2 in each dimension

-# 特殊filter处理
-ADOBE_FILTER_SET = {'FlateDecode', 'JPXDecode', 'JBIG2Decode'}
-
-# 宽高阈值组合
-WH_COUPLE_1 = (500, 500)
-WH_COUPLE_2 = (700, 647)
-WH_COUPLE_3 = (100, 100)
-WH_COUPLE_4 = (100, 300)
-WH_COUPLE_5 = (100, 200)
-

 class PDFHandler:

@@ -46,194 +33,20 @@ class PDFHandler:
        img_save_path = self.get_img_save_path(page.number)
        pm.writePNG(img_save_path)

-    @staticmethod
-    def getimage(pix):
-        # RGB
-        if pix.colorspace.n != 4:
-            return pix
-        # GRAY/CMYK
-        tpix = fitz.Pixmap(fitz.csRGB, pix)
-        return tpix
-
-    def recover_pix(self, doc, xref, smask, colorspace):
-        if smask != 0:
-            # we need to reconstruct the alpha channel with the smask
-            pix1 = fitz.Pixmap(doc, xref)
-            pix2 = fitz.Pixmap(doc, smask)  # create pixmap of the /SMask entry
-
-            # sanity check
-            if not (pix1.irect == pix2.irect and pix1.alpha == pix2.alpha == 0 and pix2.n == 1):
-                pix2 = None
-                return self.getimage(pix1)
-
-            pix = fitz.Pixmap(pix1)  # copy of pix1, alpha channel added
-            pix.setAlpha(pix2.samples)  # treat pix2.samples as alpha value
-            pix1 = pix2 = None  # free temp pixmaps
-            return self.getimage(pix)
-        elif colorspace in {'Separation', 'DeviceCMYK'}:
-            pix = fitz.Pixmap(doc, xref)
-            tpix = fitz.Pixmap(fitz.csRGB, pix)
-            return tpix
-        else:
-            return doc.extractImage(xref)
-
-    @staticmethod
-    def get_img_data(pix):
-        if type(pix) is dict:  # we got a raw image
-            ext = pix["ext"]
-            img_data = pix["image"]
-        else:  # we got a pixmap
-            ext = 'png'
-            img_data = pix.getPNGData()
-        return ext, img_data
-
-    def extract_single_image(self, pdf, xref, smask, colorspace, pno, img_index=0):
-        pix = self.recover_pix(pdf, xref, smask, colorspace)
-        ext, img_data = self.get_img_data(pix)
-        img_save_path = self.get_img_save_path(pno, img_index=img_index, ext=ext)
-        with open(img_save_path, "wb") as f:
-            f.write(img_data)
-        self.xref_set.add(xref)
-
-    @staticmethod
-    def split_il(il):
-        broken_il = []
-        start = 0
-        length = len(il)
-        page_to_png = None
-        for i in range(length):
-            # 当图片对象含有特殊filter时，特殊处理：整个页面保存为png图片
-            if il[i][-1] in ADOBE_FILTER_SET:
-                page_to_png = True
-                break
-        else:
-            for i in range(length):
-                # 当图片对象够大时，不作碎图合并处理，而是单纯提取
-                if il[i][2] >= WH_COUPLE_2[0] and il[i][3] >= WH_COUPLE_2[1]:
-                    break
-                if i == start:
-                    if i == length - 1:
-                        broken_il.append(il[start: length])
-                    continue
-                elif i == length - 1:
-                    if il[i][2] == il[i - 1][2]:
-                        broken_il.append(il[start: length])
-                    else:
-                        broken_il.append(il[start: i])
-                        broken_il.append(il[i: length])
-                    continue
-                if il[i][2] != il[i - 1][2]:
-                    broken_il.append(il[start: i])
-                    start = i
-                elif il[i][3] != il[i - 1][3]:
-                    broken_il.append(il[start: i + 1])
-                    start = i + 1
-            else:
-                # 碎图分组结果
-                return broken_il
-        return page_to_png
-
-    def merge_il(self, pdf, pno, il):
-        # 尝试碎图合并前的分组
-        il.sort(key=lambda x: x[0])
-        broken_il = self.split_il(il)
-        print('broken_il: {0}'.format(broken_il))
-
-        page_to_png = True
-        # 3.1 当图片对象够大时，不作碎图合并处理，而是单纯提取
-        if broken_il is None:
-            page_to_png = False
-            for img_index, img in enumerate(il):
-                xref, smask, width, height, _, colorspace, _, _, adobe_filter = img
-                if width < WH_COUPLE_3[0] or height < WH_COUPLE_3[1]:  # 过滤小图(如二维码)
-                    continue
-                elif xref not in self.xref_set:
-                    self.extract_single_image(pdf, xref, smask, colorspace, pno, img_index)
-        # 3.2 碎图按照分组合并
-        elif isinstance(broken_il, list) and len(broken_il) <= 2:
-            for img_index, img_il in enumerate(broken_il):
-                # 3.2.1 仅一张碎图，过滤或直接提取
-                if len(img_il) == 1:
-                    xref, smask, width, height, _, colorspace, _, _, adobe_filter = img_il[0]
-                    # 过滤小图(如二维码)
-                    if width < WH_COUPLE_4[0] or height < WH_COUPLE_4[1] or \
-                            (width < WH_COUPLE_1[0] and height < WH_COUPLE_1[1]):
-                        continue
-                    elif xref not in self.xref_set:
-                        self.extract_single_image(pdf, xref, smask, colorspace, pno, img_index)
-                        page_to_png = False
-                # 3.2.2 多张碎图，竖向拼接
-                else:
-                    height_sum = sum([img[3] for img in img_il])
-                    width = img_il[0][2]
-                    # 过滤小图和不常规大图
-                    if width < WH_COUPLE_5[0] or height_sum < WH_COUPLE_5[1] or \
-                            (width > 1000 and height_sum > width * 3):
-                        continue
-                    im_list = []
-                    for img in img_il:
-                        xref, smask, _, height, _, colorspace, _, _, adobe_filter = img
-                        pix = self.recover_pix(pdf, xref, smask, colorspace)
-                        ext, img_data = self.get_img_data(pix)
-                        im = Image.open(BytesIO(img_data))
-                        im_list.append((height, im, ext))
-                    new_img = Image.new(im_list[0][1].mode, (width, height_sum))
-                    h_now = 0
-                    for h, m, _ in im_list:
-                        new_img.paste(m, box=(0, h_now))
-                        h_now += h
-                    img_save_path = self.get_img_save_path(pno, img_index, im_list[0][2])
-                    new_img.save(img_save_path)
-                    page_to_png = False
-
-        # 3.3 碎图分组大于2、全过滤、含特殊filter，特殊处理：整个页面保存为png图片
-        if page_to_png:
-            page = pdf.loadPage(pno)
-            self.page_to_png(page)
-
-    def extract_image(self, is_extract):
+    def extract_image(self):
        os.makedirs(self.img_dir_path, exist_ok=True)
        with fitz.Document(self.path) as pdf:
            print('++++++++++' * 5)
            print('{0} [start] [pdf_path={1}] [metadata={2}]'.format(LOG_BASE, self.path, pdf.metadata))
            for pno in range(pdf.pageCount):
-                il = pdf.getPageImageList(pno) if is_extract else [] # 获取页面图片对象
-                # (xref, smask, width, height, bpc, colorspace, alt.colorspace, name, filter, invoker)
-                print('---------- page: {0} ----------'.format(pno))
-                print('img_object_list: {0}'.format(il))
-
-                # 单纯提取页面图片对象
-                # for img_index, img in enumerate(il):
-                #     pix = self.recover_pix(pdf, img[0], img[1], img[5])
-                #     ext, img_data = self.get_img_data(pix)
-                #     img_save_path = self.get_img_save_path(pno, img_index, ext)
-                #     with open(img_save_path, "wb") as f:
-                #         f.write(img_data)
-
-                # 1.页面图片对象数目为0时，保存整个页面为png图片
-                if len(il) == 0:
                page = pdf.loadPage(pno)
                self.page_to_png(page)
-                # 2.页面图片对象数目为1时：
-                # 小图(如电子账单的盖章)：保存整个页面为png图片
-                # 大图：提取图片对象
-                elif len(il) == 1:
-                    xref, smask, width, height, _, colorspace, _, _, _ = il[0]
-                    # 小图
-                    if width < WH_COUPLE_1[0] and height < WH_COUPLE_1[1]:
-                        page = pdf.loadPage(pno)
-                        self.page_to_png(page)
-                    # 大图
-                    elif xref not in self.xref_set:
-                        self.extract_single_image(pdf, xref, smask, colorspace, pno)
-                # 3.页面图片对象数目大于1时，特殊处理
-                else:
-                    self.merge_il(pdf, pno, il)
+            print('{0} [end] [pdf_path={1}] [img_save_path={2}]'.format(LOG_BASE, self.path, self.img_dir_path))


-def extract_image(pdf_path, target_path, is_extract):
+def extract_image(pdf_path, target_path):
    pdf_handler = PDFHandler(pdf_path, target_path)
-    pdf_handler.extract_image(is_extract)
+    pdf_handler.extract_image()


 def main():
@@ -253,7 +66,7 @@ def main():
                    continue
                pdf_file_path = os.path.join(parent, pdf_file)
                try:
-                    extract_image(pdf_file_path, target_path, args.extract)
+                    extract_image(pdf_file_path, target_path)
                except Exception as e:
                    print('{0} [failed] [err={1}] [pdf_path={2}]'.format(LOG_BASE, e, pdf_file_path))
                    failed_list.append(pdf_file_path)
@@ -267,7 +80,7 @@ def main():
        # 图片保存目录
        target_path = os.path.realpath(args.output) if args.output else os.path.dirname(pdf_path)
        try:
-            extract_image(pdf_path, target_path, args.extract)
+            extract_image(pdf_path, target_path)
        except Exception as e:
            print('{0} [failed] [err={1}] [pdf_path={2}]'.format(LOG_BASE, e, pdf_path))
        else:
--- a/requirements.txt
View file @94794bd
+++ b/requirements.txt
View file @94794bd
-Pillow==7.2.0
 PyMuPDF==1.17.0
\ No newline at end of file