prune extract model
Showing
3 changed files
with
10 additions
and
211 deletions
| 1 | # PDF转图片脚本 | 1 | # PDF转图片脚本 |
| 2 | 2 | ||
| 3 | ## 2种转化方式 | 3 | ## 转化方式 |
| 4 | - 保存整个页面为png图片 | 4 | - 保存整个页面为png图片 |
| 5 | - 提取PDF页面中的图片对象 | ||
| 6 | - 图片对象数目为0(如电子账单),保存整个页面为png图片 | ||
| 7 | - 图片对象数目为1 | ||
| 8 | - 大图,保存图片对象 | ||
| 9 | - 小图(如电子账单盖章),保存整个页面为png图片 | ||
| 10 | - 图片对象数目大于1 | ||
| 11 | - 多整图,保存图片对象 | ||
| 12 | - 多碎图,根据宽高突变位置分组,拼接合并后保存 | ||
| 13 | - 其他特殊情况:保存整个页面为png图片 | ||
| 14 | |||
| 15 | ## 已知问题 | ||
| 16 | - 提取图片对象方式下,整图与碎图通过宽高阈值区分,无法满足所有PDF。个别PDF中,整图很小时会被当做碎图合并,碎图很大时会被当做整图不合并 | ||
| 17 | 5 | ||
| 18 | ## 用法 | 6 | ## 用法 |
| 19 | - python3.6+ | 7 | - python3.6+ |
| 20 | - `pip install -r requirements.txt` | 8 | - `pip install -r requirements.txt` |
| 21 | - `python pdf_to_img.py [-h] -i INPUT [-o OUTPUT] [-e]` | 9 | - `python pdf_to_img.py [-h] -i INPUT [-o OUTPUT]` |
| 22 | ``` | 10 | ``` |
| 23 | 可选参数: | 11 | 可选参数: |
| 24 | -h, --help 查看帮助信息并退出 | 12 | -h, --help 查看帮助信息并退出 |
| 25 | -i INPUT, --input INPUT PDF文件或目录路径,必要参数 | 13 | -i INPUT, --input INPUT PDF文件或目录路径,必要参数 |
| 26 | -o OUTPUT, --output OUTPUT 输出图片保存路径,非必要参数,缺省值为PDF文件路径 | 14 | -o OUTPUT, --output OUTPUT 输出图片保存路径,非必要参数,缺省值为PDF文件路径 |
| 27 | -e, --extract 默认采用整个页面保存png图片的方式,增加该选项选择提取图片方式转化图片 | ||
| 28 | ``` | 15 | ``` |
| ... | \ No newline at end of file | ... | \ No newline at end of file | ... | ... |
| ... | @@ -2,8 +2,6 @@ import os | ... | @@ -2,8 +2,6 @@ import os |
| 2 | import sys | 2 | import sys |
| 3 | import fitz | 3 | import fitz |
| 4 | import argparse | 4 | import argparse |
| 5 | from PIL import Image | ||
| 6 | from io import BytesIO | ||
| 7 | 5 | ||
| 8 | if sys.version_info[0] < 3: | 6 | if sys.version_info[0] < 3: |
| 9 | raise Exception("This program requires at least python3.6") | 7 | raise Exception("This program requires at least python3.6") |
| ... | @@ -11,7 +9,6 @@ if sys.version_info[0] < 3: | ... | @@ -11,7 +9,6 @@ if sys.version_info[0] < 3: |
| 11 | parser = argparse.ArgumentParser(description='PDF转图片') | 9 | parser = argparse.ArgumentParser(description='PDF转图片') |
| 12 | parser.add_argument('-i', '--input', help='PDF文件或目录路径,必要参数', required=True) | 10 | parser.add_argument('-i', '--input', help='PDF文件或目录路径,必要参数', required=True) |
| 13 | parser.add_argument('-o', '--output', help='输出图片保存路径,非必要参数,缺省值为PDF文件路径') | 11 | parser.add_argument('-o', '--output', help='输出图片保存路径,非必要参数,缺省值为PDF文件路径') |
| 14 | parser.add_argument('-e', '--extract', help='默认采用整个页面保存png图片的方式,增加该选项选择提取图片方式转化图片', action="store_true") | ||
| 15 | args = parser.parse_args() | 12 | args = parser.parse_args() |
| 16 | 13 | ||
| 17 | LOG_BASE = '[pdf to img]' | 14 | LOG_BASE = '[pdf to img]' |
| ... | @@ -20,16 +17,6 @@ LOG_BASE = '[pdf to img]' | ... | @@ -20,16 +17,6 @@ LOG_BASE = '[pdf to img]' |
| 20 | ZOOM_X = ZOOM_Y = 2.0 | 17 | ZOOM_X = ZOOM_Y = 2.0 |
| 21 | trans = fitz.Matrix(ZOOM_X, ZOOM_X).preRotate(0) # zoom factor 2 in each dimension | 18 | trans = fitz.Matrix(ZOOM_X, ZOOM_X).preRotate(0) # zoom factor 2 in each dimension |
| 22 | 19 | ||
| 23 | # 特殊filter处理 | ||
| 24 | ADOBE_FILTER_SET = {'FlateDecode', 'JPXDecode', 'JBIG2Decode'} | ||
| 25 | |||
| 26 | # 宽高阈值组合 | ||
| 27 | WH_COUPLE_1 = (500, 500) | ||
| 28 | WH_COUPLE_2 = (700, 647) | ||
| 29 | WH_COUPLE_3 = (100, 100) | ||
| 30 | WH_COUPLE_4 = (100, 300) | ||
| 31 | WH_COUPLE_5 = (100, 200) | ||
| 32 | |||
| 33 | 20 | ||
| 34 | class PDFHandler: | 21 | class PDFHandler: |
| 35 | 22 | ||
| ... | @@ -46,194 +33,20 @@ class PDFHandler: | ... | @@ -46,194 +33,20 @@ class PDFHandler: |
| 46 | img_save_path = self.get_img_save_path(page.number) | 33 | img_save_path = self.get_img_save_path(page.number) |
| 47 | pm.writePNG(img_save_path) | 34 | pm.writePNG(img_save_path) |
| 48 | 35 | ||
| 49 | @staticmethod | 36 | def extract_image(self): |
| 50 | def getimage(pix): | ||
| 51 | # RGB | ||
| 52 | if pix.colorspace.n != 4: | ||
| 53 | return pix | ||
| 54 | # GRAY/CMYK | ||
| 55 | tpix = fitz.Pixmap(fitz.csRGB, pix) | ||
| 56 | return tpix | ||
| 57 | |||
| 58 | def recover_pix(self, doc, xref, smask, colorspace): | ||
| 59 | if smask != 0: | ||
| 60 | # we need to reconstruct the alpha channel with the smask | ||
| 61 | pix1 = fitz.Pixmap(doc, xref) | ||
| 62 | pix2 = fitz.Pixmap(doc, smask) # create pixmap of the /SMask entry | ||
| 63 | |||
| 64 | # sanity check | ||
| 65 | if not (pix1.irect == pix2.irect and pix1.alpha == pix2.alpha == 0 and pix2.n == 1): | ||
| 66 | pix2 = None | ||
| 67 | return self.getimage(pix1) | ||
| 68 | |||
| 69 | pix = fitz.Pixmap(pix1) # copy of pix1, alpha channel added | ||
| 70 | pix.setAlpha(pix2.samples) # treat pix2.samples as alpha value | ||
| 71 | pix1 = pix2 = None # free temp pixmaps | ||
| 72 | return self.getimage(pix) | ||
| 73 | elif colorspace in {'Separation', 'DeviceCMYK'}: | ||
| 74 | pix = fitz.Pixmap(doc, xref) | ||
| 75 | tpix = fitz.Pixmap(fitz.csRGB, pix) | ||
| 76 | return tpix | ||
| 77 | else: | ||
| 78 | return doc.extractImage(xref) | ||
| 79 | |||
| 80 | @staticmethod | ||
| 81 | def get_img_data(pix): | ||
| 82 | if type(pix) is dict: # we got a raw image | ||
| 83 | ext = pix["ext"] | ||
| 84 | img_data = pix["image"] | ||
| 85 | else: # we got a pixmap | ||
| 86 | ext = 'png' | ||
| 87 | img_data = pix.getPNGData() | ||
| 88 | return ext, img_data | ||
| 89 | |||
| 90 | def extract_single_image(self, pdf, xref, smask, colorspace, pno, img_index=0): | ||
| 91 | pix = self.recover_pix(pdf, xref, smask, colorspace) | ||
| 92 | ext, img_data = self.get_img_data(pix) | ||
| 93 | img_save_path = self.get_img_save_path(pno, img_index=img_index, ext=ext) | ||
| 94 | with open(img_save_path, "wb") as f: | ||
| 95 | f.write(img_data) | ||
| 96 | self.xref_set.add(xref) | ||
| 97 | |||
| 98 | @staticmethod | ||
| 99 | def split_il(il): | ||
| 100 | broken_il = [] | ||
| 101 | start = 0 | ||
| 102 | length = len(il) | ||
| 103 | page_to_png = None | ||
| 104 | for i in range(length): | ||
| 105 | # 当图片对象含有特殊filter时,特殊处理:整个页面保存为png图片 | ||
| 106 | if il[i][-1] in ADOBE_FILTER_SET: | ||
| 107 | page_to_png = True | ||
| 108 | break | ||
| 109 | else: | ||
| 110 | for i in range(length): | ||
| 111 | # 当图片对象够大时,不作碎图合并处理,而是单纯提取 | ||
| 112 | if il[i][2] >= WH_COUPLE_2[0] and il[i][3] >= WH_COUPLE_2[1]: | ||
| 113 | break | ||
| 114 | if i == start: | ||
| 115 | if i == length - 1: | ||
| 116 | broken_il.append(il[start: length]) | ||
| 117 | continue | ||
| 118 | elif i == length - 1: | ||
| 119 | if il[i][2] == il[i - 1][2]: | ||
| 120 | broken_il.append(il[start: length]) | ||
| 121 | else: | ||
| 122 | broken_il.append(il[start: i]) | ||
| 123 | broken_il.append(il[i: length]) | ||
| 124 | continue | ||
| 125 | if il[i][2] != il[i - 1][2]: | ||
| 126 | broken_il.append(il[start: i]) | ||
| 127 | start = i | ||
| 128 | elif il[i][3] != il[i - 1][3]: | ||
| 129 | broken_il.append(il[start: i + 1]) | ||
| 130 | start = i + 1 | ||
| 131 | else: | ||
| 132 | # 碎图分组结果 | ||
| 133 | return broken_il | ||
| 134 | return page_to_png | ||
| 135 | |||
| 136 | def merge_il(self, pdf, pno, il): | ||
| 137 | # 尝试碎图合并前的分组 | ||
| 138 | il.sort(key=lambda x: x[0]) | ||
| 139 | broken_il = self.split_il(il) | ||
| 140 | print('broken_il: {0}'.format(broken_il)) | ||
| 141 | |||
| 142 | page_to_png = True | ||
| 143 | # 3.1 当图片对象够大时,不作碎图合并处理,而是单纯提取 | ||
| 144 | if broken_il is None: | ||
| 145 | page_to_png = False | ||
| 146 | for img_index, img in enumerate(il): | ||
| 147 | xref, smask, width, height, _, colorspace, _, _, adobe_filter = img | ||
| 148 | if width < WH_COUPLE_3[0] or height < WH_COUPLE_3[1]: # 过滤小图(如二维码) | ||
| 149 | continue | ||
| 150 | elif xref not in self.xref_set: | ||
| 151 | self.extract_single_image(pdf, xref, smask, colorspace, pno, img_index) | ||
| 152 | # 3.2 碎图按照分组合并 | ||
| 153 | elif isinstance(broken_il, list) and len(broken_il) <= 2: | ||
| 154 | for img_index, img_il in enumerate(broken_il): | ||
| 155 | # 3.2.1 仅一张碎图,过滤或直接提取 | ||
| 156 | if len(img_il) == 1: | ||
| 157 | xref, smask, width, height, _, colorspace, _, _, adobe_filter = img_il[0] | ||
| 158 | # 过滤小图(如二维码) | ||
| 159 | if width < WH_COUPLE_4[0] or height < WH_COUPLE_4[1] or \ | ||
| 160 | (width < WH_COUPLE_1[0] and height < WH_COUPLE_1[1]): | ||
| 161 | continue | ||
| 162 | elif xref not in self.xref_set: | ||
| 163 | self.extract_single_image(pdf, xref, smask, colorspace, pno, img_index) | ||
| 164 | page_to_png = False | ||
| 165 | # 3.2.2 多张碎图,竖向拼接 | ||
| 166 | else: | ||
| 167 | height_sum = sum([img[3] for img in img_il]) | ||
| 168 | width = img_il[0][2] | ||
| 169 | # 过滤小图和不常规大图 | ||
| 170 | if width < WH_COUPLE_5[0] or height_sum < WH_COUPLE_5[1] or \ | ||
| 171 | (width > 1000 and height_sum > width * 3): | ||
| 172 | continue | ||
| 173 | im_list = [] | ||
| 174 | for img in img_il: | ||
| 175 | xref, smask, _, height, _, colorspace, _, _, adobe_filter = img | ||
| 176 | pix = self.recover_pix(pdf, xref, smask, colorspace) | ||
| 177 | ext, img_data = self.get_img_data(pix) | ||
| 178 | im = Image.open(BytesIO(img_data)) | ||
| 179 | im_list.append((height, im, ext)) | ||
| 180 | new_img = Image.new(im_list[0][1].mode, (width, height_sum)) | ||
| 181 | h_now = 0 | ||
| 182 | for h, m, _ in im_list: | ||
| 183 | new_img.paste(m, box=(0, h_now)) | ||
| 184 | h_now += h | ||
| 185 | img_save_path = self.get_img_save_path(pno, img_index, im_list[0][2]) | ||
| 186 | new_img.save(img_save_path) | ||
| 187 | page_to_png = False | ||
| 188 | |||
| 189 | # 3.3 碎图分组大于2、全过滤、含特殊filter,特殊处理:整个页面保存为png图片 | ||
| 190 | if page_to_png: | ||
| 191 | page = pdf.loadPage(pno) | ||
| 192 | self.page_to_png(page) | ||
| 193 | |||
| 194 | def extract_image(self, is_extract): | ||
| 195 | os.makedirs(self.img_dir_path, exist_ok=True) | 37 | os.makedirs(self.img_dir_path, exist_ok=True) |
| 196 | with fitz.Document(self.path) as pdf: | 38 | with fitz.Document(self.path) as pdf: |
| 197 | print('++++++++++' * 5) | 39 | print('++++++++++' * 5) |
| 198 | print('{0} [start] [pdf_path={1}] [metadata={2}]'.format(LOG_BASE, self.path, pdf.metadata)) | 40 | print('{0} [start] [pdf_path={1}] [metadata={2}]'.format(LOG_BASE, self.path, pdf.metadata)) |
| 199 | for pno in range(pdf.pageCount): | 41 | for pno in range(pdf.pageCount): |
| 200 | il = pdf.getPageImageList(pno) if is_extract else [] # 获取页面图片对象 | 42 | page = pdf.loadPage(pno) |
| 201 | # (xref, smask, width, height, bpc, colorspace, alt.colorspace, name, filter, invoker) | 43 | self.page_to_png(page) |
| 202 | print('---------- page: {0} ----------'.format(pno)) | 44 | print('{0} [end] [pdf_path={1}] [img_save_path={2}]'.format(LOG_BASE, self.path, self.img_dir_path)) |
| 203 | print('img_object_list: {0}'.format(il)) | ||
| 204 | |||
| 205 | # 单纯提取页面图片对象 | ||
| 206 | # for img_index, img in enumerate(il): | ||
| 207 | # pix = self.recover_pix(pdf, img[0], img[1], img[5]) | ||
| 208 | # ext, img_data = self.get_img_data(pix) | ||
| 209 | # img_save_path = self.get_img_save_path(pno, img_index, ext) | ||
| 210 | # with open(img_save_path, "wb") as f: | ||
| 211 | # f.write(img_data) | ||
| 212 | |||
| 213 | # 1.页面图片对象数目为0时,保存整个页面为png图片 | ||
| 214 | if len(il) == 0: | ||
| 215 | page = pdf.loadPage(pno) | ||
| 216 | self.page_to_png(page) | ||
| 217 | # 2.页面图片对象数目为1时: | ||
| 218 | # 小图(如电子账单的盖章):保存整个页面为png图片 | ||
| 219 | # 大图:提取图片对象 | ||
| 220 | elif len(il) == 1: | ||
| 221 | xref, smask, width, height, _, colorspace, _, _, _ = il[0] | ||
| 222 | # 小图 | ||
| 223 | if width < WH_COUPLE_1[0] and height < WH_COUPLE_1[1]: | ||
| 224 | page = pdf.loadPage(pno) | ||
| 225 | self.page_to_png(page) | ||
| 226 | # 大图 | ||
| 227 | elif xref not in self.xref_set: | ||
| 228 | self.extract_single_image(pdf, xref, smask, colorspace, pno) | ||
| 229 | # 3.页面图片对象数目大于1时,特殊处理 | ||
| 230 | else: | ||
| 231 | self.merge_il(pdf, pno, il) | ||
| 232 | 45 | ||
| 233 | 46 | ||
| 234 | def extract_image(pdf_path, target_path, is_extract): | 47 | def extract_image(pdf_path, target_path): |
| 235 | pdf_handler = PDFHandler(pdf_path, target_path) | 48 | pdf_handler = PDFHandler(pdf_path, target_path) |
| 236 | pdf_handler.extract_image(is_extract) | 49 | pdf_handler.extract_image() |
| 237 | 50 | ||
| 238 | 51 | ||
| 239 | def main(): | 52 | def main(): |
| ... | @@ -253,7 +66,7 @@ def main(): | ... | @@ -253,7 +66,7 @@ def main(): |
| 253 | continue | 66 | continue |
| 254 | pdf_file_path = os.path.join(parent, pdf_file) | 67 | pdf_file_path = os.path.join(parent, pdf_file) |
| 255 | try: | 68 | try: |
| 256 | extract_image(pdf_file_path, target_path, args.extract) | 69 | extract_image(pdf_file_path, target_path) |
| 257 | except Exception as e: | 70 | except Exception as e: |
| 258 | print('{0} [failed] [err={1}] [pdf_path={2}]'.format(LOG_BASE, e, pdf_file_path)) | 71 | print('{0} [failed] [err={1}] [pdf_path={2}]'.format(LOG_BASE, e, pdf_file_path)) |
| 259 | failed_list.append(pdf_file_path) | 72 | failed_list.append(pdf_file_path) |
| ... | @@ -267,7 +80,7 @@ def main(): | ... | @@ -267,7 +80,7 @@ def main(): |
| 267 | # 图片保存目录 | 80 | # 图片保存目录 |
| 268 | target_path = os.path.realpath(args.output) if args.output else os.path.dirname(pdf_path) | 81 | target_path = os.path.realpath(args.output) if args.output else os.path.dirname(pdf_path) |
| 269 | try: | 82 | try: |
| 270 | extract_image(pdf_path, target_path, args.extract) | 83 | extract_image(pdf_path, target_path) |
| 271 | except Exception as e: | 84 | except Exception as e: |
| 272 | print('{0} [failed] [err={1}] [pdf_path={2}]'.format(LOG_BASE, e, pdf_path)) | 85 | print('{0} [failed] [err={1}] [pdf_path={2}]'.format(LOG_BASE, e, pdf_path)) |
| 273 | else: | 86 | else: | ... | ... |
-
Please register or sign in to post a comment