94794bd5 by 周伟奇

prune extract model

1 parent ff70b617
1 # PDF转图片脚本 1 # PDF转图片脚本
2 2
3 ## 2种转化方式 3 ## 转化方式
4 - 保存整个页面为png图片 4 - 保存整个页面为png图片
5 - 提取PDF页面中的图片对象
6 - 图片对象数目为0(如电子账单),保存整个页面为png图片
7 - 图片对象数目为1
8 - 大图,保存图片对象
9 - 小图(如电子账单盖章),保存整个页面为png图片
10 - 图片对象数目大于1
11 - 多整图,保存图片对象
12 - 多碎图,根据宽高突变位置分组,拼接合并后保存
13 - 其他特殊情况:保存整个页面为png图片
14
15 ## 已知问题
16 - 提取图片对象方式下,整图与碎图通过宽高阈值区分,无法满足所有PDF。个别PDF中,整图很小时会被当做碎图合并,碎图很大时会被当做整图不合并
17 5
18 ## 用法 6 ## 用法
19 - python3.6+ 7 - python3.6+
20 - `pip install -r requirements.txt` 8 - `pip install -r requirements.txt`
21 - `python pdf_to_img.py [-h] -i INPUT [-o OUTPUT] [-e]` 9 - `python pdf_to_img.py [-h] -i INPUT [-o OUTPUT]`
22 ``` 10 ```
23 可选参数: 11 可选参数:
24 -h, --help 查看帮助信息并退出 12 -h, --help 查看帮助信息并退出
25 -i INPUT, --input INPUT PDF文件或目录路径,必要参数 13 -i INPUT, --input INPUT PDF文件或目录路径,必要参数
26 -o OUTPUT, --output OUTPUT 输出图片保存路径,非必要参数,缺省值为PDF文件路径 14 -o OUTPUT, --output OUTPUT 输出图片保存路径,非必要参数,缺省值为PDF文件路径
27 -e, --extract 默认采用整个页面保存png图片的方式,增加该选项选择提取图片方式转化图片
28 ``` 15 ```
...\ No newline at end of file ...\ No newline at end of file
......
...@@ -2,8 +2,6 @@ import os ...@@ -2,8 +2,6 @@ import os
2 import sys 2 import sys
3 import fitz 3 import fitz
4 import argparse 4 import argparse
5 from PIL import Image
6 from io import BytesIO
7 5
8 if sys.version_info[0] < 3: 6 if sys.version_info[0] < 3:
9 raise Exception("This program requires at least python3.6") 7 raise Exception("This program requires at least python3.6")
...@@ -11,7 +9,6 @@ if sys.version_info[0] < 3: ...@@ -11,7 +9,6 @@ if sys.version_info[0] < 3:
11 parser = argparse.ArgumentParser(description='PDF转图片') 9 parser = argparse.ArgumentParser(description='PDF转图片')
12 parser.add_argument('-i', '--input', help='PDF文件或目录路径,必要参数', required=True) 10 parser.add_argument('-i', '--input', help='PDF文件或目录路径,必要参数', required=True)
13 parser.add_argument('-o', '--output', help='输出图片保存路径,非必要参数,缺省值为PDF文件路径') 11 parser.add_argument('-o', '--output', help='输出图片保存路径,非必要参数,缺省值为PDF文件路径')
14 parser.add_argument('-e', '--extract', help='默认采用整个页面保存png图片的方式,增加该选项选择提取图片方式转化图片', action="store_true")
15 args = parser.parse_args() 12 args = parser.parse_args()
16 13
17 LOG_BASE = '[pdf to img]' 14 LOG_BASE = '[pdf to img]'
...@@ -20,16 +17,6 @@ LOG_BASE = '[pdf to img]' ...@@ -20,16 +17,6 @@ LOG_BASE = '[pdf to img]'
20 ZOOM_X = ZOOM_Y = 2.0 17 ZOOM_X = ZOOM_Y = 2.0
21 trans = fitz.Matrix(ZOOM_X, ZOOM_X).preRotate(0) # zoom factor 2 in each dimension 18 trans = fitz.Matrix(ZOOM_X, ZOOM_X).preRotate(0) # zoom factor 2 in each dimension
22 19
23 # 特殊filter处理
24 ADOBE_FILTER_SET = {'FlateDecode', 'JPXDecode', 'JBIG2Decode'}
25
26 # 宽高阈值组合
27 WH_COUPLE_1 = (500, 500)
28 WH_COUPLE_2 = (700, 647)
29 WH_COUPLE_3 = (100, 100)
30 WH_COUPLE_4 = (100, 300)
31 WH_COUPLE_5 = (100, 200)
32
33 20
34 class PDFHandler: 21 class PDFHandler:
35 22
...@@ -46,194 +33,20 @@ class PDFHandler: ...@@ -46,194 +33,20 @@ class PDFHandler:
46 img_save_path = self.get_img_save_path(page.number) 33 img_save_path = self.get_img_save_path(page.number)
47 pm.writePNG(img_save_path) 34 pm.writePNG(img_save_path)
48 35
49 @staticmethod 36 def extract_image(self):
50 def getimage(pix):
51 # RGB
52 if pix.colorspace.n != 4:
53 return pix
54 # GRAY/CMYK
55 tpix = fitz.Pixmap(fitz.csRGB, pix)
56 return tpix
57
58 def recover_pix(self, doc, xref, smask, colorspace):
59 if smask != 0:
60 # we need to reconstruct the alpha channel with the smask
61 pix1 = fitz.Pixmap(doc, xref)
62 pix2 = fitz.Pixmap(doc, smask) # create pixmap of the /SMask entry
63
64 # sanity check
65 if not (pix1.irect == pix2.irect and pix1.alpha == pix2.alpha == 0 and pix2.n == 1):
66 pix2 = None
67 return self.getimage(pix1)
68
69 pix = fitz.Pixmap(pix1) # copy of pix1, alpha channel added
70 pix.setAlpha(pix2.samples) # treat pix2.samples as alpha value
71 pix1 = pix2 = None # free temp pixmaps
72 return self.getimage(pix)
73 elif colorspace in {'Separation', 'DeviceCMYK'}:
74 pix = fitz.Pixmap(doc, xref)
75 tpix = fitz.Pixmap(fitz.csRGB, pix)
76 return tpix
77 else:
78 return doc.extractImage(xref)
79
80 @staticmethod
81 def get_img_data(pix):
82 if type(pix) is dict: # we got a raw image
83 ext = pix["ext"]
84 img_data = pix["image"]
85 else: # we got a pixmap
86 ext = 'png'
87 img_data = pix.getPNGData()
88 return ext, img_data
89
90 def extract_single_image(self, pdf, xref, smask, colorspace, pno, img_index=0):
91 pix = self.recover_pix(pdf, xref, smask, colorspace)
92 ext, img_data = self.get_img_data(pix)
93 img_save_path = self.get_img_save_path(pno, img_index=img_index, ext=ext)
94 with open(img_save_path, "wb") as f:
95 f.write(img_data)
96 self.xref_set.add(xref)
97
98 @staticmethod
99 def split_il(il):
100 broken_il = []
101 start = 0
102 length = len(il)
103 page_to_png = None
104 for i in range(length):
105 # 当图片对象含有特殊filter时,特殊处理:整个页面保存为png图片
106 if il[i][-1] in ADOBE_FILTER_SET:
107 page_to_png = True
108 break
109 else:
110 for i in range(length):
111 # 当图片对象够大时,不作碎图合并处理,而是单纯提取
112 if il[i][2] >= WH_COUPLE_2[0] and il[i][3] >= WH_COUPLE_2[1]:
113 break
114 if i == start:
115 if i == length - 1:
116 broken_il.append(il[start: length])
117 continue
118 elif i == length - 1:
119 if il[i][2] == il[i - 1][2]:
120 broken_il.append(il[start: length])
121 else:
122 broken_il.append(il[start: i])
123 broken_il.append(il[i: length])
124 continue
125 if il[i][2] != il[i - 1][2]:
126 broken_il.append(il[start: i])
127 start = i
128 elif il[i][3] != il[i - 1][3]:
129 broken_il.append(il[start: i + 1])
130 start = i + 1
131 else:
132 # 碎图分组结果
133 return broken_il
134 return page_to_png
135
136 def merge_il(self, pdf, pno, il):
137 # 尝试碎图合并前的分组
138 il.sort(key=lambda x: x[0])
139 broken_il = self.split_il(il)
140 print('broken_il: {0}'.format(broken_il))
141
142 page_to_png = True
143 # 3.1 当图片对象够大时,不作碎图合并处理,而是单纯提取
144 if broken_il is None:
145 page_to_png = False
146 for img_index, img in enumerate(il):
147 xref, smask, width, height, _, colorspace, _, _, adobe_filter = img
148 if width < WH_COUPLE_3[0] or height < WH_COUPLE_3[1]: # 过滤小图(如二维码)
149 continue
150 elif xref not in self.xref_set:
151 self.extract_single_image(pdf, xref, smask, colorspace, pno, img_index)
152 # 3.2 碎图按照分组合并
153 elif isinstance(broken_il, list) and len(broken_il) <= 2:
154 for img_index, img_il in enumerate(broken_il):
155 # 3.2.1 仅一张碎图,过滤或直接提取
156 if len(img_il) == 1:
157 xref, smask, width, height, _, colorspace, _, _, adobe_filter = img_il[0]
158 # 过滤小图(如二维码)
159 if width < WH_COUPLE_4[0] or height < WH_COUPLE_4[1] or \
160 (width < WH_COUPLE_1[0] and height < WH_COUPLE_1[1]):
161 continue
162 elif xref not in self.xref_set:
163 self.extract_single_image(pdf, xref, smask, colorspace, pno, img_index)
164 page_to_png = False
165 # 3.2.2 多张碎图,竖向拼接
166 else:
167 height_sum = sum([img[3] for img in img_il])
168 width = img_il[0][2]
169 # 过滤小图和不常规大图
170 if width < WH_COUPLE_5[0] or height_sum < WH_COUPLE_5[1] or \
171 (width > 1000 and height_sum > width * 3):
172 continue
173 im_list = []
174 for img in img_il:
175 xref, smask, _, height, _, colorspace, _, _, adobe_filter = img
176 pix = self.recover_pix(pdf, xref, smask, colorspace)
177 ext, img_data = self.get_img_data(pix)
178 im = Image.open(BytesIO(img_data))
179 im_list.append((height, im, ext))
180 new_img = Image.new(im_list[0][1].mode, (width, height_sum))
181 h_now = 0
182 for h, m, _ in im_list:
183 new_img.paste(m, box=(0, h_now))
184 h_now += h
185 img_save_path = self.get_img_save_path(pno, img_index, im_list[0][2])
186 new_img.save(img_save_path)
187 page_to_png = False
188
189 # 3.3 碎图分组大于2、全过滤、含特殊filter,特殊处理:整个页面保存为png图片
190 if page_to_png:
191 page = pdf.loadPage(pno)
192 self.page_to_png(page)
193
194 def extract_image(self, is_extract):
195 os.makedirs(self.img_dir_path, exist_ok=True) 37 os.makedirs(self.img_dir_path, exist_ok=True)
196 with fitz.Document(self.path) as pdf: 38 with fitz.Document(self.path) as pdf:
197 print('++++++++++' * 5) 39 print('++++++++++' * 5)
198 print('{0} [start] [pdf_path={1}] [metadata={2}]'.format(LOG_BASE, self.path, pdf.metadata)) 40 print('{0} [start] [pdf_path={1}] [metadata={2}]'.format(LOG_BASE, self.path, pdf.metadata))
199 for pno in range(pdf.pageCount): 41 for pno in range(pdf.pageCount):
200 il = pdf.getPageImageList(pno) if is_extract else [] # 获取页面图片对象
201 # (xref, smask, width, height, bpc, colorspace, alt.colorspace, name, filter, invoker)
202 print('---------- page: {0} ----------'.format(pno))
203 print('img_object_list: {0}'.format(il))
204
205 # 单纯提取页面图片对象
206 # for img_index, img in enumerate(il):
207 # pix = self.recover_pix(pdf, img[0], img[1], img[5])
208 # ext, img_data = self.get_img_data(pix)
209 # img_save_path = self.get_img_save_path(pno, img_index, ext)
210 # with open(img_save_path, "wb") as f:
211 # f.write(img_data)
212
213 # 1.页面图片对象数目为0时,保存整个页面为png图片
214 if len(il) == 0:
215 page = pdf.loadPage(pno) 42 page = pdf.loadPage(pno)
216 self.page_to_png(page) 43 self.page_to_png(page)
217 # 2.页面图片对象数目为1时: 44 print('{0} [end] [pdf_path={1}] [img_save_path={2}]'.format(LOG_BASE, self.path, self.img_dir_path))
218 # 小图(如电子账单的盖章):保存整个页面为png图片
219 # 大图:提取图片对象
220 elif len(il) == 1:
221 xref, smask, width, height, _, colorspace, _, _, _ = il[0]
222 # 小图
223 if width < WH_COUPLE_1[0] and height < WH_COUPLE_1[1]:
224 page = pdf.loadPage(pno)
225 self.page_to_png(page)
226 # 大图
227 elif xref not in self.xref_set:
228 self.extract_single_image(pdf, xref, smask, colorspace, pno)
229 # 3.页面图片对象数目大于1时,特殊处理
230 else:
231 self.merge_il(pdf, pno, il)
232 45
233 46
234 def extract_image(pdf_path, target_path, is_extract): 47 def extract_image(pdf_path, target_path):
235 pdf_handler = PDFHandler(pdf_path, target_path) 48 pdf_handler = PDFHandler(pdf_path, target_path)
236 pdf_handler.extract_image(is_extract) 49 pdf_handler.extract_image()
237 50
238 51
239 def main(): 52 def main():
...@@ -253,7 +66,7 @@ def main(): ...@@ -253,7 +66,7 @@ def main():
253 continue 66 continue
254 pdf_file_path = os.path.join(parent, pdf_file) 67 pdf_file_path = os.path.join(parent, pdf_file)
255 try: 68 try:
256 extract_image(pdf_file_path, target_path, args.extract) 69 extract_image(pdf_file_path, target_path)
257 except Exception as e: 70 except Exception as e:
258 print('{0} [failed] [err={1}] [pdf_path={2}]'.format(LOG_BASE, e, pdf_file_path)) 71 print('{0} [failed] [err={1}] [pdf_path={2}]'.format(LOG_BASE, e, pdf_file_path))
259 failed_list.append(pdf_file_path) 72 failed_list.append(pdf_file_path)
...@@ -267,7 +80,7 @@ def main(): ...@@ -267,7 +80,7 @@ def main():
267 # 图片保存目录 80 # 图片保存目录
268 target_path = os.path.realpath(args.output) if args.output else os.path.dirname(pdf_path) 81 target_path = os.path.realpath(args.output) if args.output else os.path.dirname(pdf_path)
269 try: 82 try:
270 extract_image(pdf_path, target_path, args.extract) 83 extract_image(pdf_path, target_path)
271 except Exception as e: 84 except Exception as e:
272 print('{0} [failed] [err={1}] [pdf_path={2}]'.format(LOG_BASE, e, pdf_path)) 85 print('{0} [failed] [err={1}] [pdf_path={2}]'.format(LOG_BASE, e, pdf_path))
273 else: 86 else:
......
1 Pillow==7.2.0
2 PyMuPDF==1.17.0 1 PyMuPDF==1.17.0
...\ No newline at end of file ...\ No newline at end of file
......
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!