f4fffb77 by Gruel

first commit

0 parents
1 .idea/
...\ No newline at end of file ...\ No newline at end of file
1 # PDF转图片脚本
2
3 ## 主要处理逻辑
4 - 提取PDF页面中的图片对象
5 - 图片对象数目为0(如电子账单),保存整个页面为png图片
6 - 图片对象数目为1
7 - 大图,保存图片对象
8 - 小图(如电子账单盖章),保存整个页面为png图片
9 - 图片对象数目大于1
10 - 多大图,保存图片对象
11 - 多碎图,根据宽高突变位置分组,拼接合并后保存
12 - 其他特殊情况:保存整个页面为png图片
13
14 ## 用法
15 - python3.6+
16 - `pip install -r requirements`
17 - `python pdf_to_img.py pdf_path [img_path]`
18
19 | 参数 | 是否必须 | 说明 | 缺省值 |
20 | ---- | ---- | ---- | ---- |
21 | pdf_path | 是 | PDF文件或目录路径 | - |
22 | img_path | 否 | 图片保存路径 | PDF文件路径 |
...\ No newline at end of file ...\ No newline at end of file
1 import os
2 import sys
3 import fitz
4 from PIL import Image
5 from io import BytesIO
6
7 if sys.version_info[0] < 3:
8 raise Exception("This program requires at least python3.6")
9 if len(sys.argv) < 2:
10 print('用法:python pdf_to_img.py PDF文件或目录路径 [图片保存路径]')
11 sys.exit(0)
12 if not os.path.exists(sys.argv[1]):
13 print('PDF文件或目录不存在: {0}'.format(sys.argv[1]))
14 sys.exit(0)
15
16 LOG_BASE = '[pdf to img]'
17
18 # 页面保存为png图片参数
19 ZOOM_X = ZOOM_Y = 2.0
20 trans = fitz.Matrix(ZOOM_X, ZOOM_X).preRotate(0) # zoom factor 2 in each dimension
21
22 # 特殊filter处理
23 ADOBE_FILTER_SET = {'FlateDecode', 'JPXDecode', 'JBIG2Decode'}
24
25 # 宽高阈值组合
26 WH_COUPLE_1 = (500, 500)
27 WH_COUPLE_2 = (700, 647)
28 WH_COUPLE_3 = (100, 100)
29 WH_COUPLE_4 = (100, 300)
30 WH_COUPLE_5 = (100, 200)
31
32
33 class PDFHandler:
34
35 def __init__(self, path, target_path):
36 self.path = path
37 self.img_dir_path = os.path.join(target_path, os.path.splitext(os.path.basename(path))[0])
38 self.xref_set = set()
39
40 def get_img_save_path(self, pno, img_index=0, ext='png'):
41 return os.path.join(self.img_dir_path, 'page_{0}_img_{1}.{2}'.format(pno, img_index, ext))
42
43 def page_to_png(self, page):
44 pm = page.getPixmap(matrix=trans, alpha=False)
45 img_save_path = self.get_img_save_path(page.number)
46 pm.writePNG(img_save_path)
47
48 @staticmethod
49 def getimage(pix):
50 # RGB
51 if pix.colorspace.n != 4:
52 return pix
53 # GRAY/CMYK
54 tpix = fitz.Pixmap(fitz.csRGB, pix)
55 return tpix
56
57 def recover_pix(self, doc, xref, smask, colorspace):
58 if smask != 0:
59 # we need to reconstruct the alpha channel with the smask
60 pix1 = fitz.Pixmap(doc, xref)
61 pix2 = fitz.Pixmap(doc, smask) # create pixmap of the /SMask entry
62
63 # sanity check
64 if not (pix1.irect == pix2.irect and pix1.alpha == pix2.alpha == 0 and pix2.n == 1):
65 pix2 = None
66 return self.getimage(pix1)
67
68 pix = fitz.Pixmap(pix1) # copy of pix1, alpha channel added
69 pix.setAlpha(pix2.samples) # treat pix2.samples as alpha value
70 pix1 = pix2 = None # free temp pixmaps
71 return self.getimage(pix)
72 elif colorspace in {'Separation', 'DeviceCMYK'}:
73 pix = fitz.Pixmap(doc, xref)
74 tpix = fitz.Pixmap(fitz.csRGB, pix)
75 return tpix
76 else:
77 return doc.extractImage(xref)
78
79 @staticmethod
80 def get_img_data(pix):
81 if type(pix) is dict: # we got a raw image
82 ext = pix["ext"]
83 img_data = pix["image"]
84 else: # we got a pixmap
85 ext = 'png'
86 img_data = pix.getPNGData()
87 return ext, img_data
88
89 def extract_single_image(self, pdf, xref, smask, colorspace, pno, img_index=0):
90 pix = self.recover_pix(pdf, xref, smask, colorspace)
91 ext, img_data = self.get_img_data(pix)
92 img_save_path = self.get_img_save_path(pno, img_index=img_index, ext=ext)
93 with open(img_save_path, "wb") as f:
94 f.write(img_data)
95 self.xref_set.add(xref)
96
97 @staticmethod
98 def split_il(il):
99 broken_il = []
100 start = 0
101 length = len(il)
102 page_to_png = None
103 for i in range(length):
104 # 当图片对象含有特殊filter时,特殊处理:整个页面保存为png图片
105 if il[i][-1] in ADOBE_FILTER_SET:
106 page_to_png = True
107 break
108 else:
109 for i in range(length):
110 # 当图片对象够大时,不作碎图合并处理,而是单纯提取
111 if il[i][2] >= WH_COUPLE_2[0] and il[i][3] >= WH_COUPLE_2[1]:
112 break
113 if i == start:
114 if i == length - 1:
115 broken_il.append(il[start: length])
116 continue
117 elif i == length - 1:
118 if il[i][2] == il[i - 1][2]:
119 broken_il.append(il[start: length])
120 else:
121 broken_il.append(il[start: i])
122 broken_il.append(il[i: length])
123 continue
124 if il[i][2] != il[i - 1][2]:
125 broken_il.append(il[start: i])
126 start = i
127 elif il[i][3] != il[i - 1][3]:
128 broken_il.append(il[start: i + 1])
129 start = i + 1
130 else:
131 # 碎图分组结果
132 return broken_il
133 return page_to_png
134
135 def merge_il(self, pdf, pno, il):
136 # 尝试碎图合并前的分组
137 il.sort(key=lambda x: x[0])
138 broken_il = self.split_il(il)
139 print('broken_il: {0}'.format(broken_il))
140
141 page_to_png = True
142 # 3.1 当图片对象够大时,不作碎图合并处理,而是单纯提取
143 if broken_il is None:
144 page_to_png = False
145 for img_index, img in enumerate(il):
146 xref, smask, width, height, _, colorspace, _, _, adobe_filter = img
147 if width < WH_COUPLE_3[0] or height < WH_COUPLE_3[1]: # 过滤小图(如二维码)
148 continue
149 elif xref not in self.xref_set:
150 self.extract_single_image(pdf, xref, smask, colorspace, pno, img_index)
151 # 3.2 碎图按照分组合并
152 elif isinstance(broken_il, list) and len(broken_il) <= 2:
153 for img_index, img_il in enumerate(broken_il):
154 # 3.2.1 仅一张碎图,过滤或直接提取
155 if len(img_il) == 1:
156 xref, smask, width, height, _, colorspace, _, _, adobe_filter = img_il[0]
157 # 过滤小图(如二维码)
158 if width < WH_COUPLE_4[0] or height < WH_COUPLE_4[1] or \
159 (width < WH_COUPLE_1[0] and height < WH_COUPLE_1[1]):
160 continue
161 elif xref not in self.xref_set:
162 self.extract_single_image(pdf, xref, smask, colorspace, pno, img_index)
163 page_to_png = False
164 # 3.2.2 多张碎图,竖向拼接
165 else:
166 height_sum = sum([img[3] for img in img_il])
167 width = img_il[0][2]
168 # 过滤小图和不常规大图
169 if width < WH_COUPLE_5[0] or height_sum < WH_COUPLE_5[1] or \
170 (width > 1000 and height_sum > width * 3):
171 continue
172 im_list = []
173 for img in img_il:
174 xref, smask, _, height, _, colorspace, _, _, adobe_filter = img
175 pix = self.recover_pix(pdf, xref, smask, colorspace)
176 ext, img_data = self.get_img_data(pix)
177 im = Image.open(BytesIO(img_data))
178 im_list.append((height, im, ext))
179 new_img = Image.new(im_list[0][1].mode, (width, height_sum))
180 h_now = 0
181 for h, m, _ in im_list:
182 new_img.paste(m, box=(0, h_now))
183 h_now += h
184 img_save_path = self.get_img_save_path(pno, img_index, im_list[0][2])
185 new_img.save(img_save_path)
186 page_to_png = False
187
188 # 3.3 碎图分组大于2、全过滤、含特殊filter,特殊处理:整个页面保存为png图片
189 if page_to_png:
190 page = pdf.loadPage(pno)
191 self.page_to_png(page)
192
193 def extract_image(self):
194 os.makedirs(self.img_dir_path, exist_ok=True)
195 with fitz.Document(self.path) as pdf:
196 print('++++++++++' * 5)
197 print('{0} [start] [pdf_path={1}] [metadata={2}]'.format(LOG_BASE, self.path, pdf.metadata))
198 for pno in range(pdf.pageCount):
199 il = pdf.getPageImageList(pno) # 获取页面图片对象
200 # (xref, smask, width, height, bpc, colorspace, alt.colorspace, name, filter, invoker)
201 print('---------- page: {0} ----------'.format(pno))
202 print('img_object_list: {0}'.format(il))
203
204 # 单纯提取页面图片对象
205 # for img_index, img in enumerate(il):
206 # pix = self.recover_pix(pdf, img[0], img[1], img[5])
207 # ext, img_data = self.get_img_data(pix)
208 # img_save_path = self.get_img_save_path(pno, img_index, ext)
209 # with open(img_save_path, "wb") as f:
210 # f.write(img_data)
211
212 # 1.页面图片对象数目为0时,保存整个页面为png图片
213 if len(il) == 0:
214 page = pdf.loadPage(pno)
215 self.page_to_png(page)
216 # 2.页面图片对象数目为1时:
217 # 小图(如电子账单的盖章):保存整个页面为png图片
218 # 大图:提取图片对象
219 elif len(il) == 1:
220 xref, smask, width, height, _, colorspace, _, _, _ = il[0]
221 # 小图
222 if width < WH_COUPLE_1[0] and height < WH_COUPLE_1[1]:
223 page = pdf.loadPage(pno)
224 self.page_to_png(page)
225 # 大图
226 elif xref not in self.xref_set:
227 self.extract_single_image(pdf, xref, smask, colorspace, pno)
228 # 3.页面图片对象数目大于1时,特殊处理
229 else:
230 self.merge_il(pdf, pno, il)
231
232
233 def extract_image(pdf_path, target_path):
234 pdf_handler = PDFHandler(pdf_path, target_path)
235 pdf_handler.extract_image()
236
237
238 def main():
239 pdf_path = os.path.realpath(sys.argv[1])
240 # 目录:遍历处理所有pdf文件
241 if os.path.isdir(pdf_path):
242 completed_count = 0
243 failed_list = []
244 for parent, dirnames, filenames in os.walk(pdf_path):
245 # 图片保存目录
246 target_path = os.path.realpath(sys.argv[2]) if len(sys.argv) > 2 else parent
247 for pdf_file in filenames:
248 if not pdf_file.endswith('pdf') and not pdf_file.endswith('PDF'):
249 continue
250 pdf_file_path = os.path.join(parent, pdf_file)
251 try:
252 extract_image(pdf_file_path, target_path)
253 except Exception as e:
254 print('{0} [failed] [err={1}] [pdf_path={2}]'.format(LOG_BASE, e, pdf_file_path))
255 failed_list.append(pdf_file_path)
256 else:
257 print('{0} [completed] [pdf_path={1}]'.format(LOG_BASE, pdf_path))
258 completed_count += 1
259 print('{0} [all completed] [completed_count={1}] [failed_count={2}] [failed_pdf_path={3}]'.format(
260 LOG_BASE, completed_count, len(failed_list), failed_list))
261 # 文件:处理pdf文件
262 else:
263 # 图片保存目录
264 target_path = os.path.realpath(sys.argv[2]) if len(sys.argv) > 2 else os.path.dirname(pdf_path)
265 try:
266 extract_image(pdf_path, target_path)
267 except Exception as e:
268 print('{0} [failed] [err={1}] [pdf_path={2}]'.format(LOG_BASE, e, pdf_path))
269 else:
270 print('{0} [completed] [pdf_path={1}]'.format(LOG_BASE, pdf_path))
271
272
273 if __name__ == "__main__":
274 main()
1 Pillow==7.2.0
2 PyMuPDF==1.17.0
...\ No newline at end of file ...\ No newline at end of file
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!