first commit
0 parents
Showing
4 changed files
with
299 additions
and
0 deletions
.gitignore
0 → 100644
1 | .idea/ | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
README.md
0 → 100644
1 | # PDF转图片脚本 | ||
2 | |||
3 | ## 主要处理逻辑 | ||
4 | - 提取PDF页面中的图片对象 | ||
5 | - 图片对象数目为0(如电子账单),保存整个页面为png图片 | ||
6 | - 图片对象数目为1 | ||
7 | - 大图,保存图片对象 | ||
8 | - 小图(如电子账单盖章),保存整个页面为png图片 | ||
9 | - 图片对象数目大于1 | ||
10 | - 多大图,保存图片对象 | ||
11 | - 多碎图,根据宽高突变位置分组,拼接合并后保存 | ||
12 | - 其他特殊情况:保存整个页面为png图片 | ||
13 | |||
14 | ## 用法 | ||
15 | - python3.6+ | ||
16 | - `pip install -r requirements` | ||
17 | - `python pdf_to_img.py pdf_path [img_path]` | ||
18 | |||
19 | | 参数 | 是否必须 | 说明 | 缺省值 | | ||
20 | | ---- | ---- | ---- | ---- | | ||
21 | | pdf_path | 是 | PDF文件或目录路径 | - | | ||
22 | | img_path | 否 | 图片保存路径 | PDF文件路径 | | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
pdf_to_img.py
0 → 100644
1 | import os | ||
2 | import sys | ||
3 | import fitz | ||
4 | from PIL import Image | ||
5 | from io import BytesIO | ||
6 | |||
7 | if sys.version_info[0] < 3: | ||
8 | raise Exception("This program requires at least python3.6") | ||
9 | if len(sys.argv) < 2: | ||
10 | print('用法:python pdf_to_img.py PDF文件或目录路径 [图片保存路径]') | ||
11 | sys.exit(0) | ||
12 | if not os.path.exists(sys.argv[1]): | ||
13 | print('PDF文件或目录不存在: {0}'.format(sys.argv[1])) | ||
14 | sys.exit(0) | ||
15 | |||
16 | LOG_BASE = '[pdf to img]' | ||
17 | |||
18 | # 页面保存为png图片参数 | ||
19 | ZOOM_X = ZOOM_Y = 2.0 | ||
20 | trans = fitz.Matrix(ZOOM_X, ZOOM_X).preRotate(0) # zoom factor 2 in each dimension | ||
21 | |||
22 | # 特殊filter处理 | ||
23 | ADOBE_FILTER_SET = {'FlateDecode', 'JPXDecode', 'JBIG2Decode'} | ||
24 | |||
25 | # 宽高阈值组合 | ||
26 | WH_COUPLE_1 = (500, 500) | ||
27 | WH_COUPLE_2 = (700, 647) | ||
28 | WH_COUPLE_3 = (100, 100) | ||
29 | WH_COUPLE_4 = (100, 300) | ||
30 | WH_COUPLE_5 = (100, 200) | ||
31 | |||
32 | |||
33 | class PDFHandler: | ||
34 | |||
35 | def __init__(self, path, target_path): | ||
36 | self.path = path | ||
37 | self.img_dir_path = os.path.join(target_path, os.path.splitext(os.path.basename(path))[0]) | ||
38 | self.xref_set = set() | ||
39 | |||
40 | def get_img_save_path(self, pno, img_index=0, ext='png'): | ||
41 | return os.path.join(self.img_dir_path, 'page_{0}_img_{1}.{2}'.format(pno, img_index, ext)) | ||
42 | |||
43 | def page_to_png(self, page): | ||
44 | pm = page.getPixmap(matrix=trans, alpha=False) | ||
45 | img_save_path = self.get_img_save_path(page.number) | ||
46 | pm.writePNG(img_save_path) | ||
47 | |||
48 | @staticmethod | ||
49 | def getimage(pix): | ||
50 | # RGB | ||
51 | if pix.colorspace.n != 4: | ||
52 | return pix | ||
53 | # GRAY/CMYK | ||
54 | tpix = fitz.Pixmap(fitz.csRGB, pix) | ||
55 | return tpix | ||
56 | |||
57 | def recover_pix(self, doc, xref, smask, colorspace): | ||
58 | if smask != 0: | ||
59 | # we need to reconstruct the alpha channel with the smask | ||
60 | pix1 = fitz.Pixmap(doc, xref) | ||
61 | pix2 = fitz.Pixmap(doc, smask) # create pixmap of the /SMask entry | ||
62 | |||
63 | # sanity check | ||
64 | if not (pix1.irect == pix2.irect and pix1.alpha == pix2.alpha == 0 and pix2.n == 1): | ||
65 | pix2 = None | ||
66 | return self.getimage(pix1) | ||
67 | |||
68 | pix = fitz.Pixmap(pix1) # copy of pix1, alpha channel added | ||
69 | pix.setAlpha(pix2.samples) # treat pix2.samples as alpha value | ||
70 | pix1 = pix2 = None # free temp pixmaps | ||
71 | return self.getimage(pix) | ||
72 | elif colorspace in {'Separation', 'DeviceCMYK'}: | ||
73 | pix = fitz.Pixmap(doc, xref) | ||
74 | tpix = fitz.Pixmap(fitz.csRGB, pix) | ||
75 | return tpix | ||
76 | else: | ||
77 | return doc.extractImage(xref) | ||
78 | |||
79 | @staticmethod | ||
80 | def get_img_data(pix): | ||
81 | if type(pix) is dict: # we got a raw image | ||
82 | ext = pix["ext"] | ||
83 | img_data = pix["image"] | ||
84 | else: # we got a pixmap | ||
85 | ext = 'png' | ||
86 | img_data = pix.getPNGData() | ||
87 | return ext, img_data | ||
88 | |||
89 | def extract_single_image(self, pdf, xref, smask, colorspace, pno, img_index=0): | ||
90 | pix = self.recover_pix(pdf, xref, smask, colorspace) | ||
91 | ext, img_data = self.get_img_data(pix) | ||
92 | img_save_path = self.get_img_save_path(pno, img_index=img_index, ext=ext) | ||
93 | with open(img_save_path, "wb") as f: | ||
94 | f.write(img_data) | ||
95 | self.xref_set.add(xref) | ||
96 | |||
97 | @staticmethod | ||
98 | def split_il(il): | ||
99 | broken_il = [] | ||
100 | start = 0 | ||
101 | length = len(il) | ||
102 | page_to_png = None | ||
103 | for i in range(length): | ||
104 | # 当图片对象含有特殊filter时,特殊处理:整个页面保存为png图片 | ||
105 | if il[i][-1] in ADOBE_FILTER_SET: | ||
106 | page_to_png = True | ||
107 | break | ||
108 | else: | ||
109 | for i in range(length): | ||
110 | # 当图片对象够大时,不作碎图合并处理,而是单纯提取 | ||
111 | if il[i][2] >= WH_COUPLE_2[0] and il[i][3] >= WH_COUPLE_2[1]: | ||
112 | break | ||
113 | if i == start: | ||
114 | if i == length - 1: | ||
115 | broken_il.append(il[start: length]) | ||
116 | continue | ||
117 | elif i == length - 1: | ||
118 | if il[i][2] == il[i - 1][2]: | ||
119 | broken_il.append(il[start: length]) | ||
120 | else: | ||
121 | broken_il.append(il[start: i]) | ||
122 | broken_il.append(il[i: length]) | ||
123 | continue | ||
124 | if il[i][2] != il[i - 1][2]: | ||
125 | broken_il.append(il[start: i]) | ||
126 | start = i | ||
127 | elif il[i][3] != il[i - 1][3]: | ||
128 | broken_il.append(il[start: i + 1]) | ||
129 | start = i + 1 | ||
130 | else: | ||
131 | # 碎图分组结果 | ||
132 | return broken_il | ||
133 | return page_to_png | ||
134 | |||
135 | def merge_il(self, pdf, pno, il): | ||
136 | # 尝试碎图合并前的分组 | ||
137 | il.sort(key=lambda x: x[0]) | ||
138 | broken_il = self.split_il(il) | ||
139 | print('broken_il: {0}'.format(broken_il)) | ||
140 | |||
141 | page_to_png = True | ||
142 | # 3.1 当图片对象够大时,不作碎图合并处理,而是单纯提取 | ||
143 | if broken_il is None: | ||
144 | page_to_png = False | ||
145 | for img_index, img in enumerate(il): | ||
146 | xref, smask, width, height, _, colorspace, _, _, adobe_filter = img | ||
147 | if width < WH_COUPLE_3[0] or height < WH_COUPLE_3[1]: # 过滤小图(如二维码) | ||
148 | continue | ||
149 | elif xref not in self.xref_set: | ||
150 | self.extract_single_image(pdf, xref, smask, colorspace, pno, img_index) | ||
151 | # 3.2 碎图按照分组合并 | ||
152 | elif isinstance(broken_il, list) and len(broken_il) <= 2: | ||
153 | for img_index, img_il in enumerate(broken_il): | ||
154 | # 3.2.1 仅一张碎图,过滤或直接提取 | ||
155 | if len(img_il) == 1: | ||
156 | xref, smask, width, height, _, colorspace, _, _, adobe_filter = img_il[0] | ||
157 | # 过滤小图(如二维码) | ||
158 | if width < WH_COUPLE_4[0] or height < WH_COUPLE_4[1] or \ | ||
159 | (width < WH_COUPLE_1[0] and height < WH_COUPLE_1[1]): | ||
160 | continue | ||
161 | elif xref not in self.xref_set: | ||
162 | self.extract_single_image(pdf, xref, smask, colorspace, pno, img_index) | ||
163 | page_to_png = False | ||
164 | # 3.2.2 多张碎图,竖向拼接 | ||
165 | else: | ||
166 | height_sum = sum([img[3] for img in img_il]) | ||
167 | width = img_il[0][2] | ||
168 | # 过滤小图和不常规大图 | ||
169 | if width < WH_COUPLE_5[0] or height_sum < WH_COUPLE_5[1] or \ | ||
170 | (width > 1000 and height_sum > width * 3): | ||
171 | continue | ||
172 | im_list = [] | ||
173 | for img in img_il: | ||
174 | xref, smask, _, height, _, colorspace, _, _, adobe_filter = img | ||
175 | pix = self.recover_pix(pdf, xref, smask, colorspace) | ||
176 | ext, img_data = self.get_img_data(pix) | ||
177 | im = Image.open(BytesIO(img_data)) | ||
178 | im_list.append((height, im, ext)) | ||
179 | new_img = Image.new(im_list[0][1].mode, (width, height_sum)) | ||
180 | h_now = 0 | ||
181 | for h, m, _ in im_list: | ||
182 | new_img.paste(m, box=(0, h_now)) | ||
183 | h_now += h | ||
184 | img_save_path = self.get_img_save_path(pno, img_index, im_list[0][2]) | ||
185 | new_img.save(img_save_path) | ||
186 | page_to_png = False | ||
187 | |||
188 | # 3.3 碎图分组大于2、全过滤、含特殊filter,特殊处理:整个页面保存为png图片 | ||
189 | if page_to_png: | ||
190 | page = pdf.loadPage(pno) | ||
191 | self.page_to_png(page) | ||
192 | |||
193 | def extract_image(self): | ||
194 | os.makedirs(self.img_dir_path, exist_ok=True) | ||
195 | with fitz.Document(self.path) as pdf: | ||
196 | print('++++++++++' * 5) | ||
197 | print('{0} [start] [pdf_path={1}] [metadata={2}]'.format(LOG_BASE, self.path, pdf.metadata)) | ||
198 | for pno in range(pdf.pageCount): | ||
199 | il = pdf.getPageImageList(pno) # 获取页面图片对象 | ||
200 | # (xref, smask, width, height, bpc, colorspace, alt.colorspace, name, filter, invoker) | ||
201 | print('---------- page: {0} ----------'.format(pno)) | ||
202 | print('img_object_list: {0}'.format(il)) | ||
203 | |||
204 | # 单纯提取页面图片对象 | ||
205 | # for img_index, img in enumerate(il): | ||
206 | # pix = self.recover_pix(pdf, img[0], img[1], img[5]) | ||
207 | # ext, img_data = self.get_img_data(pix) | ||
208 | # img_save_path = self.get_img_save_path(pno, img_index, ext) | ||
209 | # with open(img_save_path, "wb") as f: | ||
210 | # f.write(img_data) | ||
211 | |||
212 | # 1.页面图片对象数目为0时,保存整个页面为png图片 | ||
213 | if len(il) == 0: | ||
214 | page = pdf.loadPage(pno) | ||
215 | self.page_to_png(page) | ||
216 | # 2.页面图片对象数目为1时: | ||
217 | # 小图(如电子账单的盖章):保存整个页面为png图片 | ||
218 | # 大图:提取图片对象 | ||
219 | elif len(il) == 1: | ||
220 | xref, smask, width, height, _, colorspace, _, _, _ = il[0] | ||
221 | # 小图 | ||
222 | if width < WH_COUPLE_1[0] and height < WH_COUPLE_1[1]: | ||
223 | page = pdf.loadPage(pno) | ||
224 | self.page_to_png(page) | ||
225 | # 大图 | ||
226 | elif xref not in self.xref_set: | ||
227 | self.extract_single_image(pdf, xref, smask, colorspace, pno) | ||
228 | # 3.页面图片对象数目大于1时,特殊处理 | ||
229 | else: | ||
230 | self.merge_il(pdf, pno, il) | ||
231 | |||
232 | |||
233 | def extract_image(pdf_path, target_path): | ||
234 | pdf_handler = PDFHandler(pdf_path, target_path) | ||
235 | pdf_handler.extract_image() | ||
236 | |||
237 | |||
238 | def main(): | ||
239 | pdf_path = os.path.realpath(sys.argv[1]) | ||
240 | # 目录:遍历处理所有pdf文件 | ||
241 | if os.path.isdir(pdf_path): | ||
242 | completed_count = 0 | ||
243 | failed_list = [] | ||
244 | for parent, dirnames, filenames in os.walk(pdf_path): | ||
245 | # 图片保存目录 | ||
246 | target_path = os.path.realpath(sys.argv[2]) if len(sys.argv) > 2 else parent | ||
247 | for pdf_file in filenames: | ||
248 | if not pdf_file.endswith('pdf') and not pdf_file.endswith('PDF'): | ||
249 | continue | ||
250 | pdf_file_path = os.path.join(parent, pdf_file) | ||
251 | try: | ||
252 | extract_image(pdf_file_path, target_path) | ||
253 | except Exception as e: | ||
254 | print('{0} [failed] [err={1}] [pdf_path={2}]'.format(LOG_BASE, e, pdf_file_path)) | ||
255 | failed_list.append(pdf_file_path) | ||
256 | else: | ||
257 | print('{0} [completed] [pdf_path={1}]'.format(LOG_BASE, pdf_path)) | ||
258 | completed_count += 1 | ||
259 | print('{0} [all completed] [completed_count={1}] [failed_count={2}] [failed_pdf_path={3}]'.format( | ||
260 | LOG_BASE, completed_count, len(failed_list), failed_list)) | ||
261 | # 文件:处理pdf文件 | ||
262 | else: | ||
263 | # 图片保存目录 | ||
264 | target_path = os.path.realpath(sys.argv[2]) if len(sys.argv) > 2 else os.path.dirname(pdf_path) | ||
265 | try: | ||
266 | extract_image(pdf_path, target_path) | ||
267 | except Exception as e: | ||
268 | print('{0} [failed] [err={1}] [pdf_path={2}]'.format(LOG_BASE, e, pdf_path)) | ||
269 | else: | ||
270 | print('{0} [completed] [pdf_path={1}]'.format(LOG_BASE, pdf_path)) | ||
271 | |||
272 | |||
273 | if __name__ == "__main__": | ||
274 | main() |
requirements.txt
0 → 100644
-
Please register or sign in to post a comment