pdf_to_img.py
3.52 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import os
import sys
import fitz
import argparse
if sys.version_info[0] < 3:
raise Exception("This program requires at least python3.6")
parser = argparse.ArgumentParser(description='PDF转图片')
parser.add_argument('-i', '--input', help='PDF文件或目录路径,必要参数', required=True)
parser.add_argument('-o', '--output', help='输出图片保存路径,非必要参数,缺省值为PDF文件路径')
args = parser.parse_args()
LOG_BASE = '[pdf to img]'
# 页面保存为png图片参数
ZOOM_X = ZOOM_Y = 2.0
trans = fitz.Matrix(ZOOM_X, ZOOM_X).preRotate(0) # zoom factor 2 in each dimension
class PDFHandler:
def __init__(self, path, target_path):
self.path = path
self.img_dir_path = os.path.join(target_path, os.path.splitext(os.path.basename(path))[0])
self.xref_set = set()
def get_img_save_path(self, pno, img_index=0, ext='png'):
return os.path.join(self.img_dir_path, 'page_{0}_img_{1}.{2}'.format(pno, img_index, ext))
def page_to_png(self, page):
pm = page.getPixmap(matrix=trans, alpha=False)
img_save_path = self.get_img_save_path(page.number)
pm.writePNG(img_save_path)
def extract_image(self):
os.makedirs(self.img_dir_path, exist_ok=True)
with fitz.Document(self.path) as pdf:
print('++++++++++' * 5)
print('{0} [start] [pdf_path={1}] [metadata={2}]'.format(LOG_BASE, self.path, pdf.metadata))
for pno in range(pdf.pageCount):
page = pdf.loadPage(pno)
self.page_to_png(page)
print('{0} [end] [pdf_path={1}] [img_save_path={2}]'.format(LOG_BASE, self.path, self.img_dir_path))
def extract_image(pdf_path, target_path):
pdf_handler = PDFHandler(pdf_path, target_path)
pdf_handler.extract_image()
def main():
if not os.path.exists(args.input):
print('PDF文件或目录不存在: {0}'.format(args.input))
return
pdf_path = os.path.realpath(args.input)
# 目录:遍历处理所有pdf文件
if os.path.isdir(pdf_path):
completed_count = 0
failed_list = []
for parent, dirnames, filenames in os.walk(pdf_path):
# 图片保存目录
target_path = os.path.realpath(args.output) if args.output else parent
for pdf_file in filenames:
if not pdf_file.endswith('pdf') and not pdf_file.endswith('PDF'):
continue
pdf_file_path = os.path.join(parent, pdf_file)
try:
extract_image(pdf_file_path, target_path)
except Exception as e:
print('{0} [failed] [err={1}] [pdf_path={2}]'.format(LOG_BASE, e, pdf_file_path))
failed_list.append(pdf_file_path)
else:
print('{0} [completed] [pdf_path={1}]'.format(LOG_BASE, pdf_path))
completed_count += 1
print('{0} [all completed] [completed_count={1}] [failed_count={2}] [failed_pdf_path={3}]'.format(
LOG_BASE, completed_count, len(failed_list), failed_list))
# 文件:处理pdf文件
else:
# 图片保存目录
target_path = os.path.realpath(args.output) if args.output else os.path.dirname(pdf_path)
try:
extract_image(pdf_path, target_path)
except Exception as e:
print('{0} [failed] [err={1}] [pdf_path={2}]'.format(LOG_BASE, e, pdf_path))
else:
print('{0} [completed] [pdf_path={1}]'.format(LOG_BASE, pdf_path))
if __name__ == "__main__":
main()