f4fffb77 by Gruel

first commit

0 parents
.idea/
\ No newline at end of file
# PDF转图片脚本
## 主要处理逻辑
- 提取PDF页面中的图片对象
- 图片对象数目为0(如电子账单),保存整个页面为png图片
- 图片对象数目为1
- 大图,保存图片对象
- 小图(如电子账单盖章),保存整个页面为png图片
- 图片对象数目大于1
- 多大图,保存图片对象
- 多碎图,根据宽高突变位置分组,拼接合并后保存
- 其他特殊情况:保存整个页面为png图片
## 用法
- python3.6+
- `pip install -r requirements`
- `python pdf_to_img.py pdf_path [img_path]`
| 参数 | 是否必须 | 说明 | 缺省值 |
| ---- | ---- | ---- | ---- |
| pdf_path | 是 | PDF文件或目录路径 | - |
| img_path | 否 | 图片保存路径 | PDF文件路径 |
\ No newline at end of file
import os
import sys
import fitz
from PIL import Image
from io import BytesIO
if sys.version_info[0] < 3:
raise Exception("This program requires at least python3.6")
if len(sys.argv) < 2:
print('用法:python pdf_to_img.py PDF文件或目录路径 [图片保存路径]')
sys.exit(0)
if not os.path.exists(sys.argv[1]):
print('PDF文件或目录不存在: {0}'.format(sys.argv[1]))
sys.exit(0)
LOG_BASE = '[pdf to img]'
# 页面保存为png图片参数
ZOOM_X = ZOOM_Y = 2.0
trans = fitz.Matrix(ZOOM_X, ZOOM_X).preRotate(0) # zoom factor 2 in each dimension
# 特殊filter处理
ADOBE_FILTER_SET = {'FlateDecode', 'JPXDecode', 'JBIG2Decode'}
# 宽高阈值组合
WH_COUPLE_1 = (500, 500)
WH_COUPLE_2 = (700, 647)
WH_COUPLE_3 = (100, 100)
WH_COUPLE_4 = (100, 300)
WH_COUPLE_5 = (100, 200)
class PDFHandler:
def __init__(self, path, target_path):
self.path = path
self.img_dir_path = os.path.join(target_path, os.path.splitext(os.path.basename(path))[0])
self.xref_set = set()
def get_img_save_path(self, pno, img_index=0, ext='png'):
return os.path.join(self.img_dir_path, 'page_{0}_img_{1}.{2}'.format(pno, img_index, ext))
def page_to_png(self, page):
pm = page.getPixmap(matrix=trans, alpha=False)
img_save_path = self.get_img_save_path(page.number)
pm.writePNG(img_save_path)
@staticmethod
def getimage(pix):
# RGB
if pix.colorspace.n != 4:
return pix
# GRAY/CMYK
tpix = fitz.Pixmap(fitz.csRGB, pix)
return tpix
def recover_pix(self, doc, xref, smask, colorspace):
if smask != 0:
# we need to reconstruct the alpha channel with the smask
pix1 = fitz.Pixmap(doc, xref)
pix2 = fitz.Pixmap(doc, smask) # create pixmap of the /SMask entry
# sanity check
if not (pix1.irect == pix2.irect and pix1.alpha == pix2.alpha == 0 and pix2.n == 1):
pix2 = None
return self.getimage(pix1)
pix = fitz.Pixmap(pix1) # copy of pix1, alpha channel added
pix.setAlpha(pix2.samples) # treat pix2.samples as alpha value
pix1 = pix2 = None # free temp pixmaps
return self.getimage(pix)
elif colorspace in {'Separation', 'DeviceCMYK'}:
pix = fitz.Pixmap(doc, xref)
tpix = fitz.Pixmap(fitz.csRGB, pix)
return tpix
else:
return doc.extractImage(xref)
@staticmethod
def get_img_data(pix):
if type(pix) is dict: # we got a raw image
ext = pix["ext"]
img_data = pix["image"]
else: # we got a pixmap
ext = 'png'
img_data = pix.getPNGData()
return ext, img_data
def extract_single_image(self, pdf, xref, smask, colorspace, pno, img_index=0):
pix = self.recover_pix(pdf, xref, smask, colorspace)
ext, img_data = self.get_img_data(pix)
img_save_path = self.get_img_save_path(pno, img_index=img_index, ext=ext)
with open(img_save_path, "wb") as f:
f.write(img_data)
self.xref_set.add(xref)
@staticmethod
def split_il(il):
broken_il = []
start = 0
length = len(il)
page_to_png = None
for i in range(length):
# 当图片对象含有特殊filter时,特殊处理:整个页面保存为png图片
if il[i][-1] in ADOBE_FILTER_SET:
page_to_png = True
break
else:
for i in range(length):
# 当图片对象够大时,不作碎图合并处理,而是单纯提取
if il[i][2] >= WH_COUPLE_2[0] and il[i][3] >= WH_COUPLE_2[1]:
break
if i == start:
if i == length - 1:
broken_il.append(il[start: length])
continue
elif i == length - 1:
if il[i][2] == il[i - 1][2]:
broken_il.append(il[start: length])
else:
broken_il.append(il[start: i])
broken_il.append(il[i: length])
continue
if il[i][2] != il[i - 1][2]:
broken_il.append(il[start: i])
start = i
elif il[i][3] != il[i - 1][3]:
broken_il.append(il[start: i + 1])
start = i + 1
else:
# 碎图分组结果
return broken_il
return page_to_png
def merge_il(self, pdf, pno, il):
# 尝试碎图合并前的分组
il.sort(key=lambda x: x[0])
broken_il = self.split_il(il)
print('broken_il: {0}'.format(broken_il))
page_to_png = True
# 3.1 当图片对象够大时,不作碎图合并处理,而是单纯提取
if broken_il is None:
page_to_png = False
for img_index, img in enumerate(il):
xref, smask, width, height, _, colorspace, _, _, adobe_filter = img
if width < WH_COUPLE_3[0] or height < WH_COUPLE_3[1]: # 过滤小图(如二维码)
continue
elif xref not in self.xref_set:
self.extract_single_image(pdf, xref, smask, colorspace, pno, img_index)
# 3.2 碎图按照分组合并
elif isinstance(broken_il, list) and len(broken_il) <= 2:
for img_index, img_il in enumerate(broken_il):
# 3.2.1 仅一张碎图,过滤或直接提取
if len(img_il) == 1:
xref, smask, width, height, _, colorspace, _, _, adobe_filter = img_il[0]
# 过滤小图(如二维码)
if width < WH_COUPLE_4[0] or height < WH_COUPLE_4[1] or \
(width < WH_COUPLE_1[0] and height < WH_COUPLE_1[1]):
continue
elif xref not in self.xref_set:
self.extract_single_image(pdf, xref, smask, colorspace, pno, img_index)
page_to_png = False
# 3.2.2 多张碎图,竖向拼接
else:
height_sum = sum([img[3] for img in img_il])
width = img_il[0][2]
# 过滤小图和不常规大图
if width < WH_COUPLE_5[0] or height_sum < WH_COUPLE_5[1] or \
(width > 1000 and height_sum > width * 3):
continue
im_list = []
for img in img_il:
xref, smask, _, height, _, colorspace, _, _, adobe_filter = img
pix = self.recover_pix(pdf, xref, smask, colorspace)
ext, img_data = self.get_img_data(pix)
im = Image.open(BytesIO(img_data))
im_list.append((height, im, ext))
new_img = Image.new(im_list[0][1].mode, (width, height_sum))
h_now = 0
for h, m, _ in im_list:
new_img.paste(m, box=(0, h_now))
h_now += h
img_save_path = self.get_img_save_path(pno, img_index, im_list[0][2])
new_img.save(img_save_path)
page_to_png = False
# 3.3 碎图分组大于2、全过滤、含特殊filter,特殊处理:整个页面保存为png图片
if page_to_png:
page = pdf.loadPage(pno)
self.page_to_png(page)
def extract_image(self):
os.makedirs(self.img_dir_path, exist_ok=True)
with fitz.Document(self.path) as pdf:
print('++++++++++' * 5)
print('{0} [start] [pdf_path={1}] [metadata={2}]'.format(LOG_BASE, self.path, pdf.metadata))
for pno in range(pdf.pageCount):
il = pdf.getPageImageList(pno) # 获取页面图片对象
# (xref, smask, width, height, bpc, colorspace, alt.colorspace, name, filter, invoker)
print('---------- page: {0} ----------'.format(pno))
print('img_object_list: {0}'.format(il))
# 单纯提取页面图片对象
# for img_index, img in enumerate(il):
# pix = self.recover_pix(pdf, img[0], img[1], img[5])
# ext, img_data = self.get_img_data(pix)
# img_save_path = self.get_img_save_path(pno, img_index, ext)
# with open(img_save_path, "wb") as f:
# f.write(img_data)
# 1.页面图片对象数目为0时,保存整个页面为png图片
if len(il) == 0:
page = pdf.loadPage(pno)
self.page_to_png(page)
# 2.页面图片对象数目为1时:
# 小图(如电子账单的盖章):保存整个页面为png图片
# 大图:提取图片对象
elif len(il) == 1:
xref, smask, width, height, _, colorspace, _, _, _ = il[0]
# 小图
if width < WH_COUPLE_1[0] and height < WH_COUPLE_1[1]:
page = pdf.loadPage(pno)
self.page_to_png(page)
# 大图
elif xref not in self.xref_set:
self.extract_single_image(pdf, xref, smask, colorspace, pno)
# 3.页面图片对象数目大于1时,特殊处理
else:
self.merge_il(pdf, pno, il)
def extract_image(pdf_path, target_path):
pdf_handler = PDFHandler(pdf_path, target_path)
pdf_handler.extract_image()
def main():
pdf_path = os.path.realpath(sys.argv[1])
# 目录:遍历处理所有pdf文件
if os.path.isdir(pdf_path):
completed_count = 0
failed_list = []
for parent, dirnames, filenames in os.walk(pdf_path):
# 图片保存目录
target_path = os.path.realpath(sys.argv[2]) if len(sys.argv) > 2 else parent
for pdf_file in filenames:
if not pdf_file.endswith('pdf') and not pdf_file.endswith('PDF'):
continue
pdf_file_path = os.path.join(parent, pdf_file)
try:
extract_image(pdf_file_path, target_path)
except Exception as e:
print('{0} [failed] [err={1}] [pdf_path={2}]'.format(LOG_BASE, e, pdf_file_path))
failed_list.append(pdf_file_path)
else:
print('{0} [completed] [pdf_path={1}]'.format(LOG_BASE, pdf_path))
completed_count += 1
print('{0} [all completed] [completed_count={1}] [failed_count={2}] [failed_pdf_path={3}]'.format(
LOG_BASE, completed_count, len(failed_list), failed_list))
# 文件:处理pdf文件
else:
# 图片保存目录
target_path = os.path.realpath(sys.argv[2]) if len(sys.argv) > 2 else os.path.dirname(pdf_path)
try:
extract_image(pdf_path, target_path)
except Exception as e:
print('{0} [failed] [err={1}] [pdf_path={2}]'.format(LOG_BASE, e, pdf_path))
else:
print('{0} [completed] [pdf_path={1}]'.format(LOG_BASE, pdf_path))
if __name__ == "__main__":
main()
Pillow==7.2.0
PyMuPDF==1.17.0
\ No newline at end of file
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!