94794bd5 by 周伟奇

prune extract model

1 parent ff70b617
# PDF转图片脚本
## 2种转化方式
## 转化方式
- 保存整个页面为png图片
- 提取PDF页面中的图片对象
- 图片对象数目为0(如电子账单),保存整个页面为png图片
- 图片对象数目为1
- 大图,保存图片对象
- 小图(如电子账单盖章),保存整个页面为png图片
- 图片对象数目大于1
- 多整图,保存图片对象
- 多碎图,根据宽高突变位置分组,拼接合并后保存
- 其他特殊情况:保存整个页面为png图片
## 已知问题
- 提取图片对象方式下,整图与碎图通过宽高阈值区分,无法满足所有PDF。个别PDF中,整图很小时会被当做碎图合并,碎图很大时会被当做整图不合并
## 用法
- python3.6+
- `pip install -r requirements.txt`
- `python pdf_to_img.py [-h] -i INPUT [-o OUTPUT] [-e]`
- `python pdf_to_img.py [-h] -i INPUT [-o OUTPUT]`
```
可选参数:
-h, --help 查看帮助信息并退出
-i INPUT, --input INPUT PDF文件或目录路径,必要参数
-o OUTPUT, --output OUTPUT 输出图片保存路径,非必要参数,缺省值为PDF文件路径
-e, --extract 默认采用整个页面保存png图片的方式,增加该选项选择提取图片方式转化图片
```
\ No newline at end of file
......
......@@ -2,8 +2,6 @@ import os
import sys
import fitz
import argparse
from PIL import Image
from io import BytesIO
if sys.version_info[0] < 3:
raise Exception("This program requires at least python3.6")
......@@ -11,7 +9,6 @@ if sys.version_info[0] < 3:
parser = argparse.ArgumentParser(description='PDF转图片')
parser.add_argument('-i', '--input', help='PDF文件或目录路径,必要参数', required=True)
parser.add_argument('-o', '--output', help='输出图片保存路径,非必要参数,缺省值为PDF文件路径')
parser.add_argument('-e', '--extract', help='默认采用整个页面保存png图片的方式,增加该选项选择提取图片方式转化图片', action="store_true")
args = parser.parse_args()
LOG_BASE = '[pdf to img]'
......@@ -20,16 +17,6 @@ LOG_BASE = '[pdf to img]'
ZOOM_X = ZOOM_Y = 2.0
trans = fitz.Matrix(ZOOM_X, ZOOM_X).preRotate(0) # zoom factor 2 in each dimension
# 特殊filter处理
ADOBE_FILTER_SET = {'FlateDecode', 'JPXDecode', 'JBIG2Decode'}
# 宽高阈值组合
WH_COUPLE_1 = (500, 500)
WH_COUPLE_2 = (700, 647)
WH_COUPLE_3 = (100, 100)
WH_COUPLE_4 = (100, 300)
WH_COUPLE_5 = (100, 200)
class PDFHandler:
......@@ -46,194 +33,20 @@ class PDFHandler:
img_save_path = self.get_img_save_path(page.number)
pm.writePNG(img_save_path)
@staticmethod
def getimage(pix):
# RGB
if pix.colorspace.n != 4:
return pix
# GRAY/CMYK
tpix = fitz.Pixmap(fitz.csRGB, pix)
return tpix
def recover_pix(self, doc, xref, smask, colorspace):
if smask != 0:
# we need to reconstruct the alpha channel with the smask
pix1 = fitz.Pixmap(doc, xref)
pix2 = fitz.Pixmap(doc, smask) # create pixmap of the /SMask entry
# sanity check
if not (pix1.irect == pix2.irect and pix1.alpha == pix2.alpha == 0 and pix2.n == 1):
pix2 = None
return self.getimage(pix1)
pix = fitz.Pixmap(pix1) # copy of pix1, alpha channel added
pix.setAlpha(pix2.samples) # treat pix2.samples as alpha value
pix1 = pix2 = None # free temp pixmaps
return self.getimage(pix)
elif colorspace in {'Separation', 'DeviceCMYK'}:
pix = fitz.Pixmap(doc, xref)
tpix = fitz.Pixmap(fitz.csRGB, pix)
return tpix
else:
return doc.extractImage(xref)
@staticmethod
def get_img_data(pix):
if type(pix) is dict: # we got a raw image
ext = pix["ext"]
img_data = pix["image"]
else: # we got a pixmap
ext = 'png'
img_data = pix.getPNGData()
return ext, img_data
def extract_single_image(self, pdf, xref, smask, colorspace, pno, img_index=0):
pix = self.recover_pix(pdf, xref, smask, colorspace)
ext, img_data = self.get_img_data(pix)
img_save_path = self.get_img_save_path(pno, img_index=img_index, ext=ext)
with open(img_save_path, "wb") as f:
f.write(img_data)
self.xref_set.add(xref)
@staticmethod
def split_il(il):
broken_il = []
start = 0
length = len(il)
page_to_png = None
for i in range(length):
# 当图片对象含有特殊filter时,特殊处理:整个页面保存为png图片
if il[i][-1] in ADOBE_FILTER_SET:
page_to_png = True
break
else:
for i in range(length):
# 当图片对象够大时,不作碎图合并处理,而是单纯提取
if il[i][2] >= WH_COUPLE_2[0] and il[i][3] >= WH_COUPLE_2[1]:
break
if i == start:
if i == length - 1:
broken_il.append(il[start: length])
continue
elif i == length - 1:
if il[i][2] == il[i - 1][2]:
broken_il.append(il[start: length])
else:
broken_il.append(il[start: i])
broken_il.append(il[i: length])
continue
if il[i][2] != il[i - 1][2]:
broken_il.append(il[start: i])
start = i
elif il[i][3] != il[i - 1][3]:
broken_il.append(il[start: i + 1])
start = i + 1
else:
# 碎图分组结果
return broken_il
return page_to_png
def merge_il(self, pdf, pno, il):
# 尝试碎图合并前的分组
il.sort(key=lambda x: x[0])
broken_il = self.split_il(il)
print('broken_il: {0}'.format(broken_il))
page_to_png = True
# 3.1 当图片对象够大时,不作碎图合并处理,而是单纯提取
if broken_il is None:
page_to_png = False
for img_index, img in enumerate(il):
xref, smask, width, height, _, colorspace, _, _, adobe_filter = img
if width < WH_COUPLE_3[0] or height < WH_COUPLE_3[1]: # 过滤小图(如二维码)
continue
elif xref not in self.xref_set:
self.extract_single_image(pdf, xref, smask, colorspace, pno, img_index)
# 3.2 碎图按照分组合并
elif isinstance(broken_il, list) and len(broken_il) <= 2:
for img_index, img_il in enumerate(broken_il):
# 3.2.1 仅一张碎图,过滤或直接提取
if len(img_il) == 1:
xref, smask, width, height, _, colorspace, _, _, adobe_filter = img_il[0]
# 过滤小图(如二维码)
if width < WH_COUPLE_4[0] or height < WH_COUPLE_4[1] or \
(width < WH_COUPLE_1[0] and height < WH_COUPLE_1[1]):
continue
elif xref not in self.xref_set:
self.extract_single_image(pdf, xref, smask, colorspace, pno, img_index)
page_to_png = False
# 3.2.2 多张碎图,竖向拼接
else:
height_sum = sum([img[3] for img in img_il])
width = img_il[0][2]
# 过滤小图和不常规大图
if width < WH_COUPLE_5[0] or height_sum < WH_COUPLE_5[1] or \
(width > 1000 and height_sum > width * 3):
continue
im_list = []
for img in img_il:
xref, smask, _, height, _, colorspace, _, _, adobe_filter = img
pix = self.recover_pix(pdf, xref, smask, colorspace)
ext, img_data = self.get_img_data(pix)
im = Image.open(BytesIO(img_data))
im_list.append((height, im, ext))
new_img = Image.new(im_list[0][1].mode, (width, height_sum))
h_now = 0
for h, m, _ in im_list:
new_img.paste(m, box=(0, h_now))
h_now += h
img_save_path = self.get_img_save_path(pno, img_index, im_list[0][2])
new_img.save(img_save_path)
page_to_png = False
# 3.3 碎图分组大于2、全过滤、含特殊filter,特殊处理:整个页面保存为png图片
if page_to_png:
page = pdf.loadPage(pno)
self.page_to_png(page)
def extract_image(self, is_extract):
def extract_image(self):
os.makedirs(self.img_dir_path, exist_ok=True)
with fitz.Document(self.path) as pdf:
print('++++++++++' * 5)
print('{0} [start] [pdf_path={1}] [metadata={2}]'.format(LOG_BASE, self.path, pdf.metadata))
for pno in range(pdf.pageCount):
il = pdf.getPageImageList(pno) if is_extract else [] # 获取页面图片对象
# (xref, smask, width, height, bpc, colorspace, alt.colorspace, name, filter, invoker)
print('---------- page: {0} ----------'.format(pno))
print('img_object_list: {0}'.format(il))
# 单纯提取页面图片对象
# for img_index, img in enumerate(il):
# pix = self.recover_pix(pdf, img[0], img[1], img[5])
# ext, img_data = self.get_img_data(pix)
# img_save_path = self.get_img_save_path(pno, img_index, ext)
# with open(img_save_path, "wb") as f:
# f.write(img_data)
# 1.页面图片对象数目为0时,保存整个页面为png图片
if len(il) == 0:
page = pdf.loadPage(pno)
self.page_to_png(page)
# 2.页面图片对象数目为1时:
# 小图(如电子账单的盖章):保存整个页面为png图片
# 大图:提取图片对象
elif len(il) == 1:
xref, smask, width, height, _, colorspace, _, _, _ = il[0]
# 小图
if width < WH_COUPLE_1[0] and height < WH_COUPLE_1[1]:
page = pdf.loadPage(pno)
self.page_to_png(page)
# 大图
elif xref not in self.xref_set:
self.extract_single_image(pdf, xref, smask, colorspace, pno)
# 3.页面图片对象数目大于1时,特殊处理
else:
self.merge_il(pdf, pno, il)
print('{0} [end] [pdf_path={1}] [img_save_path={2}]'.format(LOG_BASE, self.path, self.img_dir_path))
def extract_image(pdf_path, target_path, is_extract):
def extract_image(pdf_path, target_path):
pdf_handler = PDFHandler(pdf_path, target_path)
pdf_handler.extract_image(is_extract)
pdf_handler.extract_image()
def main():
......@@ -253,7 +66,7 @@ def main():
continue
pdf_file_path = os.path.join(parent, pdf_file)
try:
extract_image(pdf_file_path, target_path, args.extract)
extract_image(pdf_file_path, target_path)
except Exception as e:
print('{0} [failed] [err={1}] [pdf_path={2}]'.format(LOG_BASE, e, pdf_file_path))
failed_list.append(pdf_file_path)
......@@ -267,7 +80,7 @@ def main():
# 图片保存目录
target_path = os.path.realpath(args.output) if args.output else os.path.dirname(pdf_path)
try:
extract_image(pdf_path, target_path, args.extract)
extract_image(pdf_path, target_path)
except Exception as e:
print('{0} [failed] [err={1}] [pdf_path={2}]'.format(LOG_BASE, e, pdf_path))
else:
......
Pillow==7.2.0
PyMuPDF==1.17.0
\ No newline at end of file
......
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!