Skip to content
Toggle navigation
Toggle navigation
This project
Loading...
Sign in
周伟奇
/
pdf_to_img
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Pipelines
Graphs
Issues
0
Merge Requests
0
Wiki
Network
Create a new issue
Builds
Commits
Issue Boards
Files
Commits
Network
Compare
Branches
Tags
94794bd5
authored
2020-08-06 15:21:27 +0800
by
周伟奇
Browse Files
Options
Browse Files
Tag
Download
Email Patches
Plain Diff
prune extract model
1 parent
ff70b617
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
10 additions
and
211 deletions
README.md
pdf_to_img.py
requirements.txt
README.md
View file @
94794bd
# PDF转图片脚本
##
2种
转化方式
## 转化方式
-
保存整个页面为png图片
-
提取PDF页面中的图片对象
-
图片对象数目为0(如电子账单),保存整个页面为png图片
-
图片对象数目为1
-
大图,保存图片对象
-
小图(如电子账单盖章),保存整个页面为png图片
-
图片对象数目大于1
-
多整图,保存图片对象
-
多碎图,根据宽高突变位置分组,拼接合并后保存
-
其他特殊情况:保存整个页面为png图片
## 已知问题
-
提取图片对象方式下,整图与碎图通过宽高阈值区分,无法满足所有PDF。个别PDF中,整图很小时会被当做碎图合并,碎图很大时会被当做整图不合并
## 用法
-
python3.6+
-
`pip install -r requirements.txt`
-
`python pdf_to_img.py [-h] -i INPUT [-o OUTPUT]
[-e]
`
-
`python pdf_to_img.py [-h] -i INPUT [-o OUTPUT]`
```
可选参数:
-h, --help 查看帮助信息并退出
-i INPUT, --input INPUT PDF文件或目录路径,必要参数
-o OUTPUT, --output OUTPUT 输出图片保存路径,非必要参数,缺省值为PDF文件路径
-e, --extract 默认采用整个页面保存png图片的方式,增加该选项选择提取图片方式转化图片
```
\ No newline at end of file
...
...
pdf_to_img.py
View file @
94794bd
...
...
@@ -2,8 +2,6 @@ import os
import
sys
import
fitz
import
argparse
from
PIL
import
Image
from
io
import
BytesIO
if
sys
.
version_info
[
0
]
<
3
:
raise
Exception
(
"This program requires at least python3.6"
)
...
...
@@ -11,7 +9,6 @@ if sys.version_info[0] < 3:
parser
=
argparse
.
ArgumentParser
(
description
=
'PDF转图片'
)
parser
.
add_argument
(
'-i'
,
'--input'
,
help
=
'PDF文件或目录路径,必要参数'
,
required
=
True
)
parser
.
add_argument
(
'-o'
,
'--output'
,
help
=
'输出图片保存路径,非必要参数,缺省值为PDF文件路径'
)
parser
.
add_argument
(
'-e'
,
'--extract'
,
help
=
'默认采用整个页面保存png图片的方式,增加该选项选择提取图片方式转化图片'
,
action
=
"store_true"
)
args
=
parser
.
parse_args
()
LOG_BASE
=
'[pdf to img]'
...
...
@@ -20,16 +17,6 @@ LOG_BASE = '[pdf to img]'
ZOOM_X
=
ZOOM_Y
=
2.0
trans
=
fitz
.
Matrix
(
ZOOM_X
,
ZOOM_X
)
.
preRotate
(
0
)
# zoom factor 2 in each dimension
# 特殊filter处理
ADOBE_FILTER_SET
=
{
'FlateDecode'
,
'JPXDecode'
,
'JBIG2Decode'
}
# 宽高阈值组合
WH_COUPLE_1
=
(
500
,
500
)
WH_COUPLE_2
=
(
700
,
647
)
WH_COUPLE_3
=
(
100
,
100
)
WH_COUPLE_4
=
(
100
,
300
)
WH_COUPLE_5
=
(
100
,
200
)
class
PDFHandler
:
...
...
@@ -46,194 +33,20 @@ class PDFHandler:
img_save_path
=
self
.
get_img_save_path
(
page
.
number
)
pm
.
writePNG
(
img_save_path
)
@staticmethod
def
getimage
(
pix
):
# RGB
if
pix
.
colorspace
.
n
!=
4
:
return
pix
# GRAY/CMYK
tpix
=
fitz
.
Pixmap
(
fitz
.
csRGB
,
pix
)
return
tpix
def
recover_pix
(
self
,
doc
,
xref
,
smask
,
colorspace
):
if
smask
!=
0
:
# we need to reconstruct the alpha channel with the smask
pix1
=
fitz
.
Pixmap
(
doc
,
xref
)
pix2
=
fitz
.
Pixmap
(
doc
,
smask
)
# create pixmap of the /SMask entry
# sanity check
if
not
(
pix1
.
irect
==
pix2
.
irect
and
pix1
.
alpha
==
pix2
.
alpha
==
0
and
pix2
.
n
==
1
):
pix2
=
None
return
self
.
getimage
(
pix1
)
pix
=
fitz
.
Pixmap
(
pix1
)
# copy of pix1, alpha channel added
pix
.
setAlpha
(
pix2
.
samples
)
# treat pix2.samples as alpha value
pix1
=
pix2
=
None
# free temp pixmaps
return
self
.
getimage
(
pix
)
elif
colorspace
in
{
'Separation'
,
'DeviceCMYK'
}:
pix
=
fitz
.
Pixmap
(
doc
,
xref
)
tpix
=
fitz
.
Pixmap
(
fitz
.
csRGB
,
pix
)
return
tpix
else
:
return
doc
.
extractImage
(
xref
)
@staticmethod
def
get_img_data
(
pix
):
if
type
(
pix
)
is
dict
:
# we got a raw image
ext
=
pix
[
"ext"
]
img_data
=
pix
[
"image"
]
else
:
# we got a pixmap
ext
=
'png'
img_data
=
pix
.
getPNGData
()
return
ext
,
img_data
def
extract_single_image
(
self
,
pdf
,
xref
,
smask
,
colorspace
,
pno
,
img_index
=
0
):
pix
=
self
.
recover_pix
(
pdf
,
xref
,
smask
,
colorspace
)
ext
,
img_data
=
self
.
get_img_data
(
pix
)
img_save_path
=
self
.
get_img_save_path
(
pno
,
img_index
=
img_index
,
ext
=
ext
)
with
open
(
img_save_path
,
"wb"
)
as
f
:
f
.
write
(
img_data
)
self
.
xref_set
.
add
(
xref
)
@staticmethod
def
split_il
(
il
):
broken_il
=
[]
start
=
0
length
=
len
(
il
)
page_to_png
=
None
for
i
in
range
(
length
):
# 当图片对象含有特殊filter时,特殊处理:整个页面保存为png图片
if
il
[
i
][
-
1
]
in
ADOBE_FILTER_SET
:
page_to_png
=
True
break
else
:
for
i
in
range
(
length
):
# 当图片对象够大时,不作碎图合并处理,而是单纯提取
if
il
[
i
][
2
]
>=
WH_COUPLE_2
[
0
]
and
il
[
i
][
3
]
>=
WH_COUPLE_2
[
1
]:
break
if
i
==
start
:
if
i
==
length
-
1
:
broken_il
.
append
(
il
[
start
:
length
])
continue
elif
i
==
length
-
1
:
if
il
[
i
][
2
]
==
il
[
i
-
1
][
2
]:
broken_il
.
append
(
il
[
start
:
length
])
else
:
broken_il
.
append
(
il
[
start
:
i
])
broken_il
.
append
(
il
[
i
:
length
])
continue
if
il
[
i
][
2
]
!=
il
[
i
-
1
][
2
]:
broken_il
.
append
(
il
[
start
:
i
])
start
=
i
elif
il
[
i
][
3
]
!=
il
[
i
-
1
][
3
]:
broken_il
.
append
(
il
[
start
:
i
+
1
])
start
=
i
+
1
else
:
# 碎图分组结果
return
broken_il
return
page_to_png
def
merge_il
(
self
,
pdf
,
pno
,
il
):
# 尝试碎图合并前的分组
il
.
sort
(
key
=
lambda
x
:
x
[
0
])
broken_il
=
self
.
split_il
(
il
)
print
(
'broken_il: {0}'
.
format
(
broken_il
))
page_to_png
=
True
# 3.1 当图片对象够大时,不作碎图合并处理,而是单纯提取
if
broken_il
is
None
:
page_to_png
=
False
for
img_index
,
img
in
enumerate
(
il
):
xref
,
smask
,
width
,
height
,
_
,
colorspace
,
_
,
_
,
adobe_filter
=
img
if
width
<
WH_COUPLE_3
[
0
]
or
height
<
WH_COUPLE_3
[
1
]:
# 过滤小图(如二维码)
continue
elif
xref
not
in
self
.
xref_set
:
self
.
extract_single_image
(
pdf
,
xref
,
smask
,
colorspace
,
pno
,
img_index
)
# 3.2 碎图按照分组合并
elif
isinstance
(
broken_il
,
list
)
and
len
(
broken_il
)
<=
2
:
for
img_index
,
img_il
in
enumerate
(
broken_il
):
# 3.2.1 仅一张碎图,过滤或直接提取
if
len
(
img_il
)
==
1
:
xref
,
smask
,
width
,
height
,
_
,
colorspace
,
_
,
_
,
adobe_filter
=
img_il
[
0
]
# 过滤小图(如二维码)
if
width
<
WH_COUPLE_4
[
0
]
or
height
<
WH_COUPLE_4
[
1
]
or
\
(
width
<
WH_COUPLE_1
[
0
]
and
height
<
WH_COUPLE_1
[
1
]):
continue
elif
xref
not
in
self
.
xref_set
:
self
.
extract_single_image
(
pdf
,
xref
,
smask
,
colorspace
,
pno
,
img_index
)
page_to_png
=
False
# 3.2.2 多张碎图,竖向拼接
else
:
height_sum
=
sum
([
img
[
3
]
for
img
in
img_il
])
width
=
img_il
[
0
][
2
]
# 过滤小图和不常规大图
if
width
<
WH_COUPLE_5
[
0
]
or
height_sum
<
WH_COUPLE_5
[
1
]
or
\
(
width
>
1000
and
height_sum
>
width
*
3
):
continue
im_list
=
[]
for
img
in
img_il
:
xref
,
smask
,
_
,
height
,
_
,
colorspace
,
_
,
_
,
adobe_filter
=
img
pix
=
self
.
recover_pix
(
pdf
,
xref
,
smask
,
colorspace
)
ext
,
img_data
=
self
.
get_img_data
(
pix
)
im
=
Image
.
open
(
BytesIO
(
img_data
))
im_list
.
append
((
height
,
im
,
ext
))
new_img
=
Image
.
new
(
im_list
[
0
][
1
]
.
mode
,
(
width
,
height_sum
))
h_now
=
0
for
h
,
m
,
_
in
im_list
:
new_img
.
paste
(
m
,
box
=
(
0
,
h_now
))
h_now
+=
h
img_save_path
=
self
.
get_img_save_path
(
pno
,
img_index
,
im_list
[
0
][
2
])
new_img
.
save
(
img_save_path
)
page_to_png
=
False
# 3.3 碎图分组大于2、全过滤、含特殊filter,特殊处理:整个页面保存为png图片
if
page_to_png
:
page
=
pdf
.
loadPage
(
pno
)
self
.
page_to_png
(
page
)
def
extract_image
(
self
,
is_extract
):
def
extract_image
(
self
):
os
.
makedirs
(
self
.
img_dir_path
,
exist_ok
=
True
)
with
fitz
.
Document
(
self
.
path
)
as
pdf
:
print
(
'++++++++++'
*
5
)
print
(
'{0} [start] [pdf_path={1}] [metadata={2}]'
.
format
(
LOG_BASE
,
self
.
path
,
pdf
.
metadata
))
for
pno
in
range
(
pdf
.
pageCount
):
il
=
pdf
.
getPageImageList
(
pno
)
if
is_extract
else
[]
# 获取页面图片对象
# (xref, smask, width, height, bpc, colorspace, alt.colorspace, name, filter, invoker)
print
(
'---------- page: {0} ----------'
.
format
(
pno
))
print
(
'img_object_list: {0}'
.
format
(
il
))
# 单纯提取页面图片对象
# for img_index, img in enumerate(il):
# pix = self.recover_pix(pdf, img[0], img[1], img[5])
# ext, img_data = self.get_img_data(pix)
# img_save_path = self.get_img_save_path(pno, img_index, ext)
# with open(img_save_path, "wb") as f:
# f.write(img_data)
# 1.页面图片对象数目为0时,保存整个页面为png图片
if
len
(
il
)
==
0
:
page
=
pdf
.
loadPage
(
pno
)
self
.
page_to_png
(
page
)
# 2.页面图片对象数目为1时:
# 小图(如电子账单的盖章):保存整个页面为png图片
# 大图:提取图片对象
elif
len
(
il
)
==
1
:
xref
,
smask
,
width
,
height
,
_
,
colorspace
,
_
,
_
,
_
=
il
[
0
]
# 小图
if
width
<
WH_COUPLE_1
[
0
]
and
height
<
WH_COUPLE_1
[
1
]:
page
=
pdf
.
loadPage
(
pno
)
self
.
page_to_png
(
page
)
# 大图
elif
xref
not
in
self
.
xref_set
:
self
.
extract_single_image
(
pdf
,
xref
,
smask
,
colorspace
,
pno
)
# 3.页面图片对象数目大于1时,特殊处理
else
:
self
.
merge_il
(
pdf
,
pno
,
il
)
page
=
pdf
.
loadPage
(
pno
)
self
.
page_to_png
(
page
)
print
(
'{0} [end] [pdf_path={1}] [img_save_path={2}]'
.
format
(
LOG_BASE
,
self
.
path
,
self
.
img_dir_path
))
def
extract_image
(
pdf_path
,
target_path
,
is_extract
):
def
extract_image
(
pdf_path
,
target_path
):
pdf_handler
=
PDFHandler
(
pdf_path
,
target_path
)
pdf_handler
.
extract_image
(
is_extract
)
pdf_handler
.
extract_image
()
def
main
():
...
...
@@ -253,7 +66,7 @@ def main():
continue
pdf_file_path
=
os
.
path
.
join
(
parent
,
pdf_file
)
try
:
extract_image
(
pdf_file_path
,
target_path
,
args
.
extract
)
extract_image
(
pdf_file_path
,
target_path
)
except
Exception
as
e
:
print
(
'{0} [failed] [err={1}] [pdf_path={2}]'
.
format
(
LOG_BASE
,
e
,
pdf_file_path
))
failed_list
.
append
(
pdf_file_path
)
...
...
@@ -267,7 +80,7 @@ def main():
# 图片保存目录
target_path
=
os
.
path
.
realpath
(
args
.
output
)
if
args
.
output
else
os
.
path
.
dirname
(
pdf_path
)
try
:
extract_image
(
pdf_path
,
target_path
,
args
.
extract
)
extract_image
(
pdf_path
,
target_path
)
except
Exception
as
e
:
print
(
'{0} [failed] [err={1}] [pdf_path={2}]'
.
format
(
LOG_BASE
,
e
,
pdf_path
))
else
:
...
...
requirements.txt
View file @
94794bd
Pillow==7.2.0
PyMuPDF==1.17.0
\ No newline at end of file
...
...
Write
Preview
Styling with
Markdown
is supported
Attach a file
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to post a comment