Skip to content
Toggle navigation
Toggle navigation
This project
Loading...
Sign in
周伟奇
/
bmw-ocr
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Graphs
Network
Create a new issue
Commits
Issue Boards
Files
Commits
Network
Compare
Branches
Tags
d21adf2c
authored
2023-10-30 17:36:55 +0800
by
周伟奇
Browse Files
Options
Browse Files
Tag
Download
Email Patches
Plain Diff
modify pdf_to_img
1 parent
b81daff4
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
20 additions
and
2 deletions
.gitignore
src/common/tools/pdf_to_img.py
.gitignore
View file @
d21adf2
...
...
@@ -32,3 +32,4 @@ data/*
test*
flow_test.py
pdf_test.py
\ No newline at end of file
...
...
src/common/tools/pdf_to_img.py
View file @
d21adf2
...
...
@@ -12,8 +12,10 @@ from unicodedata import normalize
# 页面保存为png图片参数
ZOOM_X_1
=
ZOOM_Y_1
=
1.0
ZOOM_X_2
=
ZOOM_Y_2
=
2.0
ZOOM_X_3
=
ZOOM_Y_3
=
3.0
trans_1
=
fitz
.
Matrix
(
ZOOM_X_1
,
ZOOM_X_1
)
.
preRotate
(
0
)
# zoom factor 1 in each dimension
trans_2
=
fitz
.
Matrix
(
ZOOM_X_2
,
ZOOM_X_2
)
.
preRotate
(
0
)
# zoom factor 2 in each dimension
trans_3
=
fitz
.
Matrix
(
ZOOM_X_3
,
ZOOM_X_3
)
.
preRotate
(
0
)
# zoom factor 3 in each dimension
# 特殊filter处理
ADOBE_FILTER_SET
=
{
'FlateDecode'
,
'JPXDecode'
,
'JBIG2Decode'
}
...
...
@@ -25,6 +27,12 @@ WH_COUPLE_3 = (100, 100)
WH_COUPLE_4
=
(
100
,
300
)
WH_COUPLE_5
=
(
100
,
200
)
# 碎图宽度阈值
TINY_IMG_MAX_WIDTH
=
1800
# 大图宽高阈值
WH_COUPLE_6
=
(
1800
,
1400
)
WH_COUPLE_7
=
(
2500
,
3000
)
class
PDFBuild
:
...
...
@@ -165,8 +173,10 @@ class PDFHandler:
except
Exception
as
e
:
pass
def
page_to_png
(
self
,
page
):
if
page
.
MediaBoxSize
.
x
>
1500
or
page
.
MediaBoxSize
.
y
>
1500
:
def
page_to_png
(
self
,
page
,
is_big_img
=
False
):
if
is_big_img
:
pm
=
page
.
getPixmap
(
matrix
=
trans_3
,
alpha
=
False
)
elif
page
.
MediaBoxSize
.
x
>
1500
or
page
.
MediaBoxSize
.
y
>
1500
:
pm
=
page
.
getPixmap
(
matrix
=
trans_1
,
alpha
=
False
)
else
:
pm
=
page
.
getPixmap
(
matrix
=
trans_2
,
alpha
=
False
)
...
...
@@ -247,6 +257,9 @@ class PDFHandler:
if
il
[
i
][
-
1
]
in
ADOBE_FILTER_SET
:
page_to_png
=
True
break
if
il
[
i
][
2
]
>=
TINY_IMG_MAX_WIDTH
:
page_to_png
=
True
break
else
:
for
i
in
range
(
length
):
# 当图片对象够大时,不作碎图合并处理,而是单纯提取
...
...
@@ -446,6 +459,10 @@ class PDFHandler:
page
=
pdf
.
loadPage
(
pno
)
self
.
page_to_png
(
page
)
# 大图
elif
width
>=
WH_COUPLE_6
[
0
]
or
height
>=
WH_COUPLE_6
[
1
]:
is_big_img
=
(
width
<
WH_COUPLE_7
[
0
]
and
height
<
WH_COUPLE_7
[
1
])
# 防止图片过大
page
=
pdf
.
loadPage
(
pno
)
self
.
page_to_png
(
page
,
is_big_img
=
is_big_img
)
elif
xref
not
in
self
.
xref_set
:
self
.
extract_single_image
(
pdf
,
xref
,
smask
,
colorspace
,
pno
)
# 3.页面图片对象数目大于1时,特殊处理
...
...
Write
Preview
Styling with
Markdown
is supported
Attach a file
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to post a comment