Skip to content
Toggle navigation
Toggle navigation
This project
Loading...
Sign in
周伟奇
/
bmw-ocr
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Graphs
Network
Create a new issue
Commits
Issue Boards
Files
Commits
Network
Compare
Branches
Tags
88e5fc6b
authored
2023-11-16 14:23:45 +0800
by
周伟奇
Browse Files
Options
Browse Files
Tag
Download
Plain Diff
Merge branch 'feature/pdftoimg' into 'master'
Feature/pdftoimg See merge request !23
2 parents
b81daff4
c996af2d
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
28 additions
and
7 deletions
.gitignore
src/apps/doc/management/commands/ocr_process.py
src/common/tools/pdf_to_img.py
.gitignore
View file @
88e5fc6
...
...
@@ -31,4 +31,5 @@ conf/*
data/*
test*
flow_test.py
\ No newline at end of file
flow_test.py
pdf_test.py
\ No newline at end of file
...
...
src/apps/doc/management/commands/ocr_process.py
View file @
88e5fc6
...
...
@@ -1339,8 +1339,8 @@ class Command(BaseCommand, LoggerMixin):
pdf_handler
.
extract_image
(
max_img_count
)
end_time
=
time
.
time
()
speed_time
=
int
(
end_time
-
start_time
)
self
.
online_log
.
info
(
'{0} [pdf to img end] [task={1}] [times={2}] [spend_time={3}]'
.
format
(
self
.
log_base
,
task_str
,
times
,
speed_time
))
self
.
online_log
.
info
(
'{0} [pdf to img end] [task={1}] [times={2}] [spend_time={3}]
[is_new_modify={4}]
'
.
format
(
self
.
log_base
,
task_str
,
times
,
speed_time
,
pdf_handler
.
is_new_modify
))
except
Exception
as
e
:
self
.
online_log
.
warn
(
'{0} [download or pdf to img failed] [task={1}] [times={2}] '
'[error={3}]'
.
format
(
self
.
log_base
,
task_str
,
times
,
...
...
src/common/tools/pdf_to_img.py
View file @
88e5fc6
...
...
@@ -12,8 +12,10 @@ from unicodedata import normalize
# 页面保存为png图片参数
ZOOM_X_1
=
ZOOM_Y_1
=
1.0
ZOOM_X_2
=
ZOOM_Y_2
=
2.0
ZOOM_X_3
=
ZOOM_Y_3
=
3.0
trans_1
=
fitz
.
Matrix
(
ZOOM_X_1
,
ZOOM_X_1
)
.
preRotate
(
0
)
# zoom factor 1 in each dimension
trans_2
=
fitz
.
Matrix
(
ZOOM_X_2
,
ZOOM_X_2
)
.
preRotate
(
0
)
# zoom factor 2 in each dimension
trans_3
=
fitz
.
Matrix
(
ZOOM_X_3
,
ZOOM_X_3
)
.
preRotate
(
0
)
# zoom factor 3 in each dimension
# 特殊filter处理
ADOBE_FILTER_SET
=
{
'FlateDecode'
,
'JPXDecode'
,
'JBIG2Decode'
}
...
...
@@ -25,6 +27,12 @@ WH_COUPLE_3 = (100, 100)
WH_COUPLE_4
=
(
100
,
300
)
WH_COUPLE_5
=
(
100
,
200
)
# 碎图宽度阈值
TINY_IMG_MAX_WIDTH
=
1400
# 大图宽高阈值
WH_COUPLE_6
=
(
1800
,
1400
)
WH_COUPLE_7
=
(
2500
,
3000
)
class
PDFBuild
:
...
...
@@ -55,6 +63,7 @@ class PDFHandler:
self
.
img_dir_path
=
img_dir_path
self
.
img_path_list
=
[]
self
.
img_count
=
0
self
.
is_new_modify
=
0
# 用于记录受新改动影响的PDF
self
.
xref_set
=
set
()
self
.
img_suffixs
=
{
'.jpeg'
,
'.jpg'
,
'.png'
,
'.webp'
,
'.bmp'
}
self
.
suffix
=
self
.
get_suffix
(
document_name
)
...
...
@@ -165,8 +174,10 @@ class PDFHandler:
except
Exception
as
e
:
pass
def
page_to_png
(
self
,
page
):
if
page
.
MediaBoxSize
.
x
>
1500
or
page
.
MediaBoxSize
.
y
>
1500
:
def
page_to_png
(
self
,
page
,
is_big_img
=
False
):
if
is_big_img
:
pm
=
page
.
getPixmap
(
matrix
=
trans_3
,
alpha
=
False
)
elif
page
.
MediaBoxSize
.
x
>
1500
or
page
.
MediaBoxSize
.
y
>
1500
:
pm
=
page
.
getPixmap
(
matrix
=
trans_1
,
alpha
=
False
)
else
:
pm
=
page
.
getPixmap
(
matrix
=
trans_2
,
alpha
=
False
)
...
...
@@ -236,8 +247,8 @@ class PDFHandler:
self
.
xref_set
.
add
(
xref
)
self
.
img_path_list
.
append
(
img_save_path
)
@staticmethod
def
split_il
(
il
):
#
@staticmethod
def
split_il
(
self
,
il
):
broken_il
=
[]
start
=
0
length
=
len
(
il
)
...
...
@@ -247,6 +258,10 @@ class PDFHandler:
if
il
[
i
][
-
1
]
in
ADOBE_FILTER_SET
:
page_to_png
=
True
break
if
il
[
i
][
2
]
>=
TINY_IMG_MAX_WIDTH
:
self
.
is_new_modify
=
1
page_to_png
=
True
break
else
:
for
i
in
range
(
length
):
# 当图片对象够大时,不作碎图合并处理,而是单纯提取
...
...
@@ -446,6 +461,11 @@ class PDFHandler:
page
=
pdf
.
loadPage
(
pno
)
self
.
page_to_png
(
page
)
# 大图
elif
width
>=
WH_COUPLE_6
[
0
]
or
height
>=
WH_COUPLE_6
[
1
]:
self
.
is_new_modify
=
1
is_big_img
=
(
width
<
WH_COUPLE_7
[
0
]
and
height
<
WH_COUPLE_7
[
1
])
# 防止图片过大
page
=
pdf
.
loadPage
(
pno
)
self
.
page_to_png
(
page
,
is_big_img
=
is_big_img
)
elif
xref
not
in
self
.
xref_set
:
self
.
extract_single_image
(
pdf
,
xref
,
smask
,
colorspace
,
pno
)
# 3.页面图片对象数目大于1时,特殊处理
...
...
Write
Preview
Styling with
Markdown
is supported
Attach a file
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to post a comment