Skip to content
Toggle navigation
Toggle navigation
This project
Loading...
Sign in
周伟奇
/
bmw-ocr
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Graphs
Network
Create a new issue
Commits
Issue Boards
Files
Commits
Network
Compare
Branches
Tags
0ba01d9e
authored
2021-09-25 16:31:48 +0800
by
周伟奇
Browse Files
Options
Browse Files
Tag
Download
Email Patches
Plain Diff
fix box
1 parent
4383b4d1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
32 additions
and
2 deletions
src/common/tools/pdf_to_img.py
src/common/tools/pdf_to_img.py
View file @
0ba01d9
import
os
import
shutil
import
fitz
import
math
from
PIL
import
Image
from
io
import
BytesIO
...
...
@@ -53,14 +54,33 @@ class PDFHandler:
width
=
self
.
page_text_list
[
pno
]
.
pop
(
'width'
)
height
=
self
.
page_text_list
[
pno
]
.
pop
(
'height'
)
src_text_list
=
self
.
page_text_list
[
pno
]
.
pop
(
'text'
)
rotation
=
self
.
page_text_list
[
pno
]
.
pop
(
'rotation'
)
width_scale
=
src_width
/
width
height_scale
=
src_height
/
height
sin
=
math
.
sin
(
math
.
pi
*
rotation
/
2
)
cos
=
math
.
cos
(
math
.
pi
*
rotation
/
2
)
min_x
=
min_y
=
0
for
x
,
y
in
((
0
,
height
),
(
width
,
0
),
(
width
,
height
)):
new_x
=
x
*
cos
-
y
*
sin
new_y
=
x
*
sin
+
y
*
cos
min_x
=
min
(
min_x
,
new_x
)
min_y
=
min
(
min_y
,
new_y
)
new_width
=
int
((
height
*
abs
(
sin
))
+
(
width
*
abs
(
cos
)))
new_height
=
int
((
height
*
abs
(
cos
))
+
(
width
*
abs
(
sin
)))
width_scale
=
src_width
/
new_width
height_scale
=
src_height
/
new_height
rebuild_text_list
=
[]
for
bbox
,
text
in
src_text_list
:
x0
,
y0
,
x1
,
y1
=
bbox
x0
,
y0
,
x1
,
y1
=
(
x0
*
cos
-
y0
*
sin
,
x0
*
sin
+
y0
*
cos
,
x1
*
cos
-
y1
*
sin
,
x1
*
sin
+
y1
*
cos
)
x_list
=
sorted
([
x0
-
min_x
,
x1
-
min_x
])
y_list
=
sorted
([
y0
-
min_y
,
y1
-
min_y
])
x0
,
y0
,
x1
,
y1
=
(
x_list
[
0
],
y_list
[
0
],
x_list
[
1
],
y_list
[
1
])
x0
=
x0
*
width_scale
y0
=
y0
*
height_scale
x1
=
x1
*
width_scale
...
...
@@ -240,6 +260,15 @@ class PDFHandler:
text_item_sum
=
0
for
pno
in
range
(
pdf
.
pageCount
):
page
=
pdf
.
loadPage
(
pno
)
if
page
.
rotation
is
None
:
rotation
=
0
elif
isinstance
(
page
.
rotation
,
int
):
divisor
,
remainder
=
divmod
(
page
.
rotation
,
90
)
if
remainder
!=
0
:
return
rotation
=
divmod
(
divisor
,
4
)[
1
]
else
:
return
textpage
=
page
.
getTextPage
()
text
=
textpage
.
extractDICT
()
text_list
=
[]
...
...
@@ -259,6 +288,7 @@ class PDFHandler:
{
'width'
:
text
.
get
(
'width'
),
'height'
:
text
.
get
(
'height'
),
'rotation'
:
rotation
,
'text'
:
text_list
}
)
...
...
Write
Preview
Styling with
Markdown
is supported
Attach a file
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to post a comment