fix box
Showing
1 changed file
with
32 additions
and
2 deletions
1 | import os | 1 | import os |
2 | import shutil | 2 | import shutil |
3 | import fitz | 3 | import fitz |
4 | import math | ||
4 | from PIL import Image | 5 | from PIL import Image |
5 | from io import BytesIO | 6 | from io import BytesIO |
6 | 7 | ||
... | @@ -53,14 +54,33 @@ class PDFHandler: | ... | @@ -53,14 +54,33 @@ class PDFHandler: |
53 | width = self.page_text_list[pno].pop('width') | 54 | width = self.page_text_list[pno].pop('width') |
54 | height = self.page_text_list[pno].pop('height') | 55 | height = self.page_text_list[pno].pop('height') |
55 | src_text_list = self.page_text_list[pno].pop('text') | 56 | src_text_list = self.page_text_list[pno].pop('text') |
57 | rotation = self.page_text_list[pno].pop('rotation') | ||
56 | 58 | ||
57 | width_scale = src_width / width | 59 | sin = math.sin(math.pi * rotation / 2) |
58 | height_scale = src_height / height | 60 | cos = math.cos(math.pi * rotation / 2) |
61 | |||
62 | min_x = min_y = 0 | ||
63 | for x, y in ((0, height), (width, 0), (width, height)): | ||
64 | new_x = x * cos - y * sin | ||
65 | new_y = x * sin + y * cos | ||
66 | min_x = min(min_x, new_x) | ||
67 | min_y = min(min_y, new_y) | ||
68 | |||
69 | new_width = int((height * abs(sin)) + (width * abs(cos))) | ||
70 | new_height = int((height * abs(cos)) + (width * abs(sin))) | ||
71 | |||
72 | width_scale = src_width / new_width | ||
73 | height_scale = src_height / new_height | ||
59 | 74 | ||
60 | rebuild_text_list = [] | 75 | rebuild_text_list = [] |
61 | 76 | ||
62 | for bbox, text in src_text_list: | 77 | for bbox, text in src_text_list: |
63 | x0, y0, x1, y1 = bbox | 78 | x0, y0, x1, y1 = bbox |
79 | x0, y0, x1, y1 = (x0 * cos - y0 * sin, x0 * sin + y0 * cos, x1 * cos - y1 * sin, x1 * sin + y1 * cos) | ||
80 | x_list = sorted([x0 - min_x, x1 - min_x]) | ||
81 | y_list = sorted([y0 - min_y, y1 - min_y]) | ||
82 | |||
83 | x0, y0, x1, y1 = (x_list[0], y_list[0], x_list[1], y_list[1]) | ||
64 | x0 = x0 * width_scale | 84 | x0 = x0 * width_scale |
65 | y0 = y0 * height_scale | 85 | y0 = y0 * height_scale |
66 | x1 = x1 * width_scale | 86 | x1 = x1 * width_scale |
... | @@ -240,6 +260,15 @@ class PDFHandler: | ... | @@ -240,6 +260,15 @@ class PDFHandler: |
240 | text_item_sum = 0 | 260 | text_item_sum = 0 |
241 | for pno in range(pdf.pageCount): | 261 | for pno in range(pdf.pageCount): |
242 | page = pdf.loadPage(pno) | 262 | page = pdf.loadPage(pno) |
263 | if page.rotation is None: | ||
264 | rotation = 0 | ||
265 | elif isinstance(page.rotation, int): | ||
266 | divisor, remainder = divmod(page.rotation, 90) | ||
267 | if remainder != 0: | ||
268 | return | ||
269 | rotation = divmod(divisor, 4)[1] | ||
270 | else: | ||
271 | return | ||
243 | textpage = page.getTextPage() | 272 | textpage = page.getTextPage() |
244 | text = textpage.extractDICT() | 273 | text = textpage.extractDICT() |
245 | text_list = [] | 274 | text_list = [] |
... | @@ -259,6 +288,7 @@ class PDFHandler: | ... | @@ -259,6 +288,7 @@ class PDFHandler: |
259 | { | 288 | { |
260 | 'width': text.get('width'), | 289 | 'width': text.get('width'), |
261 | 'height': text.get('height'), | 290 | 'height': text.get('height'), |
291 | 'rotation': rotation, | ||
262 | 'text': text_list | 292 | 'text': text_list |
263 | } | 293 | } |
264 | ) | 294 | ) | ... | ... |
-
Please register or sign in to post a comment