fix box
Showing
1 changed file
with
32 additions
and
2 deletions
| 1 | import os | 1 | import os |
| 2 | import shutil | 2 | import shutil |
| 3 | import fitz | 3 | import fitz |
| 4 | import math | ||
| 4 | from PIL import Image | 5 | from PIL import Image |
| 5 | from io import BytesIO | 6 | from io import BytesIO |
| 6 | 7 | ||
| ... | @@ -53,14 +54,33 @@ class PDFHandler: | ... | @@ -53,14 +54,33 @@ class PDFHandler: |
| 53 | width = self.page_text_list[pno].pop('width') | 54 | width = self.page_text_list[pno].pop('width') |
| 54 | height = self.page_text_list[pno].pop('height') | 55 | height = self.page_text_list[pno].pop('height') |
| 55 | src_text_list = self.page_text_list[pno].pop('text') | 56 | src_text_list = self.page_text_list[pno].pop('text') |
| 57 | rotation = self.page_text_list[pno].pop('rotation') | ||
| 56 | 58 | ||
| 57 | width_scale = src_width / width | 59 | sin = math.sin(math.pi * rotation / 2) |
| 58 | height_scale = src_height / height | 60 | cos = math.cos(math.pi * rotation / 2) |
| 61 | |||
| 62 | min_x = min_y = 0 | ||
| 63 | for x, y in ((0, height), (width, 0), (width, height)): | ||
| 64 | new_x = x * cos - y * sin | ||
| 65 | new_y = x * sin + y * cos | ||
| 66 | min_x = min(min_x, new_x) | ||
| 67 | min_y = min(min_y, new_y) | ||
| 68 | |||
| 69 | new_width = int((height * abs(sin)) + (width * abs(cos))) | ||
| 70 | new_height = int((height * abs(cos)) + (width * abs(sin))) | ||
| 71 | |||
| 72 | width_scale = src_width / new_width | ||
| 73 | height_scale = src_height / new_height | ||
| 59 | 74 | ||
| 60 | rebuild_text_list = [] | 75 | rebuild_text_list = [] |
| 61 | 76 | ||
| 62 | for bbox, text in src_text_list: | 77 | for bbox, text in src_text_list: |
| 63 | x0, y0, x1, y1 = bbox | 78 | x0, y0, x1, y1 = bbox |
| 79 | x0, y0, x1, y1 = (x0 * cos - y0 * sin, x0 * sin + y0 * cos, x1 * cos - y1 * sin, x1 * sin + y1 * cos) | ||
| 80 | x_list = sorted([x0 - min_x, x1 - min_x]) | ||
| 81 | y_list = sorted([y0 - min_y, y1 - min_y]) | ||
| 82 | |||
| 83 | x0, y0, x1, y1 = (x_list[0], y_list[0], x_list[1], y_list[1]) | ||
| 64 | x0 = x0 * width_scale | 84 | x0 = x0 * width_scale |
| 65 | y0 = y0 * height_scale | 85 | y0 = y0 * height_scale |
| 66 | x1 = x1 * width_scale | 86 | x1 = x1 * width_scale |
| ... | @@ -240,6 +260,15 @@ class PDFHandler: | ... | @@ -240,6 +260,15 @@ class PDFHandler: |
| 240 | text_item_sum = 0 | 260 | text_item_sum = 0 |
| 241 | for pno in range(pdf.pageCount): | 261 | for pno in range(pdf.pageCount): |
| 242 | page = pdf.loadPage(pno) | 262 | page = pdf.loadPage(pno) |
| 263 | if page.rotation is None: | ||
| 264 | rotation = 0 | ||
| 265 | elif isinstance(page.rotation, int): | ||
| 266 | divisor, remainder = divmod(page.rotation, 90) | ||
| 267 | if remainder != 0: | ||
| 268 | return | ||
| 269 | rotation = divmod(divisor, 4)[1] | ||
| 270 | else: | ||
| 271 | return | ||
| 243 | textpage = page.getTextPage() | 272 | textpage = page.getTextPage() |
| 244 | text = textpage.extractDICT() | 273 | text = textpage.extractDICT() |
| 245 | text_list = [] | 274 | text_list = [] |
| ... | @@ -259,6 +288,7 @@ class PDFHandler: | ... | @@ -259,6 +288,7 @@ class PDFHandler: |
| 259 | { | 288 | { |
| 260 | 'width': text.get('width'), | 289 | 'width': text.get('width'), |
| 261 | 'height': text.get('height'), | 290 | 'height': text.get('height'), |
| 291 | 'rotation': rotation, | ||
| 262 | 'text': text_list | 292 | 'text': text_list |
| 263 | } | 293 | } |
| 264 | ) | 294 | ) | ... | ... |
-
Please register or sign in to post a comment