Merge branch 'feature/ebank' into feature/0918
Showing
1 changed file
with
32 additions
and
2 deletions
| 1 | import os | 1 | import os | 
| 2 | import shutil | 2 | import shutil | 
| 3 | import fitz | 3 | import fitz | 
| 4 | import math | ||
| 4 | from PIL import Image | 5 | from PIL import Image | 
| 5 | from io import BytesIO | 6 | from io import BytesIO | 
| 6 | 7 | ||
| ... | @@ -53,14 +54,33 @@ class PDFHandler: | ... | @@ -53,14 +54,33 @@ class PDFHandler: | 
| 53 | width = self.page_text_list[pno].pop('width') | 54 | width = self.page_text_list[pno].pop('width') | 
| 54 | height = self.page_text_list[pno].pop('height') | 55 | height = self.page_text_list[pno].pop('height') | 
| 55 | src_text_list = self.page_text_list[pno].pop('text') | 56 | src_text_list = self.page_text_list[pno].pop('text') | 
| 57 | rotation = self.page_text_list[pno].pop('rotation') | ||
| 56 | 58 | ||
| 57 | width_scale = src_width / width | 59 | sin = math.sin(math.pi * rotation / 2) | 
| 58 | height_scale = src_height / height | 60 | cos = math.cos(math.pi * rotation / 2) | 
| 61 | |||
| 62 | min_x = min_y = 0 | ||
| 63 | for x, y in ((0, height), (width, 0), (width, height)): | ||
| 64 | new_x = x * cos - y * sin | ||
| 65 | new_y = x * sin + y * cos | ||
| 66 | min_x = min(min_x, new_x) | ||
| 67 | min_y = min(min_y, new_y) | ||
| 68 | |||
| 69 | new_width = int((height * abs(sin)) + (width * abs(cos))) | ||
| 70 | new_height = int((height * abs(cos)) + (width * abs(sin))) | ||
| 71 | |||
| 72 | width_scale = src_width / new_width | ||
| 73 | height_scale = src_height / new_height | ||
| 59 | 74 | ||
| 60 | rebuild_text_list = [] | 75 | rebuild_text_list = [] | 
| 61 | 76 | ||
| 62 | for bbox, text in src_text_list: | 77 | for bbox, text in src_text_list: | 
| 63 | x0, y0, x1, y1 = bbox | 78 | x0, y0, x1, y1 = bbox | 
| 79 | x0, y0, x1, y1 = (x0 * cos - y0 * sin, x0 * sin + y0 * cos, x1 * cos - y1 * sin, x1 * sin + y1 * cos) | ||
| 80 | x_list = sorted([x0 - min_x, x1 - min_x]) | ||
| 81 | y_list = sorted([y0 - min_y, y1 - min_y]) | ||
| 82 | |||
| 83 | x0, y0, x1, y1 = (x_list[0], y_list[0], x_list[1], y_list[1]) | ||
| 64 | x0 = x0 * width_scale | 84 | x0 = x0 * width_scale | 
| 65 | y0 = y0 * height_scale | 85 | y0 = y0 * height_scale | 
| 66 | x1 = x1 * width_scale | 86 | x1 = x1 * width_scale | 
| ... | @@ -240,6 +260,15 @@ class PDFHandler: | ... | @@ -240,6 +260,15 @@ class PDFHandler: | 
| 240 | text_item_sum = 0 | 260 | text_item_sum = 0 | 
| 241 | for pno in range(pdf.pageCount): | 261 | for pno in range(pdf.pageCount): | 
| 242 | page = pdf.loadPage(pno) | 262 | page = pdf.loadPage(pno) | 
| 263 | if page.rotation is None: | ||
| 264 | rotation = 0 | ||
| 265 | elif isinstance(page.rotation, int): | ||
| 266 | divisor, remainder = divmod(page.rotation, 90) | ||
| 267 | if remainder != 0: | ||
| 268 | return | ||
| 269 | rotation = divmod(divisor, 4)[1] | ||
| 270 | else: | ||
| 271 | return | ||
| 243 | textpage = page.getTextPage() | 272 | textpage = page.getTextPage() | 
| 244 | text = textpage.extractDICT() | 273 | text = textpage.extractDICT() | 
| 245 | text_list = [] | 274 | text_list = [] | 
| ... | @@ -259,6 +288,7 @@ class PDFHandler: | ... | @@ -259,6 +288,7 @@ class PDFHandler: | 
| 259 | { | 288 | { | 
| 260 | 'width': text.get('width'), | 289 | 'width': text.get('width'), | 
| 261 | 'height': text.get('height'), | 290 | 'height': text.get('height'), | 
| 291 | 'rotation': rotation, | ||
| 262 | 'text': text_list | 292 | 'text': text_list | 
| 263 | } | 293 | } | 
| 264 | ) | 294 | ) | ... | ... | 
- 
Please register or sign in to post a comment