3783dcfc by 周伟奇

Merge branch 'feature/ebank' into feature/0918

2 parents bb0678cb 0ba01d9e
1 import os 1 import os
2 import shutil 2 import shutil
3 import fitz 3 import fitz
4 import math
4 from PIL import Image 5 from PIL import Image
5 from io import BytesIO 6 from io import BytesIO
6 7
...@@ -53,14 +54,33 @@ class PDFHandler: ...@@ -53,14 +54,33 @@ class PDFHandler:
53 width = self.page_text_list[pno].pop('width') 54 width = self.page_text_list[pno].pop('width')
54 height = self.page_text_list[pno].pop('height') 55 height = self.page_text_list[pno].pop('height')
55 src_text_list = self.page_text_list[pno].pop('text') 56 src_text_list = self.page_text_list[pno].pop('text')
57 rotation = self.page_text_list[pno].pop('rotation')
56 58
57 width_scale = src_width / width 59 sin = math.sin(math.pi * rotation / 2)
58 height_scale = src_height / height 60 cos = math.cos(math.pi * rotation / 2)
61
62 min_x = min_y = 0
63 for x, y in ((0, height), (width, 0), (width, height)):
64 new_x = x * cos - y * sin
65 new_y = x * sin + y * cos
66 min_x = min(min_x, new_x)
67 min_y = min(min_y, new_y)
68
69 new_width = int((height * abs(sin)) + (width * abs(cos)))
70 new_height = int((height * abs(cos)) + (width * abs(sin)))
71
72 width_scale = src_width / new_width
73 height_scale = src_height / new_height
59 74
60 rebuild_text_list = [] 75 rebuild_text_list = []
61 76
62 for bbox, text in src_text_list: 77 for bbox, text in src_text_list:
63 x0, y0, x1, y1 = bbox 78 x0, y0, x1, y1 = bbox
79 x0, y0, x1, y1 = (x0 * cos - y0 * sin, x0 * sin + y0 * cos, x1 * cos - y1 * sin, x1 * sin + y1 * cos)
80 x_list = sorted([x0 - min_x, x1 - min_x])
81 y_list = sorted([y0 - min_y, y1 - min_y])
82
83 x0, y0, x1, y1 = (x_list[0], y_list[0], x_list[1], y_list[1])
64 x0 = x0 * width_scale 84 x0 = x0 * width_scale
65 y0 = y0 * height_scale 85 y0 = y0 * height_scale
66 x1 = x1 * width_scale 86 x1 = x1 * width_scale
...@@ -240,6 +260,15 @@ class PDFHandler: ...@@ -240,6 +260,15 @@ class PDFHandler:
240 text_item_sum = 0 260 text_item_sum = 0
241 for pno in range(pdf.pageCount): 261 for pno in range(pdf.pageCount):
242 page = pdf.loadPage(pno) 262 page = pdf.loadPage(pno)
263 if page.rotation is None:
264 rotation = 0
265 elif isinstance(page.rotation, int):
266 divisor, remainder = divmod(page.rotation, 90)
267 if remainder != 0:
268 return
269 rotation = divmod(divisor, 4)[1]
270 else:
271 return
243 textpage = page.getTextPage() 272 textpage = page.getTextPage()
244 text = textpage.extractDICT() 273 text = textpage.extractDICT()
245 text_list = [] 274 text_list = []
...@@ -259,6 +288,7 @@ class PDFHandler: ...@@ -259,6 +288,7 @@ class PDFHandler:
259 { 288 {
260 'width': text.get('width'), 289 'width': text.get('width'),
261 'height': text.get('height'), 290 'height': text.get('height'),
291 'rotation': rotation,
262 'text': text_list 292 'text': text_list
263 } 293 }
264 ) 294 )
......
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!