3783dcfc by 周伟奇

Merge branch 'feature/ebank' into feature/0918

2 parents bb0678cb 0ba01d9e
import os
import shutil
import fitz
import math
from PIL import Image
from io import BytesIO
......@@ -53,14 +54,33 @@ class PDFHandler:
width = self.page_text_list[pno].pop('width')
height = self.page_text_list[pno].pop('height')
src_text_list = self.page_text_list[pno].pop('text')
rotation = self.page_text_list[pno].pop('rotation')
width_scale = src_width / width
height_scale = src_height / height
sin = math.sin(math.pi * rotation / 2)
cos = math.cos(math.pi * rotation / 2)
min_x = min_y = 0
for x, y in ((0, height), (width, 0), (width, height)):
new_x = x * cos - y * sin
new_y = x * sin + y * cos
min_x = min(min_x, new_x)
min_y = min(min_y, new_y)
new_width = int((height * abs(sin)) + (width * abs(cos)))
new_height = int((height * abs(cos)) + (width * abs(sin)))
width_scale = src_width / new_width
height_scale = src_height / new_height
rebuild_text_list = []
for bbox, text in src_text_list:
x0, y0, x1, y1 = bbox
x0, y0, x1, y1 = (x0 * cos - y0 * sin, x0 * sin + y0 * cos, x1 * cos - y1 * sin, x1 * sin + y1 * cos)
x_list = sorted([x0 - min_x, x1 - min_x])
y_list = sorted([y0 - min_y, y1 - min_y])
x0, y0, x1, y1 = (x_list[0], y_list[0], x_list[1], y_list[1])
x0 = x0 * width_scale
y0 = y0 * height_scale
x1 = x1 * width_scale
......@@ -240,6 +260,15 @@ class PDFHandler:
text_item_sum = 0
for pno in range(pdf.pageCount):
page = pdf.loadPage(pno)
if page.rotation is None:
rotation = 0
elif isinstance(page.rotation, int):
divisor, remainder = divmod(page.rotation, 90)
if remainder != 0:
return
rotation = divmod(divisor, 4)[1]
else:
return
textpage = page.getTextPage()
text = textpage.extractDICT()
text_list = []
......@@ -259,6 +288,7 @@ class PDFHandler:
{
'width': text.get('width'),
'height': text.get('height'),
'rotation': rotation,
'text': text_list
}
)
......
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!