88e5fc6b by 周伟奇

Merge branch 'feature/pdftoimg' into 'master'

Feature/pdftoimg

See merge request !23
2 parents b81daff4 c996af2d
......@@ -31,4 +31,5 @@ conf/*
data/*
test*
flow_test.py
\ No newline at end of file
flow_test.py
pdf_test.py
\ No newline at end of file
......
......@@ -1339,8 +1339,8 @@ class Command(BaseCommand, LoggerMixin):
pdf_handler.extract_image(max_img_count)
end_time = time.time()
speed_time = int(end_time - start_time)
self.online_log.info('{0} [pdf to img end] [task={1}] [times={2}] [spend_time={3}]'.format(
self.log_base, task_str, times, speed_time))
self.online_log.info('{0} [pdf to img end] [task={1}] [times={2}] [spend_time={3}] [is_new_modify={4}]'.format(
self.log_base, task_str, times, speed_time, pdf_handler.is_new_modify))
except Exception as e:
self.online_log.warn('{0} [download or pdf to img failed] [task={1}] [times={2}] '
'[error={3}]'.format(self.log_base, task_str, times,
......
......@@ -12,8 +12,10 @@ from unicodedata import normalize
# 页面保存为png图片参数
ZOOM_X_1 = ZOOM_Y_1 = 1.0
ZOOM_X_2 = ZOOM_Y_2 = 2.0
ZOOM_X_3 = ZOOM_Y_3 = 3.0
trans_1 = fitz.Matrix(ZOOM_X_1, ZOOM_X_1).preRotate(0) # zoom factor 1 in each dimension
trans_2 = fitz.Matrix(ZOOM_X_2, ZOOM_X_2).preRotate(0) # zoom factor 2 in each dimension
trans_3 = fitz.Matrix(ZOOM_X_3, ZOOM_X_3).preRotate(0) # zoom factor 3 in each dimension
# 特殊filter处理
ADOBE_FILTER_SET = {'FlateDecode', 'JPXDecode', 'JBIG2Decode'}
......@@ -25,6 +27,12 @@ WH_COUPLE_3 = (100, 100)
WH_COUPLE_4 = (100, 300)
WH_COUPLE_5 = (100, 200)
# 碎图宽度阈值
TINY_IMG_MAX_WIDTH = 1400
# 大图宽高阈值
WH_COUPLE_6 = (1800, 1400)
WH_COUPLE_7 = (2500, 3000)
class PDFBuild:
......@@ -55,6 +63,7 @@ class PDFHandler:
self.img_dir_path = img_dir_path
self.img_path_list = []
self.img_count = 0
self.is_new_modify = 0 # 用于记录受新改动影响的PDF
self.xref_set = set()
self.img_suffixs = {'.jpeg', '.jpg', '.png', '.webp', '.bmp'}
self.suffix = self.get_suffix(document_name)
......@@ -165,8 +174,10 @@ class PDFHandler:
except Exception as e:
pass
def page_to_png(self, page):
if page.MediaBoxSize.x > 1500 or page.MediaBoxSize.y > 1500:
def page_to_png(self, page, is_big_img=False):
if is_big_img:
pm = page.getPixmap(matrix=trans_3, alpha=False)
elif page.MediaBoxSize.x > 1500 or page.MediaBoxSize.y > 1500:
pm = page.getPixmap(matrix=trans_1, alpha=False)
else:
pm = page.getPixmap(matrix=trans_2, alpha=False)
......@@ -236,8 +247,8 @@ class PDFHandler:
self.xref_set.add(xref)
self.img_path_list.append(img_save_path)
@staticmethod
def split_il(il):
# @staticmethod
def split_il(self, il):
broken_il = []
start = 0
length = len(il)
......@@ -247,6 +258,10 @@ class PDFHandler:
if il[i][-1] in ADOBE_FILTER_SET:
page_to_png = True
break
if il[i][2] >= TINY_IMG_MAX_WIDTH:
self.is_new_modify = 1
page_to_png = True
break
else:
for i in range(length):
# 当图片对象够大时,不作碎图合并处理,而是单纯提取
......@@ -446,6 +461,11 @@ class PDFHandler:
page = pdf.loadPage(pno)
self.page_to_png(page)
# 大图
elif width >= WH_COUPLE_6[0] or height >= WH_COUPLE_6[1]:
self.is_new_modify = 1
is_big_img = (width < WH_COUPLE_7[0] and height < WH_COUPLE_7[1]) # 防止图片过大
page = pdf.loadPage(pno)
self.page_to_png(page, is_big_img=is_big_img)
elif xref not in self.xref_set:
self.extract_single_image(pdf, xref, smask, colorspace, pno)
# 3.页面图片对象数目大于1时,特殊处理
......
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!