diff --git a/.gitignore b/.gitignore index f91d255..4233e35 100644 --- a/.gitignore +++ b/.gitignore @@ -31,4 +31,5 @@ conf/* data/* test* -flow_test.py \ No newline at end of file +flow_test.py +pdf_test.py \ No newline at end of file diff --git a/src/common/tools/pdf_to_img.py b/src/common/tools/pdf_to_img.py index e03b300..6f7c80b 100644 --- a/src/common/tools/pdf_to_img.py +++ b/src/common/tools/pdf_to_img.py @@ -12,8 +12,10 @@ from unicodedata import normalize # 页面保存为png图片参数 ZOOM_X_1 = ZOOM_Y_1 = 1.0 ZOOM_X_2 = ZOOM_Y_2 = 2.0 +ZOOM_X_3 = ZOOM_Y_3 = 3.0 trans_1 = fitz.Matrix(ZOOM_X_1, ZOOM_X_1).preRotate(0) # zoom factor 1 in each dimension trans_2 = fitz.Matrix(ZOOM_X_2, ZOOM_X_2).preRotate(0) # zoom factor 2 in each dimension +trans_3 = fitz.Matrix(ZOOM_X_3, ZOOM_X_3).preRotate(0) # zoom factor 3 in each dimension # 特殊filter处理 ADOBE_FILTER_SET = {'FlateDecode', 'JPXDecode', 'JBIG2Decode'} @@ -25,6 +27,12 @@ WH_COUPLE_3 = (100, 100) WH_COUPLE_4 = (100, 300) WH_COUPLE_5 = (100, 200) +# 碎图宽度阈值 +TINY_IMG_MAX_WIDTH = 1800 + +# 大图宽高阈值 +WH_COUPLE_6 = (1800, 1400) +WH_COUPLE_7 = (2500, 3000) class PDFBuild: @@ -165,8 +173,10 @@ class PDFHandler: except Exception as e: pass - def page_to_png(self, page): - if page.MediaBoxSize.x > 1500 or page.MediaBoxSize.y > 1500: + def page_to_png(self, page, is_big_img=False): + if is_big_img: + pm = page.getPixmap(matrix=trans_3, alpha=False) + elif page.MediaBoxSize.x > 1500 or page.MediaBoxSize.y > 1500: pm = page.getPixmap(matrix=trans_1, alpha=False) else: pm = page.getPixmap(matrix=trans_2, alpha=False) @@ -247,6 +257,9 @@ class PDFHandler: if il[i][-1] in ADOBE_FILTER_SET: page_to_png = True break + if il[i][2] >= TINY_IMG_MAX_WIDTH: + page_to_png = True + break else: for i in range(length): # 当图片对象够大时,不作碎图合并处理,而是单纯提取 @@ -446,6 +459,10 @@ class PDFHandler: page = pdf.loadPage(pno) self.page_to_png(page) # 大图 + elif width >= WH_COUPLE_6[0] or height >= WH_COUPLE_6[1]: + is_big_img = (width < WH_COUPLE_7[0] and height < WH_COUPLE_7[1]) # 防止图片过大 + page = pdf.loadPage(pno) + self.page_to_png(page, is_big_img=is_big_img) elif xref not in self.xref_set: self.extract_single_image(pdf, xref, smask, colorspace, pno) # 3.页面图片对象数目大于1时,特殊处理