modify pdf_to_img
Showing
2 changed files
with
20 additions
and
2 deletions
... | @@ -12,8 +12,10 @@ from unicodedata import normalize | ... | @@ -12,8 +12,10 @@ from unicodedata import normalize |
12 | # 页面保存为png图片参数 | 12 | # 页面保存为png图片参数 |
13 | ZOOM_X_1 = ZOOM_Y_1 = 1.0 | 13 | ZOOM_X_1 = ZOOM_Y_1 = 1.0 |
14 | ZOOM_X_2 = ZOOM_Y_2 = 2.0 | 14 | ZOOM_X_2 = ZOOM_Y_2 = 2.0 |
15 | ZOOM_X_3 = ZOOM_Y_3 = 3.0 | ||
15 | trans_1 = fitz.Matrix(ZOOM_X_1, ZOOM_X_1).preRotate(0) # zoom factor 1 in each dimension | 16 | trans_1 = fitz.Matrix(ZOOM_X_1, ZOOM_X_1).preRotate(0) # zoom factor 1 in each dimension |
16 | trans_2 = fitz.Matrix(ZOOM_X_2, ZOOM_X_2).preRotate(0) # zoom factor 2 in each dimension | 17 | trans_2 = fitz.Matrix(ZOOM_X_2, ZOOM_X_2).preRotate(0) # zoom factor 2 in each dimension |
18 | trans_3 = fitz.Matrix(ZOOM_X_3, ZOOM_X_3).preRotate(0) # zoom factor 3 in each dimension | ||
17 | 19 | ||
18 | # 特殊filter处理 | 20 | # 特殊filter处理 |
19 | ADOBE_FILTER_SET = {'FlateDecode', 'JPXDecode', 'JBIG2Decode'} | 21 | ADOBE_FILTER_SET = {'FlateDecode', 'JPXDecode', 'JBIG2Decode'} |
... | @@ -25,6 +27,12 @@ WH_COUPLE_3 = (100, 100) | ... | @@ -25,6 +27,12 @@ WH_COUPLE_3 = (100, 100) |
25 | WH_COUPLE_4 = (100, 300) | 27 | WH_COUPLE_4 = (100, 300) |
26 | WH_COUPLE_5 = (100, 200) | 28 | WH_COUPLE_5 = (100, 200) |
27 | 29 | ||
30 | # 碎图宽度阈值 | ||
31 | TINY_IMG_MAX_WIDTH = 1800 | ||
32 | |||
33 | # 大图宽高阈值 | ||
34 | WH_COUPLE_6 = (1800, 1400) | ||
35 | WH_COUPLE_7 = (2500, 3000) | ||
28 | 36 | ||
29 | class PDFBuild: | 37 | class PDFBuild: |
30 | 38 | ||
... | @@ -165,8 +173,10 @@ class PDFHandler: | ... | @@ -165,8 +173,10 @@ class PDFHandler: |
165 | except Exception as e: | 173 | except Exception as e: |
166 | pass | 174 | pass |
167 | 175 | ||
168 | def page_to_png(self, page): | 176 | def page_to_png(self, page, is_big_img=False): |
169 | if page.MediaBoxSize.x > 1500 or page.MediaBoxSize.y > 1500: | 177 | if is_big_img: |
178 | pm = page.getPixmap(matrix=trans_3, alpha=False) | ||
179 | elif page.MediaBoxSize.x > 1500 or page.MediaBoxSize.y > 1500: | ||
170 | pm = page.getPixmap(matrix=trans_1, alpha=False) | 180 | pm = page.getPixmap(matrix=trans_1, alpha=False) |
171 | else: | 181 | else: |
172 | pm = page.getPixmap(matrix=trans_2, alpha=False) | 182 | pm = page.getPixmap(matrix=trans_2, alpha=False) |
... | @@ -247,6 +257,9 @@ class PDFHandler: | ... | @@ -247,6 +257,9 @@ class PDFHandler: |
247 | if il[i][-1] in ADOBE_FILTER_SET: | 257 | if il[i][-1] in ADOBE_FILTER_SET: |
248 | page_to_png = True | 258 | page_to_png = True |
249 | break | 259 | break |
260 | if il[i][2] >= TINY_IMG_MAX_WIDTH: | ||
261 | page_to_png = True | ||
262 | break | ||
250 | else: | 263 | else: |
251 | for i in range(length): | 264 | for i in range(length): |
252 | # 当图片对象够大时,不作碎图合并处理,而是单纯提取 | 265 | # 当图片对象够大时,不作碎图合并处理,而是单纯提取 |
... | @@ -446,6 +459,10 @@ class PDFHandler: | ... | @@ -446,6 +459,10 @@ class PDFHandler: |
446 | page = pdf.loadPage(pno) | 459 | page = pdf.loadPage(pno) |
447 | self.page_to_png(page) | 460 | self.page_to_png(page) |
448 | # 大图 | 461 | # 大图 |
462 | elif width >= WH_COUPLE_6[0] or height >= WH_COUPLE_6[1]: | ||
463 | is_big_img = (width < WH_COUPLE_7[0] and height < WH_COUPLE_7[1]) # 防止图片过大 | ||
464 | page = pdf.loadPage(pno) | ||
465 | self.page_to_png(page, is_big_img=is_big_img) | ||
449 | elif xref not in self.xref_set: | 466 | elif xref not in self.xref_set: |
450 | self.extract_single_image(pdf, xref, smask, colorspace, pno) | 467 | self.extract_single_image(pdf, xref, smask, colorspace, pno) |
451 | # 3.页面图片对象数目大于1时,特殊处理 | 468 | # 3.页面图片对象数目大于1时,特殊处理 | ... | ... |
-
Please register or sign in to post a comment