modify pdf_to_img
Showing
2 changed files
with
20 additions
and
2 deletions
| ... | @@ -12,8 +12,10 @@ from unicodedata import normalize | ... | @@ -12,8 +12,10 @@ from unicodedata import normalize |
| 12 | # 页面保存为png图片参数 | 12 | # 页面保存为png图片参数 |
| 13 | ZOOM_X_1 = ZOOM_Y_1 = 1.0 | 13 | ZOOM_X_1 = ZOOM_Y_1 = 1.0 |
| 14 | ZOOM_X_2 = ZOOM_Y_2 = 2.0 | 14 | ZOOM_X_2 = ZOOM_Y_2 = 2.0 |
| 15 | ZOOM_X_3 = ZOOM_Y_3 = 3.0 | ||
| 15 | trans_1 = fitz.Matrix(ZOOM_X_1, ZOOM_X_1).preRotate(0) # zoom factor 1 in each dimension | 16 | trans_1 = fitz.Matrix(ZOOM_X_1, ZOOM_X_1).preRotate(0) # zoom factor 1 in each dimension |
| 16 | trans_2 = fitz.Matrix(ZOOM_X_2, ZOOM_X_2).preRotate(0) # zoom factor 2 in each dimension | 17 | trans_2 = fitz.Matrix(ZOOM_X_2, ZOOM_X_2).preRotate(0) # zoom factor 2 in each dimension |
| 18 | trans_3 = fitz.Matrix(ZOOM_X_3, ZOOM_X_3).preRotate(0) # zoom factor 3 in each dimension | ||
| 17 | 19 | ||
| 18 | # 特殊filter处理 | 20 | # 特殊filter处理 |
| 19 | ADOBE_FILTER_SET = {'FlateDecode', 'JPXDecode', 'JBIG2Decode'} | 21 | ADOBE_FILTER_SET = {'FlateDecode', 'JPXDecode', 'JBIG2Decode'} |
| ... | @@ -25,6 +27,12 @@ WH_COUPLE_3 = (100, 100) | ... | @@ -25,6 +27,12 @@ WH_COUPLE_3 = (100, 100) |
| 25 | WH_COUPLE_4 = (100, 300) | 27 | WH_COUPLE_4 = (100, 300) |
| 26 | WH_COUPLE_5 = (100, 200) | 28 | WH_COUPLE_5 = (100, 200) |
| 27 | 29 | ||
| 30 | # 碎图宽度阈值 | ||
| 31 | TINY_IMG_MAX_WIDTH = 1800 | ||
| 32 | |||
| 33 | # 大图宽高阈值 | ||
| 34 | WH_COUPLE_6 = (1800, 1400) | ||
| 35 | WH_COUPLE_7 = (2500, 3000) | ||
| 28 | 36 | ||
| 29 | class PDFBuild: | 37 | class PDFBuild: |
| 30 | 38 | ||
| ... | @@ -165,8 +173,10 @@ class PDFHandler: | ... | @@ -165,8 +173,10 @@ class PDFHandler: |
| 165 | except Exception as e: | 173 | except Exception as e: |
| 166 | pass | 174 | pass |
| 167 | 175 | ||
| 168 | def page_to_png(self, page): | 176 | def page_to_png(self, page, is_big_img=False): |
| 169 | if page.MediaBoxSize.x > 1500 or page.MediaBoxSize.y > 1500: | 177 | if is_big_img: |
| 178 | pm = page.getPixmap(matrix=trans_3, alpha=False) | ||
| 179 | elif page.MediaBoxSize.x > 1500 or page.MediaBoxSize.y > 1500: | ||
| 170 | pm = page.getPixmap(matrix=trans_1, alpha=False) | 180 | pm = page.getPixmap(matrix=trans_1, alpha=False) |
| 171 | else: | 181 | else: |
| 172 | pm = page.getPixmap(matrix=trans_2, alpha=False) | 182 | pm = page.getPixmap(matrix=trans_2, alpha=False) |
| ... | @@ -247,6 +257,9 @@ class PDFHandler: | ... | @@ -247,6 +257,9 @@ class PDFHandler: |
| 247 | if il[i][-1] in ADOBE_FILTER_SET: | 257 | if il[i][-1] in ADOBE_FILTER_SET: |
| 248 | page_to_png = True | 258 | page_to_png = True |
| 249 | break | 259 | break |
| 260 | if il[i][2] >= TINY_IMG_MAX_WIDTH: | ||
| 261 | page_to_png = True | ||
| 262 | break | ||
| 250 | else: | 263 | else: |
| 251 | for i in range(length): | 264 | for i in range(length): |
| 252 | # 当图片对象够大时,不作碎图合并处理,而是单纯提取 | 265 | # 当图片对象够大时,不作碎图合并处理,而是单纯提取 |
| ... | @@ -446,6 +459,10 @@ class PDFHandler: | ... | @@ -446,6 +459,10 @@ class PDFHandler: |
| 446 | page = pdf.loadPage(pno) | 459 | page = pdf.loadPage(pno) |
| 447 | self.page_to_png(page) | 460 | self.page_to_png(page) |
| 448 | # 大图 | 461 | # 大图 |
| 462 | elif width >= WH_COUPLE_6[0] or height >= WH_COUPLE_6[1]: | ||
| 463 | is_big_img = (width < WH_COUPLE_7[0] and height < WH_COUPLE_7[1]) # 防止图片过大 | ||
| 464 | page = pdf.loadPage(pno) | ||
| 465 | self.page_to_png(page, is_big_img=is_big_img) | ||
| 449 | elif xref not in self.xref_set: | 466 | elif xref not in self.xref_set: |
| 450 | self.extract_single_image(pdf, xref, smask, colorspace, pno) | 467 | self.extract_single_image(pdf, xref, smask, colorspace, pno) |
| 451 | # 3.页面图片对象数目大于1时,特殊处理 | 468 | # 3.页面图片对象数目大于1时,特殊处理 | ... | ... |
-
Please register or sign in to post a comment