d21adf2c by 周伟奇

modify pdf_to_img

1 parent b81daff4
...@@ -31,4 +31,5 @@ conf/* ...@@ -31,4 +31,5 @@ conf/*
31 data/* 31 data/*
32 32
33 test* 33 test*
34 flow_test.py
...\ No newline at end of file ...\ No newline at end of file
34 flow_test.py
35 pdf_test.py
...\ No newline at end of file ...\ No newline at end of file
......
...@@ -12,8 +12,10 @@ from unicodedata import normalize ...@@ -12,8 +12,10 @@ from unicodedata import normalize
12 # 页面保存为png图片参数 12 # 页面保存为png图片参数
13 ZOOM_X_1 = ZOOM_Y_1 = 1.0 13 ZOOM_X_1 = ZOOM_Y_1 = 1.0
14 ZOOM_X_2 = ZOOM_Y_2 = 2.0 14 ZOOM_X_2 = ZOOM_Y_2 = 2.0
15 ZOOM_X_3 = ZOOM_Y_3 = 3.0
15 trans_1 = fitz.Matrix(ZOOM_X_1, ZOOM_X_1).preRotate(0) # zoom factor 1 in each dimension 16 trans_1 = fitz.Matrix(ZOOM_X_1, ZOOM_X_1).preRotate(0) # zoom factor 1 in each dimension
16 trans_2 = fitz.Matrix(ZOOM_X_2, ZOOM_X_2).preRotate(0) # zoom factor 2 in each dimension 17 trans_2 = fitz.Matrix(ZOOM_X_2, ZOOM_X_2).preRotate(0) # zoom factor 2 in each dimension
18 trans_3 = fitz.Matrix(ZOOM_X_3, ZOOM_X_3).preRotate(0) # zoom factor 3 in each dimension
17 19
18 # 特殊filter处理 20 # 特殊filter处理
19 ADOBE_FILTER_SET = {'FlateDecode', 'JPXDecode', 'JBIG2Decode'} 21 ADOBE_FILTER_SET = {'FlateDecode', 'JPXDecode', 'JBIG2Decode'}
...@@ -25,6 +27,12 @@ WH_COUPLE_3 = (100, 100) ...@@ -25,6 +27,12 @@ WH_COUPLE_3 = (100, 100)
25 WH_COUPLE_4 = (100, 300) 27 WH_COUPLE_4 = (100, 300)
26 WH_COUPLE_5 = (100, 200) 28 WH_COUPLE_5 = (100, 200)
27 29
30 # 碎图宽度阈值
31 TINY_IMG_MAX_WIDTH = 1800
32
33 # 大图宽高阈值
34 WH_COUPLE_6 = (1800, 1400)
35 WH_COUPLE_7 = (2500, 3000)
28 36
29 class PDFBuild: 37 class PDFBuild:
30 38
...@@ -165,8 +173,10 @@ class PDFHandler: ...@@ -165,8 +173,10 @@ class PDFHandler:
165 except Exception as e: 173 except Exception as e:
166 pass 174 pass
167 175
168 def page_to_png(self, page): 176 def page_to_png(self, page, is_big_img=False):
169 if page.MediaBoxSize.x > 1500 or page.MediaBoxSize.y > 1500: 177 if is_big_img:
178 pm = page.getPixmap(matrix=trans_3, alpha=False)
179 elif page.MediaBoxSize.x > 1500 or page.MediaBoxSize.y > 1500:
170 pm = page.getPixmap(matrix=trans_1, alpha=False) 180 pm = page.getPixmap(matrix=trans_1, alpha=False)
171 else: 181 else:
172 pm = page.getPixmap(matrix=trans_2, alpha=False) 182 pm = page.getPixmap(matrix=trans_2, alpha=False)
...@@ -247,6 +257,9 @@ class PDFHandler: ...@@ -247,6 +257,9 @@ class PDFHandler:
247 if il[i][-1] in ADOBE_FILTER_SET: 257 if il[i][-1] in ADOBE_FILTER_SET:
248 page_to_png = True 258 page_to_png = True
249 break 259 break
260 if il[i][2] >= TINY_IMG_MAX_WIDTH:
261 page_to_png = True
262 break
250 else: 263 else:
251 for i in range(length): 264 for i in range(length):
252 # 当图片对象够大时,不作碎图合并处理,而是单纯提取 265 # 当图片对象够大时,不作碎图合并处理,而是单纯提取
...@@ -446,6 +459,10 @@ class PDFHandler: ...@@ -446,6 +459,10 @@ class PDFHandler:
446 page = pdf.loadPage(pno) 459 page = pdf.loadPage(pno)
447 self.page_to_png(page) 460 self.page_to_png(page)
448 # 大图 461 # 大图
462 elif width >= WH_COUPLE_6[0] or height >= WH_COUPLE_6[1]:
463 is_big_img = (width < WH_COUPLE_7[0] and height < WH_COUPLE_7[1]) # 防止图片过大
464 page = pdf.loadPage(pno)
465 self.page_to_png(page, is_big_img=is_big_img)
449 elif xref not in self.xref_set: 466 elif xref not in self.xref_set:
450 self.extract_single_image(pdf, xref, smask, colorspace, pno) 467 self.extract_single_image(pdf, xref, smask, colorspace, pno)
451 # 3.页面图片对象数目大于1时,特殊处理 468 # 3.页面图片对象数目大于1时,特殊处理
......
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!