Merge branch 'feature/pdftoimg' into 'master'
Feature/pdftoimg See merge request !23
Showing
3 changed files
with
28 additions
and
7 deletions
... | @@ -1339,8 +1339,8 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -1339,8 +1339,8 @@ class Command(BaseCommand, LoggerMixin): |
1339 | pdf_handler.extract_image(max_img_count) | 1339 | pdf_handler.extract_image(max_img_count) |
1340 | end_time = time.time() | 1340 | end_time = time.time() |
1341 | speed_time = int(end_time - start_time) | 1341 | speed_time = int(end_time - start_time) |
1342 | self.online_log.info('{0} [pdf to img end] [task={1}] [times={2}] [spend_time={3}]'.format( | 1342 | self.online_log.info('{0} [pdf to img end] [task={1}] [times={2}] [spend_time={3}] [is_new_modify={4}]'.format( |
1343 | self.log_base, task_str, times, speed_time)) | 1343 | self.log_base, task_str, times, speed_time, pdf_handler.is_new_modify)) |
1344 | except Exception as e: | 1344 | except Exception as e: |
1345 | self.online_log.warn('{0} [download or pdf to img failed] [task={1}] [times={2}] ' | 1345 | self.online_log.warn('{0} [download or pdf to img failed] [task={1}] [times={2}] ' |
1346 | '[error={3}]'.format(self.log_base, task_str, times, | 1346 | '[error={3}]'.format(self.log_base, task_str, times, | ... | ... |
... | @@ -12,8 +12,10 @@ from unicodedata import normalize | ... | @@ -12,8 +12,10 @@ from unicodedata import normalize |
12 | # 页面保存为png图片参数 | 12 | # 页面保存为png图片参数 |
13 | ZOOM_X_1 = ZOOM_Y_1 = 1.0 | 13 | ZOOM_X_1 = ZOOM_Y_1 = 1.0 |
14 | ZOOM_X_2 = ZOOM_Y_2 = 2.0 | 14 | ZOOM_X_2 = ZOOM_Y_2 = 2.0 |
15 | ZOOM_X_3 = ZOOM_Y_3 = 3.0 | ||
15 | trans_1 = fitz.Matrix(ZOOM_X_1, ZOOM_X_1).preRotate(0) # zoom factor 1 in each dimension | 16 | trans_1 = fitz.Matrix(ZOOM_X_1, ZOOM_X_1).preRotate(0) # zoom factor 1 in each dimension |
16 | trans_2 = fitz.Matrix(ZOOM_X_2, ZOOM_X_2).preRotate(0) # zoom factor 2 in each dimension | 17 | trans_2 = fitz.Matrix(ZOOM_X_2, ZOOM_X_2).preRotate(0) # zoom factor 2 in each dimension |
18 | trans_3 = fitz.Matrix(ZOOM_X_3, ZOOM_X_3).preRotate(0) # zoom factor 3 in each dimension | ||
17 | 19 | ||
18 | # 特殊filter处理 | 20 | # 特殊filter处理 |
19 | ADOBE_FILTER_SET = {'FlateDecode', 'JPXDecode', 'JBIG2Decode'} | 21 | ADOBE_FILTER_SET = {'FlateDecode', 'JPXDecode', 'JBIG2Decode'} |
... | @@ -25,6 +27,12 @@ WH_COUPLE_3 = (100, 100) | ... | @@ -25,6 +27,12 @@ WH_COUPLE_3 = (100, 100) |
25 | WH_COUPLE_4 = (100, 300) | 27 | WH_COUPLE_4 = (100, 300) |
26 | WH_COUPLE_5 = (100, 200) | 28 | WH_COUPLE_5 = (100, 200) |
27 | 29 | ||
30 | # 碎图宽度阈值 | ||
31 | TINY_IMG_MAX_WIDTH = 1400 | ||
32 | |||
33 | # 大图宽高阈值 | ||
34 | WH_COUPLE_6 = (1800, 1400) | ||
35 | WH_COUPLE_7 = (2500, 3000) | ||
28 | 36 | ||
29 | class PDFBuild: | 37 | class PDFBuild: |
30 | 38 | ||
... | @@ -55,6 +63,7 @@ class PDFHandler: | ... | @@ -55,6 +63,7 @@ class PDFHandler: |
55 | self.img_dir_path = img_dir_path | 63 | self.img_dir_path = img_dir_path |
56 | self.img_path_list = [] | 64 | self.img_path_list = [] |
57 | self.img_count = 0 | 65 | self.img_count = 0 |
66 | self.is_new_modify = 0 # 用于记录受新改动影响的PDF | ||
58 | self.xref_set = set() | 67 | self.xref_set = set() |
59 | self.img_suffixs = {'.jpeg', '.jpg', '.png', '.webp', '.bmp'} | 68 | self.img_suffixs = {'.jpeg', '.jpg', '.png', '.webp', '.bmp'} |
60 | self.suffix = self.get_suffix(document_name) | 69 | self.suffix = self.get_suffix(document_name) |
... | @@ -165,8 +174,10 @@ class PDFHandler: | ... | @@ -165,8 +174,10 @@ class PDFHandler: |
165 | except Exception as e: | 174 | except Exception as e: |
166 | pass | 175 | pass |
167 | 176 | ||
168 | def page_to_png(self, page): | 177 | def page_to_png(self, page, is_big_img=False): |
169 | if page.MediaBoxSize.x > 1500 or page.MediaBoxSize.y > 1500: | 178 | if is_big_img: |
179 | pm = page.getPixmap(matrix=trans_3, alpha=False) | ||
180 | elif page.MediaBoxSize.x > 1500 or page.MediaBoxSize.y > 1500: | ||
170 | pm = page.getPixmap(matrix=trans_1, alpha=False) | 181 | pm = page.getPixmap(matrix=trans_1, alpha=False) |
171 | else: | 182 | else: |
172 | pm = page.getPixmap(matrix=trans_2, alpha=False) | 183 | pm = page.getPixmap(matrix=trans_2, alpha=False) |
... | @@ -236,8 +247,8 @@ class PDFHandler: | ... | @@ -236,8 +247,8 @@ class PDFHandler: |
236 | self.xref_set.add(xref) | 247 | self.xref_set.add(xref) |
237 | self.img_path_list.append(img_save_path) | 248 | self.img_path_list.append(img_save_path) |
238 | 249 | ||
239 | @staticmethod | 250 | # @staticmethod |
240 | def split_il(il): | 251 | def split_il(self, il): |
241 | broken_il = [] | 252 | broken_il = [] |
242 | start = 0 | 253 | start = 0 |
243 | length = len(il) | 254 | length = len(il) |
... | @@ -247,6 +258,10 @@ class PDFHandler: | ... | @@ -247,6 +258,10 @@ class PDFHandler: |
247 | if il[i][-1] in ADOBE_FILTER_SET: | 258 | if il[i][-1] in ADOBE_FILTER_SET: |
248 | page_to_png = True | 259 | page_to_png = True |
249 | break | 260 | break |
261 | if il[i][2] >= TINY_IMG_MAX_WIDTH: | ||
262 | self.is_new_modify = 1 | ||
263 | page_to_png = True | ||
264 | break | ||
250 | else: | 265 | else: |
251 | for i in range(length): | 266 | for i in range(length): |
252 | # 当图片对象够大时,不作碎图合并处理,而是单纯提取 | 267 | # 当图片对象够大时,不作碎图合并处理,而是单纯提取 |
... | @@ -446,6 +461,11 @@ class PDFHandler: | ... | @@ -446,6 +461,11 @@ class PDFHandler: |
446 | page = pdf.loadPage(pno) | 461 | page = pdf.loadPage(pno) |
447 | self.page_to_png(page) | 462 | self.page_to_png(page) |
448 | # 大图 | 463 | # 大图 |
464 | elif width >= WH_COUPLE_6[0] or height >= WH_COUPLE_6[1]: | ||
465 | self.is_new_modify = 1 | ||
466 | is_big_img = (width < WH_COUPLE_7[0] and height < WH_COUPLE_7[1]) # 防止图片过大 | ||
467 | page = pdf.loadPage(pno) | ||
468 | self.page_to_png(page, is_big_img=is_big_img) | ||
449 | elif xref not in self.xref_set: | 469 | elif xref not in self.xref_set: |
450 | self.extract_single_image(pdf, xref, smask, colorspace, pno) | 470 | self.extract_single_image(pdf, xref, smask, colorspace, pno) |
451 | # 3.页面图片对象数目大于1时,特殊处理 | 471 | # 3.页面图片对象数目大于1时,特殊处理 | ... | ... |
-
Please register or sign in to post a comment