img process
Showing
2 changed files
with
47 additions
and
27 deletions
| ... | @@ -585,7 +585,7 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -585,7 +585,7 @@ class Command(BaseCommand, LoggerMixin): |
| 585 | img_save_path = os.path.join(doc_data_path, 'img') | 585 | img_save_path = os.path.join(doc_data_path, 'img') |
| 586 | pdf_path = os.path.join(doc_data_path, '{0}.pdf'.format(doc.id)) | 586 | pdf_path = os.path.join(doc_data_path, '{0}.pdf'.format(doc.id)) |
| 587 | 587 | ||
| 588 | pdf_handler = PDFHandler(pdf_path, img_save_path) | 588 | pdf_handler = PDFHandler(pdf_path, img_save_path, doc.document_name) |
| 589 | max_count_obj = Configs.objects.filter(id=2).first() | 589 | max_count_obj = Configs.objects.filter(id=2).first() |
| 590 | try: | 590 | try: |
| 591 | max_img_count = int(max_count_obj.value) | 591 | max_img_count = int(max_count_obj.value) | ... | ... |
| 1 | import os | 1 | import os |
| 2 | import shutil | ||
| 2 | import fitz | 3 | import fitz |
| 3 | from PIL import Image | 4 | from PIL import Image |
| 4 | from io import BytesIO | 5 | from io import BytesIO |
| ... | @@ -22,12 +23,25 @@ WH_COUPLE_5 = (100, 200) | ... | @@ -22,12 +23,25 @@ WH_COUPLE_5 = (100, 200) |
| 22 | 23 | ||
| 23 | class PDFHandler: | 24 | class PDFHandler: |
| 24 | 25 | ||
| 25 | def __init__(self, path, img_dir_path): | 26 | def __init__(self, path, img_dir_path, document_name=None): |
| 26 | self.path = path | 27 | self.path = path |
| 27 | self.img_dir_path = img_dir_path | 28 | self.img_dir_path = img_dir_path |
| 28 | self.img_path_list = [] | 29 | self.img_path_list = [] |
| 29 | self.img_count = 0 | 30 | self.img_count = 0 |
| 30 | self.xref_set = set() | 31 | self.xref_set = set() |
| 32 | self.img_suffixs = {'.jpeg', '.jpg', '.png', '.webp', '.bmp'} | ||
| 33 | self.suffix = self.get_suffix(document_name) | ||
| 34 | |||
| 35 | def get_suffix(self, file_name): | ||
| 36 | if file_name is None: | ||
| 37 | return None | ||
| 38 | try: | ||
| 39 | _, src_suffix = os.path.splitext(file_name) | ||
| 40 | lower_suffix = src_suffix.lower() | ||
| 41 | if lower_suffix in self.img_suffixs: | ||
| 42 | return lower_suffix | ||
| 43 | except Exception as e: | ||
| 44 | return | ||
| 31 | 45 | ||
| 32 | def get_img_save_path(self, pno, img_index=0, ext='png'): | 46 | def get_img_save_path(self, pno, img_index=0, ext='png'): |
| 33 | return os.path.join(self.img_dir_path, 'page_{0}_img_{1}.{2}'.format(pno, img_index, ext)) | 47 | return os.path.join(self.img_dir_path, 'page_{0}_img_{1}.{2}'.format(pno, img_index, ext)) |
| ... | @@ -197,31 +211,37 @@ class PDFHandler: | ... | @@ -197,31 +211,37 @@ class PDFHandler: |
| 197 | self.img_path_list = [] | 211 | self.img_path_list = [] |
| 198 | self.xref_set = set() | 212 | self.xref_set = set() |
| 199 | os.makedirs(self.img_dir_path, exist_ok=True) | 213 | os.makedirs(self.img_dir_path, exist_ok=True) |
| 200 | with fitz.Document(self.path) as pdf: | 214 | |
| 201 | if isinstance(max_img_count, int) and pdf.pageCount >= max_img_count: | 215 | if self.suffix in self.img_suffixs: |
| 202 | self.img_count = pdf.pageCount | 216 | img_save_path = self.get_img_save_path(0, ext=self.suffix[1:]) |
| 203 | return | 217 | shutil.copy(self.path, img_save_path) |
| 204 | for pno in range(pdf.pageCount): | 218 | self.img_path_list.append(img_save_path) |
| 205 | il = pdf.getPageImageList(pno) # 获取页面图片对象 | 219 | else: |
| 206 | # (xref, smask, width, height, bpc, colorspace, alt.colorspace, name, filter, invoker) | 220 | with fitz.Document(self.path) as pdf: |
| 207 | 221 | if isinstance(max_img_count, int) and pdf.pageCount >= max_img_count: | |
| 208 | # 1.页面图片对象数目为0时,保存整个页面为png图片 | 222 | self.img_count = pdf.pageCount |
| 209 | if len(il) == 0: | 223 | return |
| 210 | page = pdf.loadPage(pno) | 224 | for pno in range(pdf.pageCount): |
| 211 | self.page_to_png(page) | 225 | il = pdf.getPageImageList(pno) # 获取页面图片对象 |
| 212 | # 2.页面图片对象数目为1时: | 226 | # (xref, smask, width, height, bpc, colorspace, alt.colorspace, name, filter, invoker) |
| 213 | # 小图(如电子账单的盖章):保存整个页面为png图片 | 227 | |
| 214 | # 大图:提取图片对象 | 228 | # 1.页面图片对象数目为0时,保存整个页面为png图片 |
| 215 | elif len(il) == 1: | 229 | if len(il) == 0: |
| 216 | xref, smask, width, height, _, colorspace, _, _, _ = il[0] | ||
| 217 | # 小图 | ||
| 218 | if width < WH_COUPLE_1[0] and height < WH_COUPLE_1[1]: | ||
| 219 | page = pdf.loadPage(pno) | 230 | page = pdf.loadPage(pno) |
| 220 | self.page_to_png(page) | 231 | self.page_to_png(page) |
| 221 | # 大图 | 232 | # 2.页面图片对象数目为1时: |
| 222 | elif xref not in self.xref_set: | 233 | # 小图(如电子账单的盖章):保存整个页面为png图片 |
| 223 | self.extract_single_image(pdf, xref, smask, colorspace, pno) | 234 | # 大图:提取图片对象 |
| 224 | # 3.页面图片对象数目大于1时,特殊处理 | 235 | elif len(il) == 1: |
| 225 | else: | 236 | xref, smask, width, height, _, colorspace, _, _, _ = il[0] |
| 226 | self.merge_il(pdf, pno, il) | 237 | # 小图 |
| 238 | if width < WH_COUPLE_1[0] and height < WH_COUPLE_1[1]: | ||
| 239 | page = pdf.loadPage(pno) | ||
| 240 | self.page_to_png(page) | ||
| 241 | # 大图 | ||
| 242 | elif xref not in self.xref_set: | ||
| 243 | self.extract_single_image(pdf, xref, smask, colorspace, pno) | ||
| 244 | # 3.页面图片对象数目大于1时,特殊处理 | ||
| 245 | else: | ||
| 246 | self.merge_il(pdf, pno, il) | ||
| 227 | self.img_count = len(self.img_path_list) | 247 | self.img_count = len(self.img_path_list) | ... | ... |
-
Please register or sign in to post a comment