update pdf to img
Showing
6 changed files
with
223 additions
and
682 deletions
| ... | @@ -5,16 +5,12 @@ import signal | ... | @@ -5,16 +5,12 @@ import signal |
| 5 | import base64 | 5 | import base64 |
| 6 | import asyncio | 6 | import asyncio |
| 7 | import aiohttp | 7 | import aiohttp |
| 8 | import locale | ||
| 9 | from PIL import Image | ||
| 10 | from io import BytesIO | ||
| 11 | from openpyxl import Workbook | 8 | from openpyxl import Workbook |
| 12 | from openpyxl.styles import numbers | ||
| 13 | from openpyxl.utils import get_column_letter | ||
| 14 | |||
| 15 | from django.core.management import BaseCommand | 9 | from django.core.management import BaseCommand |
| 10 | |||
| 16 | from common.mixins import LoggerMixin | 11 | from common.mixins import LoggerMixin |
| 17 | from common.tools.file_tools import write_zip_file | 12 | from common.tools.file_tools import write_zip_file |
| 13 | from common.tools.pdf_to_img import PDFHandler | ||
| 18 | from apps.doc.models import DocStatus, HILDoc, AFCDoc | 14 | from apps.doc.models import DocStatus, HILDoc, AFCDoc |
| 19 | from apps.doc import consts | 15 | from apps.doc import consts |
| 20 | from settings import conf | 16 | from settings import conf |
| ... | @@ -123,126 +119,6 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -123,126 +119,6 @@ class Command(BaseCommand, LoggerMixin): |
| 123 | img_name = os.path.basename(img_path) | 119 | img_name = os.path.basename(img_path) |
| 124 | self.append_sheet(wb, sheets_list, img_name) | 120 | self.append_sheet(wb, sheets_list, img_name) |
| 125 | 121 | ||
| 126 | def proof(self, ws): | ||
| 127 | # 找到金额、余额列 | ||
| 128 | amount_col = overage_col = None | ||
| 129 | for i in ws[1]: | ||
| 130 | if i.value in consts.AMOUNT_COL_TITLE_SET: | ||
| 131 | amount_col = i.column | ||
| 132 | amount_col_letter = get_column_letter(amount_col) | ||
| 133 | elif i.value in consts.OVERAGE_COL_TITLE_SET: | ||
| 134 | overage_col = i.column | ||
| 135 | overage_col_letter = get_column_letter(overage_col) | ||
| 136 | if amount_col is None or overage_col is None: | ||
| 137 | return | ||
| 138 | # 文本转数值 | ||
| 139 | for col_tuple in ws.iter_cols(min_row=2, min_col=amount_col, max_col=overage_col): | ||
| 140 | for c in col_tuple: | ||
| 141 | try: | ||
| 142 | c.value = locale.atof(c.value) | ||
| 143 | c.number_format = numbers.FORMAT_NUMBER_00 | ||
| 144 | except Exception: | ||
| 145 | continue | ||
| 146 | # 增加核对结果列 | ||
| 147 | proof_col_letter = get_column_letter(ws.max_column + 1) | ||
| 148 | for c in ws[proof_col_letter]: | ||
| 149 | if c.row == 1: | ||
| 150 | c.value = consts.PROOF_COL_TITLE | ||
| 151 | elif c.row == 2: | ||
| 152 | continue | ||
| 153 | else: | ||
| 154 | c.value = '=IF({3}{0}=SUM({2}{0},{3}{1}), "{4}", "{5}")'.format( | ||
| 155 | c.row, c.row - 1, amount_col_letter, overage_col_letter, *consts.PROOF_RES) | ||
| 156 | |||
| 157 | def wb_process(self, wb, excel_path): | ||
| 158 | locale.setlocale(locale.LC_NUMERIC, 'en_US.UTF-8') | ||
| 159 | for ws in wb.worksheets: | ||
| 160 | if ws.title == 'Sheet': | ||
| 161 | ws.title = consts.META_SHEET_TITLE | ||
| 162 | else: | ||
| 163 | self.proof(ws) | ||
| 164 | wb.save(excel_path) # TODO no sheet (res always []) | ||
| 165 | |||
| 166 | @staticmethod | ||
| 167 | def getimage(pix): | ||
| 168 | if pix.colorspace.n != 4: | ||
| 169 | return pix | ||
| 170 | tpix = fitz.Pixmap(fitz.csRGB, pix) | ||
| 171 | return tpix | ||
| 172 | |||
| 173 | def recoverpix(self, doc, item): | ||
| 174 | x = item[0] # xref of PDF image | ||
| 175 | s = item[1] # xref of its /SMask | ||
| 176 | is_rgb = True if item[5] == 'DeviceRGB' else False | ||
| 177 | |||
| 178 | # RGB | ||
| 179 | if is_rgb: | ||
| 180 | if s == 0: | ||
| 181 | return doc.extractImage(x) | ||
| 182 | # we need to reconstruct the alpha channel with the smask | ||
| 183 | pix1 = fitz.Pixmap(doc, x) | ||
| 184 | pix2 = fitz.Pixmap(doc, s) # create pixmap of the /SMask entry | ||
| 185 | |||
| 186 | # sanity check | ||
| 187 | if not (pix1.irect == pix2.irect and pix1.alpha == pix2.alpha == 0 and pix2.n == 1): | ||
| 188 | pix2 = None | ||
| 189 | return self.getimage(pix1) | ||
| 190 | |||
| 191 | pix = fitz.Pixmap(pix1) # copy of pix1, alpha channel added | ||
| 192 | pix.setAlpha(pix2.samples) # treat pix2.samples as alpha value | ||
| 193 | pix1 = pix2 = None # free temp pixmaps | ||
| 194 | return self.getimage(pix) | ||
| 195 | |||
| 196 | # CMYK | ||
| 197 | pix1 = fitz.Pixmap(doc, x) | ||
| 198 | pix = fitz.Pixmap(pix1) # copy of pix1, alpha channel added | ||
| 199 | |||
| 200 | if s != 0: | ||
| 201 | pix2 = fitz.Pixmap(doc, s) # create pixmap of the /SMask entry | ||
| 202 | |||
| 203 | # sanity check | ||
| 204 | if not (pix1.irect == pix2.irect and pix1.alpha == pix2.alpha == 0 and pix2.n == 1): | ||
| 205 | pix2 = None | ||
| 206 | return self.getimage(pix1) | ||
| 207 | |||
| 208 | pix.setAlpha(pix2.samples) # treat pix2.samples as alpha value | ||
| 209 | |||
| 210 | pix1 = pix2 = None # free temp pixmaps | ||
| 211 | |||
| 212 | pix = fitz.Pixmap(fitz.csRGB, pix) # GRAY/CMYK to RGB | ||
| 213 | return self.getimage(pix) | ||
| 214 | |||
| 215 | @staticmethod | ||
| 216 | def get_img_data(pix): | ||
| 217 | if type(pix) is dict: # we got a raw image | ||
| 218 | ext = pix["ext"] | ||
| 219 | img_data = pix["image"] | ||
| 220 | else: # we got a pixmap | ||
| 221 | ext = 'png' | ||
| 222 | img_data = pix.getPNGData() | ||
| 223 | return ext, img_data | ||
| 224 | |||
| 225 | @staticmethod | ||
| 226 | def split_il(il): | ||
| 227 | img_il_list = [] | ||
| 228 | start = 0 | ||
| 229 | length = len(il) | ||
| 230 | for i in range(length): | ||
| 231 | if i == start: | ||
| 232 | if i == length - 1: | ||
| 233 | img_il_list.append(il[start: length]) | ||
| 234 | continue | ||
| 235 | elif i == length - 1: | ||
| 236 | img_il_list.append(il[start: length]) | ||
| 237 | continue | ||
| 238 | if il[i][2] != il[i - 1][2]: | ||
| 239 | img_il_list.append(il[start: i]) | ||
| 240 | start = i | ||
| 241 | elif il[i][3] != il[i - 1][3]: | ||
| 242 | img_il_list.append(il[start: i + 1]) | ||
| 243 | start = i + 1 | ||
| 244 | return img_il_list | ||
| 245 | |||
| 246 | # TODO 细化文件状态,不同异常状态采取不同的处理 | 122 | # TODO 细化文件状态,不同异常状态采取不同的处理 |
| 247 | # TODO 调用接口重试 | 123 | # TODO 调用接口重试 |
| 248 | def handle(self, *args, **kwargs): | 124 | def handle(self, *args, **kwargs): |
| ... | @@ -252,98 +128,33 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -252,98 +128,33 @@ class Command(BaseCommand, LoggerMixin): |
| 252 | while self.switch: | 128 | while self.switch: |
| 253 | # 1. 从队列获取文件信息 | 129 | # 1. 从队列获取文件信息 |
| 254 | doc, business_type = self.get_doc_info() | 130 | doc, business_type = self.get_doc_info() |
| 255 | |||
| 256 | try: | 131 | try: |
| 257 | # 2. 从EDMS获取PDF文件 | 132 | # 2. 从EDMS获取PDF文件 |
| 258 | doc_data_path, excel_path, pdf_path = self.pdf_download(doc, business_type) | 133 | doc_data_path, excel_path, pdf_path = self.pdf_download(doc, business_type) |
| 259 | |||
| 260 | # 队列为空时的处理 | 134 | # 队列为空时的处理 |
| 261 | if pdf_path is None: | 135 | if pdf_path is None: |
| 262 | time.sleep(sleep_second) | 136 | time.sleep(sleep_second) |
| 263 | sleep_second = min(max_sleep_second, sleep_second+5) | 137 | sleep_second = min(max_sleep_second, sleep_second+5) |
| 264 | continue | 138 | continue |
| 265 | |||
| 266 | sleep_second = int(conf.SLEEP_SECOND) | 139 | sleep_second = int(conf.SLEEP_SECOND) |
| 267 | |||
| 268 | # 3.PDF文件提取图片 | 140 | # 3.PDF文件提取图片 |
| 269 | img_save_path = os.path.join(doc_data_path, 'img') | 141 | img_save_path = os.path.join(doc_data_path, 'img') |
| 270 | os.makedirs(img_save_path, exist_ok=True) | 142 | self.cronjob_log.info('{0} [pdf to img start] [business_type={1}] [doc_id={2}]'.format( |
| 271 | img_path_list = [] | 143 | self.log_base, business_type, doc.id)) |
| 272 | with fitz.Document(pdf_path) as pdf: | 144 | pdf_handler = PDFHandler(pdf_path, img_save_path) |
| 273 | self.cronjob_log.info('{0} [pdf_path={1}] [metadata={2}]'.format( | 145 | pdf_handler.extract_image() |
| 274 | self.log_base, pdf_path, pdf.metadata)) | 146 | self.cronjob_log.info('{0} [pdf to img end] [business_type={1}] [doc_id={2}]'.format( |
| 275 | # xref_list = [] # TODO 图片去重 特殊pdf:如电子发票 | 147 | self.log_base, business_type, doc.id)) |
| 276 | for pno in range(pdf.pageCount): | ||
| 277 | il = pdf.getPageImageList(pno) | ||
| 278 | il.sort(key=lambda x: x[0]) | ||
| 279 | img_il_list = self.split_il(il) | ||
| 280 | del il | ||
| 281 | |||
| 282 | if len(img_il_list) > 3: # 单页无规律小图过多时,使用页面转图片 | ||
| 283 | page = pdf.loadPage(pno) | ||
| 284 | pm = page.getPixmap(matrix=self.trans, alpha=False) | ||
| 285 | save_path = os.path.join(img_save_path, 'page_{0}_img_0.png'.format(page.number)) | ||
| 286 | pm.writePNG(save_path) | ||
| 287 | img_path_list.append(save_path) | ||
| 288 | self.cronjob_log.info('{0} [page to img success] [pdf_path={1}] [page={2}]'.format( | ||
| 289 | self.log_base, pdf_path, page.number)) | ||
| 290 | else: # 提取图片 | ||
| 291 | for img_index, img_il in enumerate(img_il_list): | ||
| 292 | if len(img_il) == 1: # 当只有一张图片时, 简化处理 | ||
| 293 | pix = self.recoverpix(pdf, img_il[0]) | ||
| 294 | ext, img_data = self.get_img_data(pix) | ||
| 295 | save_path = os.path.join(img_save_path, 'page_{0}_img_{1}.{2}'.format( | ||
| 296 | pno, img_index, ext)) | ||
| 297 | with open(save_path, "wb") as f: | ||
| 298 | f.write(img_data) | ||
| 299 | img_path_list.append(save_path) | ||
| 300 | self.cronjob_log.info( | ||
| 301 | '{0} [extract img success] [pdf_path={1}] [page={2}] [img_index={3}]'.format( | ||
| 302 | self.log_base, pdf_path, pno, img_index)) | ||
| 303 | else: # 多张图片,竖向拼接 | ||
| 304 | height_sum = 0 | ||
| 305 | im_list = [] | ||
| 306 | width = img_il[0][2] | ||
| 307 | for img in img_il: | ||
| 308 | # xref = img[0] | ||
| 309 | # if xref in xref_list: | ||
| 310 | # continue | ||
| 311 | height = img[3] | ||
| 312 | pix = self.recoverpix(pdf, img) | ||
| 313 | ext, img_data = self.get_img_data(pix) | ||
| 314 | |||
| 315 | # xref_list.append(xref) | ||
| 316 | |||
| 317 | im = Image.open(BytesIO(img_data)) | ||
| 318 | im_list.append((height, im, ext)) | ||
| 319 | height_sum += height | ||
| 320 | |||
| 321 | save_path = os.path.join(img_save_path, 'page_{0}_img_{1}.{2}'.format( | ||
| 322 | pno, img_index, im_list[0][2])) | ||
| 323 | res = Image.new(im_list[0][1].mode, (width, height_sum)) | ||
| 324 | h_now = 0 | ||
| 325 | for h, m, _ in im_list: | ||
| 326 | res.paste(m, box=(0, h_now)) | ||
| 327 | h_now += h | ||
| 328 | res.save(save_path) | ||
| 329 | img_path_list.append(save_path) | ||
| 330 | self.cronjob_log.info( | ||
| 331 | '{0} [extract img success] [pdf_path={1}] [page={2}] [img_index={3}]'.format( | ||
| 332 | self.log_base, pdf_path, pno, img_index)) | ||
| 333 | self.cronjob_log.info('{0} [pdf to img success] [business_type={1}] [doc_id={2}]'.format( | ||
| 334 | self.log_base, business_type, doc.id)) | ||
| 335 | |||
| 336 | write_zip_file(img_save_path, os.path.join(doc_data_path, '{0}_img.zip'.format(doc.id))) | 148 | write_zip_file(img_save_path, os.path.join(doc_data_path, '{0}_img.zip'.format(doc.id))) |
| 337 | 149 | ||
| 338 | # 4.图片调用算法判断是否为银行流水, 图片调用算法OCR为excel文件 | 150 | # 4.图片调用算法判断是否为银行流水, 图片调用算法OCR为excel文件 |
| 339 | wb = Workbook() | 151 | wb = Workbook() |
| 340 | loop = asyncio.get_event_loop() | 152 | loop = asyncio.get_event_loop() |
| 341 | tasks = [self.img_ocr_excel(wb, img_path) for img_path in img_path_list] | 153 | tasks = [self.img_ocr_excel(wb, img_path) for img_path in pdf_handler.img_path_list] |
| 342 | loop.run_until_complete(asyncio.wait(tasks)) | 154 | loop.run_until_complete(asyncio.wait(tasks)) |
| 343 | # loop.close() | 155 | # loop.close() |
| 344 | 156 | ||
| 345 | # 整合excel文件 | 157 | # 整合excel文件 |
| 346 | # self.wb_process(wb, excel_path) | ||
| 347 | wb.save(excel_path) | 158 | wb.save(excel_path) |
| 348 | except Exception as e: | 159 | except Exception as e: |
| 349 | doc.status = DocStatus.PROCESS_FAILED.value | 160 | doc.status = DocStatus.PROCESS_FAILED.value | ... | ... |
| 1 | import os | ||
| 2 | import fitz | ||
| 3 | import signal | ||
| 4 | from PIL import Image | ||
| 5 | from io import BytesIO | ||
| 6 | |||
| 7 | from django.core.management import BaseCommand | ||
| 8 | from common.mixins import LoggerMixin | ||
| 9 | |||
| 10 | |||
| 11 | class Command(BaseCommand, LoggerMixin): | ||
| 12 | |||
| 13 | def __init__(self): | ||
| 14 | super().__init__() | ||
| 15 | self.log_base = '[pdf to img]' | ||
| 16 | # 处理文件开关 | ||
| 17 | self.switch = True | ||
| 18 | # pdf页面转图片 | ||
| 19 | self.zoom_x = 2.0 | ||
| 20 | self.zoom_y = 2.0 | ||
| 21 | self.trans = fitz.Matrix(self.zoom_x, self.zoom_y).preRotate(0) # zoom factor 2 in each dimension | ||
| 22 | # 优雅退出信号:15 | ||
| 23 | signal.signal(signal.SIGTERM, self.signal_handler) | ||
| 24 | |||
| 25 | def signal_handler(self, sig, frame): | ||
| 26 | self.switch = False # 停止处理文件 | ||
| 27 | |||
| 28 | @staticmethod | ||
| 29 | def getimage(pix): | ||
| 30 | if pix.colorspace.n != 4: | ||
| 31 | return pix | ||
| 32 | tpix = fitz.Pixmap(fitz.csRGB, pix) | ||
| 33 | return tpix | ||
| 34 | |||
| 35 | def recoverpix(self, doc, item): | ||
| 36 | x = item[0] # xref of PDF image | ||
| 37 | s = item[1] # xref of its /SMask | ||
| 38 | is_rgb = True if item[5] == 'DeviceRGB' else False | ||
| 39 | |||
| 40 | # RGB | ||
| 41 | if is_rgb: | ||
| 42 | if s == 0: | ||
| 43 | return doc.extractImage(x) | ||
| 44 | # we need to reconstruct the alpha channel with the smask | ||
| 45 | pix1 = fitz.Pixmap(doc, x) | ||
| 46 | pix2 = fitz.Pixmap(doc, s) # create pixmap of the /SMask entry | ||
| 47 | |||
| 48 | # sanity check | ||
| 49 | if not (pix1.irect == pix2.irect and pix1.alpha == pix2.alpha == 0 and pix2.n == 1): | ||
| 50 | pix2 = None | ||
| 51 | return self.getimage(pix1) | ||
| 52 | |||
| 53 | pix = fitz.Pixmap(pix1) # copy of pix1, alpha channel added | ||
| 54 | pix.setAlpha(pix2.samples) # treat pix2.samples as alpha value | ||
| 55 | pix1 = pix2 = None # free temp pixmaps | ||
| 56 | return self.getimage(pix) | ||
| 57 | |||
| 58 | # GRAY/CMYK | ||
| 59 | pix1 = fitz.Pixmap(doc, x) | ||
| 60 | pix = fitz.Pixmap(pix1) # copy of pix1, alpha channel added | ||
| 61 | |||
| 62 | if s != 0: | ||
| 63 | pix2 = fitz.Pixmap(doc, s) # create pixmap of the /SMask entry | ||
| 64 | |||
| 65 | # sanity check | ||
| 66 | if not (pix1.irect == pix2.irect and pix1.alpha == pix2.alpha == 0 and pix2.n == 1): | ||
| 67 | pix2 = None | ||
| 68 | return self.getimage(pix1) | ||
| 69 | |||
| 70 | pix.setAlpha(pix2.samples) # treat pix2.samples as alpha value | ||
| 71 | |||
| 72 | pix1 = pix2 = None # free temp pixmaps | ||
| 73 | |||
| 74 | pix = fitz.Pixmap(fitz.csRGB, pix) # GRAY/CMYK to RGB | ||
| 75 | return self.getimage(pix) | ||
| 76 | |||
| 77 | @staticmethod | ||
| 78 | def get_img_data(pix): | ||
| 79 | if type(pix) is dict: # we got a raw image | ||
| 80 | ext = pix["ext"] | ||
| 81 | img_data = pix["image"] | ||
| 82 | else: # we got a pixmap | ||
| 83 | ext = 'png' | ||
| 84 | img_data = pix.getPNGData() | ||
| 85 | return ext, img_data | ||
| 86 | |||
| 87 | @staticmethod | ||
| 88 | def split_il(il): | ||
| 89 | small_img_il_list = [] | ||
| 90 | big_img_il_list = [] | ||
| 91 | start = 0 | ||
| 92 | index = 0 | ||
| 93 | length = len(il) | ||
| 94 | for i in range(length): | ||
| 95 | if il[i][2] >= 700 and il[i][3] >= 647: | ||
| 96 | if start < i: | ||
| 97 | small_img_il_list.append((il[start: i], index)) | ||
| 98 | index += 1 | ||
| 99 | else: | ||
| 100 | start += 1 | ||
| 101 | big_img_il_list.append((il[i], index)) | ||
| 102 | index += 1 | ||
| 103 | continue | ||
| 104 | if i == start: | ||
| 105 | if i == length - 1: | ||
| 106 | small_img_il_list.append((il[start: length], index)) | ||
| 107 | continue | ||
| 108 | elif i == length - 1: | ||
| 109 | if il[i][2] == il[i - 1][2]: | ||
| 110 | small_img_il_list.append((il[start: length], index)) | ||
| 111 | else: | ||
| 112 | small_img_il_list.append((il[start: i], index)) | ||
| 113 | small_img_il_list.append((il[i: length], index+1)) | ||
| 114 | continue | ||
| 115 | if il[i][2] != il[i - 1][2]: | ||
| 116 | small_img_il_list.append((il[start: i], index)) | ||
| 117 | index += 1 | ||
| 118 | start = i | ||
| 119 | elif il[i][3] != il[i - 1][3] and il[i][2] < 1200: | ||
| 120 | small_img_il_list.append((il[start: i + 1], index)) | ||
| 121 | index += 1 | ||
| 122 | start = i + 1 | ||
| 123 | return small_img_il_list, big_img_il_list | ||
| 124 | |||
| 125 | def handle(self, *args, **kwargs): | ||
| 126 | pdf_dir = '/Users/clay/Desktop/问题PDF' | ||
| 127 | img_dir = '/Users/clay/Desktop/问题PDF' | ||
| 128 | for d in os.listdir(pdf_dir): | ||
| 129 | # if d in ['.DS_Store', 'CH-B008003736.pdf', 'CH-B006317088.pdf', 'CH-B008487476.pdf', 'CH-B006337608.pdf', | ||
| 130 | # 'CH-B006391612.pdf', 'CH-B006536124.pdf', 'CH-B006526652.pdf', 'CH-B009003592.pdf']: | ||
| 131 | # continue | ||
| 132 | # if d != 'CH-B006393152.PDF': | ||
| 133 | # if d != 'CH-B006526652.pdf': | ||
| 134 | if d != 'CH-B008487944.pdf': | ||
| 135 | continue | ||
| 136 | pdf_path = os.path.join(pdf_dir, d) | ||
| 137 | if os.path.isfile(pdf_path): | ||
| 138 | img_save_path = os.path.join(img_dir, d[:-4]) | ||
| 139 | # if os.path.exists(img_save_path): | ||
| 140 | # continue | ||
| 141 | os.makedirs(img_save_path, exist_ok=True) | ||
| 142 | with fitz.Document(pdf_path) as pdf: | ||
| 143 | self.cronjob_log.info('{0} [pdf_path={1}] [metadata={2}]'.format( | ||
| 144 | self.log_base, pdf_path, pdf.metadata)) | ||
| 145 | xref_set = set() | ||
| 146 | for pno in range(pdf.pageCount): | ||
| 147 | print('---------------------------------------') | ||
| 148 | il = pdf.getPageImageList(pno) | ||
| 149 | # (xref, smask, width, height, bpc, colorspace, alt.colorspace, name, filter, invoker) | ||
| 150 | print(il) | ||
| 151 | |||
| 152 | # for img_index, img in enumerate(il): | ||
| 153 | # pix = self.recoverpix(pdf, img) | ||
| 154 | # ext, img_data = self.get_img_data(pix) | ||
| 155 | # save_path = os.path.join(img_save_path, 'page_{0}_img_{1}.{2}'.format( | ||
| 156 | # pno, img_index, ext)) | ||
| 157 | # with open(save_path, "wb") as f: | ||
| 158 | # f.write(img_data) | ||
| 159 | |||
| 160 | if len(il) == 0: | ||
| 161 | page = pdf.loadPage(pno) | ||
| 162 | pm = page.getPixmap(matrix=self.trans, alpha=False) | ||
| 163 | save_path = os.path.join(img_save_path, 'page_{0}_img_0.png'.format(page.number)) | ||
| 164 | pm.writePNG(save_path) | ||
| 165 | elif len(il) == 1: | ||
| 166 | width = il[0][2] | ||
| 167 | height = il[0][3] | ||
| 168 | colorspace = il[0][5] | ||
| 169 | adobe_filter = il[0][-1] | ||
| 170 | if colorspace == '' or adobe_filter in ['', '']: | ||
| 171 | continue | ||
| 172 | # 小图 | ||
| 173 | if width < 500 and height < 500: | ||
| 174 | page = pdf.loadPage(pno) | ||
| 175 | pm = page.getPixmap(matrix=self.trans, alpha=False) | ||
| 176 | save_path = os.path.join(img_save_path, 'page_{0}_img_0.png'.format(page.number)) | ||
| 177 | pm.writePNG(save_path) | ||
| 178 | # 大图 | ||
| 179 | elif il[0][0] not in xref_set: | ||
| 180 | pix = self.recoverpix(pdf, il[0]) | ||
| 181 | ext, img_data = self.get_img_data(pix) | ||
| 182 | save_path = os.path.join(img_save_path, 'page_{0}_img_0.{1}'.format(pno, ext)) | ||
| 183 | with open(save_path, "wb") as f: | ||
| 184 | f.write(img_data) | ||
| 185 | xref_set.add(il[0][0]) | ||
| 186 | else: | ||
| 187 | il.sort(key=lambda x: x[0]) | ||
| 188 | small_img_il_list, big_img_il_list = self.split_il(il) | ||
| 189 | print(small_img_il_list) | ||
| 190 | print(big_img_il_list) | ||
| 191 | print('+++++++++++++++++++++++++++++++++++') | ||
| 192 | |||
| 193 | if len(small_img_il_list) > 2: # 单页无规律小图过多时,使用页面转图片 | ||
| 194 | page = pdf.loadPage(pno) | ||
| 195 | pm = page.getPixmap(matrix=self.trans, alpha=False) | ||
| 196 | save_path = os.path.join(img_save_path, 'page_{0}_img_0.png'.format(page.number)) | ||
| 197 | pm.writePNG(save_path) | ||
| 198 | else: # 提取图片 | ||
| 199 | for img_il, img_index in big_img_il_list: | ||
| 200 | if img_il[0] in xref_set: | ||
| 201 | continue | ||
| 202 | pix = self.recoverpix(pdf, img_il) | ||
| 203 | ext, img_data = self.get_img_data(pix) | ||
| 204 | save_path = os.path.join(img_save_path, 'page_{0}_img_{1}.{2}'.format( | ||
| 205 | pno, img_index, ext)) | ||
| 206 | with open(save_path, "wb") as f: | ||
| 207 | f.write(img_data) | ||
| 208 | xref_set.add(img_il[0]) | ||
| 209 | |||
| 210 | for img_il, img_index in small_img_il_list: | ||
| 211 | # 小图 | ||
| 212 | if len(img_il) == 1 and img_il[0][2] < 500 and img_il[0][3] < 500: | ||
| 213 | page = pdf.loadPage(pno) | ||
| 214 | pm = page.getPixmap(matrix=self.trans, alpha=False) | ||
| 215 | save_path = os.path.join(img_save_path, | ||
| 216 | 'page_{0}_img_0.png'.format(page.number)) | ||
| 217 | pm.writePNG(save_path) | ||
| 218 | elif len(img_il) == 1 and img_il[0][0] not in xref_set: # 当只有一张图片时, 简化处理 | ||
| 219 | pix = self.recoverpix(pdf, img_il[0]) | ||
| 220 | ext, img_data = self.get_img_data(pix) | ||
| 221 | save_path = os.path.join(img_save_path, 'page_{0}_img_{1}.{2}'.format( | ||
| 222 | pno, img_index, ext)) | ||
| 223 | with open(save_path, "wb") as f: | ||
| 224 | f.write(img_data) | ||
| 225 | xref_set.add(img_il[0][0]) | ||
| 226 | else: # 多张图片,竖向拼接 | ||
| 227 | height_sum = 0 | ||
| 228 | im_list = [] | ||
| 229 | width = img_il[0][2] | ||
| 230 | for img in img_il: | ||
| 231 | # xref = img[0] | ||
| 232 | # if xref in xref_list: | ||
| 233 | # continue | ||
| 234 | height = img[3] | ||
| 235 | pix = self.recoverpix(pdf, img) | ||
| 236 | ext, img_data = self.get_img_data(pix) | ||
| 237 | |||
| 238 | # xref_list.append(xref) | ||
| 239 | |||
| 240 | im = Image.open(BytesIO(img_data)) | ||
| 241 | im_list.append((height, im, ext)) | ||
| 242 | height_sum += height | ||
| 243 | |||
| 244 | save_path = os.path.join(img_save_path, 'page_{0}_img_{1}.{2}'.format( | ||
| 245 | pno, img_index, im_list[0][2])) | ||
| 246 | res = Image.new(im_list[0][1].mode, (width, height_sum)) | ||
| 247 | h_now = 0 | ||
| 248 | for h, m, _ in im_list: | ||
| 249 | res.paste(m, box=(0, h_now)) | ||
| 250 | h_now += h | ||
| 251 | res.save(save_path) |
| ... | @@ -28,7 +28,8 @@ class DocHandler: | ... | @@ -28,7 +28,8 @@ class DocHandler: |
| 28 | def get_doc_class(business_type): | 28 | def get_doc_class(business_type): |
| 29 | return (HILDoc, consts.HIL_PREFIX) if business_type in consts.HIL_SET else (AFCDoc, consts.AFC_PREFIX) | 29 | return (HILDoc, consts.HIL_PREFIX) if business_type in consts.HIL_SET else (AFCDoc, consts.AFC_PREFIX) |
| 30 | 30 | ||
| 31 | def fix_scheme(self, scheme): | 31 | @staticmethod |
| 32 | def fix_scheme(scheme): | ||
| 32 | if scheme in consts.DOC_SCHEME_LIST: | 33 | if scheme in consts.DOC_SCHEME_LIST: |
| 33 | return scheme | 34 | return scheme |
| 34 | elif scheme.upper() in consts.DOC_SCHEME_LIST: | 35 | elif scheme.upper() in consts.DOC_SCHEME_LIST: |
| ... | @@ -36,7 +37,8 @@ class DocHandler: | ... | @@ -36,7 +37,8 @@ class DocHandler: |
| 36 | else: | 37 | else: |
| 37 | return consts.DOC_SCHEME_LIST[0] | 38 | return consts.DOC_SCHEME_LIST[0] |
| 38 | 39 | ||
| 39 | def fix_data_source(self, data_source): | 40 | @staticmethod |
| 41 | def fix_data_source(data_source): | ||
| 40 | if data_source in consts.DATA_SOURCE_LIST: | 42 | if data_source in consts.DATA_SOURCE_LIST: |
| 41 | return data_source | 43 | return data_source |
| 42 | elif data_source.upper() in consts.DATA_SOURCE_LIST: | 44 | elif data_source.upper() in consts.DATA_SOURCE_LIST: | ... | ... |
src/common/tools/pdf_to_img.py
0 → 100644
| 1 | import os | ||
| 2 | import fitz | ||
| 3 | from PIL import Image | ||
| 4 | from io import BytesIO | ||
| 5 | |||
| 6 | # 页面保存为png图片参数 | ||
| 7 | ZOOM_X = ZOOM_Y = 2.0 | ||
| 8 | trans = fitz.Matrix(ZOOM_X, ZOOM_X).preRotate(0) # zoom factor 2 in each dimension | ||
| 9 | |||
| 10 | # 特殊filter处理 | ||
| 11 | ADOBE_FILTER_SET = {'FlateDecode', 'JPXDecode', 'JBIG2Decode'} | ||
| 12 | |||
| 13 | # 宽高阈值组合 | ||
| 14 | WH_COUPLE_1 = (500, 500) | ||
| 15 | WH_COUPLE_2 = (700, 647) | ||
| 16 | WH_COUPLE_3 = (100, 100) | ||
| 17 | WH_COUPLE_4 = (100, 300) | ||
| 18 | WH_COUPLE_5 = (100, 200) | ||
| 19 | |||
| 20 | |||
| 21 | class PDFHandler: | ||
| 22 | |||
| 23 | def __init__(self, path, img_dir_path): | ||
| 24 | self.path = path | ||
| 25 | self.img_dir_path = img_dir_path | ||
| 26 | self.img_path_list = [] | ||
| 27 | self.xref_set = set() | ||
| 28 | |||
| 29 | def get_img_save_path(self, pno, img_index=0, ext='png'): | ||
| 30 | return os.path.join(self.img_dir_path, 'page_{0}_img_{1}.{2}'.format(pno, img_index, ext)) | ||
| 31 | |||
| 32 | def page_to_png(self, page): | ||
| 33 | pm = page.getPixmap(matrix=trans, alpha=False) | ||
| 34 | img_save_path = self.get_img_save_path(page.number) | ||
| 35 | pm.writePNG(img_save_path) | ||
| 36 | self.img_path_list.append(img_save_path) | ||
| 37 | |||
| 38 | @staticmethod | ||
| 39 | def getimage(pix): | ||
| 40 | # RGB | ||
| 41 | if pix.colorspace.n != 4: | ||
| 42 | return pix | ||
| 43 | # GRAY/CMYK | ||
| 44 | tpix = fitz.Pixmap(fitz.csRGB, pix) | ||
| 45 | return tpix | ||
| 46 | |||
| 47 | def recover_pix(self, doc, xref, smask, colorspace): | ||
| 48 | if smask != 0: | ||
| 49 | # we need to reconstruct the alpha channel with the smask | ||
| 50 | pix1 = fitz.Pixmap(doc, xref) | ||
| 51 | pix2 = fitz.Pixmap(doc, smask) # create pixmap of the /SMask entry | ||
| 52 | |||
| 53 | # sanity check | ||
| 54 | if not (pix1.irect == pix2.irect and pix1.alpha == pix2.alpha == 0 and pix2.n == 1): | ||
| 55 | pix2 = None | ||
| 56 | return self.getimage(pix1) | ||
| 57 | |||
| 58 | pix = fitz.Pixmap(pix1) # copy of pix1, alpha channel added | ||
| 59 | pix.setAlpha(pix2.samples) # treat pix2.samples as alpha value | ||
| 60 | pix1 = pix2 = None # free temp pixmaps | ||
| 61 | return self.getimage(pix) | ||
| 62 | elif colorspace in {'Separation', 'DeviceCMYK'}: | ||
| 63 | pix = fitz.Pixmap(doc, xref) | ||
| 64 | tpix = fitz.Pixmap(fitz.csRGB, pix) | ||
| 65 | return tpix | ||
| 66 | else: | ||
| 67 | return doc.extractImage(xref) | ||
| 68 | |||
| 69 | @staticmethod | ||
| 70 | def get_img_data(pix): | ||
| 71 | if type(pix) is dict: # we got a raw image | ||
| 72 | ext = pix["ext"] | ||
| 73 | img_data = pix["image"] | ||
| 74 | else: # we got a pixmap | ||
| 75 | ext = 'png' | ||
| 76 | img_data = pix.getPNGData() | ||
| 77 | return ext, img_data | ||
| 78 | |||
| 79 | def extract_single_image(self, pdf, xref, smask, colorspace, pno, img_index=0): | ||
| 80 | pix = self.recover_pix(pdf, xref, smask, colorspace) | ||
| 81 | ext, img_data = self.get_img_data(pix) | ||
| 82 | img_save_path = self.get_img_save_path(pno, img_index=img_index, ext=ext) | ||
| 83 | with open(img_save_path, "wb") as f: | ||
| 84 | f.write(img_data) | ||
| 85 | self.xref_set.add(xref) | ||
| 86 | self.img_path_list.append(img_save_path) | ||
| 87 | |||
| 88 | @staticmethod | ||
| 89 | def split_il(il): | ||
| 90 | broken_il = [] | ||
| 91 | start = 0 | ||
| 92 | length = len(il) | ||
| 93 | page_to_png = None | ||
| 94 | for i in range(length): | ||
| 95 | # 当图片对象含有特殊filter时,特殊处理:整个页面保存为png图片 | ||
| 96 | if il[i][-1] in ADOBE_FILTER_SET: | ||
| 97 | page_to_png = True | ||
| 98 | break | ||
| 99 | else: | ||
| 100 | for i in range(length): | ||
| 101 | # 当图片对象够大时,不作碎图合并处理,而是单纯提取 | ||
| 102 | if il[i][2] >= WH_COUPLE_2[0] and il[i][3] >= WH_COUPLE_2[1]: | ||
| 103 | break | ||
| 104 | if i == start: | ||
| 105 | if i == length - 1: | ||
| 106 | broken_il.append(il[start: length]) | ||
| 107 | continue | ||
| 108 | elif i == length - 1: | ||
| 109 | if il[i][2] == il[i - 1][2]: | ||
| 110 | broken_il.append(il[start: length]) | ||
| 111 | else: | ||
| 112 | broken_il.append(il[start: i]) | ||
| 113 | broken_il.append(il[i: length]) | ||
| 114 | continue | ||
| 115 | if il[i][2] != il[i - 1][2]: | ||
| 116 | broken_il.append(il[start: i]) | ||
| 117 | start = i | ||
| 118 | elif il[i][3] != il[i - 1][3]: | ||
| 119 | broken_il.append(il[start: i + 1]) | ||
| 120 | start = i + 1 | ||
| 121 | else: | ||
| 122 | # 碎图分组结果 | ||
| 123 | return broken_il | ||
| 124 | return page_to_png | ||
| 125 | |||
| 126 | def merge_il(self, pdf, pno, il): | ||
| 127 | # 尝试碎图合并前的分组 | ||
| 128 | il.sort(key=lambda x: x[0]) | ||
| 129 | broken_il = self.split_il(il) | ||
| 130 | |||
| 131 | page_to_png = True | ||
| 132 | # 3.1 当图片对象够大时,不作碎图合并处理,而是单纯提取 | ||
| 133 | if broken_il is None: | ||
| 134 | page_to_png = False | ||
| 135 | for img_index, img in enumerate(il): | ||
| 136 | xref, smask, width, height, _, colorspace, _, _, adobe_filter = img | ||
| 137 | if width < WH_COUPLE_3[0] or height < WH_COUPLE_3[1]: # 过滤小图(如二维码) | ||
| 138 | continue | ||
| 139 | elif xref not in self.xref_set: | ||
| 140 | self.extract_single_image(pdf, xref, smask, colorspace, pno, img_index) | ||
| 141 | # 3.2 碎图按照分组合并 | ||
| 142 | elif isinstance(broken_il, list) and len(broken_il) <= 2: | ||
| 143 | for img_index, img_il in enumerate(broken_il): | ||
| 144 | # 3.2.1 仅一张碎图,过滤或直接提取 | ||
| 145 | if len(img_il) == 1: | ||
| 146 | xref, smask, width, height, _, colorspace, _, _, adobe_filter = img_il[0] | ||
| 147 | # 过滤小图(如二维码) | ||
| 148 | if width < WH_COUPLE_4[0] or height < WH_COUPLE_4[1] or \ | ||
| 149 | (width < WH_COUPLE_1[0] and height < WH_COUPLE_1[1]): | ||
| 150 | continue | ||
| 151 | elif xref not in self.xref_set: | ||
| 152 | self.extract_single_image(pdf, xref, smask, colorspace, pno, img_index) | ||
| 153 | page_to_png = False | ||
| 154 | # 3.2.2 多张碎图,竖向拼接 | ||
| 155 | else: | ||
| 156 | height_sum = sum([img[3] for img in img_il]) | ||
| 157 | width = img_il[0][2] | ||
| 158 | # 过滤小图和不常规大图 | ||
| 159 | if width < WH_COUPLE_5[0] or height_sum < WH_COUPLE_5[1] or \ | ||
| 160 | (width > 1000 and height_sum > width * 3): | ||
| 161 | continue | ||
| 162 | im_list = [] | ||
| 163 | for img in img_il: | ||
| 164 | xref, smask, _, height, _, colorspace, _, _, adobe_filter = img | ||
| 165 | pix = self.recover_pix(pdf, xref, smask, colorspace) | ||
| 166 | ext, img_data = self.get_img_data(pix) | ||
| 167 | im = Image.open(BytesIO(img_data)) | ||
| 168 | im_list.append((height, im, ext)) | ||
| 169 | new_img = Image.new(im_list[0][1].mode, (width, height_sum)) | ||
| 170 | h_now = 0 | ||
| 171 | for h, m, _ in im_list: | ||
| 172 | new_img.paste(m, box=(0, h_now)) | ||
| 173 | h_now += h | ||
| 174 | img_save_path = self.get_img_save_path(pno, img_index, im_list[0][2]) | ||
| 175 | new_img.save(img_save_path) | ||
| 176 | page_to_png = False | ||
| 177 | self.img_path_list.append(img_save_path) | ||
| 178 | |||
| 179 | # 3.3 碎图分组大于2、全过滤、含特殊filter,特殊处理:整个页面保存为png图片 | ||
| 180 | if page_to_png: | ||
| 181 | page = pdf.loadPage(pno) | ||
| 182 | self.page_to_png(page) | ||
| 183 | |||
| 184 | def extract_image(self): | ||
| 185 | os.makedirs(self.img_dir_path, exist_ok=True) | ||
| 186 | with fitz.Document(self.path) as pdf: | ||
| 187 | for pno in range(pdf.pageCount): | ||
| 188 | il = pdf.getPageImageList(pno) # 获取页面图片对象 | ||
| 189 | # (xref, smask, width, height, bpc, colorspace, alt.colorspace, name, filter, invoker) | ||
| 190 | |||
| 191 | # 1.页面图片对象数目为0时,保存整个页面为png图片 | ||
| 192 | if len(il) == 0: | ||
| 193 | page = pdf.loadPage(pno) | ||
| 194 | self.page_to_png(page) | ||
| 195 | # 2.页面图片对象数目为1时: | ||
| 196 | # 小图(如电子账单的盖章):保存整个页面为png图片 | ||
| 197 | # 大图:提取图片对象 | ||
| 198 | elif len(il) == 1: | ||
| 199 | xref, smask, width, height, _, colorspace, _, _, _ = il[0] | ||
| 200 | # 小图 | ||
| 201 | if width < WH_COUPLE_1[0] and height < WH_COUPLE_1[1]: | ||
| 202 | page = pdf.loadPage(pno) | ||
| 203 | self.page_to_png(page) | ||
| 204 | # 大图 | ||
| 205 | elif xref not in self.xref_set: | ||
| 206 | self.extract_single_image(pdf, xref, smask, colorspace, pno) | ||
| 207 | # 3.页面图片对象数目大于1时,特殊处理 | ||
| 208 | else: | ||
| 209 | self.merge_il(pdf, pno, il) |
src/common/tools/pdf_tools.py
deleted
100644 → 0
| 1 | import fitz | ||
| 2 | import os | ||
| 3 | from PIL import Image | ||
| 4 | from io import BytesIO | ||
| 5 | |||
| 6 | |||
| 7 | class PdfHandler: | ||
| 8 | |||
| 9 | def __init__(self, pdf_path): | ||
| 10 | self.pdf_path = pdf_path | ||
| 11 | self.pdf_name = os.path.splitext(os.path.basename(pdf_path))[0] | ||
| 12 | |||
| 13 | def page_to_pix_img(self, save_dir_path, zoom_x, zoom_y): | ||
| 14 | trans = fitz.Matrix(zoom_x, zoom_y).preRotate(0) # zoom factor 2 in each dimension | ||
| 15 | with fitz.Document(self.pdf_path) as pdf: | ||
| 16 | # print(pdf.metadata) | ||
| 17 | # print(pdf.getPageImageList(0)) | ||
| 18 | # print(pdf.getToC()) # 获取大纲 | ||
| 19 | for page in pdf: | ||
| 20 | pm = page.getPixmap(matrix=trans, alpha=False) | ||
| 21 | # print(pm.samples) # a rectangular area of bytes representing the image data (a Python bytes object). | ||
| 22 | # print(pm.width) | ||
| 23 | # print(pm.height) | ||
| 24 | # print(pm.stride) # number of bytes of one horizontal image line) | ||
| 25 | |||
| 26 | save_path = os.path.join(save_dir_path, '{0}_{1}.png'.format(self.pdf_name, page.number)) | ||
| 27 | # pm.writePNG(save_path) | ||
| 28 | pm.writeImage(save_path) | ||
| 29 | |||
| 30 | def page_to_svg_img(self, save_dir_path): | ||
| 31 | with fitz.Document(self.pdf_path) as pdf: | ||
| 32 | for page in pdf: | ||
| 33 | svg = page.getSVGimage(matrix=fitz.Identity) # UTF-8 string svg | ||
| 34 | save_path = os.path.join(save_dir_path, '{0}_{1}.svg'.format(self.pdf_name, page.number)) | ||
| 35 | with open(save_path, 'w') as f: | ||
| 36 | f.write(svg) | ||
| 37 | |||
| 38 | @staticmethod | ||
| 39 | def getimage(pix): | ||
| 40 | if pix.colorspace.n != 4: | ||
| 41 | return pix | ||
| 42 | tpix = fitz.Pixmap(fitz.csRGB, pix) | ||
| 43 | return tpix | ||
| 44 | |||
| 45 | def recoverpix(self, doc, item): | ||
| 46 | x = item[0] # xref of PDF image | ||
| 47 | s = item[1] # xref of its /SMask | ||
| 48 | is_rgb = True if item[5] == 'DeviceRGB' else False | ||
| 49 | |||
| 50 | # RGB | ||
| 51 | if is_rgb: | ||
| 52 | if s == 0: | ||
| 53 | return doc.extractImage(x) | ||
| 54 | # we need to reconstruct the alpha channel with the smask | ||
| 55 | pix1 = fitz.Pixmap(doc, x) | ||
| 56 | pix2 = fitz.Pixmap(doc, s) # create pixmap of the /SMask entry | ||
| 57 | |||
| 58 | # sanity check | ||
| 59 | if not (pix1.irect == pix2.irect and pix1.alpha == pix2.alpha == 0 and pix2.n == 1): | ||
| 60 | pix2 = None | ||
| 61 | return self.getimage(pix1) | ||
| 62 | |||
| 63 | pix = fitz.Pixmap(pix1) # copy of pix1, alpha channel added | ||
| 64 | pix.setAlpha(pix2.samples) # treat pix2.samples as alpha value | ||
| 65 | pix1 = pix2 = None # free temp pixmaps | ||
| 66 | return self.getimage(pix) | ||
| 67 | |||
| 68 | # GRAY/CMYK | ||
| 69 | pix1 = fitz.Pixmap(doc, x) | ||
| 70 | pix = fitz.Pixmap(pix1) # copy of pix1, alpha channel added | ||
| 71 | |||
| 72 | if s != 0: | ||
| 73 | pix2 = fitz.Pixmap(doc, s) # create pixmap of the /SMask entry | ||
| 74 | |||
| 75 | # sanity check | ||
| 76 | if not (pix1.irect == pix2.irect and pix1.alpha == pix2.alpha == 0 and pix2.n == 1): | ||
| 77 | pix2 = None | ||
| 78 | return self.getimage(pix1) | ||
| 79 | |||
| 80 | pix.setAlpha(pix2.samples) # treat pix2.samples as alpha value | ||
| 81 | |||
| 82 | pix1 = pix2 = None # free temp pixmaps | ||
| 83 | |||
| 84 | pix = fitz.Pixmap(fitz.csRGB, pix) # GRAY/CMYK to RGB | ||
| 85 | return self.getimage(pix) | ||
| 86 | |||
| 87 | def extract_images(self, save_dir_path): | ||
| 88 | dimlimit = 100 # each image side must be greater than this | ||
| 89 | relsize = 0.05 # image : pixmap size ratio must be larger than this (5%) | ||
| 90 | abssize = 2048 # absolute image size limit 2 KB: ignore if smaller | ||
| 91 | imgdir = save_dir_path # found images are stored in this subfolder | ||
| 92 | |||
| 93 | xreflist = [] | ||
| 94 | with fitz.Document(self.pdf_path) as pdf: | ||
| 95 | for pno in range(pdf.pageCount): | ||
| 96 | il = pdf.getPageImageList(pno) | ||
| 97 | for img in il: | ||
| 98 | print(img) | ||
| 99 | xref = img[0] | ||
| 100 | if xref in xreflist: | ||
| 101 | continue | ||
| 102 | width = img[2] | ||
| 103 | height = img[3] | ||
| 104 | print(xref, width, height) | ||
| 105 | # if min(width, height) <= dimlimit: | ||
| 106 | # continue | ||
| 107 | pix = self.recoverpix(pdf, img) | ||
| 108 | if type(pix) is dict: # we got a raw image | ||
| 109 | ext = pix["ext"] | ||
| 110 | imgdata = pix["image"] | ||
| 111 | n = pix["colorspace"] | ||
| 112 | imgfile = os.path.join(imgdir, "img-%i.%s" % (xref, ext)) | ||
| 113 | else: # we got a pixmap | ||
| 114 | imgfile = os.path.join(imgdir, "img-%i.png" % xref) | ||
| 115 | n = pix.n | ||
| 116 | imgdata = pix.getPNGData() | ||
| 117 | |||
| 118 | # if len(imgdata) <= abssize: | ||
| 119 | # continue | ||
| 120 | # | ||
| 121 | # if len(imgdata) / (width * height * n) <= relsize: | ||
| 122 | # continue | ||
| 123 | |||
| 124 | fout = open(imgfile, "wb") | ||
| 125 | fout.write(imgdata) | ||
| 126 | fout.close() | ||
| 127 | xreflist.append(xref) | ||
| 128 | |||
| 129 | @staticmethod | ||
| 130 | def split_il(il): | ||
| 131 | img_il_list = [] | ||
| 132 | start = 0 | ||
| 133 | length = len(il) | ||
| 134 | for i in range(length): | ||
| 135 | if i == start: | ||
| 136 | if i == length - 1: | ||
| 137 | img_il_list.append(il[start: length]) | ||
| 138 | continue | ||
| 139 | elif i == length - 1: | ||
| 140 | img_il_list.append(il[start: length]) | ||
| 141 | continue | ||
| 142 | if il[i][2] != il[i - 1][2]: | ||
| 143 | img_il_list.append(il[start: i]) | ||
| 144 | start = i | ||
| 145 | elif il[i][3] != il[i - 1][3]: | ||
| 146 | img_il_list.append(il[start: i + 1]) | ||
| 147 | start = i + 1 | ||
| 148 | return img_il_list | ||
| 149 | |||
| 150 | def extract_images_pro(self, save_dir_path): | ||
| 151 | with fitz.Document(self.pdf_path) as pdf: | ||
| 152 | print('----------------------------') | ||
| 153 | print(self.pdf_name) | ||
| 154 | print(pdf.metadata) | ||
| 155 | # xref_list = [] | ||
| 156 | for pno in range(pdf.pageCount): | ||
| 157 | print('========================') | ||
| 158 | il = pdf.getPageImageList(pno) | ||
| 159 | il.sort(key=lambda x: x[0]) | ||
| 160 | # (xref, smask, width, height, bpc, colorspace, alt.colorspace, name, filter, invoker) | ||
| 161 | |||
| 162 | img_il_list = self.split_il(il) | ||
| 163 | il = None | ||
| 164 | print(img_il_list) | ||
| 165 | print(len(img_il_list)) | ||
| 166 | |||
| 167 | for img_count, img_il in enumerate(img_il_list): | ||
| 168 | print(img_il) | ||
| 169 | height_sum = 0 | ||
| 170 | im_list = [] | ||
| 171 | for img in img_il: | ||
| 172 | # xref = img[0] | ||
| 173 | # if xref in xref_list: | ||
| 174 | # continue | ||
| 175 | width = img[2] | ||
| 176 | height = img[3] | ||
| 177 | pix = self.recoverpix(pdf, img) | ||
| 178 | if type(pix) is dict: # we got a raw image | ||
| 179 | ext = pix["ext"] | ||
| 180 | img_data = pix["image"] | ||
| 181 | else: # we got a pixmap | ||
| 182 | ext = 'png' | ||
| 183 | img_data = pix.getPNGData() | ||
| 184 | |||
| 185 | # xref_list.append(xref) | ||
| 186 | |||
| 187 | im = Image.open(BytesIO(img_data)) | ||
| 188 | im_list.append((width, height, im, ext)) | ||
| 189 | height_sum += height | ||
| 190 | |||
| 191 | print(im_list) | ||
| 192 | save_path = os.path.join(save_dir_path, 'page_{0}_img_{1}.{2}'.format( | ||
| 193 | pno, img_count, im_list[0][3])) | ||
| 194 | # 当只有一张图片时, 简化处理 | ||
| 195 | if len(im_list) == 1: | ||
| 196 | im_list[0][2].save(save_path) | ||
| 197 | # 多张图片,竖向拼接 | ||
| 198 | else: | ||
| 199 | res = Image.new(im_list[0][2].mode, (im_list[0][0], height_sum)) | ||
| 200 | h_now = 0 | ||
| 201 | for _, h, m, _ in im_list: | ||
| 202 | res.paste(m, box=(0, h_now)) | ||
| 203 | h_now += h | ||
| 204 | res.save(save_path) | ||
| 205 | |||
| 206 | |||
| 207 | if __name__ == '__main__': | ||
| 208 | dir_path = '/Users/clay/Desktop/biz/pdf_test/银行流水/' | ||
| 209 | pdf_list = os.listdir(dir_path) | ||
| 210 | for path in pdf_list: | ||
| 211 | if path == '.DS_Store': | ||
| 212 | continue | ||
| 213 | pdf_handler = PdfHandler(os.path.join(dir_path, path)) | ||
| 214 | save_path = os.path.join('/Users/clay/Desktop/biz/pdf_test/', 'test', os.path.splitext(os.path.basename(path))[0]) | ||
| 215 | os.mkdir(save_path) | ||
| 216 | pdf_handler.extract_images_pro(save_path) | ||
| 217 | # pdf_handler = PdfHandler('/Users/clay/Desktop/biz/pdf_test/银行流水/竖版-特殊-邮储银行-一本通绿卡通交易明细(客户).pdf') | ||
| 218 | # pdf_handler = PdfHandler('/Users/clay/Desktop/biz/pdf_test/银行流水/横版-表格-工商银行 借记卡账户历史明细清单 .pdf') | ||
| 219 | # pdf_handler.page_to_pix_img('/Users/clay/Desktop/biz/pdf_test/', 3.0, 3.0) | ||
| 220 | # pdf_handler.page_to_svg_img('/Users/clay/Desktop/biz/pdf_test/') | ||
| 221 | # pdf_handler.extract_images_pro('/Users/clay/Desktop/biz/pdf_test/test') | ||
| 222 | |||
| 223 | # pix = fitz.Pixmap(sys.argv[1]) # read image file | ||
| 224 | # rgb = "RGB" # set PIL parameter | ||
| 225 | # if pix.alpha: # JPEG cannot have alpha! | ||
| 226 | # pix0 = fitz.Pixmap(pix, 0) # drop alpha channel | ||
| 227 | # pix = pix0 # rename pixmap | ||
| 228 | # | ||
| 229 | # img = Image.frombuffer(rgb, [pix.width, pix.height], pix.samples, "raw", rgb, 0, 1) | ||
| 230 | # img.save(outputFileName) |
-
Please register or sign in to post a comment