b2945296 by 周伟奇

update pdf to img

1 parent b6896a10
...@@ -5,16 +5,12 @@ import signal ...@@ -5,16 +5,12 @@ import signal
5 import base64 5 import base64
6 import asyncio 6 import asyncio
7 import aiohttp 7 import aiohttp
8 import locale
9 from PIL import Image
10 from io import BytesIO
11 from openpyxl import Workbook 8 from openpyxl import Workbook
12 from openpyxl.styles import numbers
13 from openpyxl.utils import get_column_letter
14
15 from django.core.management import BaseCommand 9 from django.core.management import BaseCommand
10
16 from common.mixins import LoggerMixin 11 from common.mixins import LoggerMixin
17 from common.tools.file_tools import write_zip_file 12 from common.tools.file_tools import write_zip_file
13 from common.tools.pdf_to_img import PDFHandler
18 from apps.doc.models import DocStatus, HILDoc, AFCDoc 14 from apps.doc.models import DocStatus, HILDoc, AFCDoc
19 from apps.doc import consts 15 from apps.doc import consts
20 from settings import conf 16 from settings import conf
...@@ -123,126 +119,6 @@ class Command(BaseCommand, LoggerMixin): ...@@ -123,126 +119,6 @@ class Command(BaseCommand, LoggerMixin):
123 img_name = os.path.basename(img_path) 119 img_name = os.path.basename(img_path)
124 self.append_sheet(wb, sheets_list, img_name) 120 self.append_sheet(wb, sheets_list, img_name)
125 121
126 def proof(self, ws):
127 # 找到金额、余额列
128 amount_col = overage_col = None
129 for i in ws[1]:
130 if i.value in consts.AMOUNT_COL_TITLE_SET:
131 amount_col = i.column
132 amount_col_letter = get_column_letter(amount_col)
133 elif i.value in consts.OVERAGE_COL_TITLE_SET:
134 overage_col = i.column
135 overage_col_letter = get_column_letter(overage_col)
136 if amount_col is None or overage_col is None:
137 return
138 # 文本转数值
139 for col_tuple in ws.iter_cols(min_row=2, min_col=amount_col, max_col=overage_col):
140 for c in col_tuple:
141 try:
142 c.value = locale.atof(c.value)
143 c.number_format = numbers.FORMAT_NUMBER_00
144 except Exception:
145 continue
146 # 增加核对结果列
147 proof_col_letter = get_column_letter(ws.max_column + 1)
148 for c in ws[proof_col_letter]:
149 if c.row == 1:
150 c.value = consts.PROOF_COL_TITLE
151 elif c.row == 2:
152 continue
153 else:
154 c.value = '=IF({3}{0}=SUM({2}{0},{3}{1}), "{4}", "{5}")'.format(
155 c.row, c.row - 1, amount_col_letter, overage_col_letter, *consts.PROOF_RES)
156
157 def wb_process(self, wb, excel_path):
158 locale.setlocale(locale.LC_NUMERIC, 'en_US.UTF-8')
159 for ws in wb.worksheets:
160 if ws.title == 'Sheet':
161 ws.title = consts.META_SHEET_TITLE
162 else:
163 self.proof(ws)
164 wb.save(excel_path) # TODO no sheet (res always [])
165
166 @staticmethod
167 def getimage(pix):
168 if pix.colorspace.n != 4:
169 return pix
170 tpix = fitz.Pixmap(fitz.csRGB, pix)
171 return tpix
172
173 def recoverpix(self, doc, item):
174 x = item[0] # xref of PDF image
175 s = item[1] # xref of its /SMask
176 is_rgb = True if item[5] == 'DeviceRGB' else False
177
178 # RGB
179 if is_rgb:
180 if s == 0:
181 return doc.extractImage(x)
182 # we need to reconstruct the alpha channel with the smask
183 pix1 = fitz.Pixmap(doc, x)
184 pix2 = fitz.Pixmap(doc, s) # create pixmap of the /SMask entry
185
186 # sanity check
187 if not (pix1.irect == pix2.irect and pix1.alpha == pix2.alpha == 0 and pix2.n == 1):
188 pix2 = None
189 return self.getimage(pix1)
190
191 pix = fitz.Pixmap(pix1) # copy of pix1, alpha channel added
192 pix.setAlpha(pix2.samples) # treat pix2.samples as alpha value
193 pix1 = pix2 = None # free temp pixmaps
194 return self.getimage(pix)
195
196 # CMYK
197 pix1 = fitz.Pixmap(doc, x)
198 pix = fitz.Pixmap(pix1) # copy of pix1, alpha channel added
199
200 if s != 0:
201 pix2 = fitz.Pixmap(doc, s) # create pixmap of the /SMask entry
202
203 # sanity check
204 if not (pix1.irect == pix2.irect and pix1.alpha == pix2.alpha == 0 and pix2.n == 1):
205 pix2 = None
206 return self.getimage(pix1)
207
208 pix.setAlpha(pix2.samples) # treat pix2.samples as alpha value
209
210 pix1 = pix2 = None # free temp pixmaps
211
212 pix = fitz.Pixmap(fitz.csRGB, pix) # GRAY/CMYK to RGB
213 return self.getimage(pix)
214
215 @staticmethod
216 def get_img_data(pix):
217 if type(pix) is dict: # we got a raw image
218 ext = pix["ext"]
219 img_data = pix["image"]
220 else: # we got a pixmap
221 ext = 'png'
222 img_data = pix.getPNGData()
223 return ext, img_data
224
225 @staticmethod
226 def split_il(il):
227 img_il_list = []
228 start = 0
229 length = len(il)
230 for i in range(length):
231 if i == start:
232 if i == length - 1:
233 img_il_list.append(il[start: length])
234 continue
235 elif i == length - 1:
236 img_il_list.append(il[start: length])
237 continue
238 if il[i][2] != il[i - 1][2]:
239 img_il_list.append(il[start: i])
240 start = i
241 elif il[i][3] != il[i - 1][3]:
242 img_il_list.append(il[start: i + 1])
243 start = i + 1
244 return img_il_list
245
246 # TODO 细化文件状态,不同异常状态采取不同的处理 122 # TODO 细化文件状态,不同异常状态采取不同的处理
247 # TODO 调用接口重试 123 # TODO 调用接口重试
248 def handle(self, *args, **kwargs): 124 def handle(self, *args, **kwargs):
...@@ -252,98 +128,33 @@ class Command(BaseCommand, LoggerMixin): ...@@ -252,98 +128,33 @@ class Command(BaseCommand, LoggerMixin):
252 while self.switch: 128 while self.switch:
253 # 1. 从队列获取文件信息 129 # 1. 从队列获取文件信息
254 doc, business_type = self.get_doc_info() 130 doc, business_type = self.get_doc_info()
255
256 try: 131 try:
257 # 2. 从EDMS获取PDF文件 132 # 2. 从EDMS获取PDF文件
258 doc_data_path, excel_path, pdf_path = self.pdf_download(doc, business_type) 133 doc_data_path, excel_path, pdf_path = self.pdf_download(doc, business_type)
259
260 # 队列为空时的处理 134 # 队列为空时的处理
261 if pdf_path is None: 135 if pdf_path is None:
262 time.sleep(sleep_second) 136 time.sleep(sleep_second)
263 sleep_second = min(max_sleep_second, sleep_second+5) 137 sleep_second = min(max_sleep_second, sleep_second+5)
264 continue 138 continue
265
266 sleep_second = int(conf.SLEEP_SECOND) 139 sleep_second = int(conf.SLEEP_SECOND)
267
268 # 3.PDF文件提取图片 140 # 3.PDF文件提取图片
269 img_save_path = os.path.join(doc_data_path, 'img') 141 img_save_path = os.path.join(doc_data_path, 'img')
270 os.makedirs(img_save_path, exist_ok=True) 142 self.cronjob_log.info('{0} [pdf to img start] [business_type={1}] [doc_id={2}]'.format(
271 img_path_list = [] 143 self.log_base, business_type, doc.id))
272 with fitz.Document(pdf_path) as pdf: 144 pdf_handler = PDFHandler(pdf_path, img_save_path)
273 self.cronjob_log.info('{0} [pdf_path={1}] [metadata={2}]'.format( 145 pdf_handler.extract_image()
274 self.log_base, pdf_path, pdf.metadata)) 146 self.cronjob_log.info('{0} [pdf to img end] [business_type={1}] [doc_id={2}]'.format(
275 # xref_list = [] # TODO 图片去重 特殊pdf:如电子发票
276 for pno in range(pdf.pageCount):
277 il = pdf.getPageImageList(pno)
278 il.sort(key=lambda x: x[0])
279 img_il_list = self.split_il(il)
280 del il
281
282 if len(img_il_list) > 3: # 单页无规律小图过多时,使用页面转图片
283 page = pdf.loadPage(pno)
284 pm = page.getPixmap(matrix=self.trans, alpha=False)
285 save_path = os.path.join(img_save_path, 'page_{0}_img_0.png'.format(page.number))
286 pm.writePNG(save_path)
287 img_path_list.append(save_path)
288 self.cronjob_log.info('{0} [page to img success] [pdf_path={1}] [page={2}]'.format(
289 self.log_base, pdf_path, page.number))
290 else: # 提取图片
291 for img_index, img_il in enumerate(img_il_list):
292 if len(img_il) == 1: # 当只有一张图片时, 简化处理
293 pix = self.recoverpix(pdf, img_il[0])
294 ext, img_data = self.get_img_data(pix)
295 save_path = os.path.join(img_save_path, 'page_{0}_img_{1}.{2}'.format(
296 pno, img_index, ext))
297 with open(save_path, "wb") as f:
298 f.write(img_data)
299 img_path_list.append(save_path)
300 self.cronjob_log.info(
301 '{0} [extract img success] [pdf_path={1}] [page={2}] [img_index={3}]'.format(
302 self.log_base, pdf_path, pno, img_index))
303 else: # 多张图片,竖向拼接
304 height_sum = 0
305 im_list = []
306 width = img_il[0][2]
307 for img in img_il:
308 # xref = img[0]
309 # if xref in xref_list:
310 # continue
311 height = img[3]
312 pix = self.recoverpix(pdf, img)
313 ext, img_data = self.get_img_data(pix)
314
315 # xref_list.append(xref)
316
317 im = Image.open(BytesIO(img_data))
318 im_list.append((height, im, ext))
319 height_sum += height
320
321 save_path = os.path.join(img_save_path, 'page_{0}_img_{1}.{2}'.format(
322 pno, img_index, im_list[0][2]))
323 res = Image.new(im_list[0][1].mode, (width, height_sum))
324 h_now = 0
325 for h, m, _ in im_list:
326 res.paste(m, box=(0, h_now))
327 h_now += h
328 res.save(save_path)
329 img_path_list.append(save_path)
330 self.cronjob_log.info(
331 '{0} [extract img success] [pdf_path={1}] [page={2}] [img_index={3}]'.format(
332 self.log_base, pdf_path, pno, img_index))
333 self.cronjob_log.info('{0} [pdf to img success] [business_type={1}] [doc_id={2}]'.format(
334 self.log_base, business_type, doc.id)) 147 self.log_base, business_type, doc.id))
335
336 write_zip_file(img_save_path, os.path.join(doc_data_path, '{0}_img.zip'.format(doc.id))) 148 write_zip_file(img_save_path, os.path.join(doc_data_path, '{0}_img.zip'.format(doc.id)))
337 149
338 # 4.图片调用算法判断是否为银行流水, 图片调用算法OCR为excel文件 150 # 4.图片调用算法判断是否为银行流水, 图片调用算法OCR为excel文件
339 wb = Workbook() 151 wb = Workbook()
340 loop = asyncio.get_event_loop() 152 loop = asyncio.get_event_loop()
341 tasks = [self.img_ocr_excel(wb, img_path) for img_path in img_path_list] 153 tasks = [self.img_ocr_excel(wb, img_path) for img_path in pdf_handler.img_path_list]
342 loop.run_until_complete(asyncio.wait(tasks)) 154 loop.run_until_complete(asyncio.wait(tasks))
343 # loop.close() 155 # loop.close()
344 156
345 # 整合excel文件 157 # 整合excel文件
346 # self.wb_process(wb, excel_path)
347 wb.save(excel_path) 158 wb.save(excel_path)
348 except Exception as e: 159 except Exception as e:
349 doc.status = DocStatus.PROCESS_FAILED.value 160 doc.status = DocStatus.PROCESS_FAILED.value
......
1 import os
2 import fitz
3 import signal
4 from PIL import Image
5 from io import BytesIO
6
7 from django.core.management import BaseCommand
8 from common.mixins import LoggerMixin
9
10
11 class Command(BaseCommand, LoggerMixin):
12
13 def __init__(self):
14 super().__init__()
15 self.log_base = '[pdf to img]'
16 # 处理文件开关
17 self.switch = True
18 # pdf页面转图片
19 self.zoom_x = 2.0
20 self.zoom_y = 2.0
21 self.trans = fitz.Matrix(self.zoom_x, self.zoom_y).preRotate(0) # zoom factor 2 in each dimension
22 # 优雅退出信号:15
23 signal.signal(signal.SIGTERM, self.signal_handler)
24
25 def signal_handler(self, sig, frame):
26 self.switch = False # 停止处理文件
27
28 @staticmethod
29 def getimage(pix):
30 if pix.colorspace.n != 4:
31 return pix
32 tpix = fitz.Pixmap(fitz.csRGB, pix)
33 return tpix
34
35 def recoverpix(self, doc, item):
36 x = item[0] # xref of PDF image
37 s = item[1] # xref of its /SMask
38 is_rgb = True if item[5] == 'DeviceRGB' else False
39
40 # RGB
41 if is_rgb:
42 if s == 0:
43 return doc.extractImage(x)
44 # we need to reconstruct the alpha channel with the smask
45 pix1 = fitz.Pixmap(doc, x)
46 pix2 = fitz.Pixmap(doc, s) # create pixmap of the /SMask entry
47
48 # sanity check
49 if not (pix1.irect == pix2.irect and pix1.alpha == pix2.alpha == 0 and pix2.n == 1):
50 pix2 = None
51 return self.getimage(pix1)
52
53 pix = fitz.Pixmap(pix1) # copy of pix1, alpha channel added
54 pix.setAlpha(pix2.samples) # treat pix2.samples as alpha value
55 pix1 = pix2 = None # free temp pixmaps
56 return self.getimage(pix)
57
58 # GRAY/CMYK
59 pix1 = fitz.Pixmap(doc, x)
60 pix = fitz.Pixmap(pix1) # copy of pix1, alpha channel added
61
62 if s != 0:
63 pix2 = fitz.Pixmap(doc, s) # create pixmap of the /SMask entry
64
65 # sanity check
66 if not (pix1.irect == pix2.irect and pix1.alpha == pix2.alpha == 0 and pix2.n == 1):
67 pix2 = None
68 return self.getimage(pix1)
69
70 pix.setAlpha(pix2.samples) # treat pix2.samples as alpha value
71
72 pix1 = pix2 = None # free temp pixmaps
73
74 pix = fitz.Pixmap(fitz.csRGB, pix) # GRAY/CMYK to RGB
75 return self.getimage(pix)
76
77 @staticmethod
78 def get_img_data(pix):
79 if type(pix) is dict: # we got a raw image
80 ext = pix["ext"]
81 img_data = pix["image"]
82 else: # we got a pixmap
83 ext = 'png'
84 img_data = pix.getPNGData()
85 return ext, img_data
86
87 @staticmethod
88 def split_il(il):
89 small_img_il_list = []
90 big_img_il_list = []
91 start = 0
92 index = 0
93 length = len(il)
94 for i in range(length):
95 if il[i][2] >= 700 and il[i][3] >= 647:
96 if start < i:
97 small_img_il_list.append((il[start: i], index))
98 index += 1
99 else:
100 start += 1
101 big_img_il_list.append((il[i], index))
102 index += 1
103 continue
104 if i == start:
105 if i == length - 1:
106 small_img_il_list.append((il[start: length], index))
107 continue
108 elif i == length - 1:
109 if il[i][2] == il[i - 1][2]:
110 small_img_il_list.append((il[start: length], index))
111 else:
112 small_img_il_list.append((il[start: i], index))
113 small_img_il_list.append((il[i: length], index+1))
114 continue
115 if il[i][2] != il[i - 1][2]:
116 small_img_il_list.append((il[start: i], index))
117 index += 1
118 start = i
119 elif il[i][3] != il[i - 1][3] and il[i][2] < 1200:
120 small_img_il_list.append((il[start: i + 1], index))
121 index += 1
122 start = i + 1
123 return small_img_il_list, big_img_il_list
124
125 def handle(self, *args, **kwargs):
126 pdf_dir = '/Users/clay/Desktop/问题PDF'
127 img_dir = '/Users/clay/Desktop/问题PDF'
128 for d in os.listdir(pdf_dir):
129 # if d in ['.DS_Store', 'CH-B008003736.pdf', 'CH-B006317088.pdf', 'CH-B008487476.pdf', 'CH-B006337608.pdf',
130 # 'CH-B006391612.pdf', 'CH-B006536124.pdf', 'CH-B006526652.pdf', 'CH-B009003592.pdf']:
131 # continue
132 # if d != 'CH-B006393152.PDF':
133 # if d != 'CH-B006526652.pdf':
134 if d != 'CH-B008487944.pdf':
135 continue
136 pdf_path = os.path.join(pdf_dir, d)
137 if os.path.isfile(pdf_path):
138 img_save_path = os.path.join(img_dir, d[:-4])
139 # if os.path.exists(img_save_path):
140 # continue
141 os.makedirs(img_save_path, exist_ok=True)
142 with fitz.Document(pdf_path) as pdf:
143 self.cronjob_log.info('{0} [pdf_path={1}] [metadata={2}]'.format(
144 self.log_base, pdf_path, pdf.metadata))
145 xref_set = set()
146 for pno in range(pdf.pageCount):
147 print('---------------------------------------')
148 il = pdf.getPageImageList(pno)
149 # (xref, smask, width, height, bpc, colorspace, alt.colorspace, name, filter, invoker)
150 print(il)
151
152 # for img_index, img in enumerate(il):
153 # pix = self.recoverpix(pdf, img)
154 # ext, img_data = self.get_img_data(pix)
155 # save_path = os.path.join(img_save_path, 'page_{0}_img_{1}.{2}'.format(
156 # pno, img_index, ext))
157 # with open(save_path, "wb") as f:
158 # f.write(img_data)
159
160 if len(il) == 0:
161 page = pdf.loadPage(pno)
162 pm = page.getPixmap(matrix=self.trans, alpha=False)
163 save_path = os.path.join(img_save_path, 'page_{0}_img_0.png'.format(page.number))
164 pm.writePNG(save_path)
165 elif len(il) == 1:
166 width = il[0][2]
167 height = il[0][3]
168 colorspace = il[0][5]
169 adobe_filter = il[0][-1]
170 if colorspace == '' or adobe_filter in ['', '']:
171 continue
172 # 小图
173 if width < 500 and height < 500:
174 page = pdf.loadPage(pno)
175 pm = page.getPixmap(matrix=self.trans, alpha=False)
176 save_path = os.path.join(img_save_path, 'page_{0}_img_0.png'.format(page.number))
177 pm.writePNG(save_path)
178 # 大图
179 elif il[0][0] not in xref_set:
180 pix = self.recoverpix(pdf, il[0])
181 ext, img_data = self.get_img_data(pix)
182 save_path = os.path.join(img_save_path, 'page_{0}_img_0.{1}'.format(pno, ext))
183 with open(save_path, "wb") as f:
184 f.write(img_data)
185 xref_set.add(il[0][0])
186 else:
187 il.sort(key=lambda x: x[0])
188 small_img_il_list, big_img_il_list = self.split_il(il)
189 print(small_img_il_list)
190 print(big_img_il_list)
191 print('+++++++++++++++++++++++++++++++++++')
192
193 if len(small_img_il_list) > 2: # 单页无规律小图过多时,使用页面转图片
194 page = pdf.loadPage(pno)
195 pm = page.getPixmap(matrix=self.trans, alpha=False)
196 save_path = os.path.join(img_save_path, 'page_{0}_img_0.png'.format(page.number))
197 pm.writePNG(save_path)
198 else: # 提取图片
199 for img_il, img_index in big_img_il_list:
200 if img_il[0] in xref_set:
201 continue
202 pix = self.recoverpix(pdf, img_il)
203 ext, img_data = self.get_img_data(pix)
204 save_path = os.path.join(img_save_path, 'page_{0}_img_{1}.{2}'.format(
205 pno, img_index, ext))
206 with open(save_path, "wb") as f:
207 f.write(img_data)
208 xref_set.add(img_il[0])
209
210 for img_il, img_index in small_img_il_list:
211 # 小图
212 if len(img_il) == 1 and img_il[0][2] < 500 and img_il[0][3] < 500:
213 page = pdf.loadPage(pno)
214 pm = page.getPixmap(matrix=self.trans, alpha=False)
215 save_path = os.path.join(img_save_path,
216 'page_{0}_img_0.png'.format(page.number))
217 pm.writePNG(save_path)
218 elif len(img_il) == 1 and img_il[0][0] not in xref_set: # 当只有一张图片时, 简化处理
219 pix = self.recoverpix(pdf, img_il[0])
220 ext, img_data = self.get_img_data(pix)
221 save_path = os.path.join(img_save_path, 'page_{0}_img_{1}.{2}'.format(
222 pno, img_index, ext))
223 with open(save_path, "wb") as f:
224 f.write(img_data)
225 xref_set.add(img_il[0][0])
226 else: # 多张图片,竖向拼接
227 height_sum = 0
228 im_list = []
229 width = img_il[0][2]
230 for img in img_il:
231 # xref = img[0]
232 # if xref in xref_list:
233 # continue
234 height = img[3]
235 pix = self.recoverpix(pdf, img)
236 ext, img_data = self.get_img_data(pix)
237
238 # xref_list.append(xref)
239
240 im = Image.open(BytesIO(img_data))
241 im_list.append((height, im, ext))
242 height_sum += height
243
244 save_path = os.path.join(img_save_path, 'page_{0}_img_{1}.{2}'.format(
245 pno, img_index, im_list[0][2]))
246 res = Image.new(im_list[0][1].mode, (width, height_sum))
247 h_now = 0
248 for h, m, _ in im_list:
249 res.paste(m, box=(0, h_now))
250 h_now += h
251 res.save(save_path)
...@@ -28,7 +28,8 @@ class DocHandler: ...@@ -28,7 +28,8 @@ class DocHandler:
28 def get_doc_class(business_type): 28 def get_doc_class(business_type):
29 return (HILDoc, consts.HIL_PREFIX) if business_type in consts.HIL_SET else (AFCDoc, consts.AFC_PREFIX) 29 return (HILDoc, consts.HIL_PREFIX) if business_type in consts.HIL_SET else (AFCDoc, consts.AFC_PREFIX)
30 30
31 def fix_scheme(self, scheme): 31 @staticmethod
32 def fix_scheme(scheme):
32 if scheme in consts.DOC_SCHEME_LIST: 33 if scheme in consts.DOC_SCHEME_LIST:
33 return scheme 34 return scheme
34 elif scheme.upper() in consts.DOC_SCHEME_LIST: 35 elif scheme.upper() in consts.DOC_SCHEME_LIST:
...@@ -36,7 +37,8 @@ class DocHandler: ...@@ -36,7 +37,8 @@ class DocHandler:
36 else: 37 else:
37 return consts.DOC_SCHEME_LIST[0] 38 return consts.DOC_SCHEME_LIST[0]
38 39
39 def fix_data_source(self, data_source): 40 @staticmethod
41 def fix_data_source(data_source):
40 if data_source in consts.DATA_SOURCE_LIST: 42 if data_source in consts.DATA_SOURCE_LIST:
41 return data_source 43 return data_source
42 elif data_source.upper() in consts.DATA_SOURCE_LIST: 44 elif data_source.upper() in consts.DATA_SOURCE_LIST:
......
1 import os
2 import fitz
3 from PIL import Image
4 from io import BytesIO
5
6 # 页面保存为png图片参数
7 ZOOM_X = ZOOM_Y = 2.0
8 trans = fitz.Matrix(ZOOM_X, ZOOM_X).preRotate(0) # zoom factor 2 in each dimension
9
10 # 特殊filter处理
11 ADOBE_FILTER_SET = {'FlateDecode', 'JPXDecode', 'JBIG2Decode'}
12
13 # 宽高阈值组合
14 WH_COUPLE_1 = (500, 500)
15 WH_COUPLE_2 = (700, 647)
16 WH_COUPLE_3 = (100, 100)
17 WH_COUPLE_4 = (100, 300)
18 WH_COUPLE_5 = (100, 200)
19
20
21 class PDFHandler:
22
23 def __init__(self, path, img_dir_path):
24 self.path = path
25 self.img_dir_path = img_dir_path
26 self.img_path_list = []
27 self.xref_set = set()
28
29 def get_img_save_path(self, pno, img_index=0, ext='png'):
30 return os.path.join(self.img_dir_path, 'page_{0}_img_{1}.{2}'.format(pno, img_index, ext))
31
32 def page_to_png(self, page):
33 pm = page.getPixmap(matrix=trans, alpha=False)
34 img_save_path = self.get_img_save_path(page.number)
35 pm.writePNG(img_save_path)
36 self.img_path_list.append(img_save_path)
37
38 @staticmethod
39 def getimage(pix):
40 # RGB
41 if pix.colorspace.n != 4:
42 return pix
43 # GRAY/CMYK
44 tpix = fitz.Pixmap(fitz.csRGB, pix)
45 return tpix
46
47 def recover_pix(self, doc, xref, smask, colorspace):
48 if smask != 0:
49 # we need to reconstruct the alpha channel with the smask
50 pix1 = fitz.Pixmap(doc, xref)
51 pix2 = fitz.Pixmap(doc, smask) # create pixmap of the /SMask entry
52
53 # sanity check
54 if not (pix1.irect == pix2.irect and pix1.alpha == pix2.alpha == 0 and pix2.n == 1):
55 pix2 = None
56 return self.getimage(pix1)
57
58 pix = fitz.Pixmap(pix1) # copy of pix1, alpha channel added
59 pix.setAlpha(pix2.samples) # treat pix2.samples as alpha value
60 pix1 = pix2 = None # free temp pixmaps
61 return self.getimage(pix)
62 elif colorspace in {'Separation', 'DeviceCMYK'}:
63 pix = fitz.Pixmap(doc, xref)
64 tpix = fitz.Pixmap(fitz.csRGB, pix)
65 return tpix
66 else:
67 return doc.extractImage(xref)
68
69 @staticmethod
70 def get_img_data(pix):
71 if type(pix) is dict: # we got a raw image
72 ext = pix["ext"]
73 img_data = pix["image"]
74 else: # we got a pixmap
75 ext = 'png'
76 img_data = pix.getPNGData()
77 return ext, img_data
78
79 def extract_single_image(self, pdf, xref, smask, colorspace, pno, img_index=0):
80 pix = self.recover_pix(pdf, xref, smask, colorspace)
81 ext, img_data = self.get_img_data(pix)
82 img_save_path = self.get_img_save_path(pno, img_index=img_index, ext=ext)
83 with open(img_save_path, "wb") as f:
84 f.write(img_data)
85 self.xref_set.add(xref)
86 self.img_path_list.append(img_save_path)
87
88 @staticmethod
89 def split_il(il):
90 broken_il = []
91 start = 0
92 length = len(il)
93 page_to_png = None
94 for i in range(length):
95 # 当图片对象含有特殊filter时,特殊处理:整个页面保存为png图片
96 if il[i][-1] in ADOBE_FILTER_SET:
97 page_to_png = True
98 break
99 else:
100 for i in range(length):
101 # 当图片对象够大时,不作碎图合并处理,而是单纯提取
102 if il[i][2] >= WH_COUPLE_2[0] and il[i][3] >= WH_COUPLE_2[1]:
103 break
104 if i == start:
105 if i == length - 1:
106 broken_il.append(il[start: length])
107 continue
108 elif i == length - 1:
109 if il[i][2] == il[i - 1][2]:
110 broken_il.append(il[start: length])
111 else:
112 broken_il.append(il[start: i])
113 broken_il.append(il[i: length])
114 continue
115 if il[i][2] != il[i - 1][2]:
116 broken_il.append(il[start: i])
117 start = i
118 elif il[i][3] != il[i - 1][3]:
119 broken_il.append(il[start: i + 1])
120 start = i + 1
121 else:
122 # 碎图分组结果
123 return broken_il
124 return page_to_png
125
126 def merge_il(self, pdf, pno, il):
127 # 尝试碎图合并前的分组
128 il.sort(key=lambda x: x[0])
129 broken_il = self.split_il(il)
130
131 page_to_png = True
132 # 3.1 当图片对象够大时,不作碎图合并处理,而是单纯提取
133 if broken_il is None:
134 page_to_png = False
135 for img_index, img in enumerate(il):
136 xref, smask, width, height, _, colorspace, _, _, adobe_filter = img
137 if width < WH_COUPLE_3[0] or height < WH_COUPLE_3[1]: # 过滤小图(如二维码)
138 continue
139 elif xref not in self.xref_set:
140 self.extract_single_image(pdf, xref, smask, colorspace, pno, img_index)
141 # 3.2 碎图按照分组合并
142 elif isinstance(broken_il, list) and len(broken_il) <= 2:
143 for img_index, img_il in enumerate(broken_il):
144 # 3.2.1 仅一张碎图,过滤或直接提取
145 if len(img_il) == 1:
146 xref, smask, width, height, _, colorspace, _, _, adobe_filter = img_il[0]
147 # 过滤小图(如二维码)
148 if width < WH_COUPLE_4[0] or height < WH_COUPLE_4[1] or \
149 (width < WH_COUPLE_1[0] and height < WH_COUPLE_1[1]):
150 continue
151 elif xref not in self.xref_set:
152 self.extract_single_image(pdf, xref, smask, colorspace, pno, img_index)
153 page_to_png = False
154 # 3.2.2 多张碎图,竖向拼接
155 else:
156 height_sum = sum([img[3] for img in img_il])
157 width = img_il[0][2]
158 # 过滤小图和不常规大图
159 if width < WH_COUPLE_5[0] or height_sum < WH_COUPLE_5[1] or \
160 (width > 1000 and height_sum > width * 3):
161 continue
162 im_list = []
163 for img in img_il:
164 xref, smask, _, height, _, colorspace, _, _, adobe_filter = img
165 pix = self.recover_pix(pdf, xref, smask, colorspace)
166 ext, img_data = self.get_img_data(pix)
167 im = Image.open(BytesIO(img_data))
168 im_list.append((height, im, ext))
169 new_img = Image.new(im_list[0][1].mode, (width, height_sum))
170 h_now = 0
171 for h, m, _ in im_list:
172 new_img.paste(m, box=(0, h_now))
173 h_now += h
174 img_save_path = self.get_img_save_path(pno, img_index, im_list[0][2])
175 new_img.save(img_save_path)
176 page_to_png = False
177 self.img_path_list.append(img_save_path)
178
179 # 3.3 碎图分组大于2、全过滤、含特殊filter,特殊处理:整个页面保存为png图片
180 if page_to_png:
181 page = pdf.loadPage(pno)
182 self.page_to_png(page)
183
184 def extract_image(self):
185 os.makedirs(self.img_dir_path, exist_ok=True)
186 with fitz.Document(self.path) as pdf:
187 for pno in range(pdf.pageCount):
188 il = pdf.getPageImageList(pno) # 获取页面图片对象
189 # (xref, smask, width, height, bpc, colorspace, alt.colorspace, name, filter, invoker)
190
191 # 1.页面图片对象数目为0时,保存整个页面为png图片
192 if len(il) == 0:
193 page = pdf.loadPage(pno)
194 self.page_to_png(page)
195 # 2.页面图片对象数目为1时:
196 # 小图(如电子账单的盖章):保存整个页面为png图片
197 # 大图:提取图片对象
198 elif len(il) == 1:
199 xref, smask, width, height, _, colorspace, _, _, _ = il[0]
200 # 小图
201 if width < WH_COUPLE_1[0] and height < WH_COUPLE_1[1]:
202 page = pdf.loadPage(pno)
203 self.page_to_png(page)
204 # 大图
205 elif xref not in self.xref_set:
206 self.extract_single_image(pdf, xref, smask, colorspace, pno)
207 # 3.页面图片对象数目大于1时,特殊处理
208 else:
209 self.merge_il(pdf, pno, il)
1 import fitz
2 import os
3 from PIL import Image
4 from io import BytesIO
5
6
7 class PdfHandler:
8
9 def __init__(self, pdf_path):
10 self.pdf_path = pdf_path
11 self.pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]
12
13 def page_to_pix_img(self, save_dir_path, zoom_x, zoom_y):
14 trans = fitz.Matrix(zoom_x, zoom_y).preRotate(0) # zoom factor 2 in each dimension
15 with fitz.Document(self.pdf_path) as pdf:
16 # print(pdf.metadata)
17 # print(pdf.getPageImageList(0))
18 # print(pdf.getToC()) # 获取大纲
19 for page in pdf:
20 pm = page.getPixmap(matrix=trans, alpha=False)
21 # print(pm.samples) # a rectangular area of bytes representing the image data (a Python bytes object).
22 # print(pm.width)
23 # print(pm.height)
24 # print(pm.stride) # number of bytes of one horizontal image line)
25
26 save_path = os.path.join(save_dir_path, '{0}_{1}.png'.format(self.pdf_name, page.number))
27 # pm.writePNG(save_path)
28 pm.writeImage(save_path)
29
30 def page_to_svg_img(self, save_dir_path):
31 with fitz.Document(self.pdf_path) as pdf:
32 for page in pdf:
33 svg = page.getSVGimage(matrix=fitz.Identity) # UTF-8 string svg
34 save_path = os.path.join(save_dir_path, '{0}_{1}.svg'.format(self.pdf_name, page.number))
35 with open(save_path, 'w') as f:
36 f.write(svg)
37
38 @staticmethod
39 def getimage(pix):
40 if pix.colorspace.n != 4:
41 return pix
42 tpix = fitz.Pixmap(fitz.csRGB, pix)
43 return tpix
44
45 def recoverpix(self, doc, item):
46 x = item[0] # xref of PDF image
47 s = item[1] # xref of its /SMask
48 is_rgb = True if item[5] == 'DeviceRGB' else False
49
50 # RGB
51 if is_rgb:
52 if s == 0:
53 return doc.extractImage(x)
54 # we need to reconstruct the alpha channel with the smask
55 pix1 = fitz.Pixmap(doc, x)
56 pix2 = fitz.Pixmap(doc, s) # create pixmap of the /SMask entry
57
58 # sanity check
59 if not (pix1.irect == pix2.irect and pix1.alpha == pix2.alpha == 0 and pix2.n == 1):
60 pix2 = None
61 return self.getimage(pix1)
62
63 pix = fitz.Pixmap(pix1) # copy of pix1, alpha channel added
64 pix.setAlpha(pix2.samples) # treat pix2.samples as alpha value
65 pix1 = pix2 = None # free temp pixmaps
66 return self.getimage(pix)
67
68 # GRAY/CMYK
69 pix1 = fitz.Pixmap(doc, x)
70 pix = fitz.Pixmap(pix1) # copy of pix1, alpha channel added
71
72 if s != 0:
73 pix2 = fitz.Pixmap(doc, s) # create pixmap of the /SMask entry
74
75 # sanity check
76 if not (pix1.irect == pix2.irect and pix1.alpha == pix2.alpha == 0 and pix2.n == 1):
77 pix2 = None
78 return self.getimage(pix1)
79
80 pix.setAlpha(pix2.samples) # treat pix2.samples as alpha value
81
82 pix1 = pix2 = None # free temp pixmaps
83
84 pix = fitz.Pixmap(fitz.csRGB, pix) # GRAY/CMYK to RGB
85 return self.getimage(pix)
86
87 def extract_images(self, save_dir_path):
88 dimlimit = 100 # each image side must be greater than this
89 relsize = 0.05 # image : pixmap size ratio must be larger than this (5%)
90 abssize = 2048 # absolute image size limit 2 KB: ignore if smaller
91 imgdir = save_dir_path # found images are stored in this subfolder
92
93 xreflist = []
94 with fitz.Document(self.pdf_path) as pdf:
95 for pno in range(pdf.pageCount):
96 il = pdf.getPageImageList(pno)
97 for img in il:
98 print(img)
99 xref = img[0]
100 if xref in xreflist:
101 continue
102 width = img[2]
103 height = img[3]
104 print(xref, width, height)
105 # if min(width, height) <= dimlimit:
106 # continue
107 pix = self.recoverpix(pdf, img)
108 if type(pix) is dict: # we got a raw image
109 ext = pix["ext"]
110 imgdata = pix["image"]
111 n = pix["colorspace"]
112 imgfile = os.path.join(imgdir, "img-%i.%s" % (xref, ext))
113 else: # we got a pixmap
114 imgfile = os.path.join(imgdir, "img-%i.png" % xref)
115 n = pix.n
116 imgdata = pix.getPNGData()
117
118 # if len(imgdata) <= abssize:
119 # continue
120 #
121 # if len(imgdata) / (width * height * n) <= relsize:
122 # continue
123
124 fout = open(imgfile, "wb")
125 fout.write(imgdata)
126 fout.close()
127 xreflist.append(xref)
128
129 @staticmethod
130 def split_il(il):
131 img_il_list = []
132 start = 0
133 length = len(il)
134 for i in range(length):
135 if i == start:
136 if i == length - 1:
137 img_il_list.append(il[start: length])
138 continue
139 elif i == length - 1:
140 img_il_list.append(il[start: length])
141 continue
142 if il[i][2] != il[i - 1][2]:
143 img_il_list.append(il[start: i])
144 start = i
145 elif il[i][3] != il[i - 1][3]:
146 img_il_list.append(il[start: i + 1])
147 start = i + 1
148 return img_il_list
149
150 def extract_images_pro(self, save_dir_path):
151 with fitz.Document(self.pdf_path) as pdf:
152 print('----------------------------')
153 print(self.pdf_name)
154 print(pdf.metadata)
155 # xref_list = []
156 for pno in range(pdf.pageCount):
157 print('========================')
158 il = pdf.getPageImageList(pno)
159 il.sort(key=lambda x: x[0])
160 # (xref, smask, width, height, bpc, colorspace, alt.colorspace, name, filter, invoker)
161
162 img_il_list = self.split_il(il)
163 il = None
164 print(img_il_list)
165 print(len(img_il_list))
166
167 for img_count, img_il in enumerate(img_il_list):
168 print(img_il)
169 height_sum = 0
170 im_list = []
171 for img in img_il:
172 # xref = img[0]
173 # if xref in xref_list:
174 # continue
175 width = img[2]
176 height = img[3]
177 pix = self.recoverpix(pdf, img)
178 if type(pix) is dict: # we got a raw image
179 ext = pix["ext"]
180 img_data = pix["image"]
181 else: # we got a pixmap
182 ext = 'png'
183 img_data = pix.getPNGData()
184
185 # xref_list.append(xref)
186
187 im = Image.open(BytesIO(img_data))
188 im_list.append((width, height, im, ext))
189 height_sum += height
190
191 print(im_list)
192 save_path = os.path.join(save_dir_path, 'page_{0}_img_{1}.{2}'.format(
193 pno, img_count, im_list[0][3]))
194 # 当只有一张图片时, 简化处理
195 if len(im_list) == 1:
196 im_list[0][2].save(save_path)
197 # 多张图片,竖向拼接
198 else:
199 res = Image.new(im_list[0][2].mode, (im_list[0][0], height_sum))
200 h_now = 0
201 for _, h, m, _ in im_list:
202 res.paste(m, box=(0, h_now))
203 h_now += h
204 res.save(save_path)
205
206
207 if __name__ == '__main__':
208 dir_path = '/Users/clay/Desktop/biz/pdf_test/银行流水/'
209 pdf_list = os.listdir(dir_path)
210 for path in pdf_list:
211 if path == '.DS_Store':
212 continue
213 pdf_handler = PdfHandler(os.path.join(dir_path, path))
214 save_path = os.path.join('/Users/clay/Desktop/biz/pdf_test/', 'test', os.path.splitext(os.path.basename(path))[0])
215 os.mkdir(save_path)
216 pdf_handler.extract_images_pro(save_path)
217 # pdf_handler = PdfHandler('/Users/clay/Desktop/biz/pdf_test/银行流水/竖版-特殊-邮储银行-一本通绿卡通交易明细(客户).pdf')
218 # pdf_handler = PdfHandler('/Users/clay/Desktop/biz/pdf_test/银行流水/横版-表格-工商银行 借记卡账户历史明细清单 .pdf')
219 # pdf_handler.page_to_pix_img('/Users/clay/Desktop/biz/pdf_test/', 3.0, 3.0)
220 # pdf_handler.page_to_svg_img('/Users/clay/Desktop/biz/pdf_test/')
221 # pdf_handler.extract_images_pro('/Users/clay/Desktop/biz/pdf_test/test')
222
223 # pix = fitz.Pixmap(sys.argv[1]) # read image file
224 # rgb = "RGB" # set PIL parameter
225 # if pix.alpha: # JPEG cannot have alpha!
226 # pix0 = fitz.Pixmap(pix, 0) # drop alpha channel
227 # pix = pix0 # rename pixmap
228 #
229 # img = Image.frombuffer(rgb, [pix.width, pix.height], pix.samples, "raw", rgb, 0, 1)
230 # img.save(outputFileName)
1 # 录题系统开发规范 1 # 宝马OCR系统开发规范
2 2
3 3
4 ## 代码规范 4 ## 代码规范
......
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!