b2945296 by 周伟奇

update pdf to img

1 parent b6896a10
...@@ -28,7 +28,8 @@ class DocHandler: ...@@ -28,7 +28,8 @@ class DocHandler:
28 def get_doc_class(business_type): 28 def get_doc_class(business_type):
29 return (HILDoc, consts.HIL_PREFIX) if business_type in consts.HIL_SET else (AFCDoc, consts.AFC_PREFIX) 29 return (HILDoc, consts.HIL_PREFIX) if business_type in consts.HIL_SET else (AFCDoc, consts.AFC_PREFIX)
30 30
31 def fix_scheme(self, scheme): 31 @staticmethod
32 def fix_scheme(scheme):
32 if scheme in consts.DOC_SCHEME_LIST: 33 if scheme in consts.DOC_SCHEME_LIST:
33 return scheme 34 return scheme
34 elif scheme.upper() in consts.DOC_SCHEME_LIST: 35 elif scheme.upper() in consts.DOC_SCHEME_LIST:
...@@ -36,7 +37,8 @@ class DocHandler: ...@@ -36,7 +37,8 @@ class DocHandler:
36 else: 37 else:
37 return consts.DOC_SCHEME_LIST[0] 38 return consts.DOC_SCHEME_LIST[0]
38 39
39 def fix_data_source(self, data_source): 40 @staticmethod
41 def fix_data_source(data_source):
40 if data_source in consts.DATA_SOURCE_LIST: 42 if data_source in consts.DATA_SOURCE_LIST:
41 return data_source 43 return data_source
42 elif data_source.upper() in consts.DATA_SOURCE_LIST: 44 elif data_source.upper() in consts.DATA_SOURCE_LIST:
......
1 import os
2 import fitz
3 from PIL import Image
4 from io import BytesIO
5
6 # 页面保存为png图片参数
7 ZOOM_X = ZOOM_Y = 2.0
8 trans = fitz.Matrix(ZOOM_X, ZOOM_X).preRotate(0) # zoom factor 2 in each dimension
9
10 # 特殊filter处理
11 ADOBE_FILTER_SET = {'FlateDecode', 'JPXDecode', 'JBIG2Decode'}
12
13 # 宽高阈值组合
14 WH_COUPLE_1 = (500, 500)
15 WH_COUPLE_2 = (700, 647)
16 WH_COUPLE_3 = (100, 100)
17 WH_COUPLE_4 = (100, 300)
18 WH_COUPLE_5 = (100, 200)
19
20
21 class PDFHandler:
22
23 def __init__(self, path, img_dir_path):
24 self.path = path
25 self.img_dir_path = img_dir_path
26 self.img_path_list = []
27 self.xref_set = set()
28
29 def get_img_save_path(self, pno, img_index=0, ext='png'):
30 return os.path.join(self.img_dir_path, 'page_{0}_img_{1}.{2}'.format(pno, img_index, ext))
31
32 def page_to_png(self, page):
33 pm = page.getPixmap(matrix=trans, alpha=False)
34 img_save_path = self.get_img_save_path(page.number)
35 pm.writePNG(img_save_path)
36 self.img_path_list.append(img_save_path)
37
38 @staticmethod
39 def getimage(pix):
40 # RGB
41 if pix.colorspace.n != 4:
42 return pix
43 # GRAY/CMYK
44 tpix = fitz.Pixmap(fitz.csRGB, pix)
45 return tpix
46
47 def recover_pix(self, doc, xref, smask, colorspace):
48 if smask != 0:
49 # we need to reconstruct the alpha channel with the smask
50 pix1 = fitz.Pixmap(doc, xref)
51 pix2 = fitz.Pixmap(doc, smask) # create pixmap of the /SMask entry
52
53 # sanity check
54 if not (pix1.irect == pix2.irect and pix1.alpha == pix2.alpha == 0 and pix2.n == 1):
55 pix2 = None
56 return self.getimage(pix1)
57
58 pix = fitz.Pixmap(pix1) # copy of pix1, alpha channel added
59 pix.setAlpha(pix2.samples) # treat pix2.samples as alpha value
60 pix1 = pix2 = None # free temp pixmaps
61 return self.getimage(pix)
62 elif colorspace in {'Separation', 'DeviceCMYK'}:
63 pix = fitz.Pixmap(doc, xref)
64 tpix = fitz.Pixmap(fitz.csRGB, pix)
65 return tpix
66 else:
67 return doc.extractImage(xref)
68
69 @staticmethod
70 def get_img_data(pix):
71 if type(pix) is dict: # we got a raw image
72 ext = pix["ext"]
73 img_data = pix["image"]
74 else: # we got a pixmap
75 ext = 'png'
76 img_data = pix.getPNGData()
77 return ext, img_data
78
79 def extract_single_image(self, pdf, xref, smask, colorspace, pno, img_index=0):
80 pix = self.recover_pix(pdf, xref, smask, colorspace)
81 ext, img_data = self.get_img_data(pix)
82 img_save_path = self.get_img_save_path(pno, img_index=img_index, ext=ext)
83 with open(img_save_path, "wb") as f:
84 f.write(img_data)
85 self.xref_set.add(xref)
86 self.img_path_list.append(img_save_path)
87
88 @staticmethod
89 def split_il(il):
90 broken_il = []
91 start = 0
92 length = len(il)
93 page_to_png = None
94 for i in range(length):
95 # 当图片对象含有特殊filter时,特殊处理:整个页面保存为png图片
96 if il[i][-1] in ADOBE_FILTER_SET:
97 page_to_png = True
98 break
99 else:
100 for i in range(length):
101 # 当图片对象够大时,不作碎图合并处理,而是单纯提取
102 if il[i][2] >= WH_COUPLE_2[0] and il[i][3] >= WH_COUPLE_2[1]:
103 break
104 if i == start:
105 if i == length - 1:
106 broken_il.append(il[start: length])
107 continue
108 elif i == length - 1:
109 if il[i][2] == il[i - 1][2]:
110 broken_il.append(il[start: length])
111 else:
112 broken_il.append(il[start: i])
113 broken_il.append(il[i: length])
114 continue
115 if il[i][2] != il[i - 1][2]:
116 broken_il.append(il[start: i])
117 start = i
118 elif il[i][3] != il[i - 1][3]:
119 broken_il.append(il[start: i + 1])
120 start = i + 1
121 else:
122 # 碎图分组结果
123 return broken_il
124 return page_to_png
125
126 def merge_il(self, pdf, pno, il):
127 # 尝试碎图合并前的分组
128 il.sort(key=lambda x: x[0])
129 broken_il = self.split_il(il)
130
131 page_to_png = True
132 # 3.1 当图片对象够大时,不作碎图合并处理,而是单纯提取
133 if broken_il is None:
134 page_to_png = False
135 for img_index, img in enumerate(il):
136 xref, smask, width, height, _, colorspace, _, _, adobe_filter = img
137 if width < WH_COUPLE_3[0] or height < WH_COUPLE_3[1]: # 过滤小图(如二维码)
138 continue
139 elif xref not in self.xref_set:
140 self.extract_single_image(pdf, xref, smask, colorspace, pno, img_index)
141 # 3.2 碎图按照分组合并
142 elif isinstance(broken_il, list) and len(broken_il) <= 2:
143 for img_index, img_il in enumerate(broken_il):
144 # 3.2.1 仅一张碎图,过滤或直接提取
145 if len(img_il) == 1:
146 xref, smask, width, height, _, colorspace, _, _, adobe_filter = img_il[0]
147 # 过滤小图(如二维码)
148 if width < WH_COUPLE_4[0] or height < WH_COUPLE_4[1] or \
149 (width < WH_COUPLE_1[0] and height < WH_COUPLE_1[1]):
150 continue
151 elif xref not in self.xref_set:
152 self.extract_single_image(pdf, xref, smask, colorspace, pno, img_index)
153 page_to_png = False
154 # 3.2.2 多张碎图,竖向拼接
155 else:
156 height_sum = sum([img[3] for img in img_il])
157 width = img_il[0][2]
158 # 过滤小图和不常规大图
159 if width < WH_COUPLE_5[0] or height_sum < WH_COUPLE_5[1] or \
160 (width > 1000 and height_sum > width * 3):
161 continue
162 im_list = []
163 for img in img_il:
164 xref, smask, _, height, _, colorspace, _, _, adobe_filter = img
165 pix = self.recover_pix(pdf, xref, smask, colorspace)
166 ext, img_data = self.get_img_data(pix)
167 im = Image.open(BytesIO(img_data))
168 im_list.append((height, im, ext))
169 new_img = Image.new(im_list[0][1].mode, (width, height_sum))
170 h_now = 0
171 for h, m, _ in im_list:
172 new_img.paste(m, box=(0, h_now))
173 h_now += h
174 img_save_path = self.get_img_save_path(pno, img_index, im_list[0][2])
175 new_img.save(img_save_path)
176 page_to_png = False
177 self.img_path_list.append(img_save_path)
178
179 # 3.3 碎图分组大于2、全过滤、含特殊filter,特殊处理:整个页面保存为png图片
180 if page_to_png:
181 page = pdf.loadPage(pno)
182 self.page_to_png(page)
183
184 def extract_image(self):
185 os.makedirs(self.img_dir_path, exist_ok=True)
186 with fitz.Document(self.path) as pdf:
187 for pno in range(pdf.pageCount):
188 il = pdf.getPageImageList(pno) # 获取页面图片对象
189 # (xref, smask, width, height, bpc, colorspace, alt.colorspace, name, filter, invoker)
190
191 # 1.页面图片对象数目为0时,保存整个页面为png图片
192 if len(il) == 0:
193 page = pdf.loadPage(pno)
194 self.page_to_png(page)
195 # 2.页面图片对象数目为1时:
196 # 小图(如电子账单的盖章):保存整个页面为png图片
197 # 大图:提取图片对象
198 elif len(il) == 1:
199 xref, smask, width, height, _, colorspace, _, _, _ = il[0]
200 # 小图
201 if width < WH_COUPLE_1[0] and height < WH_COUPLE_1[1]:
202 page = pdf.loadPage(pno)
203 self.page_to_png(page)
204 # 大图
205 elif xref not in self.xref_set:
206 self.extract_single_image(pdf, xref, smask, colorspace, pno)
207 # 3.页面图片对象数目大于1时,特殊处理
208 else:
209 self.merge_il(pdf, pno, il)
1 import fitz
2 import os
3 from PIL import Image
4 from io import BytesIO
5
6
7 class PdfHandler:
8
9 def __init__(self, pdf_path):
10 self.pdf_path = pdf_path
11 self.pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]
12
13 def page_to_pix_img(self, save_dir_path, zoom_x, zoom_y):
14 trans = fitz.Matrix(zoom_x, zoom_y).preRotate(0) # zoom factor 2 in each dimension
15 with fitz.Document(self.pdf_path) as pdf:
16 # print(pdf.metadata)
17 # print(pdf.getPageImageList(0))
18 # print(pdf.getToC()) # 获取大纲
19 for page in pdf:
20 pm = page.getPixmap(matrix=trans, alpha=False)
21 # print(pm.samples) # a rectangular area of bytes representing the image data (a Python bytes object).
22 # print(pm.width)
23 # print(pm.height)
24 # print(pm.stride) # number of bytes of one horizontal image line)
25
26 save_path = os.path.join(save_dir_path, '{0}_{1}.png'.format(self.pdf_name, page.number))
27 # pm.writePNG(save_path)
28 pm.writeImage(save_path)
29
30 def page_to_svg_img(self, save_dir_path):
31 with fitz.Document(self.pdf_path) as pdf:
32 for page in pdf:
33 svg = page.getSVGimage(matrix=fitz.Identity) # UTF-8 string svg
34 save_path = os.path.join(save_dir_path, '{0}_{1}.svg'.format(self.pdf_name, page.number))
35 with open(save_path, 'w') as f:
36 f.write(svg)
37
38 @staticmethod
39 def getimage(pix):
40 if pix.colorspace.n != 4:
41 return pix
42 tpix = fitz.Pixmap(fitz.csRGB, pix)
43 return tpix
44
45 def recoverpix(self, doc, item):
46 x = item[0] # xref of PDF image
47 s = item[1] # xref of its /SMask
48 is_rgb = True if item[5] == 'DeviceRGB' else False
49
50 # RGB
51 if is_rgb:
52 if s == 0:
53 return doc.extractImage(x)
54 # we need to reconstruct the alpha channel with the smask
55 pix1 = fitz.Pixmap(doc, x)
56 pix2 = fitz.Pixmap(doc, s) # create pixmap of the /SMask entry
57
58 # sanity check
59 if not (pix1.irect == pix2.irect and pix1.alpha == pix2.alpha == 0 and pix2.n == 1):
60 pix2 = None
61 return self.getimage(pix1)
62
63 pix = fitz.Pixmap(pix1) # copy of pix1, alpha channel added
64 pix.setAlpha(pix2.samples) # treat pix2.samples as alpha value
65 pix1 = pix2 = None # free temp pixmaps
66 return self.getimage(pix)
67
68 # GRAY/CMYK
69 pix1 = fitz.Pixmap(doc, x)
70 pix = fitz.Pixmap(pix1) # copy of pix1, alpha channel added
71
72 if s != 0:
73 pix2 = fitz.Pixmap(doc, s) # create pixmap of the /SMask entry
74
75 # sanity check
76 if not (pix1.irect == pix2.irect and pix1.alpha == pix2.alpha == 0 and pix2.n == 1):
77 pix2 = None
78 return self.getimage(pix1)
79
80 pix.setAlpha(pix2.samples) # treat pix2.samples as alpha value
81
82 pix1 = pix2 = None # free temp pixmaps
83
84 pix = fitz.Pixmap(fitz.csRGB, pix) # GRAY/CMYK to RGB
85 return self.getimage(pix)
86
87 def extract_images(self, save_dir_path):
88 dimlimit = 100 # each image side must be greater than this
89 relsize = 0.05 # image : pixmap size ratio must be larger than this (5%)
90 abssize = 2048 # absolute image size limit 2 KB: ignore if smaller
91 imgdir = save_dir_path # found images are stored in this subfolder
92
93 xreflist = []
94 with fitz.Document(self.pdf_path) as pdf:
95 for pno in range(pdf.pageCount):
96 il = pdf.getPageImageList(pno)
97 for img in il:
98 print(img)
99 xref = img[0]
100 if xref in xreflist:
101 continue
102 width = img[2]
103 height = img[3]
104 print(xref, width, height)
105 # if min(width, height) <= dimlimit:
106 # continue
107 pix = self.recoverpix(pdf, img)
108 if type(pix) is dict: # we got a raw image
109 ext = pix["ext"]
110 imgdata = pix["image"]
111 n = pix["colorspace"]
112 imgfile = os.path.join(imgdir, "img-%i.%s" % (xref, ext))
113 else: # we got a pixmap
114 imgfile = os.path.join(imgdir, "img-%i.png" % xref)
115 n = pix.n
116 imgdata = pix.getPNGData()
117
118 # if len(imgdata) <= abssize:
119 # continue
120 #
121 # if len(imgdata) / (width * height * n) <= relsize:
122 # continue
123
124 fout = open(imgfile, "wb")
125 fout.write(imgdata)
126 fout.close()
127 xreflist.append(xref)
128
129 @staticmethod
130 def split_il(il):
131 img_il_list = []
132 start = 0
133 length = len(il)
134 for i in range(length):
135 if i == start:
136 if i == length - 1:
137 img_il_list.append(il[start: length])
138 continue
139 elif i == length - 1:
140 img_il_list.append(il[start: length])
141 continue
142 if il[i][2] != il[i - 1][2]:
143 img_il_list.append(il[start: i])
144 start = i
145 elif il[i][3] != il[i - 1][3]:
146 img_il_list.append(il[start: i + 1])
147 start = i + 1
148 return img_il_list
149
150 def extract_images_pro(self, save_dir_path):
151 with fitz.Document(self.pdf_path) as pdf:
152 print('----------------------------')
153 print(self.pdf_name)
154 print(pdf.metadata)
155 # xref_list = []
156 for pno in range(pdf.pageCount):
157 print('========================')
158 il = pdf.getPageImageList(pno)
159 il.sort(key=lambda x: x[0])
160 # (xref, smask, width, height, bpc, colorspace, alt.colorspace, name, filter, invoker)
161
162 img_il_list = self.split_il(il)
163 il = None
164 print(img_il_list)
165 print(len(img_il_list))
166
167 for img_count, img_il in enumerate(img_il_list):
168 print(img_il)
169 height_sum = 0
170 im_list = []
171 for img in img_il:
172 # xref = img[0]
173 # if xref in xref_list:
174 # continue
175 width = img[2]
176 height = img[3]
177 pix = self.recoverpix(pdf, img)
178 if type(pix) is dict: # we got a raw image
179 ext = pix["ext"]
180 img_data = pix["image"]
181 else: # we got a pixmap
182 ext = 'png'
183 img_data = pix.getPNGData()
184
185 # xref_list.append(xref)
186
187 im = Image.open(BytesIO(img_data))
188 im_list.append((width, height, im, ext))
189 height_sum += height
190
191 print(im_list)
192 save_path = os.path.join(save_dir_path, 'page_{0}_img_{1}.{2}'.format(
193 pno, img_count, im_list[0][3]))
194 # 当只有一张图片时, 简化处理
195 if len(im_list) == 1:
196 im_list[0][2].save(save_path)
197 # 多张图片,竖向拼接
198 else:
199 res = Image.new(im_list[0][2].mode, (im_list[0][0], height_sum))
200 h_now = 0
201 for _, h, m, _ in im_list:
202 res.paste(m, box=(0, h_now))
203 h_now += h
204 res.save(save_path)
205
206
207 if __name__ == '__main__':
208 dir_path = '/Users/clay/Desktop/biz/pdf_test/银行流水/'
209 pdf_list = os.listdir(dir_path)
210 for path in pdf_list:
211 if path == '.DS_Store':
212 continue
213 pdf_handler = PdfHandler(os.path.join(dir_path, path))
214 save_path = os.path.join('/Users/clay/Desktop/biz/pdf_test/', 'test', os.path.splitext(os.path.basename(path))[0])
215 os.mkdir(save_path)
216 pdf_handler.extract_images_pro(save_path)
217 # pdf_handler = PdfHandler('/Users/clay/Desktop/biz/pdf_test/银行流水/竖版-特殊-邮储银行-一本通绿卡通交易明细(客户).pdf')
218 # pdf_handler = PdfHandler('/Users/clay/Desktop/biz/pdf_test/银行流水/横版-表格-工商银行 借记卡账户历史明细清单 .pdf')
219 # pdf_handler.page_to_pix_img('/Users/clay/Desktop/biz/pdf_test/', 3.0, 3.0)
220 # pdf_handler.page_to_svg_img('/Users/clay/Desktop/biz/pdf_test/')
221 # pdf_handler.extract_images_pro('/Users/clay/Desktop/biz/pdf_test/test')
222
223 # pix = fitz.Pixmap(sys.argv[1]) # read image file
224 # rgb = "RGB" # set PIL parameter
225 # if pix.alpha: # JPEG cannot have alpha!
226 # pix0 = fitz.Pixmap(pix, 0) # drop alpha channel
227 # pix = pix0 # rename pixmap
228 #
229 # img = Image.frombuffer(rgb, [pix.width, pix.height], pix.samples, "raw", rgb, 0, 1)
230 # img.save(outputFileName)
1 # 录题系统开发规范 1 # 宝马OCR系统开发规范
2 2
3 3
4 ## 代码规范 4 ## 代码规范
......
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!