update pdf to img
Showing
6 changed files
with
214 additions
and
233 deletions
This diff is collapsed.
Click to expand it.
This diff is collapsed.
Click to expand it.
... | @@ -28,7 +28,8 @@ class DocHandler: | ... | @@ -28,7 +28,8 @@ class DocHandler: |
28 | def get_doc_class(business_type): | 28 | def get_doc_class(business_type): |
29 | return (HILDoc, consts.HIL_PREFIX) if business_type in consts.HIL_SET else (AFCDoc, consts.AFC_PREFIX) | 29 | return (HILDoc, consts.HIL_PREFIX) if business_type in consts.HIL_SET else (AFCDoc, consts.AFC_PREFIX) |
30 | 30 | ||
31 | def fix_scheme(self, scheme): | 31 | @staticmethod |
32 | def fix_scheme(scheme): | ||
32 | if scheme in consts.DOC_SCHEME_LIST: | 33 | if scheme in consts.DOC_SCHEME_LIST: |
33 | return scheme | 34 | return scheme |
34 | elif scheme.upper() in consts.DOC_SCHEME_LIST: | 35 | elif scheme.upper() in consts.DOC_SCHEME_LIST: |
... | @@ -36,7 +37,8 @@ class DocHandler: | ... | @@ -36,7 +37,8 @@ class DocHandler: |
36 | else: | 37 | else: |
37 | return consts.DOC_SCHEME_LIST[0] | 38 | return consts.DOC_SCHEME_LIST[0] |
38 | 39 | ||
39 | def fix_data_source(self, data_source): | 40 | @staticmethod |
41 | def fix_data_source(data_source): | ||
40 | if data_source in consts.DATA_SOURCE_LIST: | 42 | if data_source in consts.DATA_SOURCE_LIST: |
41 | return data_source | 43 | return data_source |
42 | elif data_source.upper() in consts.DATA_SOURCE_LIST: | 44 | elif data_source.upper() in consts.DATA_SOURCE_LIST: | ... | ... |
src/common/tools/pdf_to_img.py
0 → 100644
1 | import os | ||
2 | import fitz | ||
3 | from PIL import Image | ||
4 | from io import BytesIO | ||
5 | |||
6 | # 页面保存为png图片参数 | ||
7 | ZOOM_X = ZOOM_Y = 2.0 | ||
8 | trans = fitz.Matrix(ZOOM_X, ZOOM_X).preRotate(0) # zoom factor 2 in each dimension | ||
9 | |||
10 | # 特殊filter处理 | ||
11 | ADOBE_FILTER_SET = {'FlateDecode', 'JPXDecode', 'JBIG2Decode'} | ||
12 | |||
13 | # 宽高阈值组合 | ||
14 | WH_COUPLE_1 = (500, 500) | ||
15 | WH_COUPLE_2 = (700, 647) | ||
16 | WH_COUPLE_3 = (100, 100) | ||
17 | WH_COUPLE_4 = (100, 300) | ||
18 | WH_COUPLE_5 = (100, 200) | ||
19 | |||
20 | |||
21 | class PDFHandler: | ||
22 | |||
23 | def __init__(self, path, img_dir_path): | ||
24 | self.path = path | ||
25 | self.img_dir_path = img_dir_path | ||
26 | self.img_path_list = [] | ||
27 | self.xref_set = set() | ||
28 | |||
29 | def get_img_save_path(self, pno, img_index=0, ext='png'): | ||
30 | return os.path.join(self.img_dir_path, 'page_{0}_img_{1}.{2}'.format(pno, img_index, ext)) | ||
31 | |||
32 | def page_to_png(self, page): | ||
33 | pm = page.getPixmap(matrix=trans, alpha=False) | ||
34 | img_save_path = self.get_img_save_path(page.number) | ||
35 | pm.writePNG(img_save_path) | ||
36 | self.img_path_list.append(img_save_path) | ||
37 | |||
38 | @staticmethod | ||
39 | def getimage(pix): | ||
40 | # RGB | ||
41 | if pix.colorspace.n != 4: | ||
42 | return pix | ||
43 | # GRAY/CMYK | ||
44 | tpix = fitz.Pixmap(fitz.csRGB, pix) | ||
45 | return tpix | ||
46 | |||
47 | def recover_pix(self, doc, xref, smask, colorspace): | ||
48 | if smask != 0: | ||
49 | # we need to reconstruct the alpha channel with the smask | ||
50 | pix1 = fitz.Pixmap(doc, xref) | ||
51 | pix2 = fitz.Pixmap(doc, smask) # create pixmap of the /SMask entry | ||
52 | |||
53 | # sanity check | ||
54 | if not (pix1.irect == pix2.irect and pix1.alpha == pix2.alpha == 0 and pix2.n == 1): | ||
55 | pix2 = None | ||
56 | return self.getimage(pix1) | ||
57 | |||
58 | pix = fitz.Pixmap(pix1) # copy of pix1, alpha channel added | ||
59 | pix.setAlpha(pix2.samples) # treat pix2.samples as alpha value | ||
60 | pix1 = pix2 = None # free temp pixmaps | ||
61 | return self.getimage(pix) | ||
62 | elif colorspace in {'Separation', 'DeviceCMYK'}: | ||
63 | pix = fitz.Pixmap(doc, xref) | ||
64 | tpix = fitz.Pixmap(fitz.csRGB, pix) | ||
65 | return tpix | ||
66 | else: | ||
67 | return doc.extractImage(xref) | ||
68 | |||
69 | @staticmethod | ||
70 | def get_img_data(pix): | ||
71 | if type(pix) is dict: # we got a raw image | ||
72 | ext = pix["ext"] | ||
73 | img_data = pix["image"] | ||
74 | else: # we got a pixmap | ||
75 | ext = 'png' | ||
76 | img_data = pix.getPNGData() | ||
77 | return ext, img_data | ||
78 | |||
79 | def extract_single_image(self, pdf, xref, smask, colorspace, pno, img_index=0): | ||
80 | pix = self.recover_pix(pdf, xref, smask, colorspace) | ||
81 | ext, img_data = self.get_img_data(pix) | ||
82 | img_save_path = self.get_img_save_path(pno, img_index=img_index, ext=ext) | ||
83 | with open(img_save_path, "wb") as f: | ||
84 | f.write(img_data) | ||
85 | self.xref_set.add(xref) | ||
86 | self.img_path_list.append(img_save_path) | ||
87 | |||
88 | @staticmethod | ||
89 | def split_il(il): | ||
90 | broken_il = [] | ||
91 | start = 0 | ||
92 | length = len(il) | ||
93 | page_to_png = None | ||
94 | for i in range(length): | ||
95 | # 当图片对象含有特殊filter时,特殊处理:整个页面保存为png图片 | ||
96 | if il[i][-1] in ADOBE_FILTER_SET: | ||
97 | page_to_png = True | ||
98 | break | ||
99 | else: | ||
100 | for i in range(length): | ||
101 | # 当图片对象够大时,不作碎图合并处理,而是单纯提取 | ||
102 | if il[i][2] >= WH_COUPLE_2[0] and il[i][3] >= WH_COUPLE_2[1]: | ||
103 | break | ||
104 | if i == start: | ||
105 | if i == length - 1: | ||
106 | broken_il.append(il[start: length]) | ||
107 | continue | ||
108 | elif i == length - 1: | ||
109 | if il[i][2] == il[i - 1][2]: | ||
110 | broken_il.append(il[start: length]) | ||
111 | else: | ||
112 | broken_il.append(il[start: i]) | ||
113 | broken_il.append(il[i: length]) | ||
114 | continue | ||
115 | if il[i][2] != il[i - 1][2]: | ||
116 | broken_il.append(il[start: i]) | ||
117 | start = i | ||
118 | elif il[i][3] != il[i - 1][3]: | ||
119 | broken_il.append(il[start: i + 1]) | ||
120 | start = i + 1 | ||
121 | else: | ||
122 | # 碎图分组结果 | ||
123 | return broken_il | ||
124 | return page_to_png | ||
125 | |||
126 | def merge_il(self, pdf, pno, il): | ||
127 | # 尝试碎图合并前的分组 | ||
128 | il.sort(key=lambda x: x[0]) | ||
129 | broken_il = self.split_il(il) | ||
130 | |||
131 | page_to_png = True | ||
132 | # 3.1 当图片对象够大时,不作碎图合并处理,而是单纯提取 | ||
133 | if broken_il is None: | ||
134 | page_to_png = False | ||
135 | for img_index, img in enumerate(il): | ||
136 | xref, smask, width, height, _, colorspace, _, _, adobe_filter = img | ||
137 | if width < WH_COUPLE_3[0] or height < WH_COUPLE_3[1]: # 过滤小图(如二维码) | ||
138 | continue | ||
139 | elif xref not in self.xref_set: | ||
140 | self.extract_single_image(pdf, xref, smask, colorspace, pno, img_index) | ||
141 | # 3.2 碎图按照分组合并 | ||
142 | elif isinstance(broken_il, list) and len(broken_il) <= 2: | ||
143 | for img_index, img_il in enumerate(broken_il): | ||
144 | # 3.2.1 仅一张碎图,过滤或直接提取 | ||
145 | if len(img_il) == 1: | ||
146 | xref, smask, width, height, _, colorspace, _, _, adobe_filter = img_il[0] | ||
147 | # 过滤小图(如二维码) | ||
148 | if width < WH_COUPLE_4[0] or height < WH_COUPLE_4[1] or \ | ||
149 | (width < WH_COUPLE_1[0] and height < WH_COUPLE_1[1]): | ||
150 | continue | ||
151 | elif xref not in self.xref_set: | ||
152 | self.extract_single_image(pdf, xref, smask, colorspace, pno, img_index) | ||
153 | page_to_png = False | ||
154 | # 3.2.2 多张碎图,竖向拼接 | ||
155 | else: | ||
156 | height_sum = sum([img[3] for img in img_il]) | ||
157 | width = img_il[0][2] | ||
158 | # 过滤小图和不常规大图 | ||
159 | if width < WH_COUPLE_5[0] or height_sum < WH_COUPLE_5[1] or \ | ||
160 | (width > 1000 and height_sum > width * 3): | ||
161 | continue | ||
162 | im_list = [] | ||
163 | for img in img_il: | ||
164 | xref, smask, _, height, _, colorspace, _, _, adobe_filter = img | ||
165 | pix = self.recover_pix(pdf, xref, smask, colorspace) | ||
166 | ext, img_data = self.get_img_data(pix) | ||
167 | im = Image.open(BytesIO(img_data)) | ||
168 | im_list.append((height, im, ext)) | ||
169 | new_img = Image.new(im_list[0][1].mode, (width, height_sum)) | ||
170 | h_now = 0 | ||
171 | for h, m, _ in im_list: | ||
172 | new_img.paste(m, box=(0, h_now)) | ||
173 | h_now += h | ||
174 | img_save_path = self.get_img_save_path(pno, img_index, im_list[0][2]) | ||
175 | new_img.save(img_save_path) | ||
176 | page_to_png = False | ||
177 | self.img_path_list.append(img_save_path) | ||
178 | |||
179 | # 3.3 碎图分组大于2、全过滤、含特殊filter,特殊处理:整个页面保存为png图片 | ||
180 | if page_to_png: | ||
181 | page = pdf.loadPage(pno) | ||
182 | self.page_to_png(page) | ||
183 | |||
184 | def extract_image(self): | ||
185 | os.makedirs(self.img_dir_path, exist_ok=True) | ||
186 | with fitz.Document(self.path) as pdf: | ||
187 | for pno in range(pdf.pageCount): | ||
188 | il = pdf.getPageImageList(pno) # 获取页面图片对象 | ||
189 | # (xref, smask, width, height, bpc, colorspace, alt.colorspace, name, filter, invoker) | ||
190 | |||
191 | # 1.页面图片对象数目为0时,保存整个页面为png图片 | ||
192 | if len(il) == 0: | ||
193 | page = pdf.loadPage(pno) | ||
194 | self.page_to_png(page) | ||
195 | # 2.页面图片对象数目为1时: | ||
196 | # 小图(如电子账单的盖章):保存整个页面为png图片 | ||
197 | # 大图:提取图片对象 | ||
198 | elif len(il) == 1: | ||
199 | xref, smask, width, height, _, colorspace, _, _, _ = il[0] | ||
200 | # 小图 | ||
201 | if width < WH_COUPLE_1[0] and height < WH_COUPLE_1[1]: | ||
202 | page = pdf.loadPage(pno) | ||
203 | self.page_to_png(page) | ||
204 | # 大图 | ||
205 | elif xref not in self.xref_set: | ||
206 | self.extract_single_image(pdf, xref, smask, colorspace, pno) | ||
207 | # 3.页面图片对象数目大于1时,特殊处理 | ||
208 | else: | ||
209 | self.merge_il(pdf, pno, il) |
src/common/tools/pdf_tools.py
deleted
100644 → 0
1 | import fitz | ||
2 | import os | ||
3 | from PIL import Image | ||
4 | from io import BytesIO | ||
5 | |||
6 | |||
7 | class PdfHandler: | ||
8 | |||
9 | def __init__(self, pdf_path): | ||
10 | self.pdf_path = pdf_path | ||
11 | self.pdf_name = os.path.splitext(os.path.basename(pdf_path))[0] | ||
12 | |||
13 | def page_to_pix_img(self, save_dir_path, zoom_x, zoom_y): | ||
14 | trans = fitz.Matrix(zoom_x, zoom_y).preRotate(0) # zoom factor 2 in each dimension | ||
15 | with fitz.Document(self.pdf_path) as pdf: | ||
16 | # print(pdf.metadata) | ||
17 | # print(pdf.getPageImageList(0)) | ||
18 | # print(pdf.getToC()) # 获取大纲 | ||
19 | for page in pdf: | ||
20 | pm = page.getPixmap(matrix=trans, alpha=False) | ||
21 | # print(pm.samples) # a rectangular area of bytes representing the image data (a Python bytes object). | ||
22 | # print(pm.width) | ||
23 | # print(pm.height) | ||
24 | # print(pm.stride) # number of bytes of one horizontal image line) | ||
25 | |||
26 | save_path = os.path.join(save_dir_path, '{0}_{1}.png'.format(self.pdf_name, page.number)) | ||
27 | # pm.writePNG(save_path) | ||
28 | pm.writeImage(save_path) | ||
29 | |||
30 | def page_to_svg_img(self, save_dir_path): | ||
31 | with fitz.Document(self.pdf_path) as pdf: | ||
32 | for page in pdf: | ||
33 | svg = page.getSVGimage(matrix=fitz.Identity) # UTF-8 string svg | ||
34 | save_path = os.path.join(save_dir_path, '{0}_{1}.svg'.format(self.pdf_name, page.number)) | ||
35 | with open(save_path, 'w') as f: | ||
36 | f.write(svg) | ||
37 | |||
38 | @staticmethod | ||
39 | def getimage(pix): | ||
40 | if pix.colorspace.n != 4: | ||
41 | return pix | ||
42 | tpix = fitz.Pixmap(fitz.csRGB, pix) | ||
43 | return tpix | ||
44 | |||
45 | def recoverpix(self, doc, item): | ||
46 | x = item[0] # xref of PDF image | ||
47 | s = item[1] # xref of its /SMask | ||
48 | is_rgb = True if item[5] == 'DeviceRGB' else False | ||
49 | |||
50 | # RGB | ||
51 | if is_rgb: | ||
52 | if s == 0: | ||
53 | return doc.extractImage(x) | ||
54 | # we need to reconstruct the alpha channel with the smask | ||
55 | pix1 = fitz.Pixmap(doc, x) | ||
56 | pix2 = fitz.Pixmap(doc, s) # create pixmap of the /SMask entry | ||
57 | |||
58 | # sanity check | ||
59 | if not (pix1.irect == pix2.irect and pix1.alpha == pix2.alpha == 0 and pix2.n == 1): | ||
60 | pix2 = None | ||
61 | return self.getimage(pix1) | ||
62 | |||
63 | pix = fitz.Pixmap(pix1) # copy of pix1, alpha channel added | ||
64 | pix.setAlpha(pix2.samples) # treat pix2.samples as alpha value | ||
65 | pix1 = pix2 = None # free temp pixmaps | ||
66 | return self.getimage(pix) | ||
67 | |||
68 | # GRAY/CMYK | ||
69 | pix1 = fitz.Pixmap(doc, x) | ||
70 | pix = fitz.Pixmap(pix1) # copy of pix1, alpha channel added | ||
71 | |||
72 | if s != 0: | ||
73 | pix2 = fitz.Pixmap(doc, s) # create pixmap of the /SMask entry | ||
74 | |||
75 | # sanity check | ||
76 | if not (pix1.irect == pix2.irect and pix1.alpha == pix2.alpha == 0 and pix2.n == 1): | ||
77 | pix2 = None | ||
78 | return self.getimage(pix1) | ||
79 | |||
80 | pix.setAlpha(pix2.samples) # treat pix2.samples as alpha value | ||
81 | |||
82 | pix1 = pix2 = None # free temp pixmaps | ||
83 | |||
84 | pix = fitz.Pixmap(fitz.csRGB, pix) # GRAY/CMYK to RGB | ||
85 | return self.getimage(pix) | ||
86 | |||
87 | def extract_images(self, save_dir_path): | ||
88 | dimlimit = 100 # each image side must be greater than this | ||
89 | relsize = 0.05 # image : pixmap size ratio must be larger than this (5%) | ||
90 | abssize = 2048 # absolute image size limit 2 KB: ignore if smaller | ||
91 | imgdir = save_dir_path # found images are stored in this subfolder | ||
92 | |||
93 | xreflist = [] | ||
94 | with fitz.Document(self.pdf_path) as pdf: | ||
95 | for pno in range(pdf.pageCount): | ||
96 | il = pdf.getPageImageList(pno) | ||
97 | for img in il: | ||
98 | print(img) | ||
99 | xref = img[0] | ||
100 | if xref in xreflist: | ||
101 | continue | ||
102 | width = img[2] | ||
103 | height = img[3] | ||
104 | print(xref, width, height) | ||
105 | # if min(width, height) <= dimlimit: | ||
106 | # continue | ||
107 | pix = self.recoverpix(pdf, img) | ||
108 | if type(pix) is dict: # we got a raw image | ||
109 | ext = pix["ext"] | ||
110 | imgdata = pix["image"] | ||
111 | n = pix["colorspace"] | ||
112 | imgfile = os.path.join(imgdir, "img-%i.%s" % (xref, ext)) | ||
113 | else: # we got a pixmap | ||
114 | imgfile = os.path.join(imgdir, "img-%i.png" % xref) | ||
115 | n = pix.n | ||
116 | imgdata = pix.getPNGData() | ||
117 | |||
118 | # if len(imgdata) <= abssize: | ||
119 | # continue | ||
120 | # | ||
121 | # if len(imgdata) / (width * height * n) <= relsize: | ||
122 | # continue | ||
123 | |||
124 | fout = open(imgfile, "wb") | ||
125 | fout.write(imgdata) | ||
126 | fout.close() | ||
127 | xreflist.append(xref) | ||
128 | |||
129 | @staticmethod | ||
130 | def split_il(il): | ||
131 | img_il_list = [] | ||
132 | start = 0 | ||
133 | length = len(il) | ||
134 | for i in range(length): | ||
135 | if i == start: | ||
136 | if i == length - 1: | ||
137 | img_il_list.append(il[start: length]) | ||
138 | continue | ||
139 | elif i == length - 1: | ||
140 | img_il_list.append(il[start: length]) | ||
141 | continue | ||
142 | if il[i][2] != il[i - 1][2]: | ||
143 | img_il_list.append(il[start: i]) | ||
144 | start = i | ||
145 | elif il[i][3] != il[i - 1][3]: | ||
146 | img_il_list.append(il[start: i + 1]) | ||
147 | start = i + 1 | ||
148 | return img_il_list | ||
149 | |||
150 | def extract_images_pro(self, save_dir_path): | ||
151 | with fitz.Document(self.pdf_path) as pdf: | ||
152 | print('----------------------------') | ||
153 | print(self.pdf_name) | ||
154 | print(pdf.metadata) | ||
155 | # xref_list = [] | ||
156 | for pno in range(pdf.pageCount): | ||
157 | print('========================') | ||
158 | il = pdf.getPageImageList(pno) | ||
159 | il.sort(key=lambda x: x[0]) | ||
160 | # (xref, smask, width, height, bpc, colorspace, alt.colorspace, name, filter, invoker) | ||
161 | |||
162 | img_il_list = self.split_il(il) | ||
163 | il = None | ||
164 | print(img_il_list) | ||
165 | print(len(img_il_list)) | ||
166 | |||
167 | for img_count, img_il in enumerate(img_il_list): | ||
168 | print(img_il) | ||
169 | height_sum = 0 | ||
170 | im_list = [] | ||
171 | for img in img_il: | ||
172 | # xref = img[0] | ||
173 | # if xref in xref_list: | ||
174 | # continue | ||
175 | width = img[2] | ||
176 | height = img[3] | ||
177 | pix = self.recoverpix(pdf, img) | ||
178 | if type(pix) is dict: # we got a raw image | ||
179 | ext = pix["ext"] | ||
180 | img_data = pix["image"] | ||
181 | else: # we got a pixmap | ||
182 | ext = 'png' | ||
183 | img_data = pix.getPNGData() | ||
184 | |||
185 | # xref_list.append(xref) | ||
186 | |||
187 | im = Image.open(BytesIO(img_data)) | ||
188 | im_list.append((width, height, im, ext)) | ||
189 | height_sum += height | ||
190 | |||
191 | print(im_list) | ||
192 | save_path = os.path.join(save_dir_path, 'page_{0}_img_{1}.{2}'.format( | ||
193 | pno, img_count, im_list[0][3])) | ||
194 | # 当只有一张图片时, 简化处理 | ||
195 | if len(im_list) == 1: | ||
196 | im_list[0][2].save(save_path) | ||
197 | # 多张图片,竖向拼接 | ||
198 | else: | ||
199 | res = Image.new(im_list[0][2].mode, (im_list[0][0], height_sum)) | ||
200 | h_now = 0 | ||
201 | for _, h, m, _ in im_list: | ||
202 | res.paste(m, box=(0, h_now)) | ||
203 | h_now += h | ||
204 | res.save(save_path) | ||
205 | |||
206 | |||
207 | if __name__ == '__main__': | ||
208 | dir_path = '/Users/clay/Desktop/biz/pdf_test/银行流水/' | ||
209 | pdf_list = os.listdir(dir_path) | ||
210 | for path in pdf_list: | ||
211 | if path == '.DS_Store': | ||
212 | continue | ||
213 | pdf_handler = PdfHandler(os.path.join(dir_path, path)) | ||
214 | save_path = os.path.join('/Users/clay/Desktop/biz/pdf_test/', 'test', os.path.splitext(os.path.basename(path))[0]) | ||
215 | os.mkdir(save_path) | ||
216 | pdf_handler.extract_images_pro(save_path) | ||
217 | # pdf_handler = PdfHandler('/Users/clay/Desktop/biz/pdf_test/银行流水/竖版-特殊-邮储银行-一本通绿卡通交易明细(客户).pdf') | ||
218 | # pdf_handler = PdfHandler('/Users/clay/Desktop/biz/pdf_test/银行流水/横版-表格-工商银行 借记卡账户历史明细清单 .pdf') | ||
219 | # pdf_handler.page_to_pix_img('/Users/clay/Desktop/biz/pdf_test/', 3.0, 3.0) | ||
220 | # pdf_handler.page_to_svg_img('/Users/clay/Desktop/biz/pdf_test/') | ||
221 | # pdf_handler.extract_images_pro('/Users/clay/Desktop/biz/pdf_test/test') | ||
222 | |||
223 | # pix = fitz.Pixmap(sys.argv[1]) # read image file | ||
224 | # rgb = "RGB" # set PIL parameter | ||
225 | # if pix.alpha: # JPEG cannot have alpha! | ||
226 | # pix0 = fitz.Pixmap(pix, 0) # drop alpha channel | ||
227 | # pix = pix0 # rename pixmap | ||
228 | # | ||
229 | # img = Image.frombuffer(rgb, [pix.width, pix.height], pix.samples, "raw", rgb, 0, 1) | ||
230 | # img.save(outputFileName) |
-
Please register or sign in to post a comment