update pdf process
Showing
5 changed files
with
226 additions
and
14 deletions
... | @@ -102,11 +102,18 @@ definitions: | ... | @@ -102,11 +102,18 @@ definitions: |
102 | documentScheme: | 102 | documentScheme: |
103 | description: 文件格式? | 103 | description: 文件格式? |
104 | type: string | 104 | type: string |
105 | example: CO00001 | 105 | example: Acceptance |
106 | enum: | ||
107 | - Acceptance | ||
108 | - Settlement | ||
109 | - Contract Management | ||
106 | businessType: | 110 | businessType: |
107 | description: 业务类型 | 111 | description: 业务类型 |
108 | type: string | 112 | type: string |
109 | example: HIL | 113 | example: CO00001 |
114 | enum: | ||
115 | - CO00001 | ||
116 | - CO00002 | ||
110 | uploadFinishTime: | 117 | uploadFinishTime: |
111 | description: 上传完成时间 | 118 | description: 上传完成时间 |
112 | type: string | 119 | type: string |
... | @@ -115,6 +122,10 @@ definitions: | ... | @@ -115,6 +122,10 @@ definitions: |
115 | description: 数据源 | 122 | description: 数据源 |
116 | type: string | 123 | type: string |
117 | example: POS | 124 | example: POS |
125 | enum: | ||
126 | - POS | ||
127 | - EAPP | ||
128 | - Econtract | ||
118 | metadataVersionId: | 129 | metadataVersionId: |
119 | description: 元数据版本ID | 130 | description: 元数据版本ID |
120 | type: string | 131 | type: string | ... | ... |
... | @@ -25,10 +25,11 @@ applicant_data_args = { | ... | @@ -25,10 +25,11 @@ applicant_data_args = { |
25 | 25 | ||
26 | document_args = { | 26 | document_args = { |
27 | 'documentName': fields.Str(required=True, validate=validate.Length(max=255)), | 27 | 'documentName': fields.Str(required=True, validate=validate.Length(max=255)), |
28 | # Acceptance/Settlement/Contract Management | ||
28 | 'documentScheme': fields.Str(required=True, validate=validate.Length(max=64)), | 29 | 'documentScheme': fields.Str(required=True, validate=validate.Length(max=64)), |
29 | 'businessType': fields.Str(required=True, validate=validate.Length(max=64)), | 30 | 'businessType': fields.Str(required=True, validate=validate.Length(max=64)), # CO00001/CO00002 |
30 | 'uploadFinishTime': fields.DateTime(required=True), | 31 | 'uploadFinishTime': fields.DateTime(required=True), |
31 | 'dataSource': fields.Str(required=True, validate=validate.Length(max=64)), | 32 | 'dataSource': fields.Str(required=True, validate=validate.Length(max=64)), # POS/EAPP/Econtract |
32 | 'metadataVersionId': fields.Str(required=True, validate=validate.Length(max=64)), | 33 | 'metadataVersionId': fields.Str(required=True, validate=validate.Length(max=64)), |
33 | } | 34 | } |
34 | 35 | ... | ... |
... | @@ -78,11 +78,13 @@ Doc: | ... | @@ -78,11 +78,13 @@ Doc: |
78 | documentScheme: | 78 | documentScheme: |
79 | description: 文件格式? | 79 | description: 文件格式? |
80 | type: string | 80 | type: string |
81 | example: CO00001 | 81 | example: Acceptance |
82 | enum: [Acceptance, Settlement, Contract Management] | ||
82 | businessType: | 83 | businessType: |
83 | description: 业务类型 | 84 | description: 业务类型 |
84 | type: string | 85 | type: string |
85 | example: HIL | 86 | example: CO00001 |
87 | enum: [CO00001, CO00002] | ||
86 | uploadFinishTime: | 88 | uploadFinishTime: |
87 | description: 上传完成时间 | 89 | description: 上传完成时间 |
88 | type: string | 90 | type: string |
... | @@ -91,6 +93,7 @@ Doc: | ... | @@ -91,6 +93,7 @@ Doc: |
91 | description: 数据源 | 93 | description: 数据源 |
92 | type: string | 94 | type: string |
93 | example: POS | 95 | example: POS |
96 | enum: [POS, EAPP, Econtract] | ||
94 | metadataVersionId: | 97 | metadataVersionId: |
95 | description: 元数据版本ID | 98 | description: 元数据版本ID |
96 | type: string | 99 | type: string | ... | ... |
1 | import fitz | 1 | import fitz |
2 | import os | 2 | import os |
3 | from PIL import Image, ImageCms | ||
4 | from io import BytesIO | ||
3 | 5 | ||
4 | 6 | ||
5 | class PdfHandler: | 7 | class PdfHandler: |
... | @@ -8,16 +10,24 @@ class PdfHandler: | ... | @@ -8,16 +10,24 @@ class PdfHandler: |
8 | self.pdf_path = pdf_path | 10 | self.pdf_path = pdf_path |
9 | self.pdf_name = os.path.splitext(os.path.basename(pdf_path))[0] | 11 | self.pdf_name = os.path.splitext(os.path.basename(pdf_path))[0] |
10 | 12 | ||
11 | def to_pix_img(self, save_dir_path, zoom_x, zoom_y): | 13 | def page_to_pix_img(self, save_dir_path, zoom_x, zoom_y): |
12 | trans = fitz.Matrix(zoom_x, zoom_y).preRotate(0) # zoom factor 2 in each dimension | 14 | trans = fitz.Matrix(zoom_x, zoom_y).preRotate(0) # zoom factor 2 in each dimension |
13 | with fitz.Document(self.pdf_path) as pdf: | 15 | with fitz.Document(self.pdf_path) as pdf: |
14 | # print(pdf.metadata) | 16 | # print(pdf.metadata) |
17 | # print(pdf.getPageImageList(0)) | ||
18 | # print(pdf.getToC()) # 获取大纲 | ||
15 | for page in pdf: | 19 | for page in pdf: |
16 | pm = page.getPixmap(matrix=trans, alpha=False) # 获得每一页的流对象 | 20 | pm = page.getPixmap(matrix=trans, alpha=False) |
21 | # print(pm.samples) # a rectangular area of bytes representing the image data (a Python bytes object). | ||
22 | # print(pm.width) | ||
23 | # print(pm.height) | ||
24 | # print(pm.stride) # number of bytes of one horizontal image line) | ||
25 | |||
17 | save_path = os.path.join(save_dir_path, '{0}_{1}.png'.format(self.pdf_name, page.number)) | 26 | save_path = os.path.join(save_dir_path, '{0}_{1}.png'.format(self.pdf_name, page.number)) |
18 | pm.writePNG(save_path) | 27 | # pm.writePNG(save_path) |
28 | pm.writeImage(save_path) | ||
19 | 29 | ||
20 | def to_svg_img(self, save_dir_path): | 30 | def page_to_svg_img(self, save_dir_path): |
21 | with fitz.Document(self.pdf_path) as pdf: | 31 | with fitz.Document(self.pdf_path) as pdf: |
22 | for page in pdf: | 32 | for page in pdf: |
23 | svg = page.getSVGimage(matrix=fitz.Identity) # UTF-8 string svg | 33 | svg = page.getSVGimage(matrix=fitz.Identity) # UTF-8 string svg |
... | @@ -25,8 +35,195 @@ class PdfHandler: | ... | @@ -25,8 +35,195 @@ class PdfHandler: |
25 | with open(save_path, 'w') as f: | 35 | with open(save_path, 'w') as f: |
26 | f.write(svg) | 36 | f.write(svg) |
27 | 37 | ||
38 | @staticmethod | ||
39 | def getimage(pix): | ||
40 | if pix.colorspace.n != 4: | ||
41 | return pix | ||
42 | tpix = fitz.Pixmap(fitz.csRGB, pix) | ||
43 | return tpix | ||
44 | |||
45 | def recoverpix(self, doc, item): | ||
46 | x = item[0] # xref of PDF image | ||
47 | s = item[1] # xref of its /SMask | ||
48 | is_rgb = True if item[5] == 'DeviceRGB' else False | ||
49 | |||
50 | # RGB | ||
51 | if is_rgb: | ||
52 | if s == 0: | ||
53 | return doc.extractImage(x) | ||
54 | # we need to reconstruct the alpha channel with the smask | ||
55 | pix1 = fitz.Pixmap(doc, x) | ||
56 | pix2 = fitz.Pixmap(doc, s) # create pixmap of the /SMask entry | ||
57 | |||
58 | # sanity check | ||
59 | if not (pix1.irect == pix2.irect and pix1.alpha == pix2.alpha == 0 and pix2.n == 1): | ||
60 | pix2 = None | ||
61 | return self.getimage(pix1) | ||
62 | |||
63 | pix = fitz.Pixmap(pix1) # copy of pix1, alpha channel added | ||
64 | pix.setAlpha(pix2.samples) # treat pix2.samples as alpha value | ||
65 | pix1 = pix2 = None # free temp pixmaps | ||
66 | return self.getimage(pix) | ||
67 | |||
68 | # GRAY/CMYK | ||
69 | pix1 = fitz.Pixmap(doc, x) | ||
70 | pix = fitz.Pixmap(pix1) # copy of pix1, alpha channel added | ||
71 | |||
72 | if s != 0: | ||
73 | pix2 = fitz.Pixmap(doc, s) # create pixmap of the /SMask entry | ||
74 | |||
75 | # sanity check | ||
76 | if not (pix1.irect == pix2.irect and pix1.alpha == pix2.alpha == 0 and pix2.n == 1): | ||
77 | pix2 = None | ||
78 | return self.getimage(pix1) | ||
79 | |||
80 | pix.setAlpha(pix2.samples) # treat pix2.samples as alpha value | ||
81 | |||
82 | pix1 = pix2 = None # free temp pixmaps | ||
83 | |||
84 | pix = fitz.Pixmap(fitz.csRGB, pix) # GRAY/CMYK to RGB | ||
85 | return self.getimage(pix) | ||
86 | |||
87 | def extract_images(self, save_dir_path): | ||
88 | dimlimit = 100 # each image side must be greater than this | ||
89 | relsize = 0.05 # image : pixmap size ratio must be larger than this (5%) | ||
90 | abssize = 2048 # absolute image size limit 2 KB: ignore if smaller | ||
91 | imgdir = save_dir_path # found images are stored in this subfolder | ||
92 | |||
93 | xreflist = [] | ||
94 | with fitz.Document(self.pdf_path) as pdf: | ||
95 | for pno in range(pdf.pageCount): | ||
96 | il = pdf.getPageImageList(pno) | ||
97 | for img in il: | ||
98 | print(img) | ||
99 | xref = img[0] | ||
100 | if xref in xreflist: | ||
101 | continue | ||
102 | width = img[2] | ||
103 | height = img[3] | ||
104 | print(xref, width, height) | ||
105 | # if min(width, height) <= dimlimit: | ||
106 | # continue | ||
107 | pix = self.recoverpix(pdf, img) | ||
108 | if type(pix) is dict: # we got a raw image | ||
109 | ext = pix["ext"] | ||
110 | imgdata = pix["image"] | ||
111 | n = pix["colorspace"] | ||
112 | imgfile = os.path.join(imgdir, "img-%i.%s" % (xref, ext)) | ||
113 | else: # we got a pixmap | ||
114 | imgfile = os.path.join(imgdir, "img-%i.png" % xref) | ||
115 | n = pix.n | ||
116 | imgdata = pix.getPNGData() | ||
117 | |||
118 | # if len(imgdata) <= abssize: | ||
119 | # continue | ||
120 | # | ||
121 | # if len(imgdata) / (width * height * n) <= relsize: | ||
122 | # continue | ||
123 | |||
124 | fout = open(imgfile, "wb") | ||
125 | fout.write(imgdata) | ||
126 | fout.close() | ||
127 | xreflist.append(xref) | ||
128 | |||
129 | def split_il(self, il): | ||
130 | img_il_list = [] | ||
131 | start = 0 | ||
132 | length = len(il) | ||
133 | for i in range(length): | ||
134 | if i == start: | ||
135 | if i == length - 1: | ||
136 | img_il_list.append(il[start: length]) | ||
137 | continue | ||
138 | elif i == length - 1: | ||
139 | img_il_list.append(il[start: length]) | ||
140 | continue | ||
141 | if il[i][2] != il[i - 1][2]: | ||
142 | img_il_list.append(il[start: i]) | ||
143 | start = i | ||
144 | elif il[i][3] != il[i - 1][3]: | ||
145 | img_il_list.append(il[start: i + 1]) | ||
146 | start = i + 1 | ||
147 | return img_il_list | ||
148 | |||
149 | def extract_images_pro(self, save_dir_path): | ||
150 | with fitz.Document(self.pdf_path) as pdf: | ||
151 | print('----------------------------') | ||
152 | print(self.pdf_name) | ||
153 | print(pdf.metadata) | ||
154 | # xref_list = [] # TODO 图片去重 | ||
155 | for pno in range(pdf.pageCount): | ||
156 | print('========================') | ||
157 | il = pdf.getPageImageList(pno) | ||
158 | il.sort(key=lambda x: x[0]) | ||
159 | # (xref, smask, width, height, bpc, colorspace, alt.colorspace, name, filter, invoker) | ||
160 | |||
161 | img_il_list = self.split_il(il) | ||
162 | il = None | ||
163 | print(img_il_list) | ||
164 | print(len(img_il_list)) # TODO 判断单页图片过多时,使用页面转图片 | ||
165 | |||
166 | for img_count, img_il in enumerate(img_il_list): | ||
167 | print(img_il) | ||
168 | height_sum = 0 | ||
169 | im_list = [] | ||
170 | for img in img_il: | ||
171 | # xref = img[0] | ||
172 | # if xref in xref_list: | ||
173 | # continue | ||
174 | width = img[2] | ||
175 | height = img[3] | ||
176 | pix = self.recoverpix(pdf, img) | ||
177 | if type(pix) is dict: # we got a raw image | ||
178 | ext = pix["ext"] | ||
179 | img_data = pix["image"] | ||
180 | else: # we got a pixmap | ||
181 | ext = 'png' | ||
182 | img_data = pix.getPNGData() | ||
183 | |||
184 | # xref_list.append(xref) | ||
185 | |||
186 | im = Image.open(BytesIO(img_data)) | ||
187 | im_list.append((width, height, im, ext)) | ||
188 | height_sum += height | ||
189 | |||
190 | print(im_list) | ||
191 | save_path = os.path.join(save_dir_path, 'page_{0}_img_{1}.{2}'.format( | ||
192 | pno, img_count, im_list[0][3])) | ||
193 | # 当只有一张图片时, 简化处理 | ||
194 | if len(im_list) == 1: | ||
195 | im_list[0][2].save(save_path) | ||
196 | # 多张图片,竖向拼接 | ||
197 | else: | ||
198 | res = Image.new(im_list[0][2].mode, (im_list[0][0], height_sum)) | ||
199 | h_now = 0 | ||
200 | for _, h, m, _ in im_list: | ||
201 | res.paste(m, box=(0, h_now)) | ||
202 | h_now += h | ||
203 | res.save(save_path) | ||
204 | |||
28 | 205 | ||
29 | if __name__ == '__main__': | 206 | if __name__ == '__main__': |
30 | pdf_handler = PdfHandler('/Users/clay/Desktop/biz/pdf_test/test.pdf') | 207 | dir_path = '/Users/clay/Desktop/biz/pdf_test/银行流水/' |
31 | # pdf_handler.to_pix_img('/Users/clay/Desktop/biz/pdf_test/', 1.0, 1.0) | 208 | pdf_list = os.listdir(dir_path) |
32 | # pdf_handler.to_svg_img('/Users/clay/Desktop/biz/pdf_test/') | 209 | for path in pdf_list: |
210 | if path == '.DS_Store': | ||
211 | continue | ||
212 | pdf_handler = PdfHandler(os.path.join(dir_path, path)) | ||
213 | save_path = os.path.join('/Users/clay/Desktop/biz/pdf_test/', 'test', os.path.splitext(os.path.basename(path))[0]) | ||
214 | os.mkdir(save_path) | ||
215 | pdf_handler.extract_images_pro(save_path) | ||
216 | # pdf_handler = PdfHandler('/Users/clay/Desktop/biz/pdf_test/银行流水/竖版-特殊-邮储银行-一本通绿卡通交易明细(客户).pdf') | ||
217 | # pdf_handler = PdfHandler('/Users/clay/Desktop/biz/pdf_test/银行流水/横版-表格-工商银行 借记卡账户历史明细清单 .pdf') | ||
218 | # pdf_handler.page_to_pix_img('/Users/clay/Desktop/biz/pdf_test/', 3.0, 3.0) | ||
219 | # pdf_handler.page_to_svg_img('/Users/clay/Desktop/biz/pdf_test/') | ||
220 | # pdf_handler.extract_images_pro('/Users/clay/Desktop/biz/pdf_test/test') | ||
221 | |||
222 | # pix = fitz.Pixmap(sys.argv[1]) # read image file | ||
223 | # rgb = "RGB" # set PIL parameter | ||
224 | # if pix.alpha: # JPEG cannot have alpha! | ||
225 | # pix0 = fitz.Pixmap(pix, 0) # drop alpha channel | ||
226 | # pix = pix0 # rename pixmap | ||
227 | # | ||
228 | # img = Image.frombuffer(rgb, [pix.width, pix.height], pix.samples, "raw", rgb, 0, 1) | ||
229 | # img.save(outputFileName) | ... | ... |
-
Please register or sign in to post a comment