c364c248 by 周伟奇

update pdf process

1 parent 7594db7e
...@@ -102,11 +102,18 @@ definitions: ...@@ -102,11 +102,18 @@ definitions:
102 documentScheme: 102 documentScheme:
103 description: 文件格式? 103 description: 文件格式?
104 type: string 104 type: string
105 example: CO00001 105 example: Acceptance
106 enum:
107 - Acceptance
108 - Settlement
109 - Contract Management
106 businessType: 110 businessType:
107 description: 业务类型 111 description: 业务类型
108 type: string 112 type: string
109 example: HIL 113 example: CO00001
114 enum:
115 - CO00001
116 - CO00002
110 uploadFinishTime: 117 uploadFinishTime:
111 description: 上传完成时间 118 description: 上传完成时间
112 type: string 119 type: string
...@@ -115,6 +122,10 @@ definitions: ...@@ -115,6 +122,10 @@ definitions:
115 description: 数据源 122 description: 数据源
116 type: string 123 type: string
117 example: POS 124 example: POS
125 enum:
126 - POS
127 - EAPP
128 - Econtract
118 metadataVersionId: 129 metadataVersionId:
119 description: 元数据版本ID 130 description: 元数据版本ID
120 type: string 131 type: string
......
...@@ -23,5 +23,5 @@ class Command(BaseCommand): ...@@ -23,5 +23,5 @@ class Command(BaseCommand):
23 # PDF文件分页转化为图片 23 # PDF文件分页转化为图片
24 # 图片调用算法判断是否为银行流水 24 # 图片调用算法判断是否为银行流水
25 # 图片调用算法OCR为excel文件 25 # 图片调用算法OCR为excel文件
26 # excel文件上传至EDMS 26 # 整合excel文件上传至EDMS
27 pass 27 pass
......
...@@ -25,10 +25,11 @@ applicant_data_args = { ...@@ -25,10 +25,11 @@ applicant_data_args = {
25 25
26 document_args = { 26 document_args = {
27 'documentName': fields.Str(required=True, validate=validate.Length(max=255)), 27 'documentName': fields.Str(required=True, validate=validate.Length(max=255)),
28 # Acceptance/Settlement/Contract Management
28 'documentScheme': fields.Str(required=True, validate=validate.Length(max=64)), 29 'documentScheme': fields.Str(required=True, validate=validate.Length(max=64)),
29 'businessType': fields.Str(required=True, validate=validate.Length(max=64)), 30 'businessType': fields.Str(required=True, validate=validate.Length(max=64)), # CO00001/CO00002
30 'uploadFinishTime': fields.DateTime(required=True), 31 'uploadFinishTime': fields.DateTime(required=True),
31 'dataSource': fields.Str(required=True, validate=validate.Length(max=64)), 32 'dataSource': fields.Str(required=True, validate=validate.Length(max=64)), # POS/EAPP/Econtract
32 'metadataVersionId': fields.Str(required=True, validate=validate.Length(max=64)), 33 'metadataVersionId': fields.Str(required=True, validate=validate.Length(max=64)),
33 } 34 }
34 35
......
...@@ -78,11 +78,13 @@ Doc: ...@@ -78,11 +78,13 @@ Doc:
78 documentScheme: 78 documentScheme:
79 description: 文件格式? 79 description: 文件格式?
80 type: string 80 type: string
81 example: CO00001 81 example: Acceptance
82 enum: [Acceptance, Settlement, Contract Management]
82 businessType: 83 businessType:
83 description: 业务类型 84 description: 业务类型
84 type: string 85 type: string
85 example: HIL 86 example: CO00001
87 enum: [CO00001, CO00002]
86 uploadFinishTime: 88 uploadFinishTime:
87 description: 上传完成时间 89 description: 上传完成时间
88 type: string 90 type: string
...@@ -91,6 +93,7 @@ Doc: ...@@ -91,6 +93,7 @@ Doc:
91 description: 数据源 93 description: 数据源
92 type: string 94 type: string
93 example: POS 95 example: POS
96 enum: [POS, EAPP, Econtract]
94 metadataVersionId: 97 metadataVersionId:
95 description: 元数据版本ID 98 description: 元数据版本ID
96 type: string 99 type: string
......
1 import fitz 1 import fitz
2 import os 2 import os
3 from PIL import Image, ImageCms
4 from io import BytesIO
3 5
4 6
5 class PdfHandler: 7 class PdfHandler:
...@@ -8,16 +10,24 @@ class PdfHandler: ...@@ -8,16 +10,24 @@ class PdfHandler:
8 self.pdf_path = pdf_path 10 self.pdf_path = pdf_path
9 self.pdf_name = os.path.splitext(os.path.basename(pdf_path))[0] 11 self.pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]
10 12
11 def to_pix_img(self, save_dir_path, zoom_x, zoom_y): 13 def page_to_pix_img(self, save_dir_path, zoom_x, zoom_y):
12 trans = fitz.Matrix(zoom_x, zoom_y).preRotate(0) # zoom factor 2 in each dimension 14 trans = fitz.Matrix(zoom_x, zoom_y).preRotate(0) # zoom factor 2 in each dimension
13 with fitz.Document(self.pdf_path) as pdf: 15 with fitz.Document(self.pdf_path) as pdf:
14 # print(pdf.metadata) 16 # print(pdf.metadata)
17 # print(pdf.getPageImageList(0))
18 # print(pdf.getToC()) # 获取大纲
15 for page in pdf: 19 for page in pdf:
16 pm = page.getPixmap(matrix=trans, alpha=False) # 获得每一页的流对象 20 pm = page.getPixmap(matrix=trans, alpha=False)
21 # print(pm.samples) # a rectangular area of bytes representing the image data (a Python bytes object).
22 # print(pm.width)
23 # print(pm.height)
24 # print(pm.stride) # number of bytes of one horizontal image line)
25
17 save_path = os.path.join(save_dir_path, '{0}_{1}.png'.format(self.pdf_name, page.number)) 26 save_path = os.path.join(save_dir_path, '{0}_{1}.png'.format(self.pdf_name, page.number))
18 pm.writePNG(save_path) 27 # pm.writePNG(save_path)
28 pm.writeImage(save_path)
19 29
20 def to_svg_img(self, save_dir_path): 30 def page_to_svg_img(self, save_dir_path):
21 with fitz.Document(self.pdf_path) as pdf: 31 with fitz.Document(self.pdf_path) as pdf:
22 for page in pdf: 32 for page in pdf:
23 svg = page.getSVGimage(matrix=fitz.Identity) # UTF-8 string svg 33 svg = page.getSVGimage(matrix=fitz.Identity) # UTF-8 string svg
...@@ -25,8 +35,195 @@ class PdfHandler: ...@@ -25,8 +35,195 @@ class PdfHandler:
25 with open(save_path, 'w') as f: 35 with open(save_path, 'w') as f:
26 f.write(svg) 36 f.write(svg)
27 37
38 @staticmethod
39 def getimage(pix):
40 if pix.colorspace.n != 4:
41 return pix
42 tpix = fitz.Pixmap(fitz.csRGB, pix)
43 return tpix
44
45 def recoverpix(self, doc, item):
46 x = item[0] # xref of PDF image
47 s = item[1] # xref of its /SMask
48 is_rgb = True if item[5] == 'DeviceRGB' else False
49
50 # RGB
51 if is_rgb:
52 if s == 0:
53 return doc.extractImage(x)
54 # we need to reconstruct the alpha channel with the smask
55 pix1 = fitz.Pixmap(doc, x)
56 pix2 = fitz.Pixmap(doc, s) # create pixmap of the /SMask entry
57
58 # sanity check
59 if not (pix1.irect == pix2.irect and pix1.alpha == pix2.alpha == 0 and pix2.n == 1):
60 pix2 = None
61 return self.getimage(pix1)
62
63 pix = fitz.Pixmap(pix1) # copy of pix1, alpha channel added
64 pix.setAlpha(pix2.samples) # treat pix2.samples as alpha value
65 pix1 = pix2 = None # free temp pixmaps
66 return self.getimage(pix)
67
68 # GRAY/CMYK
69 pix1 = fitz.Pixmap(doc, x)
70 pix = fitz.Pixmap(pix1) # copy of pix1, alpha channel added
71
72 if s != 0:
73 pix2 = fitz.Pixmap(doc, s) # create pixmap of the /SMask entry
74
75 # sanity check
76 if not (pix1.irect == pix2.irect and pix1.alpha == pix2.alpha == 0 and pix2.n == 1):
77 pix2 = None
78 return self.getimage(pix1)
79
80 pix.setAlpha(pix2.samples) # treat pix2.samples as alpha value
81
82 pix1 = pix2 = None # free temp pixmaps
83
84 pix = fitz.Pixmap(fitz.csRGB, pix) # GRAY/CMYK to RGB
85 return self.getimage(pix)
86
87 def extract_images(self, save_dir_path):
88 dimlimit = 100 # each image side must be greater than this
89 relsize = 0.05 # image : pixmap size ratio must be larger than this (5%)
90 abssize = 2048 # absolute image size limit 2 KB: ignore if smaller
91 imgdir = save_dir_path # found images are stored in this subfolder
92
93 xreflist = []
94 with fitz.Document(self.pdf_path) as pdf:
95 for pno in range(pdf.pageCount):
96 il = pdf.getPageImageList(pno)
97 for img in il:
98 print(img)
99 xref = img[0]
100 if xref in xreflist:
101 continue
102 width = img[2]
103 height = img[3]
104 print(xref, width, height)
105 # if min(width, height) <= dimlimit:
106 # continue
107 pix = self.recoverpix(pdf, img)
108 if type(pix) is dict: # we got a raw image
109 ext = pix["ext"]
110 imgdata = pix["image"]
111 n = pix["colorspace"]
112 imgfile = os.path.join(imgdir, "img-%i.%s" % (xref, ext))
113 else: # we got a pixmap
114 imgfile = os.path.join(imgdir, "img-%i.png" % xref)
115 n = pix.n
116 imgdata = pix.getPNGData()
117
118 # if len(imgdata) <= abssize:
119 # continue
120 #
121 # if len(imgdata) / (width * height * n) <= relsize:
122 # continue
123
124 fout = open(imgfile, "wb")
125 fout.write(imgdata)
126 fout.close()
127 xreflist.append(xref)
128
129 def split_il(self, il):
130 img_il_list = []
131 start = 0
132 length = len(il)
133 for i in range(length):
134 if i == start:
135 if i == length - 1:
136 img_il_list.append(il[start: length])
137 continue
138 elif i == length - 1:
139 img_il_list.append(il[start: length])
140 continue
141 if il[i][2] != il[i - 1][2]:
142 img_il_list.append(il[start: i])
143 start = i
144 elif il[i][3] != il[i - 1][3]:
145 img_il_list.append(il[start: i + 1])
146 start = i + 1
147 return img_il_list
148
149 def extract_images_pro(self, save_dir_path):
150 with fitz.Document(self.pdf_path) as pdf:
151 print('----------------------------')
152 print(self.pdf_name)
153 print(pdf.metadata)
154 # xref_list = [] # TODO 图片去重
155 for pno in range(pdf.pageCount):
156 print('========================')
157 il = pdf.getPageImageList(pno)
158 il.sort(key=lambda x: x[0])
159 # (xref, smask, width, height, bpc, colorspace, alt.colorspace, name, filter, invoker)
160
161 img_il_list = self.split_il(il)
162 il = None
163 print(img_il_list)
164 print(len(img_il_list)) # TODO 判断单页图片过多时,使用页面转图片
165
166 for img_count, img_il in enumerate(img_il_list):
167 print(img_il)
168 height_sum = 0
169 im_list = []
170 for img in img_il:
171 # xref = img[0]
172 # if xref in xref_list:
173 # continue
174 width = img[2]
175 height = img[3]
176 pix = self.recoverpix(pdf, img)
177 if type(pix) is dict: # we got a raw image
178 ext = pix["ext"]
179 img_data = pix["image"]
180 else: # we got a pixmap
181 ext = 'png'
182 img_data = pix.getPNGData()
183
184 # xref_list.append(xref)
185
186 im = Image.open(BytesIO(img_data))
187 im_list.append((width, height, im, ext))
188 height_sum += height
189
190 print(im_list)
191 save_path = os.path.join(save_dir_path, 'page_{0}_img_{1}.{2}'.format(
192 pno, img_count, im_list[0][3]))
193 # 当只有一张图片时, 简化处理
194 if len(im_list) == 1:
195 im_list[0][2].save(save_path)
196 # 多张图片,竖向拼接
197 else:
198 res = Image.new(im_list[0][2].mode, (im_list[0][0], height_sum))
199 h_now = 0
200 for _, h, m, _ in im_list:
201 res.paste(m, box=(0, h_now))
202 h_now += h
203 res.save(save_path)
204
28 205
29 if __name__ == '__main__': 206 if __name__ == '__main__':
30 pdf_handler = PdfHandler('/Users/clay/Desktop/biz/pdf_test/test.pdf') 207 dir_path = '/Users/clay/Desktop/biz/pdf_test/银行流水/'
31 # pdf_handler.to_pix_img('/Users/clay/Desktop/biz/pdf_test/', 1.0, 1.0) 208 pdf_list = os.listdir(dir_path)
32 # pdf_handler.to_svg_img('/Users/clay/Desktop/biz/pdf_test/') 209 for path in pdf_list:
210 if path == '.DS_Store':
211 continue
212 pdf_handler = PdfHandler(os.path.join(dir_path, path))
213 save_path = os.path.join('/Users/clay/Desktop/biz/pdf_test/', 'test', os.path.splitext(os.path.basename(path))[0])
214 os.mkdir(save_path)
215 pdf_handler.extract_images_pro(save_path)
216 # pdf_handler = PdfHandler('/Users/clay/Desktop/biz/pdf_test/银行流水/竖版-特殊-邮储银行-一本通绿卡通交易明细(客户).pdf')
217 # pdf_handler = PdfHandler('/Users/clay/Desktop/biz/pdf_test/银行流水/横版-表格-工商银行 借记卡账户历史明细清单 .pdf')
218 # pdf_handler.page_to_pix_img('/Users/clay/Desktop/biz/pdf_test/', 3.0, 3.0)
219 # pdf_handler.page_to_svg_img('/Users/clay/Desktop/biz/pdf_test/')
220 # pdf_handler.extract_images_pro('/Users/clay/Desktop/biz/pdf_test/test')
221
222 # pix = fitz.Pixmap(sys.argv[1]) # read image file
223 # rgb = "RGB" # set PIL parameter
224 # if pix.alpha: # JPEG cannot have alpha!
225 # pix0 = fitz.Pixmap(pix, 0) # drop alpha channel
226 # pix = pix0 # rename pixmap
227 #
228 # img = Image.frombuffer(rgb, [pix.width, pix.height], pix.samples, "raw", rgb, 0, 1)
229 # img.save(outputFileName)
......
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!