c364c248 by 周伟奇

update pdf process

1 parent 7594db7e
......@@ -102,11 +102,18 @@ definitions:
documentScheme:
description: 文件格式?
type: string
example: CO00001
example: Acceptance
enum:
- Acceptance
- Settlement
- Contract Management
businessType:
description: 业务类型
type: string
example: HIL
example: CO00001
enum:
- CO00001
- CO00002
uploadFinishTime:
description: 上传完成时间
type: string
......@@ -115,6 +122,10 @@ definitions:
description: 数据源
type: string
example: POS
enum:
- POS
- EAPP
- Econtract
metadataVersionId:
description: 元数据版本ID
type: string
......
......@@ -23,5 +23,5 @@ class Command(BaseCommand):
# PDF文件分页转化为图片
# 图片调用算法判断是否为银行流水
# 图片调用算法OCR为excel文件
# excel文件上传至EDMS
# 整合excel文件上传至EDMS
pass
......
......@@ -25,10 +25,11 @@ applicant_data_args = {
document_args = {
'documentName': fields.Str(required=True, validate=validate.Length(max=255)),
# Acceptance/Settlement/Contract Management
'documentScheme': fields.Str(required=True, validate=validate.Length(max=64)),
'businessType': fields.Str(required=True, validate=validate.Length(max=64)),
'businessType': fields.Str(required=True, validate=validate.Length(max=64)), # CO00001/CO00002
'uploadFinishTime': fields.DateTime(required=True),
'dataSource': fields.Str(required=True, validate=validate.Length(max=64)),
'dataSource': fields.Str(required=True, validate=validate.Length(max=64)), # POS/EAPP/Econtract
'metadataVersionId': fields.Str(required=True, validate=validate.Length(max=64)),
}
......
......@@ -78,11 +78,13 @@ Doc:
documentScheme:
description: 文件格式?
type: string
example: CO00001
example: Acceptance
enum: [Acceptance, Settlement, Contract Management]
businessType:
description: 业务类型
type: string
example: HIL
example: CO00001
enum: [CO00001, CO00002]
uploadFinishTime:
description: 上传完成时间
type: string
......@@ -91,6 +93,7 @@ Doc:
description: 数据源
type: string
example: POS
enum: [POS, EAPP, Econtract]
metadataVersionId:
description: 元数据版本ID
type: string
......
import fitz
import os
from PIL import Image, ImageCms
from io import BytesIO
class PdfHandler:
......@@ -8,16 +10,24 @@ class PdfHandler:
self.pdf_path = pdf_path
self.pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]
def to_pix_img(self, save_dir_path, zoom_x, zoom_y):
def page_to_pix_img(self, save_dir_path, zoom_x, zoom_y):
trans = fitz.Matrix(zoom_x, zoom_y).preRotate(0) # zoom factor 2 in each dimension
with fitz.Document(self.pdf_path) as pdf:
# print(pdf.metadata)
# print(pdf.getPageImageList(0))
# print(pdf.getToC()) # 获取大纲
for page in pdf:
pm = page.getPixmap(matrix=trans, alpha=False) # 获得每一页的流对象
pm = page.getPixmap(matrix=trans, alpha=False)
# print(pm.samples) # a rectangular area of bytes representing the image data (a Python bytes object).
# print(pm.width)
# print(pm.height)
# print(pm.stride) # number of bytes of one horizontal image line)
save_path = os.path.join(save_dir_path, '{0}_{1}.png'.format(self.pdf_name, page.number))
pm.writePNG(save_path)
# pm.writePNG(save_path)
pm.writeImage(save_path)
def to_svg_img(self, save_dir_path):
def page_to_svg_img(self, save_dir_path):
with fitz.Document(self.pdf_path) as pdf:
for page in pdf:
svg = page.getSVGimage(matrix=fitz.Identity) # UTF-8 string svg
......@@ -25,8 +35,195 @@ class PdfHandler:
with open(save_path, 'w') as f:
f.write(svg)
@staticmethod
def getimage(pix):
if pix.colorspace.n != 4:
return pix
tpix = fitz.Pixmap(fitz.csRGB, pix)
return tpix
def recoverpix(self, doc, item):
x = item[0] # xref of PDF image
s = item[1] # xref of its /SMask
is_rgb = True if item[5] == 'DeviceRGB' else False
# RGB
if is_rgb:
if s == 0:
return doc.extractImage(x)
# we need to reconstruct the alpha channel with the smask
pix1 = fitz.Pixmap(doc, x)
pix2 = fitz.Pixmap(doc, s) # create pixmap of the /SMask entry
# sanity check
if not (pix1.irect == pix2.irect and pix1.alpha == pix2.alpha == 0 and pix2.n == 1):
pix2 = None
return self.getimage(pix1)
pix = fitz.Pixmap(pix1) # copy of pix1, alpha channel added
pix.setAlpha(pix2.samples) # treat pix2.samples as alpha value
pix1 = pix2 = None # free temp pixmaps
return self.getimage(pix)
# GRAY/CMYK
pix1 = fitz.Pixmap(doc, x)
pix = fitz.Pixmap(pix1) # copy of pix1, alpha channel added
if s != 0:
pix2 = fitz.Pixmap(doc, s) # create pixmap of the /SMask entry
# sanity check
if not (pix1.irect == pix2.irect and pix1.alpha == pix2.alpha == 0 and pix2.n == 1):
pix2 = None
return self.getimage(pix1)
pix.setAlpha(pix2.samples) # treat pix2.samples as alpha value
pix1 = pix2 = None # free temp pixmaps
pix = fitz.Pixmap(fitz.csRGB, pix) # GRAY/CMYK to RGB
return self.getimage(pix)
def extract_images(self, save_dir_path):
dimlimit = 100 # each image side must be greater than this
relsize = 0.05 # image : pixmap size ratio must be larger than this (5%)
abssize = 2048 # absolute image size limit 2 KB: ignore if smaller
imgdir = save_dir_path # found images are stored in this subfolder
xreflist = []
with fitz.Document(self.pdf_path) as pdf:
for pno in range(pdf.pageCount):
il = pdf.getPageImageList(pno)
for img in il:
print(img)
xref = img[0]
if xref in xreflist:
continue
width = img[2]
height = img[3]
print(xref, width, height)
# if min(width, height) <= dimlimit:
# continue
pix = self.recoverpix(pdf, img)
if type(pix) is dict: # we got a raw image
ext = pix["ext"]
imgdata = pix["image"]
n = pix["colorspace"]
imgfile = os.path.join(imgdir, "img-%i.%s" % (xref, ext))
else: # we got a pixmap
imgfile = os.path.join(imgdir, "img-%i.png" % xref)
n = pix.n
imgdata = pix.getPNGData()
# if len(imgdata) <= abssize:
# continue
#
# if len(imgdata) / (width * height * n) <= relsize:
# continue
fout = open(imgfile, "wb")
fout.write(imgdata)
fout.close()
xreflist.append(xref)
def split_il(self, il):
img_il_list = []
start = 0
length = len(il)
for i in range(length):
if i == start:
if i == length - 1:
img_il_list.append(il[start: length])
continue
elif i == length - 1:
img_il_list.append(il[start: length])
continue
if il[i][2] != il[i - 1][2]:
img_il_list.append(il[start: i])
start = i
elif il[i][3] != il[i - 1][3]:
img_il_list.append(il[start: i + 1])
start = i + 1
return img_il_list
def extract_images_pro(self, save_dir_path):
with fitz.Document(self.pdf_path) as pdf:
print('----------------------------')
print(self.pdf_name)
print(pdf.metadata)
# xref_list = [] # TODO 图片去重
for pno in range(pdf.pageCount):
print('========================')
il = pdf.getPageImageList(pno)
il.sort(key=lambda x: x[0])
# (xref, smask, width, height, bpc, colorspace, alt.colorspace, name, filter, invoker)
img_il_list = self.split_il(il)
il = None
print(img_il_list)
print(len(img_il_list)) # TODO 判断单页图片过多时,使用页面转图片
for img_count, img_il in enumerate(img_il_list):
print(img_il)
height_sum = 0
im_list = []
for img in img_il:
# xref = img[0]
# if xref in xref_list:
# continue
width = img[2]
height = img[3]
pix = self.recoverpix(pdf, img)
if type(pix) is dict: # we got a raw image
ext = pix["ext"]
img_data = pix["image"]
else: # we got a pixmap
ext = 'png'
img_data = pix.getPNGData()
# xref_list.append(xref)
im = Image.open(BytesIO(img_data))
im_list.append((width, height, im, ext))
height_sum += height
print(im_list)
save_path = os.path.join(save_dir_path, 'page_{0}_img_{1}.{2}'.format(
pno, img_count, im_list[0][3]))
# 当只有一张图片时, 简化处理
if len(im_list) == 1:
im_list[0][2].save(save_path)
# 多张图片,竖向拼接
else:
res = Image.new(im_list[0][2].mode, (im_list[0][0], height_sum))
h_now = 0
for _, h, m, _ in im_list:
res.paste(m, box=(0, h_now))
h_now += h
res.save(save_path)
if __name__ == '__main__':
pdf_handler = PdfHandler('/Users/clay/Desktop/biz/pdf_test/test.pdf')
# pdf_handler.to_pix_img('/Users/clay/Desktop/biz/pdf_test/', 1.0, 1.0)
# pdf_handler.to_svg_img('/Users/clay/Desktop/biz/pdf_test/')
dir_path = '/Users/clay/Desktop/biz/pdf_test/银行流水/'
pdf_list = os.listdir(dir_path)
for path in pdf_list:
if path == '.DS_Store':
continue
pdf_handler = PdfHandler(os.path.join(dir_path, path))
save_path = os.path.join('/Users/clay/Desktop/biz/pdf_test/', 'test', os.path.splitext(os.path.basename(path))[0])
os.mkdir(save_path)
pdf_handler.extract_images_pro(save_path)
# pdf_handler = PdfHandler('/Users/clay/Desktop/biz/pdf_test/银行流水/竖版-特殊-邮储银行-一本通绿卡通交易明细(客户).pdf')
# pdf_handler = PdfHandler('/Users/clay/Desktop/biz/pdf_test/银行流水/横版-表格-工商银行 借记卡账户历史明细清单 .pdf')
# pdf_handler.page_to_pix_img('/Users/clay/Desktop/biz/pdf_test/', 3.0, 3.0)
# pdf_handler.page_to_svg_img('/Users/clay/Desktop/biz/pdf_test/')
# pdf_handler.extract_images_pro('/Users/clay/Desktop/biz/pdf_test/test')
# pix = fitz.Pixmap(sys.argv[1]) # read image file
# rgb = "RGB" # set PIL parameter
# if pix.alpha: # JPEG cannot have alpha!
# pix0 = fitz.Pixmap(pix, 0) # drop alpha channel
# pix = pix0 # rename pixmap
#
# img = Image.frombuffer(rgb, [pix.width, pix.height], pix.samples, "raw", rgb, 0, 1)
# img.save(outputFileName)
......
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!