update pdf process

周伟奇
Showing 5 changed files with 226 additions and 14 deletions
docs/main.yaml
src/apps/doc/management/commands/doc_process.py
src/apps/doc/views.py
src/common/api_doc.py
src/common/tools/pdf_tools.py
--- a/docs/main.yaml
View file @c364c24
+++ b/docs/main.yaml
View file @c364c24
@@ -102,11 +102,18 @@ definitions:
          documentScheme:
            description: 文件格式？
            type: string
-            example: CO00001
+            example: Acceptance
+            enum:
+            - Acceptance
+            - Settlement
+            - Contract Management
          businessType:
            description: 业务类型
            type: string
-            example: HIL
+            example: CO00001
+            enum:
+            - CO00001
+            - CO00002
          uploadFinishTime:
            description: 上传完成时间
            type: string
@@ -115,6 +122,10 @@ definitions:
            description: 数据源
            type: string
            example: POS
+            enum:
+            - POS
+            - EAPP
+            - Econtract
          metadataVersionId:
            description: 元数据版本ID
            type: string
--- a/src/apps/doc/management/commands/doc_process.py
View file @c364c24
+++ b/src/apps/doc/management/commands/doc_process.py
View file @c364c24
@@ -23,5 +23,5 @@ class Command(BaseCommand):
            # PDF文件分页转化为图片
            # 图片调用算法判断是否为银行流水
            # 图片调用算法OCR为excel文件
-            # excel文件上传至EDMS
+            # 整合excel文件上传至EDMS
            pass
--- a/src/apps/doc/views.py
View file @c364c24
+++ b/src/apps/doc/views.py
View file @c364c24
@@ -25,10 +25,11 @@ applicant_data_args = {

 document_args = {
    'documentName': fields.Str(required=True, validate=validate.Length(max=255)),
+    # Acceptance/Settlement/Contract Management
    'documentScheme': fields.Str(required=True, validate=validate.Length(max=64)),
-    'businessType': fields.Str(required=True, validate=validate.Length(max=64)),
+    'businessType': fields.Str(required=True, validate=validate.Length(max=64)),    # CO00001/CO00002
    'uploadFinishTime': fields.DateTime(required=True),
-    'dataSource': fields.Str(required=True, validate=validate.Length(max=64)),
+    'dataSource': fields.Str(required=True, validate=validate.Length(max=64)),  # POS/EAPP/Econtract
    'metadataVersionId': fields.Str(required=True, validate=validate.Length(max=64)),
 }

--- a/src/common/api_doc.py
View file @c364c24
+++ b/src/common/api_doc.py
View file @c364c24
@@ -78,11 +78,13 @@ Doc:
        documentScheme:
          description: 文件格式？
          type: string
-          example: CO00001
+          example: Acceptance
+          enum: [Acceptance, Settlement, Contract Management]
        businessType:
          description: 业务类型
          type: string
-          example: HIL
+          example: CO00001
+          enum: [CO00001, CO00002]
        uploadFinishTime:
          description: 上传完成时间
          type: string
@@ -91,6 +93,7 @@ Doc:
          description: 数据源
          type: string
          example: POS
+          enum: [POS, EAPP, Econtract]
        metadataVersionId:
          description: 元数据版本ID
          type: string
--- a/src/common/tools/pdf_tools.py
View file @c364c24
+++ b/src/common/tools/pdf_tools.py
View file @c364c24
 import fitz
 import os
+from PIL import Image, ImageCms
+from io import BytesIO


 class PdfHandler:
@@ -8,16 +10,24 @@ class PdfHandler:
        self.pdf_path = pdf_path
        self.pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]

-    def to_pix_img(self, save_dir_path, zoom_x, zoom_y):
+    def page_to_pix_img(self, save_dir_path, zoom_x, zoom_y):
        trans = fitz.Matrix(zoom_x, zoom_y).preRotate(0)  # zoom factor 2 in each dimension
        with fitz.Document(self.pdf_path) as pdf:
            # print(pdf.metadata)
+            # print(pdf.getPageImageList(0))
+            # print(pdf.getToC())  # 获取大纲
            for page in pdf:
-                pm = page.getPixmap(matrix=trans, alpha=False)  # 获得每一页的流对象
+                pm = page.getPixmap(matrix=trans, alpha=False)
+                # print(pm.samples)  # a rectangular area of bytes representing the image data (a Python bytes object).
+                # print(pm.width)
+                # print(pm.height)
+                # print(pm.stride)  # number of bytes of one horizontal image line)
+
                save_path = os.path.join(save_dir_path, '{0}_{1}.png'.format(self.pdf_name, page.number))
-                pm.writePNG(save_path)
+                # pm.writePNG(save_path)
+                pm.writeImage(save_path)

-    def to_svg_img(self, save_dir_path):
+    def page_to_svg_img(self, save_dir_path):
        with fitz.Document(self.pdf_path) as pdf:
            for page in pdf:
                svg = page.getSVGimage(matrix=fitz.Identity)  # UTF-8 string svg
@@ -25,8 +35,195 @@ class PdfHandler:
                with open(save_path, 'w') as f:
                    f.write(svg)

+    @staticmethod
+    def getimage(pix):
+        if pix.colorspace.n != 4:
+            return pix
+        tpix = fitz.Pixmap(fitz.csRGB, pix)
+        return tpix
+
+    def recoverpix(self, doc, item):
+        x = item[0]  # xref of PDF image
+        s = item[1]  # xref of its /SMask
+        is_rgb = True if item[5] == 'DeviceRGB' else False
+
+        # RGB
+        if is_rgb:
+            if s == 0:
+                return doc.extractImage(x)
+            # we need to reconstruct the alpha channel with the smask
+            pix1 = fitz.Pixmap(doc, x)
+            pix2 = fitz.Pixmap(doc, s)  # create pixmap of the /SMask entry
+
+            # sanity check
+            if not (pix1.irect == pix2.irect and pix1.alpha == pix2.alpha == 0 and pix2.n == 1):
+                pix2 = None
+                return self.getimage(pix1)
+
+            pix = fitz.Pixmap(pix1)  # copy of pix1, alpha channel added
+            pix.setAlpha(pix2.samples)  # treat pix2.samples as alpha value
+            pix1 = pix2 = None  # free temp pixmaps
+            return self.getimage(pix)
+
+        # GRAY/CMYK
+        pix1 = fitz.Pixmap(doc, x)
+        pix = fitz.Pixmap(pix1)  # copy of pix1, alpha channel added
+
+        if s != 0:
+            pix2 = fitz.Pixmap(doc, s)  # create pixmap of the /SMask entry
+
+            # sanity check
+            if not (pix1.irect == pix2.irect and pix1.alpha == pix2.alpha == 0 and pix2.n == 1):
+                pix2 = None
+                return self.getimage(pix1)
+
+            pix.setAlpha(pix2.samples)  # treat pix2.samples as alpha value
+
+        pix1 = pix2 = None  # free temp pixmaps
+
+        pix = fitz.Pixmap(fitz.csRGB, pix)  # GRAY/CMYK to RGB
+        return self.getimage(pix)
+
+    def extract_images(self, save_dir_path):
+        dimlimit = 100  # each image side must be greater than this
+        relsize = 0.05  # image : pixmap size ratio must be larger than this (5%)
+        abssize = 2048  # absolute image size limit 2 KB: ignore if smaller
+        imgdir = save_dir_path  # found images are stored in this subfolder
+
+        xreflist = []
+        with fitz.Document(self.pdf_path) as pdf:
+            for pno in range(pdf.pageCount):
+                il = pdf.getPageImageList(pno)
+                for img in il:
+                    print(img)
+                    xref = img[0]
+                    if xref in xreflist:
+                        continue
+                    width = img[2]
+                    height = img[3]
+                    print(xref, width, height)
+                    # if min(width, height) <= dimlimit:
+                    #     continue
+                    pix = self.recoverpix(pdf, img)
+                    if type(pix) is dict:  # we got a raw image
+                        ext = pix["ext"]
+                        imgdata = pix["image"]
+                        n = pix["colorspace"]
+                        imgfile = os.path.join(imgdir, "img-%i.%s" % (xref, ext))
+                    else:  # we got a pixmap
+                        imgfile = os.path.join(imgdir, "img-%i.png" % xref)
+                        n = pix.n
+                        imgdata = pix.getPNGData()
+
+                    # if len(imgdata) <= abssize:
+                    #     continue
+                    #
+                    # if len(imgdata) / (width * height * n) <= relsize:
+                    #     continue
+
+                    fout = open(imgfile, "wb")
+                    fout.write(imgdata)
+                    fout.close()
+                    xreflist.append(xref)
+
+    def split_il(self, il):
+        img_il_list = []
+        start = 0
+        length = len(il)
+        for i in range(length):
+            if i == start:
+                if i == length - 1:
+                    img_il_list.append(il[start: length])
+                continue
+            elif i == length - 1:
+                img_il_list.append(il[start: length])
+                continue
+            if il[i][2] != il[i - 1][2]:
+                img_il_list.append(il[start: i])
+                start = i
+            elif il[i][3] != il[i - 1][3]:
+                img_il_list.append(il[start: i + 1])
+                start = i + 1
+        return img_il_list
+
+    def extract_images_pro(self, save_dir_path):
+        with fitz.Document(self.pdf_path) as pdf:
+            print('----------------------------')
+            print(self.pdf_name)
+            print(pdf.metadata)
+            # xref_list = []  # TODO 图片去重
+            for pno in range(pdf.pageCount):
+                print('========================')
+                il = pdf.getPageImageList(pno)
+                il.sort(key=lambda x: x[0])
+                # (xref, smask, width, height, bpc, colorspace, alt.colorspace, name, filter, invoker)
+
+                img_il_list = self.split_il(il)
+                il = None
+                print(img_il_list)
+                print(len(img_il_list))  # TODO 判断单页图片过多时，使用页面转图片
+
+                for img_count, img_il in enumerate(img_il_list):
+                    print(img_il)
+                    height_sum = 0
+                    im_list = []
+                    for img in img_il:
+                        # xref = img[0]
+                        # if xref in xref_list:
+                        #     continue
+                        width = img[2]
+                        height = img[3]
+                        pix = self.recoverpix(pdf, img)
+                        if type(pix) is dict:  # we got a raw image
+                            ext = pix["ext"]
+                            img_data = pix["image"]
+                        else:  # we got a pixmap
+                            ext = 'png'
+                            img_data = pix.getPNGData()
+
+                        # xref_list.append(xref)
+
+                        im = Image.open(BytesIO(img_data))
+                        im_list.append((width, height, im, ext))
+                        height_sum += height
+
+                    print(im_list)
+                    save_path = os.path.join(save_dir_path, 'page_{0}_img_{1}.{2}'.format(
+                        pno, img_count, im_list[0][3]))
+                    # 当只有一张图片时， 简化处理
+                    if len(im_list) == 1:
+                        im_list[0][2].save(save_path)
+                    # 多张图片，竖向拼接
+                    else:
+                        res = Image.new(im_list[0][2].mode, (im_list[0][0], height_sum))
+                        h_now = 0
+                        for _, h, m, _ in im_list:
+                            res.paste(m, box=(0, h_now))
+                            h_now += h
+                        res.save(save_path)
+

 if __name__ == '__main__':
-    pdf_handler = PdfHandler('/Users/clay/Desktop/biz/pdf_test/test.pdf')
-    # pdf_handler.to_pix_img('/Users/clay/Desktop/biz/pdf_test/', 1.0, 1.0)
-    # pdf_handler.to_svg_img('/Users/clay/Desktop/biz/pdf_test/')
+    dir_path = '/Users/clay/Desktop/biz/pdf_test/银行流水/'
+    pdf_list = os.listdir(dir_path)
+    for path in pdf_list:
+        if path == '.DS_Store':
+            continue
+        pdf_handler = PdfHandler(os.path.join(dir_path, path))
+        save_path = os.path.join('/Users/clay/Desktop/biz/pdf_test/', 'test', os.path.splitext(os.path.basename(path))[0])
+        os.mkdir(save_path)
+        pdf_handler.extract_images_pro(save_path)
+    # pdf_handler = PdfHandler('/Users/clay/Desktop/biz/pdf_test/银行流水/竖版-特殊-邮储银行-一本通绿卡通交易明细(客户).pdf')
+    # pdf_handler = PdfHandler('/Users/clay/Desktop/biz/pdf_test/银行流水/横版-表格-工商银行 借记卡账户历史明细清单 .pdf')
+    # pdf_handler.page_to_pix_img('/Users/clay/Desktop/biz/pdf_test/', 3.0, 3.0)
+    # pdf_handler.page_to_svg_img('/Users/clay/Desktop/biz/pdf_test/')
+    # pdf_handler.extract_images_pro('/Users/clay/Desktop/biz/pdf_test/test')
+
+    # pix = fitz.Pixmap(sys.argv[1])  # read image file
+    # rgb = "RGB"  # set PIL parameter
+    # if pix.alpha:  # JPEG cannot have alpha!
+    #     pix0 = fitz.Pixmap(pix, 0)  # drop alpha channel
+    #     pix = pix0  # rename pixmap
+    #
+    # img = Image.frombuffer(rgb, [pix.width, pix.height], pix.samples, "raw", rgb, 0, 1)
+    # img.save(outputFileName)