update wb build

周伟奇
Showing 6 changed files with 8 additions and 4 deletions
src/apps/doc/consts.py
src/apps/doc/management/commands/doc_ocr_process.py
src/apps/doc/mixins.py
src/apps/doc/named_enum.py
src/apps/doc/ocr/wb.py
src/common/tools/pdf_to_img.py
--- a/src/apps/doc/consts.py
View file @793920a
+++ b/src/apps/doc/consts.py
View file @793920a
--- a/src/apps/doc/management/commands/doc_ocr_process.py
View file @793920a
+++ b/src/apps/doc/management/commands/doc_ocr_process.py
View file @793920a
--- a/src/apps/doc/mixins.py
View file @793920a
+++ b/src/apps/doc/mixins.py
View file @793920a
@@ -11,6 +11,8 @@ class DocHandler:
            return '/data/{1}/{0}/{0}.pdf'.format(doc_id, business_type)
        elif file == 'img':
            return '/data/{1}/{0}/{0}_img.zip'.format(doc_id, business_type)
+        elif file == 'src_excel':
+            return '/data/{1}/{0}/src.xlsx'.format(doc_id, business_type)
        else:
            return '/data/{1}/{0}/{0}.xlsx'.format(doc_id, business_type)

@@ -22,6 +24,7 @@ class DocHandler:
            doc_dict['pdf_link'] = self.get_link(doc_id, business_type)
            doc_dict['img_link'] = self.get_link(doc_id, business_type, file='img')
            doc_dict['excel_link'] = self.get_link(doc_id, business_type, file='excel')
+            doc_dict['src_excel_link'] = self.get_link(doc_id, business_type, file='src_excel')
        return list(doc_queryset)

    @staticmethod
--- a/src/apps/doc/named_enum.py
View file @793920a
+++ b/src/apps/doc/named_enum.py
View file @793920a
@@ -13,3 +13,4 @@ class KeywordsType(NamedEnum):
    INTEREST = (0, "利息")
    SALARY = (1, '薪资')
    LOAN = (2, '贷款')
+    ALI_WECHART = (3, '微信/支付宝')
--- a/src/apps/doc/ocr/wb.py
View file @793920a
+++ b/src/apps/doc/ocr/wb.py
View file @793920a
--- a/src/common/tools/pdf_to_img.py
View file @793920a
+++ b/src/common/tools/pdf_to_img.py
View file @793920a
@@ -25,7 +25,7 @@ class PDFHandler:
    def __init__(self, path, img_dir_path):
        self.path = path
        self.img_dir_path = img_dir_path
-        self.img_path_list = []
+        self.img_info_list = []
        self.xref_set = set()

    def get_img_save_path(self, pno, img_index=0, ext='png'):
@@ -38,7 +38,7 @@ class PDFHandler:
            pm = page.getPixmap(matrix=trans_2, alpha=False)
        img_save_path = self.get_img_save_path(page.number)
        pm.writePNG(img_save_path)
-        self.img_path_list.append(img_save_path)
+        self.img_info_list.append((img_save_path, page.number, 0))

    @staticmethod
    def getimage(pix):
@@ -88,7 +88,7 @@ class PDFHandler:
        with open(img_save_path, "wb") as f:
            f.write(img_data)
        self.xref_set.add(xref)
-        self.img_path_list.append(img_save_path)
+        self.img_info_list.append((img_save_path, pno, img_index))

    @staticmethod
    def split_il(il):
@@ -179,7 +179,7 @@ class PDFHandler:
                    img_save_path = self.get_img_save_path(pno, img_index, im_list[0][2])
                    new_img.save(img_save_path)
                    page_to_png = False
-                    self.img_path_list.append(img_save_path)
+                    self.img_info_list.append((img_save_path, pno, img_index))

        # 3.3 碎图分组大于2、全过滤、含特殊filter，特殊处理：整个页面保存为png图片
        if page_to_png: