special zfb

周伟奇
Showing 1 changed file with 29 additions and 6 deletions
src/common/tools/pdf_to_img.py
--- a/src/common/tools/pdf_to_img.py
View file @090b26b
+++ b/src/common/tools/pdf_to_img.py
View file @090b26b
@@ -63,8 +63,8 @@ class PDFHandler:
        self.page_text_list = []
        self.pdf_info = {}
        self.img_path_pno_list = []
+        # 注意影响self.title_idx
        self.ebank_title_list = [
-            # '微信支付交易明细证明',
            '支付宝收支明细证明',
            '招商银行交易流水',
            '中国工商银行借记账户历史明细',
@@ -72,10 +72,13 @@ class PDFHandler:
            '平安银行个人账户交易明细清单',
            '中国农业银行账戶活期交易明细清单',
            '支付宝（中国）网络技术有限公司   交易流水证明',
-            '支付宝(中国)网络技术有限公司   交易流水证明'
+            '支付宝(中国)网络技术有限公司   交易流水证明',
+            # '微信支付交易明细证明',
        ]
        self.page_count = None
        self.metadata = None
+        self.title_idx = None
+        self.date_pattern = re.compile(r'^\d+ \d{4}(-)(1[0-2]|0?[1-9])\1(0?[1-9]|[1-2]\d|30|31)$')

    @staticmethod
    def get_pwd_list(doc_name, pwd_list):
@@ -133,9 +136,28 @@ class PDFHandler:
                y0 = y0 * height_scale
                x1 = x1 * width_scale
                y1 = y1 * height_scale
-                rebuild_text_list.append(
-                    ((x0, y0, x1, y0, x1, y1, x0, y1), normalize('NFKC', text))
-                )
+
+                # 支付宝 交易流水证明 '46428471991912802930901 2022-01-22' 切分日期
+                if self.title_idx in {6, 7} and self.date_pattern.match(text):
+                    try:
+                        split_x = x0 + ((x1 - x0) * (10 / len(text)))
+
+                        date_str = text[-10:]
+                        other_str = text[:-10]
+                        rebuild_text_list.append(
+                            ((split_x, y0, x1, y0, x1, y1, split_x, y1), normalize('NFKC', date_str))
+                        )
+                        rebuild_text_list.append(
+                            ((x0, y0, split_x, y0, split_x, y1, x0, y1), normalize('NFKC', other_str))
+                        )
+                    except Exception as e:
+                        rebuild_text_list.append(
+                            ((x0, y0, x1, y0, x1, y1, x0, y1), normalize('NFKC', text))
+                        )
+                else:
+                    rebuild_text_list.append(
+                        ((x0, y0, x1, y0, x1, y1, x0, y1), normalize('NFKC', text))
+                    )
            self.page_text_list[pno]['rebuild_text'] = rebuild_text_list
        except Exception as e:
            pass
@@ -305,8 +327,9 @@ class PDFHandler:

    def title_is_ebank(self, char):
        new_char = normalize('NFKC', char)
-        for title in self.ebank_title_list:
+        for title_idx, title in enumerate(self.ebank_title_list):
            if new_char.find(title) != -1:
+                self.title_idx = title_idx
                return True
        return False