090b26b2 by 周伟奇

special zfb

1 parent 38da4b7e
...@@ -63,8 +63,8 @@ class PDFHandler: ...@@ -63,8 +63,8 @@ class PDFHandler:
63 self.page_text_list = [] 63 self.page_text_list = []
64 self.pdf_info = {} 64 self.pdf_info = {}
65 self.img_path_pno_list = [] 65 self.img_path_pno_list = []
66 # 注意影响self.title_idx
66 self.ebank_title_list = [ 67 self.ebank_title_list = [
67 # '微信支付交易明细证明',
68 '支付宝收支明细证明', 68 '支付宝收支明细证明',
69 '招商银行交易流水', 69 '招商银行交易流水',
70 '中国工商银行借记账户历史明细', 70 '中国工商银行借记账户历史明细',
...@@ -72,10 +72,13 @@ class PDFHandler: ...@@ -72,10 +72,13 @@ class PDFHandler:
72 '平安银行个人账户交易明细清单', 72 '平安银行个人账户交易明细清单',
73 '中国农业银行账戶活期交易明细清单', 73 '中国农业银行账戶活期交易明细清单',
74 '支付宝(中国)网络技术有限公司 交易流水证明', 74 '支付宝(中国)网络技术有限公司 交易流水证明',
75 '支付宝(中国)网络技术有限公司 交易流水证明' 75 '支付宝(中国)网络技术有限公司 交易流水证明',
76 # '微信支付交易明细证明',
76 ] 77 ]
77 self.page_count = None 78 self.page_count = None
78 self.metadata = None 79 self.metadata = None
80 self.title_idx = None
81 self.date_pattern = re.compile(r'^\d+ \d{4}(-)(1[0-2]|0?[1-9])\1(0?[1-9]|[1-2]\d|30|31)$')
79 82
80 @staticmethod 83 @staticmethod
81 def get_pwd_list(doc_name, pwd_list): 84 def get_pwd_list(doc_name, pwd_list):
...@@ -133,9 +136,28 @@ class PDFHandler: ...@@ -133,9 +136,28 @@ class PDFHandler:
133 y0 = y0 * height_scale 136 y0 = y0 * height_scale
134 x1 = x1 * width_scale 137 x1 = x1 * width_scale
135 y1 = y1 * height_scale 138 y1 = y1 * height_scale
136 rebuild_text_list.append( 139
137 ((x0, y0, x1, y0, x1, y1, x0, y1), normalize('NFKC', text)) 140 # 支付宝 交易流水证明 '46428471991912802930901 2022-01-22' 切分日期
138 ) 141 if self.title_idx in {6, 7} and self.date_pattern.match(text):
142 try:
143 split_x = x0 + ((x1 - x0) * (10 / len(text)))
144
145 date_str = text[-10:]
146 other_str = text[:-10]
147 rebuild_text_list.append(
148 ((split_x, y0, x1, y0, x1, y1, split_x, y1), normalize('NFKC', date_str))
149 )
150 rebuild_text_list.append(
151 ((x0, y0, split_x, y0, split_x, y1, x0, y1), normalize('NFKC', other_str))
152 )
153 except Exception as e:
154 rebuild_text_list.append(
155 ((x0, y0, x1, y0, x1, y1, x0, y1), normalize('NFKC', text))
156 )
157 else:
158 rebuild_text_list.append(
159 ((x0, y0, x1, y0, x1, y1, x0, y1), normalize('NFKC', text))
160 )
139 self.page_text_list[pno]['rebuild_text'] = rebuild_text_list 161 self.page_text_list[pno]['rebuild_text'] = rebuild_text_list
140 except Exception as e: 162 except Exception as e:
141 pass 163 pass
...@@ -305,8 +327,9 @@ class PDFHandler: ...@@ -305,8 +327,9 @@ class PDFHandler:
305 327
306 def title_is_ebank(self, char): 328 def title_is_ebank(self, char):
307 new_char = normalize('NFKC', char) 329 new_char = normalize('NFKC', char)
308 for title in self.ebank_title_list: 330 for title_idx, title in enumerate(self.ebank_title_list):
309 if new_char.find(title) != -1: 331 if new_char.find(title) != -1:
332 self.title_idx = title_idx
310 return True 333 return True
311 return False 334 return False
312 335
......
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!