special zfb
Showing
1 changed file
with
29 additions
and
6 deletions
| ... | @@ -63,8 +63,8 @@ class PDFHandler: | ... | @@ -63,8 +63,8 @@ class PDFHandler: |
| 63 | self.page_text_list = [] | 63 | self.page_text_list = [] |
| 64 | self.pdf_info = {} | 64 | self.pdf_info = {} |
| 65 | self.img_path_pno_list = [] | 65 | self.img_path_pno_list = [] |
| 66 | # 注意影响self.title_idx | ||
| 66 | self.ebank_title_list = [ | 67 | self.ebank_title_list = [ |
| 67 | # '微信支付交易明细证明', | ||
| 68 | '支付宝收支明细证明', | 68 | '支付宝收支明细证明', |
| 69 | '招商银行交易流水', | 69 | '招商银行交易流水', |
| 70 | '中国工商银行借记账户历史明细', | 70 | '中国工商银行借记账户历史明细', |
| ... | @@ -72,10 +72,13 @@ class PDFHandler: | ... | @@ -72,10 +72,13 @@ class PDFHandler: |
| 72 | '平安银行个人账户交易明细清单', | 72 | '平安银行个人账户交易明细清单', |
| 73 | '中国农业银行账戶活期交易明细清单', | 73 | '中国农业银行账戶活期交易明细清单', |
| 74 | '支付宝(中国)网络技术有限公司 交易流水证明', | 74 | '支付宝(中国)网络技术有限公司 交易流水证明', |
| 75 | '支付宝(中国)网络技术有限公司 交易流水证明' | 75 | '支付宝(中国)网络技术有限公司 交易流水证明', |
| 76 | # '微信支付交易明细证明', | ||
| 76 | ] | 77 | ] |
| 77 | self.page_count = None | 78 | self.page_count = None |
| 78 | self.metadata = None | 79 | self.metadata = None |
| 80 | self.title_idx = None | ||
| 81 | self.date_pattern = re.compile(r'^\d+ \d{4}(-)(1[0-2]|0?[1-9])\1(0?[1-9]|[1-2]\d|30|31)$') | ||
| 79 | 82 | ||
| 80 | @staticmethod | 83 | @staticmethod |
| 81 | def get_pwd_list(doc_name, pwd_list): | 84 | def get_pwd_list(doc_name, pwd_list): |
| ... | @@ -133,9 +136,28 @@ class PDFHandler: | ... | @@ -133,9 +136,28 @@ class PDFHandler: |
| 133 | y0 = y0 * height_scale | 136 | y0 = y0 * height_scale |
| 134 | x1 = x1 * width_scale | 137 | x1 = x1 * width_scale |
| 135 | y1 = y1 * height_scale | 138 | y1 = y1 * height_scale |
| 136 | rebuild_text_list.append( | 139 | |
| 137 | ((x0, y0, x1, y0, x1, y1, x0, y1), normalize('NFKC', text)) | 140 | # 支付宝 交易流水证明 '46428471991912802930901 2022-01-22' 切分日期 |
| 138 | ) | 141 | if self.title_idx in {6, 7} and self.date_pattern.match(text): |
| 142 | try: | ||
| 143 | split_x = x0 + ((x1 - x0) * (10 / len(text))) | ||
| 144 | |||
| 145 | date_str = text[-10:] | ||
| 146 | other_str = text[:-10] | ||
| 147 | rebuild_text_list.append( | ||
| 148 | ((split_x, y0, x1, y0, x1, y1, split_x, y1), normalize('NFKC', date_str)) | ||
| 149 | ) | ||
| 150 | rebuild_text_list.append( | ||
| 151 | ((x0, y0, split_x, y0, split_x, y1, x0, y1), normalize('NFKC', other_str)) | ||
| 152 | ) | ||
| 153 | except Exception as e: | ||
| 154 | rebuild_text_list.append( | ||
| 155 | ((x0, y0, x1, y0, x1, y1, x0, y1), normalize('NFKC', text)) | ||
| 156 | ) | ||
| 157 | else: | ||
| 158 | rebuild_text_list.append( | ||
| 159 | ((x0, y0, x1, y0, x1, y1, x0, y1), normalize('NFKC', text)) | ||
| 160 | ) | ||
| 139 | self.page_text_list[pno]['rebuild_text'] = rebuild_text_list | 161 | self.page_text_list[pno]['rebuild_text'] = rebuild_text_list |
| 140 | except Exception as e: | 162 | except Exception as e: |
| 141 | pass | 163 | pass |
| ... | @@ -305,8 +327,9 @@ class PDFHandler: | ... | @@ -305,8 +327,9 @@ class PDFHandler: |
| 305 | 327 | ||
| 306 | def title_is_ebank(self, char): | 328 | def title_is_ebank(self, char): |
| 307 | new_char = normalize('NFKC', char) | 329 | new_char = normalize('NFKC', char) |
| 308 | for title in self.ebank_title_list: | 330 | for title_idx, title in enumerate(self.ebank_title_list): |
| 309 | if new_char.find(title) != -1: | 331 | if new_char.find(title) != -1: |
| 332 | self.title_idx = title_idx | ||
| 310 | return True | 333 | return True |
| 311 | return False | 334 | return False |
| 312 | 335 | ... | ... |
-
Please register or sign in to post a comment