090b26b2 by 周伟奇

special zfb

1 parent 38da4b7e
......@@ -63,8 +63,8 @@ class PDFHandler:
self.page_text_list = []
self.pdf_info = {}
self.img_path_pno_list = []
# 注意影响self.title_idx
self.ebank_title_list = [
# '微信支付交易明细证明',
'支付宝收支明细证明',
'招商银行交易流水',
'中国工商银行借记账户历史明细',
......@@ -72,10 +72,13 @@ class PDFHandler:
'平安银行个人账户交易明细清单',
'中国农业银行账戶活期交易明细清单',
'支付宝(中国)网络技术有限公司 交易流水证明',
'支付宝(中国)网络技术有限公司 交易流水证明'
'支付宝(中国)网络技术有限公司 交易流水证明',
# '微信支付交易明细证明',
]
self.page_count = None
self.metadata = None
self.title_idx = None
self.date_pattern = re.compile(r'^\d+ \d{4}(-)(1[0-2]|0?[1-9])\1(0?[1-9]|[1-2]\d|30|31)$')
@staticmethod
def get_pwd_list(doc_name, pwd_list):
......@@ -133,9 +136,28 @@ class PDFHandler:
y0 = y0 * height_scale
x1 = x1 * width_scale
y1 = y1 * height_scale
rebuild_text_list.append(
((x0, y0, x1, y0, x1, y1, x0, y1), normalize('NFKC', text))
)
# 支付宝 交易流水证明 '46428471991912802930901 2022-01-22' 切分日期
if self.title_idx in {6, 7} and self.date_pattern.match(text):
try:
split_x = x0 + ((x1 - x0) * (10 / len(text)))
date_str = text[-10:]
other_str = text[:-10]
rebuild_text_list.append(
((split_x, y0, x1, y0, x1, y1, split_x, y1), normalize('NFKC', date_str))
)
rebuild_text_list.append(
((x0, y0, split_x, y0, split_x, y1, x0, y1), normalize('NFKC', other_str))
)
except Exception as e:
rebuild_text_list.append(
((x0, y0, x1, y0, x1, y1, x0, y1), normalize('NFKC', text))
)
else:
rebuild_text_list.append(
((x0, y0, x1, y0, x1, y1, x0, y1), normalize('NFKC', text))
)
self.page_text_list[pno]['rebuild_text'] = rebuild_text_list
except Exception as e:
pass
......@@ -305,8 +327,9 @@ class PDFHandler:
def title_is_ebank(self, char):
new_char = normalize('NFKC', char)
for title in self.ebank_title_list:
for title_idx, title in enumerate(self.ebank_title_list):
if new_char.find(title) != -1:
self.title_idx = title_idx
return True
return False
......
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!