ebank part 2
Showing
1 changed file
with
38 additions
and
21 deletions
... | @@ -39,6 +39,14 @@ class PDFHandler: | ... | @@ -39,6 +39,14 @@ class PDFHandler: |
39 | self.page_text_list = [] | 39 | self.page_text_list = [] |
40 | self.pdf_info = {} | 40 | self.pdf_info = {} |
41 | self.img_path_pno_list = [] | 41 | self.img_path_pno_list = [] |
42 | self.ebank_title_list = [ | ||
43 | '微信支付交易明细证明', | ||
44 | '支付宝收支明细证明', | ||
45 | '招商银行交易流水', | ||
46 | '中国工商银行借记账户历史明细', | ||
47 | '中国建设银行个人活期账户全部交易明细', | ||
48 | '平安银行个人账户交易明细清单', | ||
49 | ] | ||
42 | 50 | ||
43 | def get_suffix(self, file_name): | 51 | def get_suffix(self, file_name): |
44 | if file_name is None: | 52 | if file_name is None: |
... | @@ -260,20 +268,27 @@ class PDFHandler: | ... | @@ -260,20 +268,27 @@ class PDFHandler: |
260 | page = pdf.loadPage(pno) | 268 | page = pdf.loadPage(pno) |
261 | self.page_to_png(page) | 269 | self.page_to_png(page) |
262 | 270 | ||
271 | def title_is_ebank(self, char): | ||
272 | for title in self.ebank_title_list: | ||
273 | if title.find(char) != -1 or char.find(title) != -1: | ||
274 | return True | ||
275 | return False | ||
276 | |||
263 | def check_ebank(self, pdf): | 277 | def check_ebank(self, pdf): |
264 | # page_text_list = [] | 278 | page_text_list = [] |
265 | text_item_sum = 0 | 279 | text_item_sum = 0 |
280 | in_ebank_set = False | ||
266 | for pno in range(pdf.pageCount): | 281 | for pno in range(pdf.pageCount): |
267 | page = pdf.loadPage(pno) | 282 | page = pdf.loadPage(pno) |
268 | # if page.rotation is None: | 283 | if page.rotation is None: |
269 | # rotation = 0 | 284 | rotation = 0 |
270 | # elif isinstance(page.rotation, int): | 285 | elif isinstance(page.rotation, int): |
271 | # divisor, remainder = divmod(page.rotation, 90) | 286 | divisor, remainder = divmod(page.rotation, 90) |
272 | # if remainder != 0: | 287 | if remainder != 0: |
273 | # return | 288 | return |
274 | # rotation = divmod(divisor, 4)[1] | 289 | rotation = divmod(divisor, 4)[1] |
275 | # else: | 290 | else: |
276 | # return | 291 | return |
277 | textpage = page.getTextPage() | 292 | textpage = page.getTextPage() |
278 | text = textpage.extractDICT() | 293 | text = textpage.extractDICT() |
279 | text_list = [] | 294 | text_list = [] |
... | @@ -284,22 +299,24 @@ class PDFHandler: | ... | @@ -284,22 +299,24 @@ class PDFHandler: |
284 | bbox = span.get('bbox') | 299 | bbox = span.get('bbox') |
285 | if char.strip() == '': | 300 | if char.strip() == '': |
286 | continue | 301 | continue |
302 | if pno == 0 and self.title_is_ebank(char): | ||
303 | in_ebank_set = True | ||
287 | text_list.append((bbox, char)) | 304 | text_list.append((bbox, char)) |
288 | text_item_sum += len(text_list) | 305 | text_item_sum += len(text_list) |
289 | if text_item_sum < (pno + 1) * 5: | 306 | if text_item_sum < (pno + 1) * 5: |
290 | return | 307 | return |
291 | # else: | 308 | else: |
292 | # page_text_list.append( | 309 | page_text_list.append( |
293 | # { | 310 | { |
294 | # 'width': text.get('width'), | 311 | 'width': text.get('width'), |
295 | # 'height': text.get('height'), | 312 | 'height': text.get('height'), |
296 | # 'rotation': rotation, | 313 | 'rotation': rotation, |
297 | # 'text': text_list | 314 | 'text': text_list |
298 | # } | 315 | } |
299 | # ) | 316 | ) |
300 | # self.is_ebank = True | 317 | self.is_ebank = in_ebank_set |
301 | self.is_e_pdf = True | 318 | self.is_e_pdf = True |
302 | # self.page_text_list = page_text_list | 319 | self.page_text_list = page_text_list |
303 | 320 | ||
304 | def e_contract_process(self): | 321 | def e_contract_process(self): |
305 | os.makedirs(self.img_dir_path, exist_ok=True) | 322 | os.makedirs(self.img_dir_path, exist_ok=True) | ... | ... |
-
Please register or sign in to post a comment