cb4acc51 by 周伟奇

ebank part 2

1 parent d24fcf2c
...@@ -39,6 +39,14 @@ class PDFHandler: ...@@ -39,6 +39,14 @@ class PDFHandler:
39 self.page_text_list = [] 39 self.page_text_list = []
40 self.pdf_info = {} 40 self.pdf_info = {}
41 self.img_path_pno_list = [] 41 self.img_path_pno_list = []
42 self.ebank_title_list = [
43 '微信支付交易明细证明',
44 '支付宝收支明细证明',
45 '招商银行交易流水',
46 '中国工商银行借记账户历史明细',
47 '中国建设银行个人活期账户全部交易明细',
48 '平安银行个人账户交易明细清单',
49 ]
42 50
43 def get_suffix(self, file_name): 51 def get_suffix(self, file_name):
44 if file_name is None: 52 if file_name is None:
...@@ -260,20 +268,27 @@ class PDFHandler: ...@@ -260,20 +268,27 @@ class PDFHandler:
260 page = pdf.loadPage(pno) 268 page = pdf.loadPage(pno)
261 self.page_to_png(page) 269 self.page_to_png(page)
262 270
271 def title_is_ebank(self, char):
272 for title in self.ebank_title_list:
273 if title.find(char) != -1 or char.find(title) != -1:
274 return True
275 return False
276
263 def check_ebank(self, pdf): 277 def check_ebank(self, pdf):
264 # page_text_list = [] 278 page_text_list = []
265 text_item_sum = 0 279 text_item_sum = 0
280 in_ebank_set = False
266 for pno in range(pdf.pageCount): 281 for pno in range(pdf.pageCount):
267 page = pdf.loadPage(pno) 282 page = pdf.loadPage(pno)
268 # if page.rotation is None: 283 if page.rotation is None:
269 # rotation = 0 284 rotation = 0
270 # elif isinstance(page.rotation, int): 285 elif isinstance(page.rotation, int):
271 # divisor, remainder = divmod(page.rotation, 90) 286 divisor, remainder = divmod(page.rotation, 90)
272 # if remainder != 0: 287 if remainder != 0:
273 # return 288 return
274 # rotation = divmod(divisor, 4)[1] 289 rotation = divmod(divisor, 4)[1]
275 # else: 290 else:
276 # return 291 return
277 textpage = page.getTextPage() 292 textpage = page.getTextPage()
278 text = textpage.extractDICT() 293 text = textpage.extractDICT()
279 text_list = [] 294 text_list = []
...@@ -284,22 +299,24 @@ class PDFHandler: ...@@ -284,22 +299,24 @@ class PDFHandler:
284 bbox = span.get('bbox') 299 bbox = span.get('bbox')
285 if char.strip() == '': 300 if char.strip() == '':
286 continue 301 continue
302 if pno == 0 and self.title_is_ebank(char):
303 in_ebank_set = True
287 text_list.append((bbox, char)) 304 text_list.append((bbox, char))
288 text_item_sum += len(text_list) 305 text_item_sum += len(text_list)
289 if text_item_sum < (pno + 1) * 5: 306 if text_item_sum < (pno + 1) * 5:
290 return 307 return
291 # else: 308 else:
292 # page_text_list.append( 309 page_text_list.append(
293 # { 310 {
294 # 'width': text.get('width'), 311 'width': text.get('width'),
295 # 'height': text.get('height'), 312 'height': text.get('height'),
296 # 'rotation': rotation, 313 'rotation': rotation,
297 # 'text': text_list 314 'text': text_list
298 # } 315 }
299 # ) 316 )
300 # self.is_ebank = True 317 self.is_ebank = in_ebank_set
301 self.is_e_pdf = True 318 self.is_e_pdf = True
302 # self.page_text_list = page_text_list 319 self.page_text_list = page_text_list
303 320
304 def e_contract_process(self): 321 def e_contract_process(self):
305 os.makedirs(self.img_dir_path, exist_ok=True) 322 os.makedirs(self.img_dir_path, exist_ok=True)
......
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!