272692c8 by 冯轩

Merge branch 'feature/weixin-bs-2'

2 parents e08e5c00 df94248b
......@@ -11,6 +11,8 @@ from openpyxl import Workbook
from openpyxl.styles import PatternFill, numbers
from openpyxl.utils import get_column_letter
from apps.doc import consts
import logging
online_log = logging.getLogger('online')
class BSWorkbook(Workbook):
......@@ -562,6 +564,8 @@ class BSWorkbook(Workbook):
borrow_cell = None if borrow_cell_idx is None or borrow_cell_idx >= length else rows[borrow_cell_idx]
summary_cell_value = None if summary_cell is None else summary_cell.value
if summary_cell.value is not None:
summary_cell_value = summary_cell_value.strip()
date_cell_value = None if date_cell is None else date_cell.value
amount_cell_value = None if amount_cell is None else amount_cell.value
over_cell_value = None if over_cell is None else over_cell.value
......@@ -638,6 +642,7 @@ class BSWorkbook(Workbook):
# 3.2.提取信息、高亮
# row = summary_cell.row
# online_log.info('[ti qu xin xi gao liang =========== >] [summary_cell_value={0}]'.format(summary_cell_value))
if summary_cell is not None:
# 关键词1提取
if summary_cell_value in self.interest_keyword:
......
......@@ -693,6 +693,14 @@ class UploadDocView(GenericView, DocHandler):
classify_1 = classify_1_tmp
break
if classify_1 == 0 and ('微信支付交易明细证明' in document_name or '微信流水' in document_name):
classify_1 = 12
self.running_log.info('[weixin bs process] [doc_id={0}]'.format(doc.id))
if classify_1 == 0 and (document_name.startswith("dzfp_")):
classify_1 = 0
self.running_log.info('[dzfp process] [doc_id={0}]'.format(doc.id))
if document_name.endswith('.zip') or document_name.endswith('.rar') or document_name.endswith('.ZIP') \
or document_name.endswith('.RAR'):
......@@ -1247,6 +1255,14 @@ class DocView(DocGenericView, DocHandler):
if keyword in document_name:
classify_1 = classify_1_tmp
break
if classify_1 == 0 and ('微信支付交易明细证明' in document_name or '微信流水' in document_name):
classify_1 = 12
self.running_log.info('[weixin bs process] [doc_id={0}]'.format(doc.id))
if classify_1 == 0 and (document_name.startswith("dzfp_")):
classify_1 = 0
self.running_log.info('[dzfp process] [doc_id={0}]'.format(doc.id))
# tasks = ['{0}{1}{2}'.format(prefix, consts.SPLIT_STR, doc.id)]
task = consts.SPLIT_STR.join([prefix, str(doc.id), str(classify_1)])
......
......@@ -69,6 +69,7 @@ class PDFHandler:
self.suffix = self.get_suffix(document_name)
self.is_ebank = False
self.is_e_pdf = False
self.is_e_weixin_bs = False
self.page_text_list = []
self.pdf_info = {}
self.img_path_pno_list = []
......@@ -186,6 +187,8 @@ class PDFHandler:
self.img_path_list.append(img_save_path)
if self.is_ebank:
self.rebuild_bbox(pm.width, pm.height, page.number)
if self.is_e_weixin_bs:
self.rebuild_bbox(pm.width, pm.height, page.number)
@staticmethod
def getimage(pix):
......@@ -407,6 +410,57 @@ class PDFHandler:
self.is_e_pdf = True
self.page_text_list = page_text_list
def put_text(self, pdf):
page_text_list = []
text_item_sum = 0
for pno in range(pdf.pageCount):
page = pdf.loadPage(pno)
if page.rotation is None:
rotation = 0
elif isinstance(page.rotation, int):
divisor, remainder = divmod(page.rotation, 90)
if remainder != 0:
return
rotation = divmod(divisor, 4)[1]
else:
return
textpage = page.getTextPage()
text = textpage.extractDICT()
text_list = []
for block in text.get('blocks'):
for line in block.get('lines'):
for span in line.get('spans'):
char = span.get('text')
if char.strip() == '':
continue
# 特殊emoji跳过
try:
print(char)
except Exception as e:
continue
bbox = span.get('bbox')
if pno == 0 and self.title_is_ebank(char):
in_ebank_set = True
text_list.append((bbox, char))
text_item_sum += len(text_list)
if text_item_sum < (pno + 1) * 5:
return
else:
page_text_list.append(
{
'width': text.get('width'),
'height': text.get('height'),
'rotation': rotation,
'text': text_list
}
)
self.is_e_pdf = True
self.is_e_weixin_bs = True
self.page_text_list = page_text_list
def e_contract_process(self):
os.makedirs(self.img_dir_path, exist_ok=True)
with fitz.Document(self.path) as pdf:
......@@ -473,6 +527,59 @@ class PDFHandler:
self.merge_il(pdf, pno, il)
self.img_count = len(self.img_path_list)
def extract_image_for_weixin(self, max_img_count=None):
self.img_path_list = []
self.xref_set = set()
os.makedirs(self.img_dir_path, exist_ok=True)
if self.suffix in self.img_suffixs:
img_save_path = self.get_img_save_path(0, ext=self.suffix[1:])
shutil.copy(self.path, img_save_path)
self.img_path_list.append(img_save_path)
else:
with fitz.Document(self.path) as pdf:
# 解密
for pwd in self.pwd_list:
if not pdf.isEncrypted:
break
pdf.authenticate(pwd)
self.metadata = pdf.metadata
self.page_count = pdf.pageCount
if isinstance(max_img_count, int) and pdf.pageCount >= max_img_count:
self.img_count = pdf.pageCount
return
self.put_text(pdf)
for pno in range(pdf.pageCount):
il = pdf.getPageImageList(pno) # 获取页面图片对象
# (xref, smask, width, height, bpc, colorspace, alt.colorspace, name, filter, invoker)
# 1.页面图片对象数目为0时,保存整个页面为png图片
if self.is_e_pdf or self.is_ebank or len(il) == 0:
page = pdf.loadPage(pno)
self.page_to_png(page)
# 2.页面图片对象数目为1时:
# 小图(如电子账单的盖章):保存整个页面为png图片
# 大图:提取图片对象
elif len(il) == 1:
xref, smask, width, height, _, colorspace, _, _, _ = il[0]
# 小图
if width < WH_COUPLE_1[0] and height < WH_COUPLE_1[1]:
page = pdf.loadPage(pno)
self.page_to_png(page)
# 大图
elif width >= WH_COUPLE_6[0] or height >= WH_COUPLE_6[1]:
self.is_new_modify = 1
is_big_img = (width < WH_COUPLE_7[0] and height < WH_COUPLE_7[1]) # 防止图片过大
page = pdf.loadPage(pno)
self.page_to_png(page, is_big_img=is_big_img)
elif xref not in self.xref_set:
self.extract_single_image(pdf, xref, smask, colorspace, pno)
# 3.页面图片对象数目大于1时,特殊处理
else:
self.merge_il(pdf, pno, il)
self.img_count = len(self.img_path_list)
def extract_page_image(self):
self.img_path_list = []
self.xref_set = set()
......
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!