793920a0 by 周伟奇

update wb build

1 parent f3d6e429
PAGE_DEFAULT = 1
PAGE_SIZE_DEFAULT = 10
TRANS = str.maketrans('Cc((oODlLmAsSbg', '000000011345569')
CARD_RATIO = 0.9
UNKNOWN_CARD = '未知卡号'
UNKNOWN_ROLE = '未知户名'
DATE_FORMAT = ['%Y年%m月%d日', '%Y/%m/%d', '%Y-%m-%d', '%Y%m%d']
FIXED_APPLICATION_ID_PREFIX = 'CH-S'
DOC_SCHEME_LIST = ['ACCEPTANCE', 'SETTLEMENT', 'CONTRACT MANAGEMENT']
DATA_SOURCE_LIST = ['POS', 'EAPP', 'ECONTRACT']
DATA_SOURCE_LIST = ['POS', 'E-APP', 'ECONTRACT']
HIL_PREFIX = 'HIL'
AFC_PREFIX = 'AFC'
......@@ -39,11 +46,33 @@ PROOF_COL_TITLE = '核对结果'
PROOF_RES = ('对', '错')
META_SHEET_TITLE = '关键信息提取和展示'
FIXED_HEADERS = ('记账日期', '记账时间', '金额', '余额', '交易名称', '附言', '对方账户名', '对方卡号/账号', '对方开户行', '核对结果')
FIXED_HEADERS = ('记账日期', '记账时间', '金额', '余额', '交易名称', '附言', '对方账户名', '对方卡号/账号', '对方开户行', '核对结果', '借贷', '收入', '支出')
FIXED_COL_AMOUNT = len(FIXED_HEADERS)
BASE_HEADERS_MAPPING = {label: idx+1 for idx, label in enumerate(FIXED_HEADERS)}
BORROW_HEADER_COL = BASE_HEADERS_MAPPING['借贷']
INCOME_HEADER_COL = BASE_HEADERS_MAPPING['收入']
OUTLAY_HEADER_COL = BASE_HEADERS_MAPPING['支出']
RESULT_HEADER_COL = BASE_HEADERS_MAPPING['核对结果']
BORROW_IDX = BORROW_HEADER_COL - 1
INCOME_IDX = INCOME_HEADER_COL - 1
OUTLAY_IDX = OUTLAY_HEADER_COL - 1
SUMMARY_IDX = FIXED_HEADERS.index('附言')
DATE_IDX = FIXED_HEADERS.index('记账日期')
AMOUNT_IDX = FIXED_HEADERS.index('金额')
OVER_IDX = FIXED_HEADERS.index('余额')
RESULT_IDX = FIXED_HEADERS.index('核对结果')
# '借贷': ('贷', '借'), # 竖版-无表格-广发银行
# '借贷状态': ('贷', '借'), # 竖版-特殊-交通银行
# '收/支': ('收入', '支出'), # 横版-表格-北京银行
BORROW_HEADERS_SET = {'借贷', '借贷状态', '收/支'}
BORROW_INCOME_SET = {'贷', '收入'}
BORROW_OUTLAY_SET = {'借', '支出'}
INCOME_HEADERS_SET = {'收入金额', '收入', '存入', '存入金额(贷)', '存入金额(贷)'}
OUTLAY_HEADERS_SET = {'支出金额', '支出', '支取金额(借)', '支取金额(借)'}
# ------------------普通打印-全格线--------------------------------------------------------------------------------------
HEADERS_MAPPING = {}
# 中国银行
# 横版-表格-中国银行(不规则)
HEADERS_MAPPING.update(
{
'记账日期': BASE_HEADERS_MAPPING['记账日期'],
......@@ -57,37 +86,294 @@ HEADERS_MAPPING.update(
'对方开户行': BASE_HEADERS_MAPPING['对方开户行'],
}
)
# 竖版-表格-建设银行
# 横版-表格-农业银行-中国农业银行个人账户明细
HEADERS_MAPPING.update(
{
'交易日期': BASE_HEADERS_MAPPING['记账日期'],
'交易金额': BASE_HEADERS_MAPPING['金额'],
'账户余额': BASE_HEADERS_MAPPING['余额'],
'存入': BASE_HEADERS_MAPPING['金额'],
'对方账号': BASE_HEADERS_MAPPING['对方卡号/账号'],
'对方名称': BASE_HEADERS_MAPPING['对方账户名'],
'摘要': BASE_HEADERS_MAPPING['附言'],
'对方账号与户名': BASE_HEADERS_MAPPING['对方卡号/账号'],
}
)
# 横版-表格-农业银行
# 横版-表格-北京银行
HEADERS_MAPPING.update(
{
'存入': BASE_HEADERS_MAPPING['金额'],
'对方账号': BASE_HEADERS_MAPPING['对方卡号/账号'],
'对方名称': BASE_HEADERS_MAPPING['对方账户名'],
'业务摘要': BASE_HEADERS_MAPPING['附言'],
'发生额': BASE_HEADERS_MAPPING['金额'],
'对方户名': BASE_HEADERS_MAPPING['对方账户名'],
}
)
# 横版-表格-工商银行
# 横版-表格-工商银行 借记卡账户历史明细清单
# 横版-表格-工商银行-机打验证码 借记卡账户历史明细清单
# 横版-表格-工商银行CH-B008802400
# 横版-表格-工商银行 工资明细清单
# 工商银行历史明细(申请单号:20042501303039397888)
HEADERS_MAPPING.update(
{
'对方户名': BASE_HEADERS_MAPPING['对方账户名'],
'收入/支出金额': BASE_HEADERS_MAPPING['金额'],
'工作日期': BASE_HEADERS_MAPPING['记账日期'],
}
)
# 横版-表格-北京银行
# 横版-表格-建设银行-个人活期账户交易明细
# 竖版-表格-建设银行-个人活期账户交易明细 CH-B005832604
# 竖版-表格-建设银行-工资账单CH-B008786812
# 竖版-表格-建设银行-个人活期账户交易明细 CH-B005832604 (2)
HEADERS_MAPPING.update(
{
'业务摘要': BASE_HEADERS_MAPPING['附言'],
'发生额': BASE_HEADERS_MAPPING['金额'],
'交易金额': BASE_HEADERS_MAPPING['金额'],
'账户余额': BASE_HEADERS_MAPPING['余额'],
'对方账号与户名': BASE_HEADERS_MAPPING['对方卡号/账号'],
}
)
# 微信
HEADERS_MAPPING.update(
{
'交易时间': BASE_HEADERS_MAPPING['记账时间'],
'交易类型': BASE_HEADERS_MAPPING['附言'],
'金额(元)': BASE_HEADERS_MAPPING['金额'],
'金额(元)': BASE_HEADERS_MAPPING['金额'],
'交易对方': BASE_HEADERS_MAPPING['对方账户名'],
}
)
# 支付宝
HEADERS_MAPPING.update(
{
'时间': BASE_HEADERS_MAPPING['记账日期'],
'名称/备注': BASE_HEADERS_MAPPING['附言'],
}
)
# ------------普通打印-部分格线-------------------------------------------------------------------------------------------
# 竖版-无表格-农业银行
# 竖版-无表格-农业银行CH-B008805428
HEADERS_MAPPING.update(
{
'摘要/附言': BASE_HEADERS_MAPPING['附言'],
'交易地点/对方账号和户名': BASE_HEADERS_MAPPING['对方卡号/账号'],
}
)
# 竖版-特殊-农商行
HEADERS_MAPPING.update(
{
'交易发生额': BASE_HEADERS_MAPPING['金额'],
}
)
# 横版-特殊-中信银行-账户交易明细
HEADERS_MAPPING.update(
{
'对方银行': BASE_HEADERS_MAPPING['对方开户行'],
'交易摘要': BASE_HEADERS_MAPPING['附言'],
}
)
# 平安电子账单
HEADERS_MAPPING.update(
{
'借贷发生额(借:-贷:+)': BASE_HEADERS_MAPPING['金额'],
}
)
# ------------普通打印-无格线--------------------------------------------------------------------------------------------
# 竖版-无表格-招商银行(略歪)
# 竖版-无表格-招商银行账户历史交易明细表
HEADERS_MAPPING.update(
{
'联机余额': BASE_HEADERS_MAPPING['余额'],
}
)
# 竖版-无表格-邮储银行-账户对账单 含有对手方户名 对手方账户
# 竖版-无表格-邮储银行 账户对账单
# 竖版-无表格-邮储银行-电子章 邮储银行 账户对账单
HEADERS_MAPPING.update(
{
'交易金额(元)': BASE_HEADERS_MAPPING['金额'],
'交易金额(元)': BASE_HEADERS_MAPPING['金额'],
'账户余额(元)': BASE_HEADERS_MAPPING['余额'],
'账户余额(元)': BASE_HEADERS_MAPPING['余额'],
'对手方户名': BASE_HEADERS_MAPPING['对方账户名'],
'对手方账户': BASE_HEADERS_MAPPING['对方卡号/账号'],
}
)
# 横版-无表格-广发银行-账户交易历史 --> 已废弃
# 竖版-无表格-广发银行-账户交易历史 --> 已废弃
HEADERS_MAPPING.update(
{
'会计日期': BASE_HEADERS_MAPPING['记账日期'],
'对手户名': BASE_HEADERS_MAPPING['对方账户名'],
'对手账号': BASE_HEADERS_MAPPING['对方卡号/账号'],
}
)
# 招行电子账单 TODO 有英文,需测试
HEADERS_MAPPING.update(
{
'对手信息': BASE_HEADERS_MAPPING['对方账户名'],
'摘要代码': BASE_HEADERS_MAPPING['附言'],
}
)
# 横版-无表格-民生银行-中国民生银行个人账户对账单(客户卡号)
# 横版-无表格-民生银行-无标题(客户账户)
# 横版-无表格-民生银行
HEADERS_MAPPING.update(
{
'摘要信息': BASE_HEADERS_MAPPING['附言'],
'对方行名': BASE_HEADERS_MAPPING['对方开户行'],
}
)
# 竖版-无表格-农业银行整数
# 竖版-无表格-农业银行-中国农业银行银行卡交易明细清单
HEADERS_MAPPING.update(
{
'对方账号和户名': BASE_HEADERS_MAPPING['对方卡号/账号'],
}
)
# 竖版-无表格-农业银行-中国农业银行银行卡活期存折交易明细清单.pdf
# 竖版-无表格-农业银行-扩张.pdf
# 竖版-无表格-农业银行-缩进.pdf
HEADERS_MAPPING.update(
{
'日期': BASE_HEADERS_MAPPING['记账日期'],
'短摘要': BASE_HEADERS_MAPPING['附言'],
'本次余额': BASE_HEADERS_MAPPING['余额'],
}
)
# 竖版-无表格-农业银行-无标题(对手帐号)
HEADERS_MAPPING.update(
{
'交易后余额': BASE_HEADERS_MAPPING['余额'],
'对手帐号': BASE_HEADERS_MAPPING['对方卡号/账号'],
}
)
# 竖版-无表格-农商行(非常规)
HEADERS_MAPPING.update(
{
'交易说明': BASE_HEADERS_MAPPING['附言'],
}
)
# 竖版-无表格-工商银行 抬头三行 活期历史明细清单
HEADERS_MAPPING.update(
{
'对方账户': BASE_HEADERS_MAPPING['对方卡号/账号'],
}
)
# -----------针式打印-全格线--------------------------------------------------------------------------------------------
# 竖版-表格-建设银行-中国建设银行活期账户交易明细
# 竖版-表格-建设银行-中国建设银行活期账户明细清单
# 竖版-表格-建设银行-对私活期账户明细- (1).pdf
HEADERS_MAPPING.update(
{
'帐户余额': BASE_HEADERS_MAPPING['余额'],
'对方帐户名称': BASE_HEADERS_MAPPING['对方账户名'],
}
)
# 竖版-特殊-交通银行 零售客户交易清单 5000以上交易记录
HEADERS_MAPPING.update(
{
'交易日期 记账日期': BASE_HEADERS_MAPPING['记账日期'],
}
)
# ----------针式打印-部分格线------------------------------------------------------------------------------------------
# 竖版-特殊-邮储银行-一本通绿卡通交易明细(客户)
# 竖版-特殊-邮储银行-账户交易明细(客户)
HEADERS_MAPPING.update(
{
'对方账号/卡号/汇票号': BASE_HEADERS_MAPPING['对方卡号/账号'],
}
)
# --------------------------------------------------------------------------------------------------------------------
# ('记账日期', '记账时间', '金额', '余额', '交易名称', '附言', '对方账户名', '对方卡号/账号', '对方开户行', '核对结果', '借贷', '收入', '支出')
# CLASSIFY_LIST = [
# ('中国银行', (1, 2, 4, 5, 6, 9, 10, 11, 12, None, None, None, None)), # 横版-表格-中国银行(不规则)
# ('农业银行', (1, None, 3, 5, None, 8, 7, 6, None, None, None, None, None)), # 横版-表格-农业银行-中国农业银行个人账户明细
# ('北京银行', (1, None, 4, 5, None, 2, 6, 7, None, None, 3, None, None)), # 横版-表格-北京银行
#
# # 工商银行:交易日期 账号 储种 序号 币种 钞汇 摘要 地区 收入/支出金额 余额 [对方户名 对方账号] 渠道
# ('工商银行', (1, None, 9, 10, None, 7, None, None, None, None, None, None, None)),
#
# # 建设银行:空 摘要 交易日期 交易金额 账户余额 商户/网点号及其名称 对方账号与户名 --> 竖版-表格-建设银行
# # 序号 摘要 币别 钞汇 交易日期 交易金额 账户余额 交易地点附言 对方账号与户名 --> 横版-表格-建设银行
# ('建设银行', (None, None, None, None, None, 2, None, None, None, None, None, None, None)),
#
# # 微信:交易单号 交易时间 交易类型 收/支/其他 交易方式 金额(元) 交易对方 商户单号
#
# # 支付宝:流水号 时间 名称/备注 收入 支出 账户余额 资金渠道
#
# # -----------------
#
# # 农业银行:交易日期 摘要/附言 交易金额 余额 交易地点/对方账号和户名
# ('农业银行', (1, None, 3, 4, None, 2, None, 5, None, None, None, None, None)),
#
# # 农商行:交易日期 交易发生额 账户余额 对方账号 对方户名 摘要 备注
#
# # 中信银行:交易日期 交易摘要 收入金额 支出金额 账户余额 对方户名 对方账号 对方银行 交易流水号
# ('中信银行', (1, None, None, 5, None, 2, 6, 7, 8, None, None, 3, 4)),
#
# # 平安电子账单:序号 交易日期 交易网点 摘要 借贷发生额(借:-贷:+) 账户余额
# ('平安电子账单', (2, None, 5, 6, None, 4, None, None, None, None, None, None, None)),
#
# # -------------------------
#
# # 招商银行:记账日期 货币 交易金额 联机余额 冲补账 交易摘要
#
# # 邮储银行:交易日期、交易类型 交易币种 交易金额(元) 账户余额(元) [对手方户名 对手方账户 收支类型] --> 竖版-无表格-邮储银行-账户对账单 含有对手方户名 对手方账户
# # 交易日期、交易类型 交易金额(元) 账户余额(元) 操作柜员 --> 竖版-无表格-邮储银行 账户对账单
#
# # 工商银行电子版:交易日期 账号 储种 序号 币种 妙汇 摘要 地区 收入/支出金额 余额 [对方户名 对方账号] 渠道
#
# # 招商银行电子版:记账日期 货币 交易金额 联机余额 交易摘要 对手信息
#
# # 民生银行:凭证类型 凭证号码 摘要信息 交易时间 交易金额 账户余额 现转标志 交易渠道 交易机构 对方户名 对方行名
# # 凭证类型 凭证号码 交易时间 摘要 交易金额 账户余额 现转标志 交易渠道 交易机构 对方户名 对方行名
#
# # 农业银行:交易日期 摘要/附言 交易金额 对方账号和户名
#
# # 农业银行:交易日期 摘要/附言 交易金额 余额 交易地点/对方账号和户名
#
# # 农业银行:日期、时间、[日志号]、短摘要、交易金额、本次余额、交易网点、渠道、附言
#
# # 交通银行:交易日期 记账日期、交易地点、交易类型、借贷状态、交易金额、余额
#
# # ===================================
#
# # 建设银行:摘要、交易日期、交易金额、账户余额、商户/网点号及其名称、对方账号、对方户名
# # 交易日期、摘要、币种、钞汇、交易金额、帐户余额、对方账号、对方帐户名称
#
#
# # ===================================
#
# # 邮储银行:序号、交易日期、交易渠道、摘要、交易金额、账户余额、对方账号/卡号/汇票号、原子账号、交易机构名称
#
# # 建设银行:序号 摘要 币别 钞汇 交易日期 交易金额 账户余额 交易地点附言 对方账号与户名
# ]
# {
# "0": "全表格-中国农业银行个人账户明细",
# "1": "全表格-中国银行",
# "2": "全表格-北京银行",
# "3": "全表格-工商银行",
# "4": "全表格-建设银行",
# "5": "部分格线-横版-中信银行账户交易明细",
# "6": "部分格线-横版-中信银行账户交易明细特殊",
# "7": "部分格线-竖版-中国农业银行",
# "8": "部分格线-竖版-中国农业银行分账户(窄页)",
# "9": "部分格线-竖版-平安电子账单"
# }
CLASSIFY_LIST = [
('农业银行', (1, None, 3, 5, None, 8, 7, 6, None, None, None, None, None)),
('中国银行', (1, 2, 4, 5, 6, 9, 10, 11, 12, None, None, None, None)),
('北京银行', (1, None, 4, 5, None, 2, 6, 7, None, None, 3, None, None)),
('工商银行', (1, None, 9, 10, None, 7, None, None, None, None, None, None, None)),
('建设银行', (None, None, None, None, None, 2, None, None, None, None, None, None, None)),
('中信银行', (1, None, None, 5, None, 2, 6, 7, 8, None, None, 3, 4)),
('中信银行', (1, None, None, 5, None, 2, 6, 7, 8, None, None, 3, 4)),
('农业银行', (1, None, 3, 4, None, 2, None, 5, None, None, None, None, None)),
('农业银行', (1, None, 3, 4, None, 2, None, 5, None, None, None, None, None)),
('平安电子账单', (2, None, 5, 6, None, 4, None, None, None, None, None, None, None)),
]
......
import os
import time
import fitz
import signal
import base64
import asyncio
import aiohttp
import difflib
import requests
from datetime import datetime
from collections import Counter
from apps.doc.ocr.wb import BSWorkbook, Workbook
from django.core.management import BaseCommand
......@@ -65,8 +66,6 @@ class Command(BaseCommand, LoggerMixin):
return doc, business_type
def pdf_download(self, doc, business_type):
if doc is None:
return None, None, None, None
doc_data_path = os.path.join(self.data_dir, business_type, str(doc.id))
os.makedirs(doc_data_path, exist_ok=True)
pdf_path = os.path.join(doc_data_path, '{0}.pdf'.format(doc.id))
......@@ -80,20 +79,96 @@ class Command(BaseCommand, LoggerMixin):
return doc_data_path, excel_path, src_excel_path, pdf_path
@staticmethod
def append_sheet(wb, sheets_list, img_name, role_summary):
for i, sheet in enumerate(sheets_list):
sheet_name = '{0}_{1}'.format(img_name, i)
role_summary['银行-户名'].append((sheet_name, 1, None, None, None, None, None))
def append_bs_sheet(wb, sheets, bs_summary, unknown_summary, pno, img_idx, classify, confidence):
for i, sheet in enumerate(sheets):
sheet_name = 'page_{0}_img_{1}_{2}'.format(pno, img_idx, i)
# ['户名', '卡号', '页码', '回单验证码', '打印时间', '起始时间', '终止时间']
summary = sheet.get('summary')
card = summary[1]
if card is None:
classify_dict = unknown_summary.setdefault(classify, {})
role = consts.UNKNOWN_ROLE if summary[0] is None else summary[0]
role_dict = classify_dict.setdefault(role, {})
role_dict['classify'] = classify
role_dict['role'] = role
role_dict.setdefault('sheet', []).append(sheet_name)
role_dict.setdefault('confidence', []).append(confidence)
code_list = role_dict.setdefault('code', [])
pt_list = role_dict.setdefault('print_time', [])
sd_list = role_dict.setdefault('start_date', [])
ed_list = role_dict.setdefault('end_date', [])
if summary[3] is not None:
code_list.append((summary[2], summary[3]))
if summary[4] is not None:
pt_list.append(summary[4])
if summary[5] is not None:
sd_list.append(summary[5])
if summary[6] is not None:
ed_list.append(summary[6])
else:
card_dict = bs_summary.setdefault(card, {})
card_dict['count'] = card_dict.get('count', 0) + 1
card_dict.setdefault('classify', []).append(classify)
card_dict.setdefault('confidence', []).append(confidence)
card_dict.setdefault('sheet', []).append(sheet_name)
role_list = card_dict.setdefault('role', [])
role_set = card_dict.setdefault('role_set', set())
code_list = card_dict.setdefault('code', [])
pt_list = card_dict.setdefault('print_time', [])
sd_list = card_dict.setdefault('start_date', [])
ed_list = card_dict.setdefault('end_date', [])
if summary[0] is not None:
role_list.append(summary[0])
role_set.add(summary[0])
if summary[3] is not None:
code_list.append((summary[2], summary[3]))
if summary[4] is not None:
pt_list.append(summary[4])
if summary[5] is not None:
sd_list.append(summary[5])
if summary[6] is not None:
ed_list.append(summary[6])
ws = wb.create_sheet(sheet_name)
cells = sheet.get('cells')
for cell in cells:
c1 = cell.get('start_column')
# c2 = cell.get('end_column')
r1 = cell.get('start_row')
# r2 = cell.get('end_row')
words = cell.get('words')
ws.cell(row=r1+1, column=c1+1, value=words)
def ocr_2_wb(self, res, wb, pno, img_idx, bs_summary, unknown_summary, license_summary):
# res = {
# 'code': 1,
# 'msg': 'success',
# 'data': {
# 'classify': 0,
# 'confidence': 0.999,
# 'sheets': [
# {
# 'summary': ['户名', '卡号', '页码', '回单验证码', '打印时间', '起始时间', '终止时间'],
# 'cells': []
# },
# {
# 'summary': ['户名', '卡号', '页码', '回单验证码', '打印时间', '起始时间', '终止时间'],
# 'cells': []
# }
# ]
# }
# }
data = res.get('data', {})
classify = data.get('classify')
if classify is None:
return
# if classify in
sheets = data.get('sheets', [])
if not sheets:
return
confidence = data.get('confidence', 1)
self.append_bs_sheet(wb, sheets, bs_summary, unknown_summary, pno, img_idx, classify, confidence)
# else:
# pass
# async def fetch_ocr_result(self, img_path):
# async with aiohttp.ClientSession(
# headers=self.ocr_header, connector=aiohttp.TCPConnector(ssl=False)
......@@ -102,35 +177,170 @@ class Command(BaseCommand, LoggerMixin):
# async with session.post(self.ocr_url, json=json_data) as response:
# return await response.json()
#
# async def img_ocr_excel(self, wb, img_path, role_summary):
# async def img_2_ocr_2_wb(self, wb, img_path, summary):
# res = await self.fetch_ocr_result(img_path)
# self.cronjob_log.info('{0} [fetch ocr result success] [img={1}] [res={2}]'.format(self.log_base, img_path, res))
# sheets_list = res.get('result').get('res')
# img_name = os.path.basename(img_path)
# self.append_sheet(wb, sheets_list, img_name, role_summary)
# self.append_sheet(wb, sheets_list, img_name, summary)
def fetch_ocr_result(self, img_path):
# payload = {'name': 'page_0_img_0_0'}
files = [
('img', open(img_path, 'rb'))
]
response = requests.request("POST", self.ocr_url, files=files)
return response.json()
def img_ocr_excel(self, wb, img_path, role_summary):
res = self.fetch_ocr_result(img_path)
self.cronjob_log.info('{0} [fetch ocr result success] [img={1}] [res={2}]'.format(self.log_base, img_path, res))
def img_2_ocr_2_wb(self, wb, img_info, bs_summary, unknown_summary, license_summary):
res = self.fetch_ocr_result(img_info[0])
self.cronjob_log.info('{0} [fetch ocr result success] [img={1}] [res={2}]'.format(
self.log_base, img_info[0], res))
if res.get('code') == 1:
sheets_list = res.get('data')
if not sheets_list:
return
img_name = os.path.basename(img_path)
self.append_sheet(wb, sheets_list, img_name, role_summary)
self.ocr_2_wb(res, wb, img_info[1], img_info[2], bs_summary, unknown_summary, license_summary)
@staticmethod
def get_most(value_list):
if value_list:
most_common = Counter(value_list).most_common(1)
return most_common[0][0] if most_common else None
@staticmethod
def date_format(date_str, format_str):
try:
date = datetime.strptime(date_str, format_str)
except Exception as e:
return
else:
return date
def get_validate_date(self, date_list):
for date_str in date_list:
for format_str in consts.DATE_FORMAT:
date = self.date_format(date_str, format_str)
if isinstance(date, datetime):
return date
def merge_card(self, bs_summary):
merged_bs_summary = {}
sorted_card = sorted(bs_summary.keys(), key=lambda x: bs_summary[x]['count'], reverse=True)
for main_card in sorted_card:
if bs_summary.get(main_card) is None:
continue
merged_bs_summary[main_card] = bs_summary.pop(main_card)
del merged_bs_summary[main_card]['count']
merge_cards = []
for card in bs_summary.keys():
if difflib.SequenceMatcher(None, main_card, card).quick_ratio() > consts.CARD_RATIO:
merged_bs_summary[main_card]['classify'].extend(bs_summary[card]['classify'])
merged_bs_summary[main_card]['confidence'].extend(bs_summary[card]['confidence'])
merged_bs_summary[main_card]['sheet'].extend(bs_summary[card]['sheet'])
merged_bs_summary[main_card]['role'].extend(bs_summary[card]['role'])
merged_bs_summary[main_card]['role_set'].update(bs_summary[card]['role_set'])
merged_bs_summary[main_card]['code'].extend(bs_summary[card]['sheet'])
merged_bs_summary[main_card]['print_time'].extend(bs_summary[card]['print_time'])
merged_bs_summary[main_card]['start_date'].extend(bs_summary[card]['start_date'])
merged_bs_summary[main_card]['end_date'].extend(bs_summary[card]['end_date'])
merge_cards.append(card)
for card in merge_cards:
del bs_summary[card]
merged_bs_summary[main_card]['classify'] = self.get_most(merged_bs_summary[main_card]['classify'])
merged_bs_summary[main_card]['role'] = self.get_most(merged_bs_summary[main_card]['role'])
del bs_summary
return merged_bs_summary
def prune_bs_summary(self, bs_summary):
for summary in bs_summary.values():
del summary['count']
summary['classify'] = self.get_most(summary['classify'])
summary['role'] = self.get_most(summary['role'])
return bs_summary
def rebuild_bs_summary(self, bs_summary, unknown_summary):
# bs_summary = {
# '卡号': {
# 'count': 100,
# 'classify': [],
# 'confidence': [],
# 'role': [],
# 'code': [('page', 'code')],
# 'print_time': [],
# 'start_date': [],
# 'end_date': [],
# 'sheet': ['sheet_name']
# }
# }
#
# unknown_summary = {
# 0: {
# '户名': {
# 'classify': 0,
# 'confidence': [],
# 'role': '户名',
# 'code': [('page', 'code')],
# 'print_time': [],
# 'start_date': [],
# 'end_date': [],
# 'sheet': ['sheet_name']
# }
# }
# }
# 无卡号
if len(bs_summary) == 0:
del bs_summary
merged_bs_summary = {}
card_num = 1
for role_dict in unknown_summary.values():
for summary in role_dict.values():
card = '{0}_{1}'.format(consts.UNKNOWN_CARD, card_num)
card_num += 1
merged_bs_summary[card] = summary
else:
# 1卡号
if len(bs_summary) == 1:
merged_bs_summary = self.prune_bs_summary(bs_summary)
# 多卡号
else:
merged_bs_summary = self.merge_card(bs_summary)
for card_summary in merged_bs_summary.values():
merge_role = []
classify_summary = unknown_summary.get(card_summary['classify'], {})
for role, summary in classify_summary.items():
if role in card_summary['role_set']:
merge_role.append(role)
card_summary['sheet'].extend(summary['sheet'])
card_summary['code'].extend(summary['sheet'])
card_summary['print_time'].extend(summary['print_time'])
card_summary['start_date'].extend(summary['start_date'])
card_summary['end_date'].extend(summary['end_date'])
for role in merge_role:
del classify_summary[role]
card_num = 1
for role_dict in unknown_summary.values():
for summary in role_dict.values():
card = '{0}_{1}'.format(consts.UNKNOWN_CARD, card_num)
card_num += 1
merged_bs_summary[card] = summary
del unknown_summary
for summary in merged_bs_summary.values():
if summary.get('role_set') is not None:
del summary['role_set']
summary['print_time'] = self.get_validate_date(summary['print_time'])
summary['start_date'] = self.get_validate_date(summary['start_date'])
summary['end_date'] = self.get_validate_date(summary['end_date'])
summary['confidence'] = max(summary['confidence'])
return merged_bs_summary
# TODO 细化文件状态,不同异常状态,归还队列,重试时采取不同的处理
# TODO 调用接口重试
# TODO 协程异步发送OCR请求
# TODO 异常邮件通知
# TODO 数据库断联问题
# TODO 非流水证件处理,Excel模板
def handle(self, *args, **kwargs):
sleep_second = int(conf.SLEEP_SECOND)
max_sleep_second = int(conf.MAX_SLEEP_SECOND)
......@@ -138,17 +348,19 @@ class Command(BaseCommand, LoggerMixin):
while self.switch:
# 1. 从队列获取文件信息
doc, business_type = self.get_doc_info()
# 队列为空时的处理
if doc is None:
time.sleep(sleep_second)
sleep_second = min(max_sleep_second, sleep_second + 5)
continue
sleep_second = int(conf.SLEEP_SECOND)
try:
start_time = time.time()
# 2. 从EDMS获取PDF文件
doc_data_path, excel_path, src_excel_path, pdf_path = self.pdf_download(doc, business_type)
# 队列为空时的处理
if pdf_path is None:
time.sleep(sleep_second)
sleep_second = min(max_sleep_second, sleep_second+5)
continue
sleep_second = int(conf.SLEEP_SECOND)
# 3.PDF文件提取图片
start_time = time.time()
img_save_path = os.path.join(doc_data_path, 'img')
self.cronjob_log.info('{0} [pdf to img start] [business_type={1}] [doc_id={2}]'.format(
self.log_base, business_type, doc.id))
......@@ -158,28 +370,42 @@ class Command(BaseCommand, LoggerMixin):
self.log_base, business_type, doc.id))
write_zip_file(img_save_path, os.path.join(doc_data_path, '{0}_img.zip'.format(doc.id)))
# 4.图片调用算法判断是否为银行流水, 图片调用算法OCR为excel文件
role_summary = {
'银行-户名': []
}
# interest_keyword = Keywords.objects.filter(
# type=KeywordsType.INTEREST.value).values_list('keyword', flat=True)
# salary_keyword = Keywords.objects.filter(
# type=KeywordsType.SALARY.value).values_list('keyword', flat=True)
# loan_keyword = Keywords.objects.filter(type=KeywordsType.LOAN.value).values_list('keyword', flat=True)
# wb = BSWorkbook(interest_keyword, salary_keyword, loan_keyword)
wb = Workbook()
# 4.获取OCR结果并且构建excel文件
bs_summary = {}
license_summary = {}
unknown_summary = []
interest_keyword = Keywords.objects.filter(
type=KeywordsType.INTEREST.value).values_list('keyword', flat=True)
salary_keyword = Keywords.objects.filter(
type=KeywordsType.SALARY.value).values_list('keyword', flat=True)
loan_keyword = Keywords.objects.filter(
type__in=[KeywordsType.LOAN.value, KeywordsType.ALI_WECHART.value]).values_list(
'keyword', flat=True)
wb = BSWorkbook(interest_keyword, salary_keyword, loan_keyword)
# wb = Workbook()
# 4.1 获取OCR结果
# loop = asyncio.get_event_loop()
# tasks = [self.img_ocr_excel(wb, img_path, role_summary) for img_path in pdf_handler.img_path_list]
# tasks = [self.img_2_ocr_2_wb(wb, img_path, summary) for img_path in pdf_handler.img_path_list]
# loop.run_until_complete(asyncio.wait(tasks))
# loop.close()
for img_path in pdf_handler.img_path_list:
self.img_ocr_excel(wb, img_path, role_summary)
for img_info in pdf_handler.img_info_list:
self.img_2_ocr_2_wb(wb, img_info, bs_summary, unknown_summary, license_summary)
self.cronjob_log.info('{0} [bs_summary={1}] [unknown_summary={2}] [license_summary={3}]'.format(
self.log_base, bs_summary, unknown_summary, license_summary))
merged_bs_summary = self.rebuild_bs_summary(bs_summary, unknown_summary)
# 整合excel文件
# wb.save(src_excel_path)
# wb.rebuild(role_summary)
self.cronjob_log.info('{0} [merged_bs_summary={1}] [unknown_summary={2}]'.format(
self.log_base, merged_bs_summary, unknown_summary))
del unknown_summary
# 4.2 重构Excel文件
wb.save(src_excel_path)
wb.rebuild(merged_bs_summary, license_summary)
wb.save(excel_path)
except Exception as e:
doc.status = DocStatus.PROCESS_FAILED.value
......@@ -194,14 +420,16 @@ class Command(BaseCommand, LoggerMixin):
except Exception as e:
doc.status = DocStatus.UPLOAD_FAILED.value
doc.save()
self.cronjob_log.error('{0} [upload failed] [business_type={1}] [doc_id={2}] [err={3}]'.format(
self.log_base, business_type, doc.id, e))
end_time = time.time()
speed_time = int(end_time - start_time)
self.cronjob_log.error('{0} [upload failed] [business_type={1}] [doc_id={2}] [speed_time={3}] '
'[err={4}]'.format(self.log_base, business_type, doc.id, speed_time, e))
else:
doc.status = DocStatus.COMPLETE.value
doc.save()
end_time = time.time()
speed_time = int(end_time - start_time)
self.cronjob_log.info('{0} [doc process complete] [business_type={1}] [doc_id={2}] '
self.cronjob_log.info('{0} [process complete] [business_type={1}] [doc_id={2}] '
'[speed_time={3}]'.format(self.log_base, business_type, doc.id, speed_time))
self.cronjob_log.info('{0} [stop safely]'.format(self.log_base))
......
......@@ -11,6 +11,8 @@ class DocHandler:
return '/data/{1}/{0}/{0}.pdf'.format(doc_id, business_type)
elif file == 'img':
return '/data/{1}/{0}/{0}_img.zip'.format(doc_id, business_type)
elif file == 'src_excel':
return '/data/{1}/{0}/src.xlsx'.format(doc_id, business_type)
else:
return '/data/{1}/{0}/{0}.xlsx'.format(doc_id, business_type)
......@@ -22,6 +24,7 @@ class DocHandler:
doc_dict['pdf_link'] = self.get_link(doc_id, business_type)
doc_dict['img_link'] = self.get_link(doc_id, business_type, file='img')
doc_dict['excel_link'] = self.get_link(doc_id, business_type, file='excel')
doc_dict['src_excel_link'] = self.get_link(doc_id, business_type, file='src_excel')
return list(doc_queryset)
@staticmethod
......
......@@ -13,3 +13,4 @@ class KeywordsType(NamedEnum):
INTEREST = (0, "利息")
SALARY = (1, '薪资')
LOAN = (2, '贷款')
ALI_WECHART = (3, '微信/支付宝')
......
......@@ -13,6 +13,7 @@ class BSWorkbook(Workbook):
def __init__(self, interest_keyword, salary_keyword, loan_keyword, *args, **kwargs):
super().__init__(*args, **kwargs)
locale.setlocale(locale.LC_NUMERIC, 'en_US.UTF-8')
self.meta_sheet_title = '关键信息提取和展示'
self.blank_row = (None,)
self.code_header = ('页数', '电子回单验证码')
......@@ -24,26 +25,59 @@ class BSWorkbook(Workbook):
self.proof_res = ('对', '错')
self.loan_fill = PatternFill("solid", fgColor="00FFCC00")
self.amount_fill = PatternFill("solid", fgColor="00FFFF00")
self.bd = Side(style='thin', color="000000")
self.border = Border(left=self.bd, top=self.bd, right=self.bd, bottom=self.bd)
# self.bd = Side(style='thin', color="000000")
# self.border = Border(left=self.bd, top=self.bd, right=self.bd, bottom=self.bd)
self.MAX_MEAN = 31
@staticmethod
def sheet_prune(ws):
def sheet_prune(ws, classify):
ws.insert_cols(1, amount=consts.FIXED_COL_AMOUNT)
moved_col_set = set()
header_col_set = set()
# 根据第一行关键词排列
for col in range(consts.FIXED_COL_AMOUNT + 1, ws.max_column + 1):
header_value = ws.cell(1, col).value
header_idx = consts.HEADERS_MAPPING.get(header_value)
# TODO 关键字段再次查找
# TODO 支付宝、微信流水第一行非表头,怎么处理
if header_idx is None:
header_col = consts.HEADERS_MAPPING.get(header_value)
if header_col is not None:
letter = get_column_letter(col)
ws.move_range("{0}1:{0}{1}".format(letter, ws.max_row), cols=header_col - col)
moved_col_set.add(col)
header_col_set.add(header_col)
elif header_value in consts.BORROW_HEADERS_SET:
letter = get_column_letter(col)
ws.move_range("{0}1:{0}{1}".format(letter, ws.max_row), cols=consts.BORROW_HEADER_COL - col)
moved_col_set.add(col)
header_col_set.add(consts.BORROW_HEADER_COL)
elif header_value in consts.INCOME_HEADERS_SET:
letter = get_column_letter(col)
ws.move_range("{0}1:{0}{1}".format(letter, ws.max_row), cols=consts.INCOME_HEADER_COL - col)
moved_col_set.add(col)
header_col_set.add(consts.INCOME_HEADER_COL)
elif header_value in consts.OUTLAY_HEADERS_SET:
letter = get_column_letter(col)
ws.move_range("{0}1:{0}{1}".format(letter, ws.max_row), cols=consts.OUTLAY_HEADER_COL - col)
moved_col_set.add(col)
header_col_set.add(consts.OUTLAY_HEADER_COL)
# 缺失表头再次查找
for header_col in range(1, consts.FIXED_COL_AMOUNT + 1):
if header_col in header_col_set or header_col == consts.RESULT_HEADER_COL:
continue
fix_col = consts.CLASSIFY_LIST[classify][1][header_col - 1] # TODO 合并分类情况
if fix_col is None:
continue
letter = get_column_letter(col)
ws.move_range("{0}1:{0}{1}".format(letter, ws.max_row), cols=header_idx - col)
fix_col = fix_col + consts.FIXED_COL_AMOUNT
if fix_col in moved_col_set:
break
letter = get_column_letter(fix_col)
ws.move_range("{0}1:{0}{1}".format(letter, ws.max_row), cols=header_col - fix_col)
ws.delete_cols(consts.FIXED_COL_AMOUNT + 1, amount=ws.max_column)
min_row = 1 if len(moved_col_set) == 0 else 2
return min_row
@staticmethod
def month_split(dti, date_list):
def month_split(dti, date_list, date_statistics):
month_list = []
idx_list = []
month_pre = None
......@@ -53,15 +87,17 @@ class BSWorkbook(Workbook):
if month_str != month_pre:
month_list.append(month_str)
if month_pre is None:
date_list.append(dti[idx].date())
if date_statistics:
date_list.append(dti[idx].date())
idx = 0
idx_list.append(idx)
month_pre = month_str
for idx in range(len(dti)-1, -1, -1):
if isinstance(dti[idx], NaTType):
continue
date_list.append(dti[idx].date())
break
if date_statistics:
for idx in range(len(dti) - 1, -1, -1):
if isinstance(dti[idx], NaTType):
continue
date_list.append(dti[idx].date())
break
return month_list, idx_list
@staticmethod
......@@ -86,8 +122,8 @@ class BSWorkbook(Workbook):
reverse_trend = -1
return reverse_trend
def sheet_split(self, ws, month_mapping, date_list, reverse_trend_list):
for date_tuple_src in ws.iter_cols(min_col=1, max_col=1, min_row=2, values_only=True):
def sheet_split(self, ws, month_mapping, reverse_trend_list, min_row, date_list, date_statistics):
for date_tuple_src in ws.iter_cols(min_col=1, max_col=1, min_row=min_row, values_only=True):
date_tuple = [date[:10] if isinstance(date, str) else date for date in date_tuple_src]
dt_array, tz_parsed = tslib.array_to_datetime(
np.array(date_tuple, copy=False, dtype=np.object_),
......@@ -95,16 +131,16 @@ class BSWorkbook(Workbook):
utc=False,
dayfirst=False,
yearfirst=False,
require_iso8601=False,
require_iso8601=True,
)
dti = DatetimeIndex(dt_array, tz=None, name=None)
month_list, idx_list = self.month_split(dti, date_list)
month_list, idx_list = self.month_split(dti, date_list, date_statistics)
if len(month_list) == 0:
# month_info process
month_info = month_mapping.setdefault('xxxx-xx', [])
month_info.append((ws.title, 2, ws.max_row, 0))
month_info.append((ws.title, min_row, ws.max_row, 0))
elif len(month_list) == 1:
# reverse_trend_list process
reverse_trend = self.get_reverse_trend(dti.day, idx_list)
......@@ -113,14 +149,14 @@ class BSWorkbook(Workbook):
month_info = month_mapping.setdefault(month_list[0], [])
day_mean = np.mean(dti.day.dropna())
if len(month_info) == 0:
month_info.append((ws.title, 2, ws.max_row, day_mean))
month_info.append((ws.title, min_row, ws.max_row, day_mean))
else:
for i, item in enumerate(month_info):
if day_mean <= item[-1]:
month_info.insert(i, (ws.title, 2, ws.max_row, day_mean))
month_info.insert(i, (ws.title, min_row, ws.max_row, day_mean))
break
else:
month_info.append((ws.title, 2, ws.max_row, day_mean))
month_info.append((ws.title, min_row, ws.max_row, day_mean))
else:
# reverse_trend_list process
reverse_trend = self.get_reverse_trend(dti.day, idx_list)
......@@ -128,34 +164,41 @@ class BSWorkbook(Workbook):
# month_info process
for i, item in enumerate(month_list[:-1]):
month_mapping.setdefault(item, []).append(
(ws.title, idx_list[i] + 2, idx_list[i + 1] + 1, self.MAX_MEAN))
(ws.title, idx_list[i] + min_row, idx_list[i + 1] + min_row - 1, self.MAX_MEAN))
month_mapping.setdefault(month_list[-1], []).insert(
0, (ws.title, idx_list[-1] + 2, ws.max_row, 0))
0, (ws.title, idx_list[-1] + min_row, ws.max_row, 0))
def build_metadata_rows(self, confidence_max, code_list, print_time, start_date, end_date, date_interval):
metadata_rows = [('流水识别置信度', confidence_max), self.blank_row, self.code_header]
metadata_rows.extend(code_list)
def build_metadata_rows(self, classify, confidence, role, code, print_time, start_date, end_date):
metadata_rows = [
('流水识别置信度', confidence),
self.blank_row,
('分类结果', classify),
self.blank_row,
('户名', role),
self.blank_row,
self.code_header,
]
metadata_rows.extend(code)
metadata_rows.extend(
[self.blank_row,
self.date_header,
(print_time, start_date, end_date, date_interval),
(print_time, start_date, end_date, (end_date - start_date).days),
self.blank_row,
self.keyword_header]
)
return metadata_rows
def create_meta_sheet(self, role):
def create_meta_sheet(self, card):
if self.worksheets[0].title == 'Sheet':
ms = self.worksheets[0]
ms.title = '{0}({1})'.format(self.meta_sheet_title, role)
ms.title = '{0}({1})'.format(self.meta_sheet_title, card)
else:
ms = self.create_sheet('{0}({1})'.format(self.meta_sheet_title, role))
ms = self.create_sheet('{0}({1})'.format(self.meta_sheet_title, card))
return ms
def build_meta_sheet(self, role, confidence_max, code_list, print_time, start_date, end_date, date_interval):
metadata_rows = self.build_metadata_rows(confidence_max, code_list, print_time,
start_date, end_date, date_interval)
ms = self.create_meta_sheet(role)
def build_meta_sheet(self, card, classify, confidence, role, code, print_time, start_date, end_date):
metadata_rows = self.build_metadata_rows(classify, confidence, role, code, print_time, start_date, end_date)
ms = self.create_meta_sheet(card)
for row in metadata_rows:
ms.append(row)
return ms
......@@ -169,55 +212,84 @@ class BSWorkbook(Workbook):
new_ws.append(consts.FIXED_HEADERS)
for part in parts:
ws = self.get_sheet_by_name(part[0])
for row in ws.iter_rows(min_row=part[1], max_row=part[2], values_only=True):
new_ws.append(row)
for row_value in ws.iter_rows(min_row=part[1], max_row=part[2], values_only=True):
new_ws.append(row_value)
# 3.2.提取信息、高亮
amount_mapping = {}
amount_fill_row = set()
for rows in new_ws.iter_rows():
summary_cell = rows[5]
date_cell = rows[0]
for rows in new_ws.iter_rows(min_row=2):
summary_cell = rows[consts.SUMMARY_IDX]
date_cell = rows[consts.DATE_IDX]
amount_cell = rows[consts.AMOUNT_IDX]
row = summary_cell.row
# 关键词1提取
if summary_cell.value in self.interest_keyword:
ms.append((summary_cell.value, date_cell.value, rows[2].value))
ms.append((summary_cell.value, date_cell.value, amount_cell.value))
# 关键词2提取至临时表
elif summary_cell.value in self.salary_keyword:
tmp_ws.append((summary_cell.value, date_cell.value, rows[2].value))
tmp_ws.append((summary_cell.value, date_cell.value, amount_cell.value))
# 贷款关键词高亮
elif summary_cell.value in self.loan_keyword:
summary_cell.fill = self.loan_fill
for i, cell in enumerate(rows):
cell.border = self.border
if (i == 2 or i == 3) and cell.row > 1:
# 3.3.余额转数值
over_cell = rows[consts.OVER_IDX]
try:
if isinstance(over_cell.value, str):
over_cell.value = over_cell.value.translate(consts.TRANS)
over_cell.value = locale.atof(over_cell.value)
except Exception as e:
continue
else:
over_cell.number_format = numbers.FORMAT_NUMBER_COMMA_SEPARATED1
# 3.4.余额转数值
try:
try:
if isinstance(amount_cell.value, str):
amount_cell.value = amount_cell.value.translate(consts.TRANS)
amount_cell.value = locale.atof(amount_cell.value)
except Exception as e:
try:
# 3.3.金额、余额转数值
cell.value = locale.atof(cell.value)
except Exception:
continue
else:
cell.number_format = numbers.FORMAT_NUMBER_COMMA_SEPARATED1
if i == 2:
same_amount_mapping = amount_mapping.get(date_cell.value, {})
fill_rows = same_amount_mapping.get(-cell.value)
if fill_rows:
amount_fill_row.add(cell.row)
amount_fill_row.update(fill_rows)
amount_mapping.setdefault(date_cell.value, {}).setdefault(
cell.value, []).append(cell.row)
# 3.4.核对结果
# TODO 借贷、开支类型银行流水,需要手动添加+-号
if i == 9 and cell.row > 2:
if is_reverse:
cell.value = '=IF(D{0}=SUM(D{1},C{0}), "{2}", "{3}")'.format(
cell.row - 1, cell.row, *self.proof_res)
else:
cell.value = '=IF(D{0}=SUM(D{1},C{0}), "{2}", "{3}")'.format(
cell.row, cell.row - 1, *self.proof_res)
if isinstance(rows[consts.INCOME_IDX].value, str):
rows[consts.OUTLAY_IDX].value = rows[consts.INCOME_IDX].value.translate(consts.TRANS)
amount_cell.value = locale.atof(rows[consts.OUTLAY_IDX].value)
except Exception as e:
if isinstance(rows[consts.OUTLAY_IDX].value, str):
rows[consts.OUTLAY_IDX].value = rows[consts.OUTLAY_IDX].value.translate(consts.TRANS)
amount_cell.value = locale.atof(rows[consts.OUTLAY_IDX].value)
if amount_cell.value > 0:
amount_cell.value = -amount_cell.value
except Exception as e:
continue
else:
if rows[consts.BORROW_IDX].value in consts.BORROW_OUTLAY_SET:
amount_cell.value = -amount_cell.value
amount_cell.number_format = numbers.FORMAT_NUMBER_COMMA_SEPARATED1
same_amount_mapping = amount_mapping.get(date_cell.value, {})
fill_rows = same_amount_mapping.get(-amount_cell.value)
if fill_rows:
amount_fill_row.add(row)
amount_fill_row.update(fill_rows)
amount_mapping.setdefault(date_cell.value, {}).setdefault(
amount_cell.value, []).append(row)
# 3.5.同一天相同进出账高亮
# 3.5.核对结果
if row > 2:
if is_reverse:
rows[consts.RESULT_IDX].value = '=IF(D{0}=SUM(D{1},C{0}), "{2}", "{3}")'.format(
row - 1, row, *self.proof_res)
else:
rows[consts.RESULT_IDX].value = '=IF(D{0}=SUM(D{1},C{0}), "{2}", "{3}")'.format(
row, row - 1, *self.proof_res)
# 删除金额辅助列
new_ws.delete_cols(consts.BORROW_HEADER_COL, amount=new_ws.max_column)
# 3.6.同一天相同进出账高亮
del amount_mapping
for row in amount_fill_row:
new_ws[row][2].fill = self.amount_fill
new_ws[row][consts.AMOUNT_IDX].fill = self.amount_fill
# 关键词2信息提取
ms.append(self.blank_row)
......@@ -226,34 +298,51 @@ class BSWorkbook(Workbook):
ms.append(row)
self.remove(tmp_ws)
def rebuild(self, role_summary):
# (sheet_name, confidence, page, code, print_time, start_date, end_date) # TODO 表名简化,+卡号
for role, summary_list in role_summary.items():
def bs_rebuild(self, bs_summary):
# bs_summary = {
# '卡号': {
# 'classify': 0,
# 'confidence': 0.9,
# 'role': '柳雪',
# 'code': [('page', 'code')],
# 'print_time': 'datetime',
# 'start_date': 'datetime',
# 'end_date': 'datetime',
# 'sheet': ['sheet_name']
# }
# }
for card, summary in bs_summary.items():
# 1.原表修剪、排列、按照月份分割
reverse_trend_list = []
confidence_max = 0
code_list = []
month_mapping = {}
start_date = summary['start_date']
end_date = summary['end_date']
date_statistics = False
if start_date is None or end_date is None:
date_statistics = True
date_list = []
start_date = end_date = date_interval = print_time = None
for summary in summary_list:
sheet_name, confidence, page, code, print_time_local, start_date_local, end_date_local = summary
ws = self.get_sheet_by_name(sheet_name)
month_mapping = {}
reverse_trend_list = []
for sheet in summary['sheet']:
ws = self.get_sheet_by_name(sheet)
# 1.1.删除多余列、排列
self.sheet_prune(ws)
min_row = self.sheet_prune(ws, summary['classify'])
# 1.2.按月份分割
self.sheet_split(ws, month_mapping, date_list, reverse_trend_list)
# 1.3.元数据处理 TODO 时间与日期处理
confidence_max = max(confidence, confidence_max)
if code is not None:
code_list.append((page, code))
self.sheet_split(ws, month_mapping, reverse_trend_list, min_row, date_list, date_statistics)
if date_statistics is True and len(date_list) > 1:
start_date = min(date_list) if start_date is None else start_date
end_date = max(date_list) if end_date is None else end_date
if len(date_list) > 1:
start_date = min(date_list)
end_date = max(date_list)
date_interval = (end_date - start_date).days
# 2.元信息提取表
ms = self.build_meta_sheet(role, confidence_max, code_list, print_time, start_date, end_date, date_interval)
bank_name = consts.CLASSIFY_LIST[summary['classify']][0]
base_sheet_name = '{0}_{1}'.format(bank_name, summary['role'])
ms = self.build_meta_sheet(card,
summary['classify'],
summary['confidence'],
summary['role'],
summary['code'],
summary['print_time'],
start_date,
end_date)
# 3.创建月份表、提取/高亮关键行
is_reverse = False
......@@ -261,8 +350,11 @@ class BSWorkbook(Workbook):
is_reverse = True
for month_list in month_mapping.values():
month_list.sort(key=lambda x: x[-1], reverse=True)
self.build_month_sheet(role, month_mapping, ms, is_reverse)
self.build_month_sheet(base_sheet_name, month_mapping, ms, is_reverse)
# 4.删除原表
for sheet in summary['sheet']:
self.remove(self.get_sheet_by_name(sheet))
# 删除原表
for summary in summary_list:
self.remove(self.get_sheet_by_name(summary[0]))
def rebuild(self, bs_summary, license_summary):
self.bs_rebuild(bs_summary)
\ No newline at end of file
......
......@@ -25,7 +25,7 @@ class PDFHandler:
def __init__(self, path, img_dir_path):
self.path = path
self.img_dir_path = img_dir_path
self.img_path_list = []
self.img_info_list = []
self.xref_set = set()
def get_img_save_path(self, pno, img_index=0, ext='png'):
......@@ -38,7 +38,7 @@ class PDFHandler:
pm = page.getPixmap(matrix=trans_2, alpha=False)
img_save_path = self.get_img_save_path(page.number)
pm.writePNG(img_save_path)
self.img_path_list.append(img_save_path)
self.img_info_list.append((img_save_path, page.number, 0))
@staticmethod
def getimage(pix):
......@@ -88,7 +88,7 @@ class PDFHandler:
with open(img_save_path, "wb") as f:
f.write(img_data)
self.xref_set.add(xref)
self.img_path_list.append(img_save_path)
self.img_info_list.append((img_save_path, pno, img_index))
@staticmethod
def split_il(il):
......@@ -179,7 +179,7 @@ class PDFHandler:
img_save_path = self.get_img_save_path(pno, img_index, im_list[0][2])
new_img.save(img_save_path)
page_to_png = False
self.img_path_list.append(img_save_path)
self.img_info_list.append((img_save_path, pno, img_index))
# 3.3 碎图分组大于2、全过滤、含特殊filter,特殊处理:整个页面保存为png图片
if page_to_png:
......
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!