1526125c by 周伟奇

update excel process & add keywords admin

1 parent b2945296
......@@ -2,4 +2,4 @@ from django.apps import AppConfig
class AccountConfig(AppConfig):
name = 'account'
name = 'apps.account'
......
default_app_config = 'apps.doc.apps.DocConfig'
......
from django.contrib import admin
from .models import Keywords
from .named_enum import KeywordsType
# Register your models here.
class KeywordsAdmin(admin.ModelAdmin):
list_display = ('keyword', 'type_verbose_name', 'on_off')
search_fields = ('keyword',)
list_filter = ('type', 'on_off',)
def type_verbose_name(self, obj):
return KeywordsType.get_verbose_name(obj.type)
type_verbose_name.short_description = '类型'
admin.site.register(Keywords, KeywordsAdmin)
admin.site.site_header = '宝马OCR'
admin.site.site_title = '宝马OCR'
......
......@@ -2,4 +2,5 @@ from django.apps import AppConfig
class DocConfig(AppConfig):
name = 'doc'
name = 'apps.doc'
verbose_name = '文件'
......
......@@ -8,13 +8,13 @@ import aiohttp
from openpyxl import Workbook
from django.core.management import BaseCommand
from settings import conf
from common.mixins import LoggerMixin
from common.tools.file_tools import write_zip_file
from common.tools.pdf_to_img import PDFHandler
from apps.doc.models import DocStatus, HILDoc, AFCDoc
from apps.doc import consts
from settings import conf
from apps.doc.edms import EDMS, rh
from apps.doc.ocr.edms import EDMS, rh
class Command(BaseCommand, LoggerMixin):
......@@ -95,8 +95,8 @@ class Command(BaseCommand, LoggerMixin):
# c2 = cell.get('end_column')
r1 = cell.get('start_row')
# r2 = cell.get('end_row')
label = cell.get('words')
ws.cell(row=r1+1, column=c1+1, value=label)
words = cell.get('words')
ws.cell(row=r1+1, column=c1+1, value=words)
@staticmethod
def get_ocr_json(img_path):
......
from django.db import models
from .named_enum import DocStatus
from .named_enum import DocStatus, KeywordsType
# Create your models here.
......@@ -101,3 +101,19 @@ class PriorityApplication(models.Model):
situ_db_label = 'afc'
db_table = 'priority_application'
class Keywords(models.Model):
id = models.AutoField(primary_key=True, verbose_name="id")
keyword = models.CharField(max_length=64, verbose_name="关键词")
type = models.SmallIntegerField(choices=KeywordsType.get_choices_lst(), verbose_name="类型")
on_off = models.BooleanField(default=True, verbose_name="是否有效")
update_time = models.DateTimeField(auto_now=True, verbose_name='修改时间')
create_time = models.DateTimeField(auto_now_add=True, verbose_name='创建时间')
class Meta:
managed = False
situ_db_label = 'afc'
db_table = 'keywords'
verbose_name = '银行流水关键词'
verbose_name_plural = verbose_name
......
......@@ -7,3 +7,9 @@ class DocStatus(NamedEnum):
PROCESS_FAILED = (2, '识别失败')
UPLOAD_FAILED = (3, '同步失败')
COMPLETE = (4, '已完成')
class KeywordsType(NamedEnum):
INTEREST = (0, "利息")
SALARY = (1, '薪资')
LOAN = (2, '贷款')
......
......@@ -2,7 +2,7 @@ import os
import requests
from zeep import Client, xsd
from settings import conf
from . import consts
from apps.doc import consts
from common.redis_cache import redis_handler as rh
......
import numpy as np
import locale
from pandas._libs import tslib
from pandas.core.indexes.datetimes import DatetimeIndex
from openpyxl import Workbook
from openpyxl.styles import Border, Side, PatternFill, numbers
from openpyxl.utils import get_column_letter
class BSWorkbook(Workbook):
def __init__(self, interest_keyword, salary_keyword, loan_keyword, *args, **kwargs):
super().__init__(*args, **kwargs)
self.fixed_headers = ('记账日期', '记账时间', '金额', '余额', '交易名称', '附言', '对方账户名',
'对方卡号/账号', '对方开户行', '核对结果')
self.fixed_col_amount = len(self.fixed_headers)
self.headers_mapping = {
'记账日期': 1,
'交易日期': 1,
'记账时间': 2,
'金额': 3,
'交易金额': 3,
'余额': 4,
'账户余额': 4,
'交易名称': 5,
'附言': 6,
'摘要': 6,
'对方账户名': 7,
'对方卡号/账号': 8,
'对方账号与户名': 8,
'对方开户行': 9,
}
self.meta_sheet_title = '关键信息提取和展示'
self.blank_row = (None,)
self.code_header = ('页数', '电子回单验证码')
self.date_header = ('打印时间', '起始日期', '终止日期', '流水区间结果')
self.keyword_header = ('关键词', '记账日期', '金额')
self.interest_keyword = interest_keyword
self.salary_keyword = salary_keyword
self.loan_keyword = loan_keyword
self.proof_res = ('对', '错')
self.loan_fill = PatternFill("solid", fgColor="00FFCC00")
self.amount_fill = PatternFill("solid", fgColor="00FFFF00")
self.bd = Side(style='thin', color="000000")
self.border = Border(left=self.bd, top=self.bd, right=self.bd, bottom=self.bd)
def sheet_prune(self, ws):
ws.insert_cols(1, amount=self.fixed_col_amount)
for col in range(self.fixed_col_amount + 1, ws.max_column + 1):
header_value = ws.cell(1, col).value
header_idx = self.headers_mapping.get(header_value)
# TODO 关键字段再次查找
if header_idx is None:
continue
letter = get_column_letter(header_idx)
ws.move_range("{0}1:{0}{1}".format(letter, ws.max_row), cols=header_idx - col)
ws.delete_cols(self.fixed_col_amount + 1, amount=ws.max_column)
def sheet_split(self, ws, month_mapping):
for date_tuple in ws.iter_cols(min_col=1, max_col=1, min_row=2, values_only=True):
dt_array, tz_parsed = tslib.array_to_datetime(
np.array(date_tuple, copy=False, dtype=np.object_),
errors="coerce",
utc=False,
dayfirst=False,
yearfirst=False,
require_iso8601=False,
)
dti = DatetimeIndex(dt_array, tz=None, name=None)
def build_metadata_rows(self, confidence_max, code_list, print_time, start_date, end_date, date_interval):
metadata_rows = [('流水识别置信度', confidence_max), self.blank_row, self.code_header]
metadata_rows.extend(code_list)
metadata_rows.extend(
[self.blank_row,
self.date_header,
(print_time, start_date, end_date, date_interval),
self.blank_row,
self.keyword_header]
)
return metadata_rows
def create_meta_sheet(self, role):
if self.worksheets[0].title == 'Sheet':
ms = self.worksheets[0]
ms.title = '{0}({1})'.format(self.meta_sheet_title, role)
else:
ms = self.create_sheet('{0}({1})'.format(self.meta_sheet_title, role))
return ms
def build_meta_sheet(self, role, confidence_max, code_list, print_time, start_date, end_date, date_interval):
metadata_rows = self.build_metadata_rows(confidence_max, code_list, print_time,
start_date, end_date, date_interval)
ms = self.create_meta_sheet(role)
for row in metadata_rows:
ms.append(row)
return ms
def build_month_sheet(self, role, month_mapping, ms):
tmp_ws = self.create_sheet('tmp_ws')
for month, parts in month_mapping.items():
# 3.1.拷贝数据
new_ws = self.create_sheet('{0}({1})'.format(month, role))
new_ws.append(self.fixed_headers)
for part in parts:
ws = self.get_sheet_by_name(part[0])
for row in ws.iter_rows(min_row=part[1], max_row=part[2], values_only=True):
new_ws.append(row)
# 3.2.提取信息、高亮
amount_mapping = {}
amount_fill_row = set()
for rows in new_ws.iter_rows():
is_fill = False
summary_cell = rows[5]
date_cell = rows[0]
# 关键词1提取
if summary_cell.value in self.interest_keyword:
ms.append((summary_cell.value, date_cell.value, rows[2].value))
# 关键词2提取至临时表
elif summary_cell.value in self.salary_keyword:
tmp_ws.append((summary_cell.value, date_cell.value, rows[2].value))
# 贷款关键词高亮
elif summary_cell.value in self.loan_keyword:
is_fill = True
for i, cell in enumerate(rows):
cell.border = self.border
if is_fill:
cell.fill = self.loan_fill
if (i == 2 or i == 3) and cell.row > 1:
try:
# 3.3.金额、余额转数值
cell.value = locale.atof(cell.value)
except Exception:
continue
else:
cell.number_format = numbers.FORMAT_NUMBER_COMMA_SEPARATED1
if i == 2:
same_amount_mapping = amount_mapping.get(date_cell.value, {})
fill_rows = same_amount_mapping.get(-cell.value)
if fill_rows:
amount_fill_row.add(cell.row)
amount_fill_row.update(fill_rows)
amount_mapping.setdefault(date_cell.value, {}).setdefault(
cell.value, []).append(cell.row)
# 3.4.核对结果
# TODO 借贷、开支类型银行流水,需要手动添加+-号
# TODO 倒序流水需要改变公式
if i == 9 and cell.row > 2:
cell.value = '=IF(D{0}=SUM(D{1},C{0}), "{2}", "{3}")'.format(cell.row, cell.row - 1,
*self.proof_res)
# 3.5.同一天相同进出账高亮
del amount_mapping
for row in amount_fill_row:
for cell in new_ws[row]:
cell.fill = self.amount_fill
# 关键词2信息提取
ms.append(self.blank_row)
ms.append(self.keyword_header)
for row in tmp_ws.iter_rows(values_only=True):
ms.append(row)
self.remove(tmp_ws)
def rebuild(self, role_summary):
# (sheet_name, confidence, page, code, print_time, start_date, end_date)
for role, summary_list in role_summary.items():
# 1.原表修剪、排列、按照月份分割
confidence_max = 0
code_list = []
month_mapping = {}
print_time = start_date = end_date = date_interval = None
for summary in summary_list:
sheet_name, confidence, page, code, print_time, start_date, end_date = summary
ws = self.get_sheet_by_name(sheet_name)
# 1.1.删除多余列、排列
self.sheet_prune(ws)
# 1.2.TODO 按月份分割
self.sheet_split(ws, month_mapping)
# 1.3.元数据处理 TODO 时间与日期处理
# confidence_max = max(confidence, confidence_max)
# if code is not None:
# code_list.append((page, code))
# 2.元信息提取表
ms = self.build_meta_sheet(role, confidence_max, code_list, print_time, start_date, end_date, date_interval)
# 3.创建月份表、提取/高亮关键行
self.build_month_sheet(role, month_mapping, ms)
# 删除原表
for summary in summary_list:
self.remove(self.get_sheet_by_name(summary[0]))
......@@ -132,6 +132,14 @@ class NamedEnum(enum.Enum):
def raw_value(self):
return (self.value, self.verbose_name)
@classmethod
@lru_cache()
def get_choices_lst(cls):
return [
(item.value, item.verbose_name)
for _, item in cls._member_map_.items()
]
def extend(cls, sub_cls_name, names, unique=False):
assert issubclass(cls, NamedEnum)
......
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!