update excel process & add keywords admin
Showing
10 changed files
with
251 additions
and
8 deletions
| 1 | from django.contrib import admin | 1 | from django.contrib import admin |
| 2 | from .models import Keywords | ||
| 3 | from .named_enum import KeywordsType | ||
| 4 | |||
| 2 | 5 | ||
| 3 | # Register your models here. | 6 | # Register your models here. |
| 7 | class KeywordsAdmin(admin.ModelAdmin): | ||
| 8 | list_display = ('keyword', 'type_verbose_name', 'on_off') | ||
| 9 | search_fields = ('keyword',) | ||
| 10 | list_filter = ('type', 'on_off',) | ||
| 11 | |||
| 12 | def type_verbose_name(self, obj): | ||
| 13 | return KeywordsType.get_verbose_name(obj.type) | ||
| 14 | type_verbose_name.short_description = '类型' | ||
| 15 | |||
| 16 | |||
| 17 | admin.site.register(Keywords, KeywordsAdmin) | ||
| 18 | admin.site.site_header = '宝马OCR' | ||
| 19 | admin.site.site_title = '宝马OCR' | ... | ... |
| ... | @@ -8,13 +8,13 @@ import aiohttp | ... | @@ -8,13 +8,13 @@ import aiohttp |
| 8 | from openpyxl import Workbook | 8 | from openpyxl import Workbook |
| 9 | from django.core.management import BaseCommand | 9 | from django.core.management import BaseCommand |
| 10 | 10 | ||
| 11 | from settings import conf | ||
| 11 | from common.mixins import LoggerMixin | 12 | from common.mixins import LoggerMixin |
| 12 | from common.tools.file_tools import write_zip_file | 13 | from common.tools.file_tools import write_zip_file |
| 13 | from common.tools.pdf_to_img import PDFHandler | 14 | from common.tools.pdf_to_img import PDFHandler |
| 14 | from apps.doc.models import DocStatus, HILDoc, AFCDoc | 15 | from apps.doc.models import DocStatus, HILDoc, AFCDoc |
| 15 | from apps.doc import consts | 16 | from apps.doc import consts |
| 16 | from settings import conf | 17 | from apps.doc.ocr.edms import EDMS, rh |
| 17 | from apps.doc.edms import EDMS, rh | ||
| 18 | 18 | ||
| 19 | 19 | ||
| 20 | class Command(BaseCommand, LoggerMixin): | 20 | class Command(BaseCommand, LoggerMixin): |
| ... | @@ -95,8 +95,8 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -95,8 +95,8 @@ class Command(BaseCommand, LoggerMixin): |
| 95 | # c2 = cell.get('end_column') | 95 | # c2 = cell.get('end_column') |
| 96 | r1 = cell.get('start_row') | 96 | r1 = cell.get('start_row') |
| 97 | # r2 = cell.get('end_row') | 97 | # r2 = cell.get('end_row') |
| 98 | label = cell.get('words') | 98 | words = cell.get('words') |
| 99 | ws.cell(row=r1+1, column=c1+1, value=label) | 99 | ws.cell(row=r1+1, column=c1+1, value=words) |
| 100 | 100 | ||
| 101 | @staticmethod | 101 | @staticmethod |
| 102 | def get_ocr_json(img_path): | 102 | def get_ocr_json(img_path): | ... | ... |
| 1 | from django.db import models | 1 | from django.db import models |
| 2 | from .named_enum import DocStatus | 2 | from .named_enum import DocStatus, KeywordsType |
| 3 | 3 | ||
| 4 | # Create your models here. | 4 | # Create your models here. |
| 5 | 5 | ||
| ... | @@ -101,3 +101,19 @@ class PriorityApplication(models.Model): | ... | @@ -101,3 +101,19 @@ class PriorityApplication(models.Model): |
| 101 | situ_db_label = 'afc' | 101 | situ_db_label = 'afc' |
| 102 | db_table = 'priority_application' | 102 | db_table = 'priority_application' |
| 103 | 103 | ||
| 104 | |||
| 105 | class Keywords(models.Model): | ||
| 106 | id = models.AutoField(primary_key=True, verbose_name="id") | ||
| 107 | keyword = models.CharField(max_length=64, verbose_name="关键词") | ||
| 108 | type = models.SmallIntegerField(choices=KeywordsType.get_choices_lst(), verbose_name="类型") | ||
| 109 | on_off = models.BooleanField(default=True, verbose_name="是否有效") | ||
| 110 | update_time = models.DateTimeField(auto_now=True, verbose_name='修改时间') | ||
| 111 | create_time = models.DateTimeField(auto_now_add=True, verbose_name='创建时间') | ||
| 112 | |||
| 113 | class Meta: | ||
| 114 | managed = False | ||
| 115 | situ_db_label = 'afc' | ||
| 116 | db_table = 'keywords' | ||
| 117 | verbose_name = '银行流水关键词' | ||
| 118 | verbose_name_plural = verbose_name | ||
| 119 | ... | ... |
| ... | @@ -7,3 +7,9 @@ class DocStatus(NamedEnum): | ... | @@ -7,3 +7,9 @@ class DocStatus(NamedEnum): |
| 7 | PROCESS_FAILED = (2, '识别失败') | 7 | PROCESS_FAILED = (2, '识别失败') |
| 8 | UPLOAD_FAILED = (3, '同步失败') | 8 | UPLOAD_FAILED = (3, '同步失败') |
| 9 | COMPLETE = (4, '已完成') | 9 | COMPLETE = (4, '已完成') |
| 10 | |||
| 11 | |||
| 12 | class KeywordsType(NamedEnum): | ||
| 13 | INTEREST = (0, "利息") | ||
| 14 | SALARY = (1, '薪资') | ||
| 15 | LOAN = (2, '贷款') | ... | ... |
| ... | @@ -2,7 +2,7 @@ import os | ... | @@ -2,7 +2,7 @@ import os |
| 2 | import requests | 2 | import requests |
| 3 | from zeep import Client, xsd | 3 | from zeep import Client, xsd |
| 4 | from settings import conf | 4 | from settings import conf |
| 5 | from . import consts | 5 | from apps.doc import consts |
| 6 | from common.redis_cache import redis_handler as rh | 6 | from common.redis_cache import redis_handler as rh |
| 7 | 7 | ||
| 8 | 8 | ... | ... |
src/apps/doc/ocr/wb.py
0 → 100644
| 1 | import numpy as np | ||
| 2 | import locale | ||
| 3 | from pandas._libs import tslib | ||
| 4 | from pandas.core.indexes.datetimes import DatetimeIndex | ||
| 5 | from openpyxl import Workbook | ||
| 6 | from openpyxl.styles import Border, Side, PatternFill, numbers | ||
| 7 | from openpyxl.utils import get_column_letter | ||
| 8 | |||
| 9 | |||
| 10 | class BSWorkbook(Workbook): | ||
| 11 | |||
| 12 | def __init__(self, interest_keyword, salary_keyword, loan_keyword, *args, **kwargs): | ||
| 13 | super().__init__(*args, **kwargs) | ||
| 14 | self.fixed_headers = ('记账日期', '记账时间', '金额', '余额', '交易名称', '附言', '对方账户名', | ||
| 15 | '对方卡号/账号', '对方开户行', '核对结果') | ||
| 16 | self.fixed_col_amount = len(self.fixed_headers) | ||
| 17 | self.headers_mapping = { | ||
| 18 | '记账日期': 1, | ||
| 19 | '交易日期': 1, | ||
| 20 | '记账时间': 2, | ||
| 21 | '金额': 3, | ||
| 22 | '交易金额': 3, | ||
| 23 | '余额': 4, | ||
| 24 | '账户余额': 4, | ||
| 25 | '交易名称': 5, | ||
| 26 | '附言': 6, | ||
| 27 | '摘要': 6, | ||
| 28 | '对方账户名': 7, | ||
| 29 | '对方卡号/账号': 8, | ||
| 30 | '对方账号与户名': 8, | ||
| 31 | '对方开户行': 9, | ||
| 32 | } | ||
| 33 | self.meta_sheet_title = '关键信息提取和展示' | ||
| 34 | self.blank_row = (None,) | ||
| 35 | self.code_header = ('页数', '电子回单验证码') | ||
| 36 | self.date_header = ('打印时间', '起始日期', '终止日期', '流水区间结果') | ||
| 37 | self.keyword_header = ('关键词', '记账日期', '金额') | ||
| 38 | self.interest_keyword = interest_keyword | ||
| 39 | self.salary_keyword = salary_keyword | ||
| 40 | self.loan_keyword = loan_keyword | ||
| 41 | self.proof_res = ('对', '错') | ||
| 42 | self.loan_fill = PatternFill("solid", fgColor="00FFCC00") | ||
| 43 | self.amount_fill = PatternFill("solid", fgColor="00FFFF00") | ||
| 44 | self.bd = Side(style='thin', color="000000") | ||
| 45 | self.border = Border(left=self.bd, top=self.bd, right=self.bd, bottom=self.bd) | ||
| 46 | |||
| 47 | def sheet_prune(self, ws): | ||
| 48 | ws.insert_cols(1, amount=self.fixed_col_amount) | ||
| 49 | for col in range(self.fixed_col_amount + 1, ws.max_column + 1): | ||
| 50 | header_value = ws.cell(1, col).value | ||
| 51 | header_idx = self.headers_mapping.get(header_value) | ||
| 52 | # TODO 关键字段再次查找 | ||
| 53 | if header_idx is None: | ||
| 54 | continue | ||
| 55 | letter = get_column_letter(header_idx) | ||
| 56 | ws.move_range("{0}1:{0}{1}".format(letter, ws.max_row), cols=header_idx - col) | ||
| 57 | ws.delete_cols(self.fixed_col_amount + 1, amount=ws.max_column) | ||
| 58 | |||
| 59 | def sheet_split(self, ws, month_mapping): | ||
| 60 | for date_tuple in ws.iter_cols(min_col=1, max_col=1, min_row=2, values_only=True): | ||
| 61 | dt_array, tz_parsed = tslib.array_to_datetime( | ||
| 62 | np.array(date_tuple, copy=False, dtype=np.object_), | ||
| 63 | errors="coerce", | ||
| 64 | utc=False, | ||
| 65 | dayfirst=False, | ||
| 66 | yearfirst=False, | ||
| 67 | require_iso8601=False, | ||
| 68 | ) | ||
| 69 | dti = DatetimeIndex(dt_array, tz=None, name=None) | ||
| 70 | |||
| 71 | def build_metadata_rows(self, confidence_max, code_list, print_time, start_date, end_date, date_interval): | ||
| 72 | metadata_rows = [('流水识别置信度', confidence_max), self.blank_row, self.code_header] | ||
| 73 | metadata_rows.extend(code_list) | ||
| 74 | metadata_rows.extend( | ||
| 75 | [self.blank_row, | ||
| 76 | self.date_header, | ||
| 77 | (print_time, start_date, end_date, date_interval), | ||
| 78 | self.blank_row, | ||
| 79 | self.keyword_header] | ||
| 80 | ) | ||
| 81 | return metadata_rows | ||
| 82 | |||
| 83 | def create_meta_sheet(self, role): | ||
| 84 | if self.worksheets[0].title == 'Sheet': | ||
| 85 | ms = self.worksheets[0] | ||
| 86 | ms.title = '{0}({1})'.format(self.meta_sheet_title, role) | ||
| 87 | else: | ||
| 88 | ms = self.create_sheet('{0}({1})'.format(self.meta_sheet_title, role)) | ||
| 89 | return ms | ||
| 90 | |||
| 91 | def build_meta_sheet(self, role, confidence_max, code_list, print_time, start_date, end_date, date_interval): | ||
| 92 | metadata_rows = self.build_metadata_rows(confidence_max, code_list, print_time, | ||
| 93 | start_date, end_date, date_interval) | ||
| 94 | ms = self.create_meta_sheet(role) | ||
| 95 | for row in metadata_rows: | ||
| 96 | ms.append(row) | ||
| 97 | return ms | ||
| 98 | |||
| 99 | def build_month_sheet(self, role, month_mapping, ms): | ||
| 100 | tmp_ws = self.create_sheet('tmp_ws') | ||
| 101 | for month, parts in month_mapping.items(): | ||
| 102 | # 3.1.拷贝数据 | ||
| 103 | new_ws = self.create_sheet('{0}({1})'.format(month, role)) | ||
| 104 | new_ws.append(self.fixed_headers) | ||
| 105 | for part in parts: | ||
| 106 | ws = self.get_sheet_by_name(part[0]) | ||
| 107 | for row in ws.iter_rows(min_row=part[1], max_row=part[2], values_only=True): | ||
| 108 | new_ws.append(row) | ||
| 109 | # 3.2.提取信息、高亮 | ||
| 110 | amount_mapping = {} | ||
| 111 | amount_fill_row = set() | ||
| 112 | for rows in new_ws.iter_rows(): | ||
| 113 | is_fill = False | ||
| 114 | summary_cell = rows[5] | ||
| 115 | date_cell = rows[0] | ||
| 116 | # 关键词1提取 | ||
| 117 | if summary_cell.value in self.interest_keyword: | ||
| 118 | ms.append((summary_cell.value, date_cell.value, rows[2].value)) | ||
| 119 | # 关键词2提取至临时表 | ||
| 120 | elif summary_cell.value in self.salary_keyword: | ||
| 121 | tmp_ws.append((summary_cell.value, date_cell.value, rows[2].value)) | ||
| 122 | # 贷款关键词高亮 | ||
| 123 | elif summary_cell.value in self.loan_keyword: | ||
| 124 | is_fill = True | ||
| 125 | for i, cell in enumerate(rows): | ||
| 126 | cell.border = self.border | ||
| 127 | if is_fill: | ||
| 128 | cell.fill = self.loan_fill | ||
| 129 | if (i == 2 or i == 3) and cell.row > 1: | ||
| 130 | try: | ||
| 131 | # 3.3.金额、余额转数值 | ||
| 132 | cell.value = locale.atof(cell.value) | ||
| 133 | except Exception: | ||
| 134 | continue | ||
| 135 | else: | ||
| 136 | cell.number_format = numbers.FORMAT_NUMBER_COMMA_SEPARATED1 | ||
| 137 | if i == 2: | ||
| 138 | same_amount_mapping = amount_mapping.get(date_cell.value, {}) | ||
| 139 | fill_rows = same_amount_mapping.get(-cell.value) | ||
| 140 | if fill_rows: | ||
| 141 | amount_fill_row.add(cell.row) | ||
| 142 | amount_fill_row.update(fill_rows) | ||
| 143 | amount_mapping.setdefault(date_cell.value, {}).setdefault( | ||
| 144 | cell.value, []).append(cell.row) | ||
| 145 | # 3.4.核对结果 | ||
| 146 | # TODO 借贷、开支类型银行流水,需要手动添加+-号 | ||
| 147 | # TODO 倒序流水需要改变公式 | ||
| 148 | if i == 9 and cell.row > 2: | ||
| 149 | cell.value = '=IF(D{0}=SUM(D{1},C{0}), "{2}", "{3}")'.format(cell.row, cell.row - 1, | ||
| 150 | *self.proof_res) | ||
| 151 | |||
| 152 | # 3.5.同一天相同进出账高亮 | ||
| 153 | del amount_mapping | ||
| 154 | for row in amount_fill_row: | ||
| 155 | for cell in new_ws[row]: | ||
| 156 | cell.fill = self.amount_fill | ||
| 157 | |||
| 158 | # 关键词2信息提取 | ||
| 159 | ms.append(self.blank_row) | ||
| 160 | ms.append(self.keyword_header) | ||
| 161 | for row in tmp_ws.iter_rows(values_only=True): | ||
| 162 | ms.append(row) | ||
| 163 | self.remove(tmp_ws) | ||
| 164 | |||
| 165 | def rebuild(self, role_summary): | ||
| 166 | # (sheet_name, confidence, page, code, print_time, start_date, end_date) | ||
| 167 | for role, summary_list in role_summary.items(): | ||
| 168 | # 1.原表修剪、排列、按照月份分割 | ||
| 169 | confidence_max = 0 | ||
| 170 | code_list = [] | ||
| 171 | month_mapping = {} | ||
| 172 | print_time = start_date = end_date = date_interval = None | ||
| 173 | for summary in summary_list: | ||
| 174 | sheet_name, confidence, page, code, print_time, start_date, end_date = summary | ||
| 175 | ws = self.get_sheet_by_name(sheet_name) | ||
| 176 | # 1.1.删除多余列、排列 | ||
| 177 | self.sheet_prune(ws) | ||
| 178 | # 1.2.TODO 按月份分割 | ||
| 179 | self.sheet_split(ws, month_mapping) | ||
| 180 | # 1.3.元数据处理 TODO 时间与日期处理 | ||
| 181 | # confidence_max = max(confidence, confidence_max) | ||
| 182 | # if code is not None: | ||
| 183 | # code_list.append((page, code)) | ||
| 184 | |||
| 185 | # 2.元信息提取表 | ||
| 186 | ms = self.build_meta_sheet(role, confidence_max, code_list, print_time, start_date, end_date, date_interval) | ||
| 187 | |||
| 188 | # 3.创建月份表、提取/高亮关键行 | ||
| 189 | self.build_month_sheet(role, month_mapping, ms) | ||
| 190 | |||
| 191 | # 删除原表 | ||
| 192 | for summary in summary_list: | ||
| 193 | self.remove(self.get_sheet_by_name(summary[0])) |
| ... | @@ -132,6 +132,14 @@ class NamedEnum(enum.Enum): | ... | @@ -132,6 +132,14 @@ class NamedEnum(enum.Enum): |
| 132 | def raw_value(self): | 132 | def raw_value(self): |
| 133 | return (self.value, self.verbose_name) | 133 | return (self.value, self.verbose_name) |
| 134 | 134 | ||
| 135 | @classmethod | ||
| 136 | @lru_cache() | ||
| 137 | def get_choices_lst(cls): | ||
| 138 | return [ | ||
| 139 | (item.value, item.verbose_name) | ||
| 140 | for _, item in cls._member_map_.items() | ||
| 141 | ] | ||
| 142 | |||
| 135 | 143 | ||
| 136 | def extend(cls, sub_cls_name, names, unique=False): | 144 | def extend(cls, sub_cls_name, names, unique=False): |
| 137 | assert issubclass(cls, NamedEnum) | 145 | assert issubclass(cls, NamedEnum) | ... | ... |
-
Please register or sign in to post a comment