1526125c by 周伟奇

update excel process & add keywords admin

1 parent b2945296
...@@ -2,4 +2,4 @@ from django.apps import AppConfig ...@@ -2,4 +2,4 @@ from django.apps import AppConfig
2 2
3 3
4 class AccountConfig(AppConfig): 4 class AccountConfig(AppConfig):
5 name = 'account' 5 name = 'apps.account'
......
1
2
3 default_app_config = 'apps.doc.apps.DocConfig'
......
1 from django.contrib import admin 1 from django.contrib import admin
2 from .models import Keywords
3 from .named_enum import KeywordsType
4
2 5
3 # Register your models here. 6 # Register your models here.
7 class KeywordsAdmin(admin.ModelAdmin):
8 list_display = ('keyword', 'type_verbose_name', 'on_off')
9 search_fields = ('keyword',)
10 list_filter = ('type', 'on_off',)
11
12 def type_verbose_name(self, obj):
13 return KeywordsType.get_verbose_name(obj.type)
14 type_verbose_name.short_description = '类型'
15
16
17 admin.site.register(Keywords, KeywordsAdmin)
18 admin.site.site_header = '宝马OCR'
19 admin.site.site_title = '宝马OCR'
......
...@@ -2,4 +2,5 @@ from django.apps import AppConfig ...@@ -2,4 +2,5 @@ from django.apps import AppConfig
2 2
3 3
4 class DocConfig(AppConfig): 4 class DocConfig(AppConfig):
5 name = 'doc' 5 name = 'apps.doc'
6 verbose_name = '文件'
......
...@@ -8,13 +8,13 @@ import aiohttp ...@@ -8,13 +8,13 @@ import aiohttp
8 from openpyxl import Workbook 8 from openpyxl import Workbook
9 from django.core.management import BaseCommand 9 from django.core.management import BaseCommand
10 10
11 from settings import conf
11 from common.mixins import LoggerMixin 12 from common.mixins import LoggerMixin
12 from common.tools.file_tools import write_zip_file 13 from common.tools.file_tools import write_zip_file
13 from common.tools.pdf_to_img import PDFHandler 14 from common.tools.pdf_to_img import PDFHandler
14 from apps.doc.models import DocStatus, HILDoc, AFCDoc 15 from apps.doc.models import DocStatus, HILDoc, AFCDoc
15 from apps.doc import consts 16 from apps.doc import consts
16 from settings import conf 17 from apps.doc.ocr.edms import EDMS, rh
17 from apps.doc.edms import EDMS, rh
18 18
19 19
20 class Command(BaseCommand, LoggerMixin): 20 class Command(BaseCommand, LoggerMixin):
...@@ -95,8 +95,8 @@ class Command(BaseCommand, LoggerMixin): ...@@ -95,8 +95,8 @@ class Command(BaseCommand, LoggerMixin):
95 # c2 = cell.get('end_column') 95 # c2 = cell.get('end_column')
96 r1 = cell.get('start_row') 96 r1 = cell.get('start_row')
97 # r2 = cell.get('end_row') 97 # r2 = cell.get('end_row')
98 label = cell.get('words') 98 words = cell.get('words')
99 ws.cell(row=r1+1, column=c1+1, value=label) 99 ws.cell(row=r1+1, column=c1+1, value=words)
100 100
101 @staticmethod 101 @staticmethod
102 def get_ocr_json(img_path): 102 def get_ocr_json(img_path):
......
1 from django.db import models 1 from django.db import models
2 from .named_enum import DocStatus 2 from .named_enum import DocStatus, KeywordsType
3 3
4 # Create your models here. 4 # Create your models here.
5 5
...@@ -101,3 +101,19 @@ class PriorityApplication(models.Model): ...@@ -101,3 +101,19 @@ class PriorityApplication(models.Model):
101 situ_db_label = 'afc' 101 situ_db_label = 'afc'
102 db_table = 'priority_application' 102 db_table = 'priority_application'
103 103
104
105 class Keywords(models.Model):
106 id = models.AutoField(primary_key=True, verbose_name="id")
107 keyword = models.CharField(max_length=64, verbose_name="关键词")
108 type = models.SmallIntegerField(choices=KeywordsType.get_choices_lst(), verbose_name="类型")
109 on_off = models.BooleanField(default=True, verbose_name="是否有效")
110 update_time = models.DateTimeField(auto_now=True, verbose_name='修改时间')
111 create_time = models.DateTimeField(auto_now_add=True, verbose_name='创建时间')
112
113 class Meta:
114 managed = False
115 situ_db_label = 'afc'
116 db_table = 'keywords'
117 verbose_name = '银行流水关键词'
118 verbose_name_plural = verbose_name
119
......
...@@ -7,3 +7,9 @@ class DocStatus(NamedEnum): ...@@ -7,3 +7,9 @@ class DocStatus(NamedEnum):
7 PROCESS_FAILED = (2, '识别失败') 7 PROCESS_FAILED = (2, '识别失败')
8 UPLOAD_FAILED = (3, '同步失败') 8 UPLOAD_FAILED = (3, '同步失败')
9 COMPLETE = (4, '已完成') 9 COMPLETE = (4, '已完成')
10
11
12 class KeywordsType(NamedEnum):
13 INTEREST = (0, "利息")
14 SALARY = (1, '薪资')
15 LOAN = (2, '贷款')
......
...@@ -2,7 +2,7 @@ import os ...@@ -2,7 +2,7 @@ import os
2 import requests 2 import requests
3 from zeep import Client, xsd 3 from zeep import Client, xsd
4 from settings import conf 4 from settings import conf
5 from . import consts 5 from apps.doc import consts
6 from common.redis_cache import redis_handler as rh 6 from common.redis_cache import redis_handler as rh
7 7
8 8
......
1 import numpy as np
2 import locale
3 from pandas._libs import tslib
4 from pandas.core.indexes.datetimes import DatetimeIndex
5 from openpyxl import Workbook
6 from openpyxl.styles import Border, Side, PatternFill, numbers
7 from openpyxl.utils import get_column_letter
8
9
10 class BSWorkbook(Workbook):
11
12 def __init__(self, interest_keyword, salary_keyword, loan_keyword, *args, **kwargs):
13 super().__init__(*args, **kwargs)
14 self.fixed_headers = ('记账日期', '记账时间', '金额', '余额', '交易名称', '附言', '对方账户名',
15 '对方卡号/账号', '对方开户行', '核对结果')
16 self.fixed_col_amount = len(self.fixed_headers)
17 self.headers_mapping = {
18 '记账日期': 1,
19 '交易日期': 1,
20 '记账时间': 2,
21 '金额': 3,
22 '交易金额': 3,
23 '余额': 4,
24 '账户余额': 4,
25 '交易名称': 5,
26 '附言': 6,
27 '摘要': 6,
28 '对方账户名': 7,
29 '对方卡号/账号': 8,
30 '对方账号与户名': 8,
31 '对方开户行': 9,
32 }
33 self.meta_sheet_title = '关键信息提取和展示'
34 self.blank_row = (None,)
35 self.code_header = ('页数', '电子回单验证码')
36 self.date_header = ('打印时间', '起始日期', '终止日期', '流水区间结果')
37 self.keyword_header = ('关键词', '记账日期', '金额')
38 self.interest_keyword = interest_keyword
39 self.salary_keyword = salary_keyword
40 self.loan_keyword = loan_keyword
41 self.proof_res = ('对', '错')
42 self.loan_fill = PatternFill("solid", fgColor="00FFCC00")
43 self.amount_fill = PatternFill("solid", fgColor="00FFFF00")
44 self.bd = Side(style='thin', color="000000")
45 self.border = Border(left=self.bd, top=self.bd, right=self.bd, bottom=self.bd)
46
47 def sheet_prune(self, ws):
48 ws.insert_cols(1, amount=self.fixed_col_amount)
49 for col in range(self.fixed_col_amount + 1, ws.max_column + 1):
50 header_value = ws.cell(1, col).value
51 header_idx = self.headers_mapping.get(header_value)
52 # TODO 关键字段再次查找
53 if header_idx is None:
54 continue
55 letter = get_column_letter(header_idx)
56 ws.move_range("{0}1:{0}{1}".format(letter, ws.max_row), cols=header_idx - col)
57 ws.delete_cols(self.fixed_col_amount + 1, amount=ws.max_column)
58
59 def sheet_split(self, ws, month_mapping):
60 for date_tuple in ws.iter_cols(min_col=1, max_col=1, min_row=2, values_only=True):
61 dt_array, tz_parsed = tslib.array_to_datetime(
62 np.array(date_tuple, copy=False, dtype=np.object_),
63 errors="coerce",
64 utc=False,
65 dayfirst=False,
66 yearfirst=False,
67 require_iso8601=False,
68 )
69 dti = DatetimeIndex(dt_array, tz=None, name=None)
70
71 def build_metadata_rows(self, confidence_max, code_list, print_time, start_date, end_date, date_interval):
72 metadata_rows = [('流水识别置信度', confidence_max), self.blank_row, self.code_header]
73 metadata_rows.extend(code_list)
74 metadata_rows.extend(
75 [self.blank_row,
76 self.date_header,
77 (print_time, start_date, end_date, date_interval),
78 self.blank_row,
79 self.keyword_header]
80 )
81 return metadata_rows
82
83 def create_meta_sheet(self, role):
84 if self.worksheets[0].title == 'Sheet':
85 ms = self.worksheets[0]
86 ms.title = '{0}({1})'.format(self.meta_sheet_title, role)
87 else:
88 ms = self.create_sheet('{0}({1})'.format(self.meta_sheet_title, role))
89 return ms
90
91 def build_meta_sheet(self, role, confidence_max, code_list, print_time, start_date, end_date, date_interval):
92 metadata_rows = self.build_metadata_rows(confidence_max, code_list, print_time,
93 start_date, end_date, date_interval)
94 ms = self.create_meta_sheet(role)
95 for row in metadata_rows:
96 ms.append(row)
97 return ms
98
99 def build_month_sheet(self, role, month_mapping, ms):
100 tmp_ws = self.create_sheet('tmp_ws')
101 for month, parts in month_mapping.items():
102 # 3.1.拷贝数据
103 new_ws = self.create_sheet('{0}({1})'.format(month, role))
104 new_ws.append(self.fixed_headers)
105 for part in parts:
106 ws = self.get_sheet_by_name(part[0])
107 for row in ws.iter_rows(min_row=part[1], max_row=part[2], values_only=True):
108 new_ws.append(row)
109 # 3.2.提取信息、高亮
110 amount_mapping = {}
111 amount_fill_row = set()
112 for rows in new_ws.iter_rows():
113 is_fill = False
114 summary_cell = rows[5]
115 date_cell = rows[0]
116 # 关键词1提取
117 if summary_cell.value in self.interest_keyword:
118 ms.append((summary_cell.value, date_cell.value, rows[2].value))
119 # 关键词2提取至临时表
120 elif summary_cell.value in self.salary_keyword:
121 tmp_ws.append((summary_cell.value, date_cell.value, rows[2].value))
122 # 贷款关键词高亮
123 elif summary_cell.value in self.loan_keyword:
124 is_fill = True
125 for i, cell in enumerate(rows):
126 cell.border = self.border
127 if is_fill:
128 cell.fill = self.loan_fill
129 if (i == 2 or i == 3) and cell.row > 1:
130 try:
131 # 3.3.金额、余额转数值
132 cell.value = locale.atof(cell.value)
133 except Exception:
134 continue
135 else:
136 cell.number_format = numbers.FORMAT_NUMBER_COMMA_SEPARATED1
137 if i == 2:
138 same_amount_mapping = amount_mapping.get(date_cell.value, {})
139 fill_rows = same_amount_mapping.get(-cell.value)
140 if fill_rows:
141 amount_fill_row.add(cell.row)
142 amount_fill_row.update(fill_rows)
143 amount_mapping.setdefault(date_cell.value, {}).setdefault(
144 cell.value, []).append(cell.row)
145 # 3.4.核对结果
146 # TODO 借贷、开支类型银行流水,需要手动添加+-号
147 # TODO 倒序流水需要改变公式
148 if i == 9 and cell.row > 2:
149 cell.value = '=IF(D{0}=SUM(D{1},C{0}), "{2}", "{3}")'.format(cell.row, cell.row - 1,
150 *self.proof_res)
151
152 # 3.5.同一天相同进出账高亮
153 del amount_mapping
154 for row in amount_fill_row:
155 for cell in new_ws[row]:
156 cell.fill = self.amount_fill
157
158 # 关键词2信息提取
159 ms.append(self.blank_row)
160 ms.append(self.keyword_header)
161 for row in tmp_ws.iter_rows(values_only=True):
162 ms.append(row)
163 self.remove(tmp_ws)
164
165 def rebuild(self, role_summary):
166 # (sheet_name, confidence, page, code, print_time, start_date, end_date)
167 for role, summary_list in role_summary.items():
168 # 1.原表修剪、排列、按照月份分割
169 confidence_max = 0
170 code_list = []
171 month_mapping = {}
172 print_time = start_date = end_date = date_interval = None
173 for summary in summary_list:
174 sheet_name, confidence, page, code, print_time, start_date, end_date = summary
175 ws = self.get_sheet_by_name(sheet_name)
176 # 1.1.删除多余列、排列
177 self.sheet_prune(ws)
178 # 1.2.TODO 按月份分割
179 self.sheet_split(ws, month_mapping)
180 # 1.3.元数据处理 TODO 时间与日期处理
181 # confidence_max = max(confidence, confidence_max)
182 # if code is not None:
183 # code_list.append((page, code))
184
185 # 2.元信息提取表
186 ms = self.build_meta_sheet(role, confidence_max, code_list, print_time, start_date, end_date, date_interval)
187
188 # 3.创建月份表、提取/高亮关键行
189 self.build_month_sheet(role, month_mapping, ms)
190
191 # 删除原表
192 for summary in summary_list:
193 self.remove(self.get_sheet_by_name(summary[0]))
...@@ -132,6 +132,14 @@ class NamedEnum(enum.Enum): ...@@ -132,6 +132,14 @@ class NamedEnum(enum.Enum):
132 def raw_value(self): 132 def raw_value(self):
133 return (self.value, self.verbose_name) 133 return (self.value, self.verbose_name)
134 134
135 @classmethod
136 @lru_cache()
137 def get_choices_lst(cls):
138 return [
139 (item.value, item.verbose_name)
140 for _, item in cls._member_map_.items()
141 ]
142
135 143
136 def extend(cls, sub_cls_name, names, unique=False): 144 def extend(cls, sub_cls_name, names, unique=False):
137 assert issubclass(cls, NamedEnum) 145 assert issubclass(cls, NamedEnum)
......
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!