update excel process & add keywords admin
Showing
10 changed files
with
251 additions
and
8 deletions
1 | from django.contrib import admin | 1 | from django.contrib import admin |
2 | from .models import Keywords | ||
3 | from .named_enum import KeywordsType | ||
4 | |||
2 | 5 | ||
3 | # Register your models here. | 6 | # Register your models here. |
7 | class KeywordsAdmin(admin.ModelAdmin): | ||
8 | list_display = ('keyword', 'type_verbose_name', 'on_off') | ||
9 | search_fields = ('keyword',) | ||
10 | list_filter = ('type', 'on_off',) | ||
11 | |||
12 | def type_verbose_name(self, obj): | ||
13 | return KeywordsType.get_verbose_name(obj.type) | ||
14 | type_verbose_name.short_description = '类型' | ||
15 | |||
16 | |||
17 | admin.site.register(Keywords, KeywordsAdmin) | ||
18 | admin.site.site_header = '宝马OCR' | ||
19 | admin.site.site_title = '宝马OCR' | ... | ... |
... | @@ -8,13 +8,13 @@ import aiohttp | ... | @@ -8,13 +8,13 @@ import aiohttp |
8 | from openpyxl import Workbook | 8 | from openpyxl import Workbook |
9 | from django.core.management import BaseCommand | 9 | from django.core.management import BaseCommand |
10 | 10 | ||
11 | from settings import conf | ||
11 | from common.mixins import LoggerMixin | 12 | from common.mixins import LoggerMixin |
12 | from common.tools.file_tools import write_zip_file | 13 | from common.tools.file_tools import write_zip_file |
13 | from common.tools.pdf_to_img import PDFHandler | 14 | from common.tools.pdf_to_img import PDFHandler |
14 | from apps.doc.models import DocStatus, HILDoc, AFCDoc | 15 | from apps.doc.models import DocStatus, HILDoc, AFCDoc |
15 | from apps.doc import consts | 16 | from apps.doc import consts |
16 | from settings import conf | 17 | from apps.doc.ocr.edms import EDMS, rh |
17 | from apps.doc.edms import EDMS, rh | ||
18 | 18 | ||
19 | 19 | ||
20 | class Command(BaseCommand, LoggerMixin): | 20 | class Command(BaseCommand, LoggerMixin): |
... | @@ -95,8 +95,8 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -95,8 +95,8 @@ class Command(BaseCommand, LoggerMixin): |
95 | # c2 = cell.get('end_column') | 95 | # c2 = cell.get('end_column') |
96 | r1 = cell.get('start_row') | 96 | r1 = cell.get('start_row') |
97 | # r2 = cell.get('end_row') | 97 | # r2 = cell.get('end_row') |
98 | label = cell.get('words') | 98 | words = cell.get('words') |
99 | ws.cell(row=r1+1, column=c1+1, value=label) | 99 | ws.cell(row=r1+1, column=c1+1, value=words) |
100 | 100 | ||
101 | @staticmethod | 101 | @staticmethod |
102 | def get_ocr_json(img_path): | 102 | def get_ocr_json(img_path): | ... | ... |
1 | from django.db import models | 1 | from django.db import models |
2 | from .named_enum import DocStatus | 2 | from .named_enum import DocStatus, KeywordsType |
3 | 3 | ||
4 | # Create your models here. | 4 | # Create your models here. |
5 | 5 | ||
... | @@ -101,3 +101,19 @@ class PriorityApplication(models.Model): | ... | @@ -101,3 +101,19 @@ class PriorityApplication(models.Model): |
101 | situ_db_label = 'afc' | 101 | situ_db_label = 'afc' |
102 | db_table = 'priority_application' | 102 | db_table = 'priority_application' |
103 | 103 | ||
104 | |||
105 | class Keywords(models.Model): | ||
106 | id = models.AutoField(primary_key=True, verbose_name="id") | ||
107 | keyword = models.CharField(max_length=64, verbose_name="关键词") | ||
108 | type = models.SmallIntegerField(choices=KeywordsType.get_choices_lst(), verbose_name="类型") | ||
109 | on_off = models.BooleanField(default=True, verbose_name="是否有效") | ||
110 | update_time = models.DateTimeField(auto_now=True, verbose_name='修改时间') | ||
111 | create_time = models.DateTimeField(auto_now_add=True, verbose_name='创建时间') | ||
112 | |||
113 | class Meta: | ||
114 | managed = False | ||
115 | situ_db_label = 'afc' | ||
116 | db_table = 'keywords' | ||
117 | verbose_name = '银行流水关键词' | ||
118 | verbose_name_plural = verbose_name | ||
119 | ... | ... |
... | @@ -7,3 +7,9 @@ class DocStatus(NamedEnum): | ... | @@ -7,3 +7,9 @@ class DocStatus(NamedEnum): |
7 | PROCESS_FAILED = (2, '识别失败') | 7 | PROCESS_FAILED = (2, '识别失败') |
8 | UPLOAD_FAILED = (3, '同步失败') | 8 | UPLOAD_FAILED = (3, '同步失败') |
9 | COMPLETE = (4, '已完成') | 9 | COMPLETE = (4, '已完成') |
10 | |||
11 | |||
12 | class KeywordsType(NamedEnum): | ||
13 | INTEREST = (0, "利息") | ||
14 | SALARY = (1, '薪资') | ||
15 | LOAN = (2, '贷款') | ... | ... |
... | @@ -2,7 +2,7 @@ import os | ... | @@ -2,7 +2,7 @@ import os |
2 | import requests | 2 | import requests |
3 | from zeep import Client, xsd | 3 | from zeep import Client, xsd |
4 | from settings import conf | 4 | from settings import conf |
5 | from . import consts | 5 | from apps.doc import consts |
6 | from common.redis_cache import redis_handler as rh | 6 | from common.redis_cache import redis_handler as rh |
7 | 7 | ||
8 | 8 | ... | ... |
src/apps/doc/ocr/wb.py
0 → 100644
1 | import numpy as np | ||
2 | import locale | ||
3 | from pandas._libs import tslib | ||
4 | from pandas.core.indexes.datetimes import DatetimeIndex | ||
5 | from openpyxl import Workbook | ||
6 | from openpyxl.styles import Border, Side, PatternFill, numbers | ||
7 | from openpyxl.utils import get_column_letter | ||
8 | |||
9 | |||
10 | class BSWorkbook(Workbook): | ||
11 | |||
12 | def __init__(self, interest_keyword, salary_keyword, loan_keyword, *args, **kwargs): | ||
13 | super().__init__(*args, **kwargs) | ||
14 | self.fixed_headers = ('记账日期', '记账时间', '金额', '余额', '交易名称', '附言', '对方账户名', | ||
15 | '对方卡号/账号', '对方开户行', '核对结果') | ||
16 | self.fixed_col_amount = len(self.fixed_headers) | ||
17 | self.headers_mapping = { | ||
18 | '记账日期': 1, | ||
19 | '交易日期': 1, | ||
20 | '记账时间': 2, | ||
21 | '金额': 3, | ||
22 | '交易金额': 3, | ||
23 | '余额': 4, | ||
24 | '账户余额': 4, | ||
25 | '交易名称': 5, | ||
26 | '附言': 6, | ||
27 | '摘要': 6, | ||
28 | '对方账户名': 7, | ||
29 | '对方卡号/账号': 8, | ||
30 | '对方账号与户名': 8, | ||
31 | '对方开户行': 9, | ||
32 | } | ||
33 | self.meta_sheet_title = '关键信息提取和展示' | ||
34 | self.blank_row = (None,) | ||
35 | self.code_header = ('页数', '电子回单验证码') | ||
36 | self.date_header = ('打印时间', '起始日期', '终止日期', '流水区间结果') | ||
37 | self.keyword_header = ('关键词', '记账日期', '金额') | ||
38 | self.interest_keyword = interest_keyword | ||
39 | self.salary_keyword = salary_keyword | ||
40 | self.loan_keyword = loan_keyword | ||
41 | self.proof_res = ('对', '错') | ||
42 | self.loan_fill = PatternFill("solid", fgColor="00FFCC00") | ||
43 | self.amount_fill = PatternFill("solid", fgColor="00FFFF00") | ||
44 | self.bd = Side(style='thin', color="000000") | ||
45 | self.border = Border(left=self.bd, top=self.bd, right=self.bd, bottom=self.bd) | ||
46 | |||
47 | def sheet_prune(self, ws): | ||
48 | ws.insert_cols(1, amount=self.fixed_col_amount) | ||
49 | for col in range(self.fixed_col_amount + 1, ws.max_column + 1): | ||
50 | header_value = ws.cell(1, col).value | ||
51 | header_idx = self.headers_mapping.get(header_value) | ||
52 | # TODO 关键字段再次查找 | ||
53 | if header_idx is None: | ||
54 | continue | ||
55 | letter = get_column_letter(header_idx) | ||
56 | ws.move_range("{0}1:{0}{1}".format(letter, ws.max_row), cols=header_idx - col) | ||
57 | ws.delete_cols(self.fixed_col_amount + 1, amount=ws.max_column) | ||
58 | |||
59 | def sheet_split(self, ws, month_mapping): | ||
60 | for date_tuple in ws.iter_cols(min_col=1, max_col=1, min_row=2, values_only=True): | ||
61 | dt_array, tz_parsed = tslib.array_to_datetime( | ||
62 | np.array(date_tuple, copy=False, dtype=np.object_), | ||
63 | errors="coerce", | ||
64 | utc=False, | ||
65 | dayfirst=False, | ||
66 | yearfirst=False, | ||
67 | require_iso8601=False, | ||
68 | ) | ||
69 | dti = DatetimeIndex(dt_array, tz=None, name=None) | ||
70 | |||
71 | def build_metadata_rows(self, confidence_max, code_list, print_time, start_date, end_date, date_interval): | ||
72 | metadata_rows = [('流水识别置信度', confidence_max), self.blank_row, self.code_header] | ||
73 | metadata_rows.extend(code_list) | ||
74 | metadata_rows.extend( | ||
75 | [self.blank_row, | ||
76 | self.date_header, | ||
77 | (print_time, start_date, end_date, date_interval), | ||
78 | self.blank_row, | ||
79 | self.keyword_header] | ||
80 | ) | ||
81 | return metadata_rows | ||
82 | |||
83 | def create_meta_sheet(self, role): | ||
84 | if self.worksheets[0].title == 'Sheet': | ||
85 | ms = self.worksheets[0] | ||
86 | ms.title = '{0}({1})'.format(self.meta_sheet_title, role) | ||
87 | else: | ||
88 | ms = self.create_sheet('{0}({1})'.format(self.meta_sheet_title, role)) | ||
89 | return ms | ||
90 | |||
91 | def build_meta_sheet(self, role, confidence_max, code_list, print_time, start_date, end_date, date_interval): | ||
92 | metadata_rows = self.build_metadata_rows(confidence_max, code_list, print_time, | ||
93 | start_date, end_date, date_interval) | ||
94 | ms = self.create_meta_sheet(role) | ||
95 | for row in metadata_rows: | ||
96 | ms.append(row) | ||
97 | return ms | ||
98 | |||
99 | def build_month_sheet(self, role, month_mapping, ms): | ||
100 | tmp_ws = self.create_sheet('tmp_ws') | ||
101 | for month, parts in month_mapping.items(): | ||
102 | # 3.1.拷贝数据 | ||
103 | new_ws = self.create_sheet('{0}({1})'.format(month, role)) | ||
104 | new_ws.append(self.fixed_headers) | ||
105 | for part in parts: | ||
106 | ws = self.get_sheet_by_name(part[0]) | ||
107 | for row in ws.iter_rows(min_row=part[1], max_row=part[2], values_only=True): | ||
108 | new_ws.append(row) | ||
109 | # 3.2.提取信息、高亮 | ||
110 | amount_mapping = {} | ||
111 | amount_fill_row = set() | ||
112 | for rows in new_ws.iter_rows(): | ||
113 | is_fill = False | ||
114 | summary_cell = rows[5] | ||
115 | date_cell = rows[0] | ||
116 | # 关键词1提取 | ||
117 | if summary_cell.value in self.interest_keyword: | ||
118 | ms.append((summary_cell.value, date_cell.value, rows[2].value)) | ||
119 | # 关键词2提取至临时表 | ||
120 | elif summary_cell.value in self.salary_keyword: | ||
121 | tmp_ws.append((summary_cell.value, date_cell.value, rows[2].value)) | ||
122 | # 贷款关键词高亮 | ||
123 | elif summary_cell.value in self.loan_keyword: | ||
124 | is_fill = True | ||
125 | for i, cell in enumerate(rows): | ||
126 | cell.border = self.border | ||
127 | if is_fill: | ||
128 | cell.fill = self.loan_fill | ||
129 | if (i == 2 or i == 3) and cell.row > 1: | ||
130 | try: | ||
131 | # 3.3.金额、余额转数值 | ||
132 | cell.value = locale.atof(cell.value) | ||
133 | except Exception: | ||
134 | continue | ||
135 | else: | ||
136 | cell.number_format = numbers.FORMAT_NUMBER_COMMA_SEPARATED1 | ||
137 | if i == 2: | ||
138 | same_amount_mapping = amount_mapping.get(date_cell.value, {}) | ||
139 | fill_rows = same_amount_mapping.get(-cell.value) | ||
140 | if fill_rows: | ||
141 | amount_fill_row.add(cell.row) | ||
142 | amount_fill_row.update(fill_rows) | ||
143 | amount_mapping.setdefault(date_cell.value, {}).setdefault( | ||
144 | cell.value, []).append(cell.row) | ||
145 | # 3.4.核对结果 | ||
146 | # TODO 借贷、开支类型银行流水,需要手动添加+-号 | ||
147 | # TODO 倒序流水需要改变公式 | ||
148 | if i == 9 and cell.row > 2: | ||
149 | cell.value = '=IF(D{0}=SUM(D{1},C{0}), "{2}", "{3}")'.format(cell.row, cell.row - 1, | ||
150 | *self.proof_res) | ||
151 | |||
152 | # 3.5.同一天相同进出账高亮 | ||
153 | del amount_mapping | ||
154 | for row in amount_fill_row: | ||
155 | for cell in new_ws[row]: | ||
156 | cell.fill = self.amount_fill | ||
157 | |||
158 | # 关键词2信息提取 | ||
159 | ms.append(self.blank_row) | ||
160 | ms.append(self.keyword_header) | ||
161 | for row in tmp_ws.iter_rows(values_only=True): | ||
162 | ms.append(row) | ||
163 | self.remove(tmp_ws) | ||
164 | |||
165 | def rebuild(self, role_summary): | ||
166 | # (sheet_name, confidence, page, code, print_time, start_date, end_date) | ||
167 | for role, summary_list in role_summary.items(): | ||
168 | # 1.原表修剪、排列、按照月份分割 | ||
169 | confidence_max = 0 | ||
170 | code_list = [] | ||
171 | month_mapping = {} | ||
172 | print_time = start_date = end_date = date_interval = None | ||
173 | for summary in summary_list: | ||
174 | sheet_name, confidence, page, code, print_time, start_date, end_date = summary | ||
175 | ws = self.get_sheet_by_name(sheet_name) | ||
176 | # 1.1.删除多余列、排列 | ||
177 | self.sheet_prune(ws) | ||
178 | # 1.2.TODO 按月份分割 | ||
179 | self.sheet_split(ws, month_mapping) | ||
180 | # 1.3.元数据处理 TODO 时间与日期处理 | ||
181 | # confidence_max = max(confidence, confidence_max) | ||
182 | # if code is not None: | ||
183 | # code_list.append((page, code)) | ||
184 | |||
185 | # 2.元信息提取表 | ||
186 | ms = self.build_meta_sheet(role, confidence_max, code_list, print_time, start_date, end_date, date_interval) | ||
187 | |||
188 | # 3.创建月份表、提取/高亮关键行 | ||
189 | self.build_month_sheet(role, month_mapping, ms) | ||
190 | |||
191 | # 删除原表 | ||
192 | for summary in summary_list: | ||
193 | self.remove(self.get_sheet_by_name(summary[0])) |
... | @@ -132,6 +132,14 @@ class NamedEnum(enum.Enum): | ... | @@ -132,6 +132,14 @@ class NamedEnum(enum.Enum): |
132 | def raw_value(self): | 132 | def raw_value(self): |
133 | return (self.value, self.verbose_name) | 133 | return (self.value, self.verbose_name) |
134 | 134 | ||
135 | @classmethod | ||
136 | @lru_cache() | ||
137 | def get_choices_lst(cls): | ||
138 | return [ | ||
139 | (item.value, item.verbose_name) | ||
140 | for _, item in cls._member_map_.items() | ||
141 | ] | ||
142 | |||
135 | 143 | ||
136 | def extend(cls, sub_cls_name, names, unique=False): | 144 | def extend(cls, sub_cls_name, names, unique=False): |
137 | assert issubclass(cls, NamedEnum) | 145 | assert issubclass(cls, NamedEnum) | ... | ... |
-
Please register or sign in to post a comment