793920a0 by 周伟奇

update wb build

1 parent f3d6e429
1 PAGE_DEFAULT = 1 1 PAGE_DEFAULT = 1
2 PAGE_SIZE_DEFAULT = 10 2 PAGE_SIZE_DEFAULT = 10
3 3
4 TRANS = str.maketrans('Cc((oODlLmAsSbg', '000000011345569')
5
6 CARD_RATIO = 0.9
7 UNKNOWN_CARD = '未知卡号'
8 UNKNOWN_ROLE = '未知户名'
9 DATE_FORMAT = ['%Y年%m月%d日', '%Y/%m/%d', '%Y-%m-%d', '%Y%m%d']
10
4 FIXED_APPLICATION_ID_PREFIX = 'CH-S' 11 FIXED_APPLICATION_ID_PREFIX = 'CH-S'
5 12
6 DOC_SCHEME_LIST = ['ACCEPTANCE', 'SETTLEMENT', 'CONTRACT MANAGEMENT'] 13 DOC_SCHEME_LIST = ['ACCEPTANCE', 'SETTLEMENT', 'CONTRACT MANAGEMENT']
7 DATA_SOURCE_LIST = ['POS', 'EAPP', 'ECONTRACT'] 14 DATA_SOURCE_LIST = ['POS', 'E-APP', 'ECONTRACT']
8 15
9 HIL_PREFIX = 'HIL' 16 HIL_PREFIX = 'HIL'
10 AFC_PREFIX = 'AFC' 17 AFC_PREFIX = 'AFC'
...@@ -39,11 +46,33 @@ PROOF_COL_TITLE = '核对结果' ...@@ -39,11 +46,33 @@ PROOF_COL_TITLE = '核对结果'
39 PROOF_RES = ('对', '错') 46 PROOF_RES = ('对', '错')
40 META_SHEET_TITLE = '关键信息提取和展示' 47 META_SHEET_TITLE = '关键信息提取和展示'
41 48
42 FIXED_HEADERS = ('记账日期', '记账时间', '金额', '余额', '交易名称', '附言', '对方账户名', '对方卡号/账号', '对方开户行', '核对结果') 49 FIXED_HEADERS = ('记账日期', '记账时间', '金额', '余额', '交易名称', '附言', '对方账户名', '对方卡号/账号', '对方开户行', '核对结果', '借贷', '收入', '支出')
43 FIXED_COL_AMOUNT = len(FIXED_HEADERS) 50 FIXED_COL_AMOUNT = len(FIXED_HEADERS)
44 BASE_HEADERS_MAPPING = {label: idx+1 for idx, label in enumerate(FIXED_HEADERS)} 51 BASE_HEADERS_MAPPING = {label: idx+1 for idx, label in enumerate(FIXED_HEADERS)}
52 BORROW_HEADER_COL = BASE_HEADERS_MAPPING['借贷']
53 INCOME_HEADER_COL = BASE_HEADERS_MAPPING['收入']
54 OUTLAY_HEADER_COL = BASE_HEADERS_MAPPING['支出']
55 RESULT_HEADER_COL = BASE_HEADERS_MAPPING['核对结果']
56 BORROW_IDX = BORROW_HEADER_COL - 1
57 INCOME_IDX = INCOME_HEADER_COL - 1
58 OUTLAY_IDX = OUTLAY_HEADER_COL - 1
59 SUMMARY_IDX = FIXED_HEADERS.index('附言')
60 DATE_IDX = FIXED_HEADERS.index('记账日期')
61 AMOUNT_IDX = FIXED_HEADERS.index('金额')
62 OVER_IDX = FIXED_HEADERS.index('余额')
63 RESULT_IDX = FIXED_HEADERS.index('核对结果')
64 # '借贷': ('贷', '借'), # 竖版-无表格-广发银行
65 # '借贷状态': ('贷', '借'), # 竖版-特殊-交通银行
66 # '收/支': ('收入', '支出'), # 横版-表格-北京银行
67 BORROW_HEADERS_SET = {'借贷', '借贷状态', '收/支'}
68 BORROW_INCOME_SET = {'贷', '收入'}
69 BORROW_OUTLAY_SET = {'借', '支出'}
70 INCOME_HEADERS_SET = {'收入金额', '收入', '存入', '存入金额(贷)', '存入金额(贷)'}
71 OUTLAY_HEADERS_SET = {'支出金额', '支出', '支取金额(借)', '支取金额(借)'}
72
73 # ------------------普通打印-全格线--------------------------------------------------------------------------------------
45 HEADERS_MAPPING = {} 74 HEADERS_MAPPING = {}
46 # 中国银行 75 # 横版-表格-中国银行(不规则)
47 HEADERS_MAPPING.update( 76 HEADERS_MAPPING.update(
48 { 77 {
49 '记账日期': BASE_HEADERS_MAPPING['记账日期'], 78 '记账日期': BASE_HEADERS_MAPPING['记账日期'],
...@@ -57,37 +86,294 @@ HEADERS_MAPPING.update( ...@@ -57,37 +86,294 @@ HEADERS_MAPPING.update(
57 '对方开户行': BASE_HEADERS_MAPPING['对方开户行'], 86 '对方开户行': BASE_HEADERS_MAPPING['对方开户行'],
58 } 87 }
59 ) 88 )
60 # 竖版-表格-建设银行 89 # 横版-表格-农业银行-中国农业银行个人账户明细
61 HEADERS_MAPPING.update( 90 HEADERS_MAPPING.update(
62 { 91 {
63 '交易日期': BASE_HEADERS_MAPPING['记账日期'], 92 '交易日期': BASE_HEADERS_MAPPING['记账日期'],
64 '交易金额': BASE_HEADERS_MAPPING['金额'], 93 '存入': BASE_HEADERS_MAPPING['金额'],
65 '账户余额': BASE_HEADERS_MAPPING['余额'], 94 '对方账号': BASE_HEADERS_MAPPING['对方卡号/账号'],
95 '对方名称': BASE_HEADERS_MAPPING['对方账户名'],
66 '摘要': BASE_HEADERS_MAPPING['附言'], 96 '摘要': BASE_HEADERS_MAPPING['附言'],
67 '对方账号与户名': BASE_HEADERS_MAPPING['对方卡号/账号'],
68 } 97 }
69 ) 98 )
70 # 横版-表格-农业银行 99 # 横版-表格-北京银行
71 HEADERS_MAPPING.update( 100 HEADERS_MAPPING.update(
72 { 101 {
73 '存入': BASE_HEADERS_MAPPING['金额'], 102 '业务摘要': BASE_HEADERS_MAPPING['附言'],
74 '对方账号': BASE_HEADERS_MAPPING['对方卡号/账号'], 103 '发生额': BASE_HEADERS_MAPPING['金额'],
75 '对方名称': BASE_HEADERS_MAPPING['对方账户名'], 104 '对方户名': BASE_HEADERS_MAPPING['对方账户名'],
76 } 105 }
77 ) 106 )
78 # 横版-表格-工商银行 107 # 横版-表格-工商银行 借记卡账户历史明细清单
108 # 横版-表格-工商银行-机打验证码 借记卡账户历史明细清单
109 # 横版-表格-工商银行CH-B008802400
110 # 横版-表格-工商银行 工资明细清单
111 # 工商银行历史明细(申请单号:20042501303039397888)
79 HEADERS_MAPPING.update( 112 HEADERS_MAPPING.update(
80 { 113 {
81 '对方户名': BASE_HEADERS_MAPPING['对方账户名'],
82 '收入/支出金额': BASE_HEADERS_MAPPING['金额'], 114 '收入/支出金额': BASE_HEADERS_MAPPING['金额'],
83 '工作日期': BASE_HEADERS_MAPPING['记账日期'], 115 '工作日期': BASE_HEADERS_MAPPING['记账日期'],
84 } 116 }
85 ) 117 )
86 # 横版-表格-北京银行 118
119 # 横版-表格-建设银行-个人活期账户交易明细
120 # 竖版-表格-建设银行-个人活期账户交易明细 CH-B005832604
121 # 竖版-表格-建设银行-工资账单CH-B008786812
122 # 竖版-表格-建设银行-个人活期账户交易明细 CH-B005832604 (2)
87 HEADERS_MAPPING.update( 123 HEADERS_MAPPING.update(
88 { 124 {
89 '业务摘要': BASE_HEADERS_MAPPING['附言'], 125 '交易金额': BASE_HEADERS_MAPPING['金额'],
90 '发生额': BASE_HEADERS_MAPPING['金额'], 126 '账户余额': BASE_HEADERS_MAPPING['余额'],
127 '对方账号与户名': BASE_HEADERS_MAPPING['对方卡号/账号'],
128 }
129 )
130 # 微信
131 HEADERS_MAPPING.update(
132 {
133 '交易时间': BASE_HEADERS_MAPPING['记账时间'],
134 '交易类型': BASE_HEADERS_MAPPING['附言'],
135 '金额(元)': BASE_HEADERS_MAPPING['金额'],
136 '金额(元)': BASE_HEADERS_MAPPING['金额'],
137 '交易对方': BASE_HEADERS_MAPPING['对方账户名'],
91 } 138 }
92 ) 139 )
140 # 支付宝
141 HEADERS_MAPPING.update(
142 {
143 '时间': BASE_HEADERS_MAPPING['记账日期'],
144 '名称/备注': BASE_HEADERS_MAPPING['附言'],
145 }
146 )
147
148 # ------------普通打印-部分格线-------------------------------------------------------------------------------------------
149
150 # 竖版-无表格-农业银行
151 # 竖版-无表格-农业银行CH-B008805428
152 HEADERS_MAPPING.update(
153 {
154 '摘要/附言': BASE_HEADERS_MAPPING['附言'],
155 '交易地点/对方账号和户名': BASE_HEADERS_MAPPING['对方卡号/账号'],
156 }
157 )
158 # 竖版-特殊-农商行
159 HEADERS_MAPPING.update(
160 {
161 '交易发生额': BASE_HEADERS_MAPPING['金额'],
162 }
163 )
164 # 横版-特殊-中信银行-账户交易明细
165 HEADERS_MAPPING.update(
166 {
167 '对方银行': BASE_HEADERS_MAPPING['对方开户行'],
168 '交易摘要': BASE_HEADERS_MAPPING['附言'],
169 }
170 )
171 # 平安电子账单
172 HEADERS_MAPPING.update(
173 {
174 '借贷发生额(借:-贷:+)': BASE_HEADERS_MAPPING['金额'],
175 }
176 )
177
178 # ------------普通打印-无格线--------------------------------------------------------------------------------------------
179
180 # 竖版-无表格-招商银行(略歪)
181 # 竖版-无表格-招商银行账户历史交易明细表
182 HEADERS_MAPPING.update(
183 {
184 '联机余额': BASE_HEADERS_MAPPING['余额'],
185 }
186 )
187 # 竖版-无表格-邮储银行-账户对账单 含有对手方户名 对手方账户
188 # 竖版-无表格-邮储银行 账户对账单
189 # 竖版-无表格-邮储银行-电子章 邮储银行 账户对账单
190 HEADERS_MAPPING.update(
191 {
192 '交易金额(元)': BASE_HEADERS_MAPPING['金额'],
193 '交易金额(元)': BASE_HEADERS_MAPPING['金额'],
194 '账户余额(元)': BASE_HEADERS_MAPPING['余额'],
195 '账户余额(元)': BASE_HEADERS_MAPPING['余额'],
196 '对手方户名': BASE_HEADERS_MAPPING['对方账户名'],
197 '对手方账户': BASE_HEADERS_MAPPING['对方卡号/账号'],
198 }
199 )
200 # 横版-无表格-广发银行-账户交易历史 --> 已废弃
201 # 竖版-无表格-广发银行-账户交易历史 --> 已废弃
202 HEADERS_MAPPING.update(
203 {
204 '会计日期': BASE_HEADERS_MAPPING['记账日期'],
205 '对手户名': BASE_HEADERS_MAPPING['对方账户名'],
206 '对手账号': BASE_HEADERS_MAPPING['对方卡号/账号'],
207 }
208 )
209 # 招行电子账单 TODO 有英文,需测试
210 HEADERS_MAPPING.update(
211 {
212 '对手信息': BASE_HEADERS_MAPPING['对方账户名'],
213 '摘要代码': BASE_HEADERS_MAPPING['附言'],
214 }
215 )
216 # 横版-无表格-民生银行-中国民生银行个人账户对账单(客户卡号)
217 # 横版-无表格-民生银行-无标题(客户账户)
218 # 横版-无表格-民生银行
219 HEADERS_MAPPING.update(
220 {
221 '摘要信息': BASE_HEADERS_MAPPING['附言'],
222 '对方行名': BASE_HEADERS_MAPPING['对方开户行'],
223 }
224 )
225 # 竖版-无表格-农业银行整数
226 # 竖版-无表格-农业银行-中国农业银行银行卡交易明细清单
227 HEADERS_MAPPING.update(
228 {
229 '对方账号和户名': BASE_HEADERS_MAPPING['对方卡号/账号'],
230 }
231 )
232 # 竖版-无表格-农业银行-中国农业银行银行卡活期存折交易明细清单.pdf
233 # 竖版-无表格-农业银行-扩张.pdf
234 # 竖版-无表格-农业银行-缩进.pdf
235 HEADERS_MAPPING.update(
236 {
237 '日期': BASE_HEADERS_MAPPING['记账日期'],
238 '短摘要': BASE_HEADERS_MAPPING['附言'],
239 '本次余额': BASE_HEADERS_MAPPING['余额'],
240 }
241 )
242 # 竖版-无表格-农业银行-无标题(对手帐号)
243 HEADERS_MAPPING.update(
244 {
245 '交易后余额': BASE_HEADERS_MAPPING['余额'],
246 '对手帐号': BASE_HEADERS_MAPPING['对方卡号/账号'],
247 }
248 )
249 # 竖版-无表格-农商行(非常规)
250 HEADERS_MAPPING.update(
251 {
252 '交易说明': BASE_HEADERS_MAPPING['附言'],
253 }
254 )
255 # 竖版-无表格-工商银行 抬头三行 活期历史明细清单
256 HEADERS_MAPPING.update(
257 {
258 '对方账户': BASE_HEADERS_MAPPING['对方卡号/账号'],
259 }
260 )
261
262 # -----------针式打印-全格线--------------------------------------------------------------------------------------------
263 # 竖版-表格-建设银行-中国建设银行活期账户交易明细
264 # 竖版-表格-建设银行-中国建设银行活期账户明细清单
265 # 竖版-表格-建设银行-对私活期账户明细- (1).pdf
266 HEADERS_MAPPING.update(
267 {
268 '帐户余额': BASE_HEADERS_MAPPING['余额'],
269 '对方帐户名称': BASE_HEADERS_MAPPING['对方账户名'],
270 }
271 )
272 # 竖版-特殊-交通银行 零售客户交易清单 5000以上交易记录
273 HEADERS_MAPPING.update(
274 {
275 '交易日期 记账日期': BASE_HEADERS_MAPPING['记账日期'],
276 }
277 )
278
279 # ----------针式打印-部分格线------------------------------------------------------------------------------------------
280 # 竖版-特殊-邮储银行-一本通绿卡通交易明细(客户)
281 # 竖版-特殊-邮储银行-账户交易明细(客户)
282 HEADERS_MAPPING.update(
283 {
284 '对方账号/卡号/汇票号': BASE_HEADERS_MAPPING['对方卡号/账号'],
285 }
286 )
287
288 # --------------------------------------------------------------------------------------------------------------------
289
290 # ('记账日期', '记账时间', '金额', '余额', '交易名称', '附言', '对方账户名', '对方卡号/账号', '对方开户行', '核对结果', '借贷', '收入', '支出')
291 # CLASSIFY_LIST = [
292 # ('中国银行', (1, 2, 4, 5, 6, 9, 10, 11, 12, None, None, None, None)), # 横版-表格-中国银行(不规则)
293 # ('农业银行', (1, None, 3, 5, None, 8, 7, 6, None, None, None, None, None)), # 横版-表格-农业银行-中国农业银行个人账户明细
294 # ('北京银行', (1, None, 4, 5, None, 2, 6, 7, None, None, 3, None, None)), # 横版-表格-北京银行
295 #
296 # # 工商银行:交易日期 账号 储种 序号 币种 钞汇 摘要 地区 收入/支出金额 余额 [对方户名 对方账号] 渠道
297 # ('工商银行', (1, None, 9, 10, None, 7, None, None, None, None, None, None, None)),
298 #
299 # # 建设银行:空 摘要 交易日期 交易金额 账户余额 商户/网点号及其名称 对方账号与户名 --> 竖版-表格-建设银行
300 # # 序号 摘要 币别 钞汇 交易日期 交易金额 账户余额 交易地点附言 对方账号与户名 --> 横版-表格-建设银行
301 # ('建设银行', (None, None, None, None, None, 2, None, None, None, None, None, None, None)),
302 #
303 # # 微信:交易单号 交易时间 交易类型 收/支/其他 交易方式 金额(元) 交易对方 商户单号
304 #
305 # # 支付宝:流水号 时间 名称/备注 收入 支出 账户余额 资金渠道
306 #
307 # # -----------------
308 #
309 # # 农业银行:交易日期 摘要/附言 交易金额 余额 交易地点/对方账号和户名
310 # ('农业银行', (1, None, 3, 4, None, 2, None, 5, None, None, None, None, None)),
311 #
312 # # 农商行:交易日期 交易发生额 账户余额 对方账号 对方户名 摘要 备注
313 #
314 # # 中信银行:交易日期 交易摘要 收入金额 支出金额 账户余额 对方户名 对方账号 对方银行 交易流水号
315 # ('中信银行', (1, None, None, 5, None, 2, 6, 7, 8, None, None, 3, 4)),
316 #
317 # # 平安电子账单:序号 交易日期 交易网点 摘要 借贷发生额(借:-贷:+) 账户余额
318 # ('平安电子账单', (2, None, 5, 6, None, 4, None, None, None, None, None, None, None)),
319 #
320 # # -------------------------
321 #
322 # # 招商银行:记账日期 货币 交易金额 联机余额 冲补账 交易摘要
323 #
324 # # 邮储银行:交易日期、交易类型 交易币种 交易金额(元) 账户余额(元) [对手方户名 对手方账户 收支类型] --> 竖版-无表格-邮储银行-账户对账单 含有对手方户名 对手方账户
325 # # 交易日期、交易类型 交易金额(元) 账户余额(元) 操作柜员 --> 竖版-无表格-邮储银行 账户对账单
326 #
327 # # 工商银行电子版:交易日期 账号 储种 序号 币种 妙汇 摘要 地区 收入/支出金额 余额 [对方户名 对方账号] 渠道
328 #
329 # # 招商银行电子版:记账日期 货币 交易金额 联机余额 交易摘要 对手信息
330 #
331 # # 民生银行:凭证类型 凭证号码 摘要信息 交易时间 交易金额 账户余额 现转标志 交易渠道 交易机构 对方户名 对方行名
332 # # 凭证类型 凭证号码 交易时间 摘要 交易金额 账户余额 现转标志 交易渠道 交易机构 对方户名 对方行名
333 #
334 # # 农业银行:交易日期 摘要/附言 交易金额 对方账号和户名
335 #
336 # # 农业银行:交易日期 摘要/附言 交易金额 余额 交易地点/对方账号和户名
337 #
338 # # 农业银行:日期、时间、[日志号]、短摘要、交易金额、本次余额、交易网点、渠道、附言
339 #
340 # # 交通银行:交易日期 记账日期、交易地点、交易类型、借贷状态、交易金额、余额
341 #
342 # # ===================================
343 #
344 # # 建设银行:摘要、交易日期、交易金额、账户余额、商户/网点号及其名称、对方账号、对方户名
345 # # 交易日期、摘要、币种、钞汇、交易金额、帐户余额、对方账号、对方帐户名称
346 #
347 #
348 # # ===================================
349 #
350 # # 邮储银行:序号、交易日期、交易渠道、摘要、交易金额、账户余额、对方账号/卡号/汇票号、原子账号、交易机构名称
351 #
352 # # 建设银行:序号 摘要 币别 钞汇 交易日期 交易金额 账户余额 交易地点附言 对方账号与户名
353 # ]
354
355 # {
356 # "0": "全表格-中国农业银行个人账户明细",
357 # "1": "全表格-中国银行",
358 # "2": "全表格-北京银行",
359 # "3": "全表格-工商银行",
360 # "4": "全表格-建设银行",
361 # "5": "部分格线-横版-中信银行账户交易明细",
362 # "6": "部分格线-横版-中信银行账户交易明细特殊",
363 # "7": "部分格线-竖版-中国农业银行",
364 # "8": "部分格线-竖版-中国农业银行分账户(窄页)",
365 # "9": "部分格线-竖版-平安电子账单"
366 # }
367 CLASSIFY_LIST = [
368 ('农业银行', (1, None, 3, 5, None, 8, 7, 6, None, None, None, None, None)),
369 ('中国银行', (1, 2, 4, 5, 6, 9, 10, 11, 12, None, None, None, None)),
370 ('北京银行', (1, None, 4, 5, None, 2, 6, 7, None, None, 3, None, None)),
371 ('工商银行', (1, None, 9, 10, None, 7, None, None, None, None, None, None, None)),
372 ('建设银行', (None, None, None, None, None, 2, None, None, None, None, None, None, None)),
373 ('中信银行', (1, None, None, 5, None, 2, 6, 7, 8, None, None, 3, 4)),
374 ('中信银行', (1, None, None, 5, None, 2, 6, 7, 8, None, None, 3, 4)),
375 ('农业银行', (1, None, 3, 4, None, 2, None, 5, None, None, None, None, None)),
376 ('农业银行', (1, None, 3, 4, None, 2, None, 5, None, None, None, None, None)),
377 ('平安电子账单', (2, None, 5, 6, None, 4, None, None, None, None, None, None, None)),
378 ]
93 379
......
1 import os 1 import os
2 import time 2 import time
3 import fitz
4 import signal 3 import signal
5 import base64
6 import asyncio 4 import asyncio
7 import aiohttp 5 import aiohttp
6 import difflib
8 import requests 7 import requests
8 from datetime import datetime
9 from collections import Counter
9 from apps.doc.ocr.wb import BSWorkbook, Workbook 10 from apps.doc.ocr.wb import BSWorkbook, Workbook
10 from django.core.management import BaseCommand 11 from django.core.management import BaseCommand
11 12
...@@ -65,8 +66,6 @@ class Command(BaseCommand, LoggerMixin): ...@@ -65,8 +66,6 @@ class Command(BaseCommand, LoggerMixin):
65 return doc, business_type 66 return doc, business_type
66 67
67 def pdf_download(self, doc, business_type): 68 def pdf_download(self, doc, business_type):
68 if doc is None:
69 return None, None, None, None
70 doc_data_path = os.path.join(self.data_dir, business_type, str(doc.id)) 69 doc_data_path = os.path.join(self.data_dir, business_type, str(doc.id))
71 os.makedirs(doc_data_path, exist_ok=True) 70 os.makedirs(doc_data_path, exist_ok=True)
72 pdf_path = os.path.join(doc_data_path, '{0}.pdf'.format(doc.id)) 71 pdf_path = os.path.join(doc_data_path, '{0}.pdf'.format(doc.id))
...@@ -80,20 +79,96 @@ class Command(BaseCommand, LoggerMixin): ...@@ -80,20 +79,96 @@ class Command(BaseCommand, LoggerMixin):
80 return doc_data_path, excel_path, src_excel_path, pdf_path 79 return doc_data_path, excel_path, src_excel_path, pdf_path
81 80
82 @staticmethod 81 @staticmethod
83 def append_sheet(wb, sheets_list, img_name, role_summary): 82 def append_bs_sheet(wb, sheets, bs_summary, unknown_summary, pno, img_idx, classify, confidence):
84 for i, sheet in enumerate(sheets_list): 83 for i, sheet in enumerate(sheets):
85 sheet_name = '{0}_{1}'.format(img_name, i) 84 sheet_name = 'page_{0}_img_{1}_{2}'.format(pno, img_idx, i)
86 role_summary['银行-户名'].append((sheet_name, 1, None, None, None, None, None)) 85 # ['户名', '卡号', '页码', '回单验证码', '打印时间', '起始时间', '终止时间']
86 summary = sheet.get('summary')
87 card = summary[1]
88 if card is None:
89 classify_dict = unknown_summary.setdefault(classify, {})
90 role = consts.UNKNOWN_ROLE if summary[0] is None else summary[0]
91 role_dict = classify_dict.setdefault(role, {})
92 role_dict['classify'] = classify
93 role_dict['role'] = role
94 role_dict.setdefault('sheet', []).append(sheet_name)
95 role_dict.setdefault('confidence', []).append(confidence)
96 code_list = role_dict.setdefault('code', [])
97 pt_list = role_dict.setdefault('print_time', [])
98 sd_list = role_dict.setdefault('start_date', [])
99 ed_list = role_dict.setdefault('end_date', [])
100 if summary[3] is not None:
101 code_list.append((summary[2], summary[3]))
102 if summary[4] is not None:
103 pt_list.append(summary[4])
104 if summary[5] is not None:
105 sd_list.append(summary[5])
106 if summary[6] is not None:
107 ed_list.append(summary[6])
108 else:
109 card_dict = bs_summary.setdefault(card, {})
110 card_dict['count'] = card_dict.get('count', 0) + 1
111 card_dict.setdefault('classify', []).append(classify)
112 card_dict.setdefault('confidence', []).append(confidence)
113 card_dict.setdefault('sheet', []).append(sheet_name)
114 role_list = card_dict.setdefault('role', [])
115 role_set = card_dict.setdefault('role_set', set())
116 code_list = card_dict.setdefault('code', [])
117 pt_list = card_dict.setdefault('print_time', [])
118 sd_list = card_dict.setdefault('start_date', [])
119 ed_list = card_dict.setdefault('end_date', [])
120 if summary[0] is not None:
121 role_list.append(summary[0])
122 role_set.add(summary[0])
123 if summary[3] is not None:
124 code_list.append((summary[2], summary[3]))
125 if summary[4] is not None:
126 pt_list.append(summary[4])
127 if summary[5] is not None:
128 sd_list.append(summary[5])
129 if summary[6] is not None:
130 ed_list.append(summary[6])
131
87 ws = wb.create_sheet(sheet_name) 132 ws = wb.create_sheet(sheet_name)
88 cells = sheet.get('cells') 133 cells = sheet.get('cells')
89 for cell in cells: 134 for cell in cells:
90 c1 = cell.get('start_column') 135 c1 = cell.get('start_column')
91 # c2 = cell.get('end_column')
92 r1 = cell.get('start_row') 136 r1 = cell.get('start_row')
93 # r2 = cell.get('end_row')
94 words = cell.get('words') 137 words = cell.get('words')
95 ws.cell(row=r1+1, column=c1+1, value=words) 138 ws.cell(row=r1+1, column=c1+1, value=words)
96 139
140 def ocr_2_wb(self, res, wb, pno, img_idx, bs_summary, unknown_summary, license_summary):
141 # res = {
142 # 'code': 1,
143 # 'msg': 'success',
144 # 'data': {
145 # 'classify': 0,
146 # 'confidence': 0.999,
147 # 'sheets': [
148 # {
149 # 'summary': ['户名', '卡号', '页码', '回单验证码', '打印时间', '起始时间', '终止时间'],
150 # 'cells': []
151 # },
152 # {
153 # 'summary': ['户名', '卡号', '页码', '回单验证码', '打印时间', '起始时间', '终止时间'],
154 # 'cells': []
155 # }
156 # ]
157 # }
158 # }
159 data = res.get('data', {})
160 classify = data.get('classify')
161 if classify is None:
162 return
163 # if classify in
164 sheets = data.get('sheets', [])
165 if not sheets:
166 return
167 confidence = data.get('confidence', 1)
168 self.append_bs_sheet(wb, sheets, bs_summary, unknown_summary, pno, img_idx, classify, confidence)
169 # else:
170 # pass
171
97 # async def fetch_ocr_result(self, img_path): 172 # async def fetch_ocr_result(self, img_path):
98 # async with aiohttp.ClientSession( 173 # async with aiohttp.ClientSession(
99 # headers=self.ocr_header, connector=aiohttp.TCPConnector(ssl=False) 174 # headers=self.ocr_header, connector=aiohttp.TCPConnector(ssl=False)
...@@ -102,35 +177,170 @@ class Command(BaseCommand, LoggerMixin): ...@@ -102,35 +177,170 @@ class Command(BaseCommand, LoggerMixin):
102 # async with session.post(self.ocr_url, json=json_data) as response: 177 # async with session.post(self.ocr_url, json=json_data) as response:
103 # return await response.json() 178 # return await response.json()
104 # 179 #
105 # async def img_ocr_excel(self, wb, img_path, role_summary): 180 # async def img_2_ocr_2_wb(self, wb, img_path, summary):
106 # res = await self.fetch_ocr_result(img_path) 181 # res = await self.fetch_ocr_result(img_path)
107 # self.cronjob_log.info('{0} [fetch ocr result success] [img={1}] [res={2}]'.format(self.log_base, img_path, res)) 182 # self.cronjob_log.info('{0} [fetch ocr result success] [img={1}] [res={2}]'.format(self.log_base, img_path, res))
108 # sheets_list = res.get('result').get('res') 183 # sheets_list = res.get('result').get('res')
109 # img_name = os.path.basename(img_path) 184 # img_name = os.path.basename(img_path)
110 # self.append_sheet(wb, sheets_list, img_name, role_summary) 185 # self.append_sheet(wb, sheets_list, img_name, summary)
111 186
112 def fetch_ocr_result(self, img_path): 187 def fetch_ocr_result(self, img_path):
113 # payload = {'name': 'page_0_img_0_0'}
114 files = [ 188 files = [
115 ('img', open(img_path, 'rb')) 189 ('img', open(img_path, 'rb'))
116 ] 190 ]
117 response = requests.request("POST", self.ocr_url, files=files) 191 response = requests.request("POST", self.ocr_url, files=files)
118 return response.json() 192 return response.json()
119 193
120 def img_ocr_excel(self, wb, img_path, role_summary): 194 def img_2_ocr_2_wb(self, wb, img_info, bs_summary, unknown_summary, license_summary):
121 res = self.fetch_ocr_result(img_path) 195 res = self.fetch_ocr_result(img_info[0])
122 self.cronjob_log.info('{0} [fetch ocr result success] [img={1}] [res={2}]'.format(self.log_base, img_path, res)) 196 self.cronjob_log.info('{0} [fetch ocr result success] [img={1}] [res={2}]'.format(
197 self.log_base, img_info[0], res))
123 if res.get('code') == 1: 198 if res.get('code') == 1:
124 sheets_list = res.get('data') 199 self.ocr_2_wb(res, wb, img_info[1], img_info[2], bs_summary, unknown_summary, license_summary)
125 if not sheets_list: 200
126 return 201 @staticmethod
127 img_name = os.path.basename(img_path) 202 def get_most(value_list):
128 self.append_sheet(wb, sheets_list, img_name, role_summary) 203 if value_list:
204 most_common = Counter(value_list).most_common(1)
205 return most_common[0][0] if most_common else None
206
207 @staticmethod
208 def date_format(date_str, format_str):
209 try:
210 date = datetime.strptime(date_str, format_str)
211 except Exception as e:
212 return
213 else:
214 return date
215
216 def get_validate_date(self, date_list):
217 for date_str in date_list:
218 for format_str in consts.DATE_FORMAT:
219 date = self.date_format(date_str, format_str)
220 if isinstance(date, datetime):
221 return date
222
223 def merge_card(self, bs_summary):
224 merged_bs_summary = {}
225 sorted_card = sorted(bs_summary.keys(), key=lambda x: bs_summary[x]['count'], reverse=True)
226 for main_card in sorted_card:
227 if bs_summary.get(main_card) is None:
228 continue
229 merged_bs_summary[main_card] = bs_summary.pop(main_card)
230 del merged_bs_summary[main_card]['count']
231 merge_cards = []
232 for card in bs_summary.keys():
233 if difflib.SequenceMatcher(None, main_card, card).quick_ratio() > consts.CARD_RATIO:
234 merged_bs_summary[main_card]['classify'].extend(bs_summary[card]['classify'])
235 merged_bs_summary[main_card]['confidence'].extend(bs_summary[card]['confidence'])
236 merged_bs_summary[main_card]['sheet'].extend(bs_summary[card]['sheet'])
237 merged_bs_summary[main_card]['role'].extend(bs_summary[card]['role'])
238 merged_bs_summary[main_card]['role_set'].update(bs_summary[card]['role_set'])
239 merged_bs_summary[main_card]['code'].extend(bs_summary[card]['sheet'])
240 merged_bs_summary[main_card]['print_time'].extend(bs_summary[card]['print_time'])
241 merged_bs_summary[main_card]['start_date'].extend(bs_summary[card]['start_date'])
242 merged_bs_summary[main_card]['end_date'].extend(bs_summary[card]['end_date'])
243 merge_cards.append(card)
244 for card in merge_cards:
245 del bs_summary[card]
246 merged_bs_summary[main_card]['classify'] = self.get_most(merged_bs_summary[main_card]['classify'])
247 merged_bs_summary[main_card]['role'] = self.get_most(merged_bs_summary[main_card]['role'])
248 del bs_summary
249 return merged_bs_summary
250
251 def prune_bs_summary(self, bs_summary):
252 for summary in bs_summary.values():
253 del summary['count']
254 summary['classify'] = self.get_most(summary['classify'])
255 summary['role'] = self.get_most(summary['role'])
256 return bs_summary
257
258
259 def rebuild_bs_summary(self, bs_summary, unknown_summary):
260 # bs_summary = {
261 # '卡号': {
262 # 'count': 100,
263 # 'classify': [],
264 # 'confidence': [],
265 # 'role': [],
266 # 'code': [('page', 'code')],
267 # 'print_time': [],
268 # 'start_date': [],
269 # 'end_date': [],
270 # 'sheet': ['sheet_name']
271 # }
272 # }
273 #
274 # unknown_summary = {
275 # 0: {
276 # '户名': {
277 # 'classify': 0,
278 # 'confidence': [],
279 # 'role': '户名',
280 # 'code': [('page', 'code')],
281 # 'print_time': [],
282 # 'start_date': [],
283 # 'end_date': [],
284 # 'sheet': ['sheet_name']
285 # }
286 # }
287 # }
288 # 无卡号
289 if len(bs_summary) == 0:
290 del bs_summary
291 merged_bs_summary = {}
292 card_num = 1
293 for role_dict in unknown_summary.values():
294 for summary in role_dict.values():
295 card = '{0}_{1}'.format(consts.UNKNOWN_CARD, card_num)
296 card_num += 1
297 merged_bs_summary[card] = summary
298 else:
299 # 1卡号
300 if len(bs_summary) == 1:
301 merged_bs_summary = self.prune_bs_summary(bs_summary)
302 # 多卡号
303 else:
304 merged_bs_summary = self.merge_card(bs_summary)
305
306 for card_summary in merged_bs_summary.values():
307 merge_role = []
308 classify_summary = unknown_summary.get(card_summary['classify'], {})
309 for role, summary in classify_summary.items():
310 if role in card_summary['role_set']:
311 merge_role.append(role)
312 card_summary['sheet'].extend(summary['sheet'])
313 card_summary['code'].extend(summary['sheet'])
314 card_summary['print_time'].extend(summary['print_time'])
315 card_summary['start_date'].extend(summary['start_date'])
316 card_summary['end_date'].extend(summary['end_date'])
317
318 for role in merge_role:
319 del classify_summary[role]
320
321 card_num = 1
322 for role_dict in unknown_summary.values():
323 for summary in role_dict.values():
324 card = '{0}_{1}'.format(consts.UNKNOWN_CARD, card_num)
325 card_num += 1
326 merged_bs_summary[card] = summary
327
328 del unknown_summary
329 for summary in merged_bs_summary.values():
330 if summary.get('role_set') is not None:
331 del summary['role_set']
332 summary['print_time'] = self.get_validate_date(summary['print_time'])
333 summary['start_date'] = self.get_validate_date(summary['start_date'])
334 summary['end_date'] = self.get_validate_date(summary['end_date'])
335 summary['confidence'] = max(summary['confidence'])
336 return merged_bs_summary
129 337
130 # TODO 细化文件状态,不同异常状态,归还队列,重试时采取不同的处理 338 # TODO 细化文件状态,不同异常状态,归还队列,重试时采取不同的处理
131 # TODO 调用接口重试 339 # TODO 调用接口重试
340 # TODO 协程异步发送OCR请求
132 # TODO 异常邮件通知 341 # TODO 异常邮件通知
133 # TODO 数据库断联问题 342 # TODO 数据库断联问题
343 # TODO 非流水证件处理,Excel模板
134 def handle(self, *args, **kwargs): 344 def handle(self, *args, **kwargs):
135 sleep_second = int(conf.SLEEP_SECOND) 345 sleep_second = int(conf.SLEEP_SECOND)
136 max_sleep_second = int(conf.MAX_SLEEP_SECOND) 346 max_sleep_second = int(conf.MAX_SLEEP_SECOND)
...@@ -138,17 +348,19 @@ class Command(BaseCommand, LoggerMixin): ...@@ -138,17 +348,19 @@ class Command(BaseCommand, LoggerMixin):
138 while self.switch: 348 while self.switch:
139 # 1. 从队列获取文件信息 349 # 1. 从队列获取文件信息
140 doc, business_type = self.get_doc_info() 350 doc, business_type = self.get_doc_info()
351 # 队列为空时的处理
352 if doc is None:
353 time.sleep(sleep_second)
354 sleep_second = min(max_sleep_second, sleep_second + 5)
355 continue
356 sleep_second = int(conf.SLEEP_SECOND)
357
141 try: 358 try:
359 start_time = time.time()
142 # 2. 从EDMS获取PDF文件 360 # 2. 从EDMS获取PDF文件
143 doc_data_path, excel_path, src_excel_path, pdf_path = self.pdf_download(doc, business_type) 361 doc_data_path, excel_path, src_excel_path, pdf_path = self.pdf_download(doc, business_type)
144 # 队列为空时的处理 362
145 if pdf_path is None:
146 time.sleep(sleep_second)
147 sleep_second = min(max_sleep_second, sleep_second+5)
148 continue
149 sleep_second = int(conf.SLEEP_SECOND)
150 # 3.PDF文件提取图片 363 # 3.PDF文件提取图片
151 start_time = time.time()
152 img_save_path = os.path.join(doc_data_path, 'img') 364 img_save_path = os.path.join(doc_data_path, 'img')
153 self.cronjob_log.info('{0} [pdf to img start] [business_type={1}] [doc_id={2}]'.format( 365 self.cronjob_log.info('{0} [pdf to img start] [business_type={1}] [doc_id={2}]'.format(
154 self.log_base, business_type, doc.id)) 366 self.log_base, business_type, doc.id))
...@@ -158,28 +370,42 @@ class Command(BaseCommand, LoggerMixin): ...@@ -158,28 +370,42 @@ class Command(BaseCommand, LoggerMixin):
158 self.log_base, business_type, doc.id)) 370 self.log_base, business_type, doc.id))
159 write_zip_file(img_save_path, os.path.join(doc_data_path, '{0}_img.zip'.format(doc.id))) 371 write_zip_file(img_save_path, os.path.join(doc_data_path, '{0}_img.zip'.format(doc.id)))
160 372
161 # 4.图片调用算法判断是否为银行流水, 图片调用算法OCR为excel文件 373 # 4.获取OCR结果并且构建excel文件
162 role_summary = { 374 bs_summary = {}
163 '银行-户名': [] 375 license_summary = {}
164 } 376 unknown_summary = []
165 # interest_keyword = Keywords.objects.filter( 377 interest_keyword = Keywords.objects.filter(
166 # type=KeywordsType.INTEREST.value).values_list('keyword', flat=True) 378 type=KeywordsType.INTEREST.value).values_list('keyword', flat=True)
167 # salary_keyword = Keywords.objects.filter( 379 salary_keyword = Keywords.objects.filter(
168 # type=KeywordsType.SALARY.value).values_list('keyword', flat=True) 380 type=KeywordsType.SALARY.value).values_list('keyword', flat=True)
169 # loan_keyword = Keywords.objects.filter(type=KeywordsType.LOAN.value).values_list('keyword', flat=True) 381 loan_keyword = Keywords.objects.filter(
170 # wb = BSWorkbook(interest_keyword, salary_keyword, loan_keyword) 382 type__in=[KeywordsType.LOAN.value, KeywordsType.ALI_WECHART.value]).values_list(
171 wb = Workbook() 383 'keyword', flat=True)
384 wb = BSWorkbook(interest_keyword, salary_keyword, loan_keyword)
385
386 # wb = Workbook()
387
388 # 4.1 获取OCR结果
172 # loop = asyncio.get_event_loop() 389 # loop = asyncio.get_event_loop()
173 # tasks = [self.img_ocr_excel(wb, img_path, role_summary) for img_path in pdf_handler.img_path_list] 390 # tasks = [self.img_2_ocr_2_wb(wb, img_path, summary) for img_path in pdf_handler.img_path_list]
174 # loop.run_until_complete(asyncio.wait(tasks)) 391 # loop.run_until_complete(asyncio.wait(tasks))
175 # loop.close() 392 # loop.close()
176 393
177 for img_path in pdf_handler.img_path_list: 394 for img_info in pdf_handler.img_info_list:
178 self.img_ocr_excel(wb, img_path, role_summary) 395 self.img_2_ocr_2_wb(wb, img_info, bs_summary, unknown_summary, license_summary)
396
397 self.cronjob_log.info('{0} [bs_summary={1}] [unknown_summary={2}] [license_summary={3}]'.format(
398 self.log_base, bs_summary, unknown_summary, license_summary))
399
400 merged_bs_summary = self.rebuild_bs_summary(bs_summary, unknown_summary)
179 401
180 # 整合excel文件 402 self.cronjob_log.info('{0} [merged_bs_summary={1}] [unknown_summary={2}]'.format(
181 # wb.save(src_excel_path) 403 self.log_base, merged_bs_summary, unknown_summary))
182 # wb.rebuild(role_summary) 404 del unknown_summary
405
406 # 4.2 重构Excel文件
407 wb.save(src_excel_path)
408 wb.rebuild(merged_bs_summary, license_summary)
183 wb.save(excel_path) 409 wb.save(excel_path)
184 except Exception as e: 410 except Exception as e:
185 doc.status = DocStatus.PROCESS_FAILED.value 411 doc.status = DocStatus.PROCESS_FAILED.value
...@@ -194,14 +420,16 @@ class Command(BaseCommand, LoggerMixin): ...@@ -194,14 +420,16 @@ class Command(BaseCommand, LoggerMixin):
194 except Exception as e: 420 except Exception as e:
195 doc.status = DocStatus.UPLOAD_FAILED.value 421 doc.status = DocStatus.UPLOAD_FAILED.value
196 doc.save() 422 doc.save()
197 self.cronjob_log.error('{0} [upload failed] [business_type={1}] [doc_id={2}] [err={3}]'.format( 423 end_time = time.time()
198 self.log_base, business_type, doc.id, e)) 424 speed_time = int(end_time - start_time)
425 self.cronjob_log.error('{0} [upload failed] [business_type={1}] [doc_id={2}] [speed_time={3}] '
426 '[err={4}]'.format(self.log_base, business_type, doc.id, speed_time, e))
199 else: 427 else:
200 doc.status = DocStatus.COMPLETE.value 428 doc.status = DocStatus.COMPLETE.value
201 doc.save() 429 doc.save()
202 end_time = time.time() 430 end_time = time.time()
203 speed_time = int(end_time - start_time) 431 speed_time = int(end_time - start_time)
204 self.cronjob_log.info('{0} [doc process complete] [business_type={1}] [doc_id={2}] ' 432 self.cronjob_log.info('{0} [process complete] [business_type={1}] [doc_id={2}] '
205 '[speed_time={3}]'.format(self.log_base, business_type, doc.id, speed_time)) 433 '[speed_time={3}]'.format(self.log_base, business_type, doc.id, speed_time))
206 434
207 self.cronjob_log.info('{0} [stop safely]'.format(self.log_base)) 435 self.cronjob_log.info('{0} [stop safely]'.format(self.log_base))
......
...@@ -11,6 +11,8 @@ class DocHandler: ...@@ -11,6 +11,8 @@ class DocHandler:
11 return '/data/{1}/{0}/{0}.pdf'.format(doc_id, business_type) 11 return '/data/{1}/{0}/{0}.pdf'.format(doc_id, business_type)
12 elif file == 'img': 12 elif file == 'img':
13 return '/data/{1}/{0}/{0}_img.zip'.format(doc_id, business_type) 13 return '/data/{1}/{0}/{0}_img.zip'.format(doc_id, business_type)
14 elif file == 'src_excel':
15 return '/data/{1}/{0}/src.xlsx'.format(doc_id, business_type)
14 else: 16 else:
15 return '/data/{1}/{0}/{0}.xlsx'.format(doc_id, business_type) 17 return '/data/{1}/{0}/{0}.xlsx'.format(doc_id, business_type)
16 18
...@@ -22,6 +24,7 @@ class DocHandler: ...@@ -22,6 +24,7 @@ class DocHandler:
22 doc_dict['pdf_link'] = self.get_link(doc_id, business_type) 24 doc_dict['pdf_link'] = self.get_link(doc_id, business_type)
23 doc_dict['img_link'] = self.get_link(doc_id, business_type, file='img') 25 doc_dict['img_link'] = self.get_link(doc_id, business_type, file='img')
24 doc_dict['excel_link'] = self.get_link(doc_id, business_type, file='excel') 26 doc_dict['excel_link'] = self.get_link(doc_id, business_type, file='excel')
27 doc_dict['src_excel_link'] = self.get_link(doc_id, business_type, file='src_excel')
25 return list(doc_queryset) 28 return list(doc_queryset)
26 29
27 @staticmethod 30 @staticmethod
......
...@@ -13,3 +13,4 @@ class KeywordsType(NamedEnum): ...@@ -13,3 +13,4 @@ class KeywordsType(NamedEnum):
13 INTEREST = (0, "利息") 13 INTEREST = (0, "利息")
14 SALARY = (1, '薪资') 14 SALARY = (1, '薪资')
15 LOAN = (2, '贷款') 15 LOAN = (2, '贷款')
16 ALI_WECHART = (3, '微信/支付宝')
......
...@@ -13,6 +13,7 @@ class BSWorkbook(Workbook): ...@@ -13,6 +13,7 @@ class BSWorkbook(Workbook):
13 13
14 def __init__(self, interest_keyword, salary_keyword, loan_keyword, *args, **kwargs): 14 def __init__(self, interest_keyword, salary_keyword, loan_keyword, *args, **kwargs):
15 super().__init__(*args, **kwargs) 15 super().__init__(*args, **kwargs)
16 locale.setlocale(locale.LC_NUMERIC, 'en_US.UTF-8')
16 self.meta_sheet_title = '关键信息提取和展示' 17 self.meta_sheet_title = '关键信息提取和展示'
17 self.blank_row = (None,) 18 self.blank_row = (None,)
18 self.code_header = ('页数', '电子回单验证码') 19 self.code_header = ('页数', '电子回单验证码')
...@@ -24,26 +25,59 @@ class BSWorkbook(Workbook): ...@@ -24,26 +25,59 @@ class BSWorkbook(Workbook):
24 self.proof_res = ('对', '错') 25 self.proof_res = ('对', '错')
25 self.loan_fill = PatternFill("solid", fgColor="00FFCC00") 26 self.loan_fill = PatternFill("solid", fgColor="00FFCC00")
26 self.amount_fill = PatternFill("solid", fgColor="00FFFF00") 27 self.amount_fill = PatternFill("solid", fgColor="00FFFF00")
27 self.bd = Side(style='thin', color="000000") 28 # self.bd = Side(style='thin', color="000000")
28 self.border = Border(left=self.bd, top=self.bd, right=self.bd, bottom=self.bd) 29 # self.border = Border(left=self.bd, top=self.bd, right=self.bd, bottom=self.bd)
29 self.MAX_MEAN = 31 30 self.MAX_MEAN = 31
30 31
31 @staticmethod 32 @staticmethod
32 def sheet_prune(ws): 33 def sheet_prune(ws, classify):
33 ws.insert_cols(1, amount=consts.FIXED_COL_AMOUNT) 34 ws.insert_cols(1, amount=consts.FIXED_COL_AMOUNT)
35 moved_col_set = set()
36 header_col_set = set()
37 # 根据第一行关键词排列
34 for col in range(consts.FIXED_COL_AMOUNT + 1, ws.max_column + 1): 38 for col in range(consts.FIXED_COL_AMOUNT + 1, ws.max_column + 1):
35 header_value = ws.cell(1, col).value 39 header_value = ws.cell(1, col).value
36 header_idx = consts.HEADERS_MAPPING.get(header_value) 40 header_col = consts.HEADERS_MAPPING.get(header_value)
37 # TODO 关键字段再次查找 41 if header_col is not None:
38 # TODO 支付宝、微信流水第一行非表头,怎么处理 42 letter = get_column_letter(col)
39 if header_idx is None: 43 ws.move_range("{0}1:{0}{1}".format(letter, ws.max_row), cols=header_col - col)
44 moved_col_set.add(col)
45 header_col_set.add(header_col)
46 elif header_value in consts.BORROW_HEADERS_SET:
47 letter = get_column_letter(col)
48 ws.move_range("{0}1:{0}{1}".format(letter, ws.max_row), cols=consts.BORROW_HEADER_COL - col)
49 moved_col_set.add(col)
50 header_col_set.add(consts.BORROW_HEADER_COL)
51 elif header_value in consts.INCOME_HEADERS_SET:
52 letter = get_column_letter(col)
53 ws.move_range("{0}1:{0}{1}".format(letter, ws.max_row), cols=consts.INCOME_HEADER_COL - col)
54 moved_col_set.add(col)
55 header_col_set.add(consts.INCOME_HEADER_COL)
56 elif header_value in consts.OUTLAY_HEADERS_SET:
57 letter = get_column_letter(col)
58 ws.move_range("{0}1:{0}{1}".format(letter, ws.max_row), cols=consts.OUTLAY_HEADER_COL - col)
59 moved_col_set.add(col)
60 header_col_set.add(consts.OUTLAY_HEADER_COL)
61
62 # 缺失表头再次查找
63 for header_col in range(1, consts.FIXED_COL_AMOUNT + 1):
64 if header_col in header_col_set or header_col == consts.RESULT_HEADER_COL:
65 continue
66 fix_col = consts.CLASSIFY_LIST[classify][1][header_col - 1] # TODO 合并分类情况
67 if fix_col is None:
40 continue 68 continue
41 letter = get_column_letter(col) 69 fix_col = fix_col + consts.FIXED_COL_AMOUNT
42 ws.move_range("{0}1:{0}{1}".format(letter, ws.max_row), cols=header_idx - col) 70 if fix_col in moved_col_set:
71 break
72 letter = get_column_letter(fix_col)
73 ws.move_range("{0}1:{0}{1}".format(letter, ws.max_row), cols=header_col - fix_col)
74
43 ws.delete_cols(consts.FIXED_COL_AMOUNT + 1, amount=ws.max_column) 75 ws.delete_cols(consts.FIXED_COL_AMOUNT + 1, amount=ws.max_column)
76 min_row = 1 if len(moved_col_set) == 0 else 2
77 return min_row
44 78
45 @staticmethod 79 @staticmethod
46 def month_split(dti, date_list): 80 def month_split(dti, date_list, date_statistics):
47 month_list = [] 81 month_list = []
48 idx_list = [] 82 idx_list = []
49 month_pre = None 83 month_pre = None
...@@ -53,15 +87,17 @@ class BSWorkbook(Workbook): ...@@ -53,15 +87,17 @@ class BSWorkbook(Workbook):
53 if month_str != month_pre: 87 if month_str != month_pre:
54 month_list.append(month_str) 88 month_list.append(month_str)
55 if month_pre is None: 89 if month_pre is None:
56 date_list.append(dti[idx].date()) 90 if date_statistics:
91 date_list.append(dti[idx].date())
57 idx = 0 92 idx = 0
58 idx_list.append(idx) 93 idx_list.append(idx)
59 month_pre = month_str 94 month_pre = month_str
60 for idx in range(len(dti)-1, -1, -1): 95 if date_statistics:
61 if isinstance(dti[idx], NaTType): 96 for idx in range(len(dti) - 1, -1, -1):
62 continue 97 if isinstance(dti[idx], NaTType):
63 date_list.append(dti[idx].date()) 98 continue
64 break 99 date_list.append(dti[idx].date())
100 break
65 return month_list, idx_list 101 return month_list, idx_list
66 102
67 @staticmethod 103 @staticmethod
...@@ -86,8 +122,8 @@ class BSWorkbook(Workbook): ...@@ -86,8 +122,8 @@ class BSWorkbook(Workbook):
86 reverse_trend = -1 122 reverse_trend = -1
87 return reverse_trend 123 return reverse_trend
88 124
89 def sheet_split(self, ws, month_mapping, date_list, reverse_trend_list): 125 def sheet_split(self, ws, month_mapping, reverse_trend_list, min_row, date_list, date_statistics):
90 for date_tuple_src in ws.iter_cols(min_col=1, max_col=1, min_row=2, values_only=True): 126 for date_tuple_src in ws.iter_cols(min_col=1, max_col=1, min_row=min_row, values_only=True):
91 date_tuple = [date[:10] if isinstance(date, str) else date for date in date_tuple_src] 127 date_tuple = [date[:10] if isinstance(date, str) else date for date in date_tuple_src]
92 dt_array, tz_parsed = tslib.array_to_datetime( 128 dt_array, tz_parsed = tslib.array_to_datetime(
93 np.array(date_tuple, copy=False, dtype=np.object_), 129 np.array(date_tuple, copy=False, dtype=np.object_),
...@@ -95,16 +131,16 @@ class BSWorkbook(Workbook): ...@@ -95,16 +131,16 @@ class BSWorkbook(Workbook):
95 utc=False, 131 utc=False,
96 dayfirst=False, 132 dayfirst=False,
97 yearfirst=False, 133 yearfirst=False,
98 require_iso8601=False, 134 require_iso8601=True,
99 ) 135 )
100 dti = DatetimeIndex(dt_array, tz=None, name=None) 136 dti = DatetimeIndex(dt_array, tz=None, name=None)
101 137
102 month_list, idx_list = self.month_split(dti, date_list) 138 month_list, idx_list = self.month_split(dti, date_list, date_statistics)
103 139
104 if len(month_list) == 0: 140 if len(month_list) == 0:
105 # month_info process 141 # month_info process
106 month_info = month_mapping.setdefault('xxxx-xx', []) 142 month_info = month_mapping.setdefault('xxxx-xx', [])
107 month_info.append((ws.title, 2, ws.max_row, 0)) 143 month_info.append((ws.title, min_row, ws.max_row, 0))
108 elif len(month_list) == 1: 144 elif len(month_list) == 1:
109 # reverse_trend_list process 145 # reverse_trend_list process
110 reverse_trend = self.get_reverse_trend(dti.day, idx_list) 146 reverse_trend = self.get_reverse_trend(dti.day, idx_list)
...@@ -113,14 +149,14 @@ class BSWorkbook(Workbook): ...@@ -113,14 +149,14 @@ class BSWorkbook(Workbook):
113 month_info = month_mapping.setdefault(month_list[0], []) 149 month_info = month_mapping.setdefault(month_list[0], [])
114 day_mean = np.mean(dti.day.dropna()) 150 day_mean = np.mean(dti.day.dropna())
115 if len(month_info) == 0: 151 if len(month_info) == 0:
116 month_info.append((ws.title, 2, ws.max_row, day_mean)) 152 month_info.append((ws.title, min_row, ws.max_row, day_mean))
117 else: 153 else:
118 for i, item in enumerate(month_info): 154 for i, item in enumerate(month_info):
119 if day_mean <= item[-1]: 155 if day_mean <= item[-1]:
120 month_info.insert(i, (ws.title, 2, ws.max_row, day_mean)) 156 month_info.insert(i, (ws.title, min_row, ws.max_row, day_mean))
121 break 157 break
122 else: 158 else:
123 month_info.append((ws.title, 2, ws.max_row, day_mean)) 159 month_info.append((ws.title, min_row, ws.max_row, day_mean))
124 else: 160 else:
125 # reverse_trend_list process 161 # reverse_trend_list process
126 reverse_trend = self.get_reverse_trend(dti.day, idx_list) 162 reverse_trend = self.get_reverse_trend(dti.day, idx_list)
...@@ -128,34 +164,41 @@ class BSWorkbook(Workbook): ...@@ -128,34 +164,41 @@ class BSWorkbook(Workbook):
128 # month_info process 164 # month_info process
129 for i, item in enumerate(month_list[:-1]): 165 for i, item in enumerate(month_list[:-1]):
130 month_mapping.setdefault(item, []).append( 166 month_mapping.setdefault(item, []).append(
131 (ws.title, idx_list[i] + 2, idx_list[i + 1] + 1, self.MAX_MEAN)) 167 (ws.title, idx_list[i] + min_row, idx_list[i + 1] + min_row - 1, self.MAX_MEAN))
132 month_mapping.setdefault(month_list[-1], []).insert( 168 month_mapping.setdefault(month_list[-1], []).insert(
133 0, (ws.title, idx_list[-1] + 2, ws.max_row, 0)) 169 0, (ws.title, idx_list[-1] + min_row, ws.max_row, 0))
134 170
135 def build_metadata_rows(self, confidence_max, code_list, print_time, start_date, end_date, date_interval): 171 def build_metadata_rows(self, classify, confidence, role, code, print_time, start_date, end_date):
136 metadata_rows = [('流水识别置信度', confidence_max), self.blank_row, self.code_header] 172 metadata_rows = [
137 metadata_rows.extend(code_list) 173 ('流水识别置信度', confidence),
174 self.blank_row,
175 ('分类结果', classify),
176 self.blank_row,
177 ('户名', role),
178 self.blank_row,
179 self.code_header,
180 ]
181 metadata_rows.extend(code)
138 metadata_rows.extend( 182 metadata_rows.extend(
139 [self.blank_row, 183 [self.blank_row,
140 self.date_header, 184 self.date_header,
141 (print_time, start_date, end_date, date_interval), 185 (print_time, start_date, end_date, (end_date - start_date).days),
142 self.blank_row, 186 self.blank_row,
143 self.keyword_header] 187 self.keyword_header]
144 ) 188 )
145 return metadata_rows 189 return metadata_rows
146 190
147 def create_meta_sheet(self, role): 191 def create_meta_sheet(self, card):
148 if self.worksheets[0].title == 'Sheet': 192 if self.worksheets[0].title == 'Sheet':
149 ms = self.worksheets[0] 193 ms = self.worksheets[0]
150 ms.title = '{0}({1})'.format(self.meta_sheet_title, role) 194 ms.title = '{0}({1})'.format(self.meta_sheet_title, card)
151 else: 195 else:
152 ms = self.create_sheet('{0}({1})'.format(self.meta_sheet_title, role)) 196 ms = self.create_sheet('{0}({1})'.format(self.meta_sheet_title, card))
153 return ms 197 return ms
154 198
155 def build_meta_sheet(self, role, confidence_max, code_list, print_time, start_date, end_date, date_interval): 199 def build_meta_sheet(self, card, classify, confidence, role, code, print_time, start_date, end_date):
156 metadata_rows = self.build_metadata_rows(confidence_max, code_list, print_time, 200 metadata_rows = self.build_metadata_rows(classify, confidence, role, code, print_time, start_date, end_date)
157 start_date, end_date, date_interval) 201 ms = self.create_meta_sheet(card)
158 ms = self.create_meta_sheet(role)
159 for row in metadata_rows: 202 for row in metadata_rows:
160 ms.append(row) 203 ms.append(row)
161 return ms 204 return ms
...@@ -169,55 +212,84 @@ class BSWorkbook(Workbook): ...@@ -169,55 +212,84 @@ class BSWorkbook(Workbook):
169 new_ws.append(consts.FIXED_HEADERS) 212 new_ws.append(consts.FIXED_HEADERS)
170 for part in parts: 213 for part in parts:
171 ws = self.get_sheet_by_name(part[0]) 214 ws = self.get_sheet_by_name(part[0])
172 for row in ws.iter_rows(min_row=part[1], max_row=part[2], values_only=True): 215 for row_value in ws.iter_rows(min_row=part[1], max_row=part[2], values_only=True):
173 new_ws.append(row) 216 new_ws.append(row_value)
174 # 3.2.提取信息、高亮 217 # 3.2.提取信息、高亮
175 amount_mapping = {} 218 amount_mapping = {}
176 amount_fill_row = set() 219 amount_fill_row = set()
177 for rows in new_ws.iter_rows(): 220 for rows in new_ws.iter_rows(min_row=2):
178 summary_cell = rows[5] 221 summary_cell = rows[consts.SUMMARY_IDX]
179 date_cell = rows[0] 222 date_cell = rows[consts.DATE_IDX]
223 amount_cell = rows[consts.AMOUNT_IDX]
224 row = summary_cell.row
180 # 关键词1提取 225 # 关键词1提取
181 if summary_cell.value in self.interest_keyword: 226 if summary_cell.value in self.interest_keyword:
182 ms.append((summary_cell.value, date_cell.value, rows[2].value)) 227 ms.append((summary_cell.value, date_cell.value, amount_cell.value))
183 # 关键词2提取至临时表 228 # 关键词2提取至临时表
184 elif summary_cell.value in self.salary_keyword: 229 elif summary_cell.value in self.salary_keyword:
185 tmp_ws.append((summary_cell.value, date_cell.value, rows[2].value)) 230 tmp_ws.append((summary_cell.value, date_cell.value, amount_cell.value))
186 # 贷款关键词高亮 231 # 贷款关键词高亮
187 elif summary_cell.value in self.loan_keyword: 232 elif summary_cell.value in self.loan_keyword:
188 summary_cell.fill = self.loan_fill 233 summary_cell.fill = self.loan_fill
189 for i, cell in enumerate(rows): 234
190 cell.border = self.border 235 # 3.3.余额转数值
191 if (i == 2 or i == 3) and cell.row > 1: 236 over_cell = rows[consts.OVER_IDX]
237 try:
238 if isinstance(over_cell.value, str):
239 over_cell.value = over_cell.value.translate(consts.TRANS)
240 over_cell.value = locale.atof(over_cell.value)
241 except Exception as e:
242 continue
243 else:
244 over_cell.number_format = numbers.FORMAT_NUMBER_COMMA_SEPARATED1
245
246 # 3.4.余额转数值
247 try:
248 try:
249 if isinstance(amount_cell.value, str):
250 amount_cell.value = amount_cell.value.translate(consts.TRANS)
251 amount_cell.value = locale.atof(amount_cell.value)
252 except Exception as e:
192 try: 253 try:
193 # 3.3.金额、余额转数值 254 if isinstance(rows[consts.INCOME_IDX].value, str):
194 cell.value = locale.atof(cell.value) 255 rows[consts.OUTLAY_IDX].value = rows[consts.INCOME_IDX].value.translate(consts.TRANS)
195 except Exception: 256 amount_cell.value = locale.atof(rows[consts.OUTLAY_IDX].value)
196 continue 257 except Exception as e:
197 else: 258 if isinstance(rows[consts.OUTLAY_IDX].value, str):
198 cell.number_format = numbers.FORMAT_NUMBER_COMMA_SEPARATED1 259 rows[consts.OUTLAY_IDX].value = rows[consts.OUTLAY_IDX].value.translate(consts.TRANS)
199 if i == 2: 260 amount_cell.value = locale.atof(rows[consts.OUTLAY_IDX].value)
200 same_amount_mapping = amount_mapping.get(date_cell.value, {}) 261 if amount_cell.value > 0:
201 fill_rows = same_amount_mapping.get(-cell.value) 262 amount_cell.value = -amount_cell.value
202 if fill_rows: 263 except Exception as e:
203 amount_fill_row.add(cell.row) 264 continue
204 amount_fill_row.update(fill_rows) 265 else:
205 amount_mapping.setdefault(date_cell.value, {}).setdefault( 266 if rows[consts.BORROW_IDX].value in consts.BORROW_OUTLAY_SET:
206 cell.value, []).append(cell.row) 267 amount_cell.value = -amount_cell.value
207 # 3.4.核对结果 268 amount_cell.number_format = numbers.FORMAT_NUMBER_COMMA_SEPARATED1
208 # TODO 借贷、开支类型银行流水,需要手动添加+-号 269 same_amount_mapping = amount_mapping.get(date_cell.value, {})
209 if i == 9 and cell.row > 2: 270 fill_rows = same_amount_mapping.get(-amount_cell.value)
210 if is_reverse: 271 if fill_rows:
211 cell.value = '=IF(D{0}=SUM(D{1},C{0}), "{2}", "{3}")'.format( 272 amount_fill_row.add(row)
212 cell.row - 1, cell.row, *self.proof_res) 273 amount_fill_row.update(fill_rows)
213 else: 274 amount_mapping.setdefault(date_cell.value, {}).setdefault(
214 cell.value = '=IF(D{0}=SUM(D{1},C{0}), "{2}", "{3}")'.format( 275 amount_cell.value, []).append(row)
215 cell.row, cell.row - 1, *self.proof_res)
216 276
217 # 3.5.同一天相同进出账高亮 277 # 3.5.核对结果
278 if row > 2:
279 if is_reverse:
280 rows[consts.RESULT_IDX].value = '=IF(D{0}=SUM(D{1},C{0}), "{2}", "{3}")'.format(
281 row - 1, row, *self.proof_res)
282 else:
283 rows[consts.RESULT_IDX].value = '=IF(D{0}=SUM(D{1},C{0}), "{2}", "{3}")'.format(
284 row, row - 1, *self.proof_res)
285
286 # 删除金额辅助列
287 new_ws.delete_cols(consts.BORROW_HEADER_COL, amount=new_ws.max_column)
288
289 # 3.6.同一天相同进出账高亮
218 del amount_mapping 290 del amount_mapping
219 for row in amount_fill_row: 291 for row in amount_fill_row:
220 new_ws[row][2].fill = self.amount_fill 292 new_ws[row][consts.AMOUNT_IDX].fill = self.amount_fill
221 293
222 # 关键词2信息提取 294 # 关键词2信息提取
223 ms.append(self.blank_row) 295 ms.append(self.blank_row)
...@@ -226,34 +298,51 @@ class BSWorkbook(Workbook): ...@@ -226,34 +298,51 @@ class BSWorkbook(Workbook):
226 ms.append(row) 298 ms.append(row)
227 self.remove(tmp_ws) 299 self.remove(tmp_ws)
228 300
229 def rebuild(self, role_summary): 301 def bs_rebuild(self, bs_summary):
230 # (sheet_name, confidence, page, code, print_time, start_date, end_date) # TODO 表名简化,+卡号 302 # bs_summary = {
231 for role, summary_list in role_summary.items(): 303 # '卡号': {
304 # 'classify': 0,
305 # 'confidence': 0.9,
306 # 'role': '柳雪',
307 # 'code': [('page', 'code')],
308 # 'print_time': 'datetime',
309 # 'start_date': 'datetime',
310 # 'end_date': 'datetime',
311 # 'sheet': ['sheet_name']
312 # }
313 # }
314 for card, summary in bs_summary.items():
232 # 1.原表修剪、排列、按照月份分割 315 # 1.原表修剪、排列、按照月份分割
233 reverse_trend_list = [] 316 start_date = summary['start_date']
234 confidence_max = 0 317 end_date = summary['end_date']
235 code_list = [] 318 date_statistics = False
236 month_mapping = {} 319 if start_date is None or end_date is None:
320 date_statistics = True
237 date_list = [] 321 date_list = []
238 start_date = end_date = date_interval = print_time = None 322 month_mapping = {}
239 for summary in summary_list: 323 reverse_trend_list = []
240 sheet_name, confidence, page, code, print_time_local, start_date_local, end_date_local = summary 324 for sheet in summary['sheet']:
241 ws = self.get_sheet_by_name(sheet_name) 325 ws = self.get_sheet_by_name(sheet)
242 # 1.1.删除多余列、排列 326 # 1.1.删除多余列、排列
243 self.sheet_prune(ws) 327 min_row = self.sheet_prune(ws, summary['classify'])
244 # 1.2.按月份分割 328 # 1.2.按月份分割
245 self.sheet_split(ws, month_mapping, date_list, reverse_trend_list) 329 self.sheet_split(ws, month_mapping, reverse_trend_list, min_row, date_list, date_statistics)
246 # 1.3.元数据处理 TODO 时间与日期处理 330
247 confidence_max = max(confidence, confidence_max) 331 if date_statistics is True and len(date_list) > 1:
248 if code is not None: 332 start_date = min(date_list) if start_date is None else start_date
249 code_list.append((page, code)) 333 end_date = max(date_list) if end_date is None else end_date
250 334
251 if len(date_list) > 1:
252 start_date = min(date_list)
253 end_date = max(date_list)
254 date_interval = (end_date - start_date).days
255 # 2.元信息提取表 335 # 2.元信息提取表
256 ms = self.build_meta_sheet(role, confidence_max, code_list, print_time, start_date, end_date, date_interval) 336 bank_name = consts.CLASSIFY_LIST[summary['classify']][0]
337 base_sheet_name = '{0}_{1}'.format(bank_name, summary['role'])
338 ms = self.build_meta_sheet(card,
339 summary['classify'],
340 summary['confidence'],
341 summary['role'],
342 summary['code'],
343 summary['print_time'],
344 start_date,
345 end_date)
257 346
258 # 3.创建月份表、提取/高亮关键行 347 # 3.创建月份表、提取/高亮关键行
259 is_reverse = False 348 is_reverse = False
...@@ -261,8 +350,11 @@ class BSWorkbook(Workbook): ...@@ -261,8 +350,11 @@ class BSWorkbook(Workbook):
261 is_reverse = True 350 is_reverse = True
262 for month_list in month_mapping.values(): 351 for month_list in month_mapping.values():
263 month_list.sort(key=lambda x: x[-1], reverse=True) 352 month_list.sort(key=lambda x: x[-1], reverse=True)
264 self.build_month_sheet(role, month_mapping, ms, is_reverse) 353 self.build_month_sheet(base_sheet_name, month_mapping, ms, is_reverse)
354
355 # 4.删除原表
356 for sheet in summary['sheet']:
357 self.remove(self.get_sheet_by_name(sheet))
265 358
266 # 删除原表 359 def rebuild(self, bs_summary, license_summary):
267 for summary in summary_list: 360 self.bs_rebuild(bs_summary)
268 self.remove(self.get_sheet_by_name(summary[0]))
...\ No newline at end of file ...\ No newline at end of file
......
...@@ -25,7 +25,7 @@ class PDFHandler: ...@@ -25,7 +25,7 @@ class PDFHandler:
25 def __init__(self, path, img_dir_path): 25 def __init__(self, path, img_dir_path):
26 self.path = path 26 self.path = path
27 self.img_dir_path = img_dir_path 27 self.img_dir_path = img_dir_path
28 self.img_path_list = [] 28 self.img_info_list = []
29 self.xref_set = set() 29 self.xref_set = set()
30 30
31 def get_img_save_path(self, pno, img_index=0, ext='png'): 31 def get_img_save_path(self, pno, img_index=0, ext='png'):
...@@ -38,7 +38,7 @@ class PDFHandler: ...@@ -38,7 +38,7 @@ class PDFHandler:
38 pm = page.getPixmap(matrix=trans_2, alpha=False) 38 pm = page.getPixmap(matrix=trans_2, alpha=False)
39 img_save_path = self.get_img_save_path(page.number) 39 img_save_path = self.get_img_save_path(page.number)
40 pm.writePNG(img_save_path) 40 pm.writePNG(img_save_path)
41 self.img_path_list.append(img_save_path) 41 self.img_info_list.append((img_save_path, page.number, 0))
42 42
43 @staticmethod 43 @staticmethod
44 def getimage(pix): 44 def getimage(pix):
...@@ -88,7 +88,7 @@ class PDFHandler: ...@@ -88,7 +88,7 @@ class PDFHandler:
88 with open(img_save_path, "wb") as f: 88 with open(img_save_path, "wb") as f:
89 f.write(img_data) 89 f.write(img_data)
90 self.xref_set.add(xref) 90 self.xref_set.add(xref)
91 self.img_path_list.append(img_save_path) 91 self.img_info_list.append((img_save_path, pno, img_index))
92 92
93 @staticmethod 93 @staticmethod
94 def split_il(il): 94 def split_il(il):
...@@ -179,7 +179,7 @@ class PDFHandler: ...@@ -179,7 +179,7 @@ class PDFHandler:
179 img_save_path = self.get_img_save_path(pno, img_index, im_list[0][2]) 179 img_save_path = self.get_img_save_path(pno, img_index, im_list[0][2])
180 new_img.save(img_save_path) 180 new_img.save(img_save_path)
181 page_to_png = False 181 page_to_png = False
182 self.img_path_list.append(img_save_path) 182 self.img_info_list.append((img_save_path, pno, img_index))
183 183
184 # 3.3 碎图分组大于2、全过滤、含特殊filter,特殊处理:整个页面保存为png图片 184 # 3.3 碎图分组大于2、全过滤、含特殊filter,特殊处理:整个页面保存为png图片
185 if page_to_png: 185 if page_to_png:
......
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!