update wb build
Showing
6 changed files
with
776 additions
and
166 deletions
1 | PAGE_DEFAULT = 1 | 1 | PAGE_DEFAULT = 1 |
2 | PAGE_SIZE_DEFAULT = 10 | 2 | PAGE_SIZE_DEFAULT = 10 |
3 | 3 | ||
4 | TRANS = str.maketrans('Cc((oODlLmAsSbg', '000000011345569') | ||
5 | |||
6 | CARD_RATIO = 0.9 | ||
7 | UNKNOWN_CARD = '未知卡号' | ||
8 | UNKNOWN_ROLE = '未知户名' | ||
9 | DATE_FORMAT = ['%Y年%m月%d日', '%Y/%m/%d', '%Y-%m-%d', '%Y%m%d'] | ||
10 | |||
4 | FIXED_APPLICATION_ID_PREFIX = 'CH-S' | 11 | FIXED_APPLICATION_ID_PREFIX = 'CH-S' |
5 | 12 | ||
6 | DOC_SCHEME_LIST = ['ACCEPTANCE', 'SETTLEMENT', 'CONTRACT MANAGEMENT'] | 13 | DOC_SCHEME_LIST = ['ACCEPTANCE', 'SETTLEMENT', 'CONTRACT MANAGEMENT'] |
7 | DATA_SOURCE_LIST = ['POS', 'EAPP', 'ECONTRACT'] | 14 | DATA_SOURCE_LIST = ['POS', 'E-APP', 'ECONTRACT'] |
8 | 15 | ||
9 | HIL_PREFIX = 'HIL' | 16 | HIL_PREFIX = 'HIL' |
10 | AFC_PREFIX = 'AFC' | 17 | AFC_PREFIX = 'AFC' |
... | @@ -39,11 +46,33 @@ PROOF_COL_TITLE = '核对结果' | ... | @@ -39,11 +46,33 @@ PROOF_COL_TITLE = '核对结果' |
39 | PROOF_RES = ('对', '错') | 46 | PROOF_RES = ('对', '错') |
40 | META_SHEET_TITLE = '关键信息提取和展示' | 47 | META_SHEET_TITLE = '关键信息提取和展示' |
41 | 48 | ||
42 | FIXED_HEADERS = ('记账日期', '记账时间', '金额', '余额', '交易名称', '附言', '对方账户名', '对方卡号/账号', '对方开户行', '核对结果') | 49 | FIXED_HEADERS = ('记账日期', '记账时间', '金额', '余额', '交易名称', '附言', '对方账户名', '对方卡号/账号', '对方开户行', '核对结果', '借贷', '收入', '支出') |
43 | FIXED_COL_AMOUNT = len(FIXED_HEADERS) | 50 | FIXED_COL_AMOUNT = len(FIXED_HEADERS) |
44 | BASE_HEADERS_MAPPING = {label: idx+1 for idx, label in enumerate(FIXED_HEADERS)} | 51 | BASE_HEADERS_MAPPING = {label: idx+1 for idx, label in enumerate(FIXED_HEADERS)} |
52 | BORROW_HEADER_COL = BASE_HEADERS_MAPPING['借贷'] | ||
53 | INCOME_HEADER_COL = BASE_HEADERS_MAPPING['收入'] | ||
54 | OUTLAY_HEADER_COL = BASE_HEADERS_MAPPING['支出'] | ||
55 | RESULT_HEADER_COL = BASE_HEADERS_MAPPING['核对结果'] | ||
56 | BORROW_IDX = BORROW_HEADER_COL - 1 | ||
57 | INCOME_IDX = INCOME_HEADER_COL - 1 | ||
58 | OUTLAY_IDX = OUTLAY_HEADER_COL - 1 | ||
59 | SUMMARY_IDX = FIXED_HEADERS.index('附言') | ||
60 | DATE_IDX = FIXED_HEADERS.index('记账日期') | ||
61 | AMOUNT_IDX = FIXED_HEADERS.index('金额') | ||
62 | OVER_IDX = FIXED_HEADERS.index('余额') | ||
63 | RESULT_IDX = FIXED_HEADERS.index('核对结果') | ||
64 | # '借贷': ('贷', '借'), # 竖版-无表格-广发银行 | ||
65 | # '借贷状态': ('贷', '借'), # 竖版-特殊-交通银行 | ||
66 | # '收/支': ('收入', '支出'), # 横版-表格-北京银行 | ||
67 | BORROW_HEADERS_SET = {'借贷', '借贷状态', '收/支'} | ||
68 | BORROW_INCOME_SET = {'贷', '收入'} | ||
69 | BORROW_OUTLAY_SET = {'借', '支出'} | ||
70 | INCOME_HEADERS_SET = {'收入金额', '收入', '存入', '存入金额(贷)', '存入金额(贷)'} | ||
71 | OUTLAY_HEADERS_SET = {'支出金额', '支出', '支取金额(借)', '支取金额(借)'} | ||
72 | |||
73 | # ------------------普通打印-全格线-------------------------------------------------------------------------------------- | ||
45 | HEADERS_MAPPING = {} | 74 | HEADERS_MAPPING = {} |
46 | # 中国银行 | 75 | # 横版-表格-中国银行(不规则) |
47 | HEADERS_MAPPING.update( | 76 | HEADERS_MAPPING.update( |
48 | { | 77 | { |
49 | '记账日期': BASE_HEADERS_MAPPING['记账日期'], | 78 | '记账日期': BASE_HEADERS_MAPPING['记账日期'], |
... | @@ -57,37 +86,294 @@ HEADERS_MAPPING.update( | ... | @@ -57,37 +86,294 @@ HEADERS_MAPPING.update( |
57 | '对方开户行': BASE_HEADERS_MAPPING['对方开户行'], | 86 | '对方开户行': BASE_HEADERS_MAPPING['对方开户行'], |
58 | } | 87 | } |
59 | ) | 88 | ) |
60 | # 竖版-表格-建设银行 | 89 | # 横版-表格-农业银行-中国农业银行个人账户明细 |
61 | HEADERS_MAPPING.update( | 90 | HEADERS_MAPPING.update( |
62 | { | 91 | { |
63 | '交易日期': BASE_HEADERS_MAPPING['记账日期'], | 92 | '交易日期': BASE_HEADERS_MAPPING['记账日期'], |
64 | '交易金额': BASE_HEADERS_MAPPING['金额'], | 93 | '存入': BASE_HEADERS_MAPPING['金额'], |
65 | '账户余额': BASE_HEADERS_MAPPING['余额'], | 94 | '对方账号': BASE_HEADERS_MAPPING['对方卡号/账号'], |
95 | '对方名称': BASE_HEADERS_MAPPING['对方账户名'], | ||
66 | '摘要': BASE_HEADERS_MAPPING['附言'], | 96 | '摘要': BASE_HEADERS_MAPPING['附言'], |
67 | '对方账号与户名': BASE_HEADERS_MAPPING['对方卡号/账号'], | ||
68 | } | 97 | } |
69 | ) | 98 | ) |
70 | # 横版-表格-农业银行 | 99 | # 横版-表格-北京银行 |
71 | HEADERS_MAPPING.update( | 100 | HEADERS_MAPPING.update( |
72 | { | 101 | { |
73 | '存入': BASE_HEADERS_MAPPING['金额'], | 102 | '业务摘要': BASE_HEADERS_MAPPING['附言'], |
74 | '对方账号': BASE_HEADERS_MAPPING['对方卡号/账号'], | 103 | '发生额': BASE_HEADERS_MAPPING['金额'], |
75 | '对方名称': BASE_HEADERS_MAPPING['对方账户名'], | 104 | '对方户名': BASE_HEADERS_MAPPING['对方账户名'], |
76 | } | 105 | } |
77 | ) | 106 | ) |
78 | # 横版-表格-工商银行 | 107 | # 横版-表格-工商银行 借记卡账户历史明细清单 |
108 | # 横版-表格-工商银行-机打验证码 借记卡账户历史明细清单 | ||
109 | # 横版-表格-工商银行CH-B008802400 | ||
110 | # 横版-表格-工商银行 工资明细清单 | ||
111 | # 工商银行历史明细(申请单号:20042501303039397888) | ||
79 | HEADERS_MAPPING.update( | 112 | HEADERS_MAPPING.update( |
80 | { | 113 | { |
81 | '对方户名': BASE_HEADERS_MAPPING['对方账户名'], | ||
82 | '收入/支出金额': BASE_HEADERS_MAPPING['金额'], | 114 | '收入/支出金额': BASE_HEADERS_MAPPING['金额'], |
83 | '工作日期': BASE_HEADERS_MAPPING['记账日期'], | 115 | '工作日期': BASE_HEADERS_MAPPING['记账日期'], |
84 | } | 116 | } |
85 | ) | 117 | ) |
86 | # 横版-表格-北京银行 | 118 | |
119 | # 横版-表格-建设银行-个人活期账户交易明细 | ||
120 | # 竖版-表格-建设银行-个人活期账户交易明细 CH-B005832604 | ||
121 | # 竖版-表格-建设银行-工资账单CH-B008786812 | ||
122 | # 竖版-表格-建设银行-个人活期账户交易明细 CH-B005832604 (2) | ||
87 | HEADERS_MAPPING.update( | 123 | HEADERS_MAPPING.update( |
88 | { | 124 | { |
89 | '业务摘要': BASE_HEADERS_MAPPING['附言'], | 125 | '交易金额': BASE_HEADERS_MAPPING['金额'], |
90 | '发生额': BASE_HEADERS_MAPPING['金额'], | 126 | '账户余额': BASE_HEADERS_MAPPING['余额'], |
127 | '对方账号与户名': BASE_HEADERS_MAPPING['对方卡号/账号'], | ||
128 | } | ||
129 | ) | ||
130 | # 微信 | ||
131 | HEADERS_MAPPING.update( | ||
132 | { | ||
133 | '交易时间': BASE_HEADERS_MAPPING['记账时间'], | ||
134 | '交易类型': BASE_HEADERS_MAPPING['附言'], | ||
135 | '金额(元)': BASE_HEADERS_MAPPING['金额'], | ||
136 | '金额(元)': BASE_HEADERS_MAPPING['金额'], | ||
137 | '交易对方': BASE_HEADERS_MAPPING['对方账户名'], | ||
91 | } | 138 | } |
92 | ) | 139 | ) |
140 | # 支付宝 | ||
141 | HEADERS_MAPPING.update( | ||
142 | { | ||
143 | '时间': BASE_HEADERS_MAPPING['记账日期'], | ||
144 | '名称/备注': BASE_HEADERS_MAPPING['附言'], | ||
145 | } | ||
146 | ) | ||
147 | |||
148 | # ------------普通打印-部分格线------------------------------------------------------------------------------------------- | ||
149 | |||
150 | # 竖版-无表格-农业银行 | ||
151 | # 竖版-无表格-农业银行CH-B008805428 | ||
152 | HEADERS_MAPPING.update( | ||
153 | { | ||
154 | '摘要/附言': BASE_HEADERS_MAPPING['附言'], | ||
155 | '交易地点/对方账号和户名': BASE_HEADERS_MAPPING['对方卡号/账号'], | ||
156 | } | ||
157 | ) | ||
158 | # 竖版-特殊-农商行 | ||
159 | HEADERS_MAPPING.update( | ||
160 | { | ||
161 | '交易发生额': BASE_HEADERS_MAPPING['金额'], | ||
162 | } | ||
163 | ) | ||
164 | # 横版-特殊-中信银行-账户交易明细 | ||
165 | HEADERS_MAPPING.update( | ||
166 | { | ||
167 | '对方银行': BASE_HEADERS_MAPPING['对方开户行'], | ||
168 | '交易摘要': BASE_HEADERS_MAPPING['附言'], | ||
169 | } | ||
170 | ) | ||
171 | # 平安电子账单 | ||
172 | HEADERS_MAPPING.update( | ||
173 | { | ||
174 | '借贷发生额(借:-贷:+)': BASE_HEADERS_MAPPING['金额'], | ||
175 | } | ||
176 | ) | ||
177 | |||
178 | # ------------普通打印-无格线-------------------------------------------------------------------------------------------- | ||
179 | |||
180 | # 竖版-无表格-招商银行(略歪) | ||
181 | # 竖版-无表格-招商银行账户历史交易明细表 | ||
182 | HEADERS_MAPPING.update( | ||
183 | { | ||
184 | '联机余额': BASE_HEADERS_MAPPING['余额'], | ||
185 | } | ||
186 | ) | ||
187 | # 竖版-无表格-邮储银行-账户对账单 含有对手方户名 对手方账户 | ||
188 | # 竖版-无表格-邮储银行 账户对账单 | ||
189 | # 竖版-无表格-邮储银行-电子章 邮储银行 账户对账单 | ||
190 | HEADERS_MAPPING.update( | ||
191 | { | ||
192 | '交易金额(元)': BASE_HEADERS_MAPPING['金额'], | ||
193 | '交易金额(元)': BASE_HEADERS_MAPPING['金额'], | ||
194 | '账户余额(元)': BASE_HEADERS_MAPPING['余额'], | ||
195 | '账户余额(元)': BASE_HEADERS_MAPPING['余额'], | ||
196 | '对手方户名': BASE_HEADERS_MAPPING['对方账户名'], | ||
197 | '对手方账户': BASE_HEADERS_MAPPING['对方卡号/账号'], | ||
198 | } | ||
199 | ) | ||
200 | # 横版-无表格-广发银行-账户交易历史 --> 已废弃 | ||
201 | # 竖版-无表格-广发银行-账户交易历史 --> 已废弃 | ||
202 | HEADERS_MAPPING.update( | ||
203 | { | ||
204 | '会计日期': BASE_HEADERS_MAPPING['记账日期'], | ||
205 | '对手户名': BASE_HEADERS_MAPPING['对方账户名'], | ||
206 | '对手账号': BASE_HEADERS_MAPPING['对方卡号/账号'], | ||
207 | } | ||
208 | ) | ||
209 | # 招行电子账单 TODO 有英文,需测试 | ||
210 | HEADERS_MAPPING.update( | ||
211 | { | ||
212 | '对手信息': BASE_HEADERS_MAPPING['对方账户名'], | ||
213 | '摘要代码': BASE_HEADERS_MAPPING['附言'], | ||
214 | } | ||
215 | ) | ||
216 | # 横版-无表格-民生银行-中国民生银行个人账户对账单(客户卡号) | ||
217 | # 横版-无表格-民生银行-无标题(客户账户) | ||
218 | # 横版-无表格-民生银行 | ||
219 | HEADERS_MAPPING.update( | ||
220 | { | ||
221 | '摘要信息': BASE_HEADERS_MAPPING['附言'], | ||
222 | '对方行名': BASE_HEADERS_MAPPING['对方开户行'], | ||
223 | } | ||
224 | ) | ||
225 | # 竖版-无表格-农业银行整数 | ||
226 | # 竖版-无表格-农业银行-中国农业银行银行卡交易明细清单 | ||
227 | HEADERS_MAPPING.update( | ||
228 | { | ||
229 | '对方账号和户名': BASE_HEADERS_MAPPING['对方卡号/账号'], | ||
230 | } | ||
231 | ) | ||
232 | # 竖版-无表格-农业银行-中国农业银行银行卡活期存折交易明细清单.pdf | ||
233 | # 竖版-无表格-农业银行-扩张.pdf | ||
234 | # 竖版-无表格-农业银行-缩进.pdf | ||
235 | HEADERS_MAPPING.update( | ||
236 | { | ||
237 | '日期': BASE_HEADERS_MAPPING['记账日期'], | ||
238 | '短摘要': BASE_HEADERS_MAPPING['附言'], | ||
239 | '本次余额': BASE_HEADERS_MAPPING['余额'], | ||
240 | } | ||
241 | ) | ||
242 | # 竖版-无表格-农业银行-无标题(对手帐号) | ||
243 | HEADERS_MAPPING.update( | ||
244 | { | ||
245 | '交易后余额': BASE_HEADERS_MAPPING['余额'], | ||
246 | '对手帐号': BASE_HEADERS_MAPPING['对方卡号/账号'], | ||
247 | } | ||
248 | ) | ||
249 | # 竖版-无表格-农商行(非常规) | ||
250 | HEADERS_MAPPING.update( | ||
251 | { | ||
252 | '交易说明': BASE_HEADERS_MAPPING['附言'], | ||
253 | } | ||
254 | ) | ||
255 | # 竖版-无表格-工商银行 抬头三行 活期历史明细清单 | ||
256 | HEADERS_MAPPING.update( | ||
257 | { | ||
258 | '对方账户': BASE_HEADERS_MAPPING['对方卡号/账号'], | ||
259 | } | ||
260 | ) | ||
261 | |||
262 | # -----------针式打印-全格线-------------------------------------------------------------------------------------------- | ||
263 | # 竖版-表格-建设银行-中国建设银行活期账户交易明细 | ||
264 | # 竖版-表格-建设银行-中国建设银行活期账户明细清单 | ||
265 | # 竖版-表格-建设银行-对私活期账户明细- (1).pdf | ||
266 | HEADERS_MAPPING.update( | ||
267 | { | ||
268 | '帐户余额': BASE_HEADERS_MAPPING['余额'], | ||
269 | '对方帐户名称': BASE_HEADERS_MAPPING['对方账户名'], | ||
270 | } | ||
271 | ) | ||
272 | # 竖版-特殊-交通银行 零售客户交易清单 5000以上交易记录 | ||
273 | HEADERS_MAPPING.update( | ||
274 | { | ||
275 | '交易日期 记账日期': BASE_HEADERS_MAPPING['记账日期'], | ||
276 | } | ||
277 | ) | ||
278 | |||
279 | # ----------针式打印-部分格线------------------------------------------------------------------------------------------ | ||
280 | # 竖版-特殊-邮储银行-一本通绿卡通交易明细(客户) | ||
281 | # 竖版-特殊-邮储银行-账户交易明细(客户) | ||
282 | HEADERS_MAPPING.update( | ||
283 | { | ||
284 | '对方账号/卡号/汇票号': BASE_HEADERS_MAPPING['对方卡号/账号'], | ||
285 | } | ||
286 | ) | ||
287 | |||
288 | # -------------------------------------------------------------------------------------------------------------------- | ||
289 | |||
290 | # ('记账日期', '记账时间', '金额', '余额', '交易名称', '附言', '对方账户名', '对方卡号/账号', '对方开户行', '核对结果', '借贷', '收入', '支出') | ||
291 | # CLASSIFY_LIST = [ | ||
292 | # ('中国银行', (1, 2, 4, 5, 6, 9, 10, 11, 12, None, None, None, None)), # 横版-表格-中国银行(不规则) | ||
293 | # ('农业银行', (1, None, 3, 5, None, 8, 7, 6, None, None, None, None, None)), # 横版-表格-农业银行-中国农业银行个人账户明细 | ||
294 | # ('北京银行', (1, None, 4, 5, None, 2, 6, 7, None, None, 3, None, None)), # 横版-表格-北京银行 | ||
295 | # | ||
296 | # # 工商银行:交易日期 账号 储种 序号 币种 钞汇 摘要 地区 收入/支出金额 余额 [对方户名 对方账号] 渠道 | ||
297 | # ('工商银行', (1, None, 9, 10, None, 7, None, None, None, None, None, None, None)), | ||
298 | # | ||
299 | # # 建设银行:空 摘要 交易日期 交易金额 账户余额 商户/网点号及其名称 对方账号与户名 --> 竖版-表格-建设银行 | ||
300 | # # 序号 摘要 币别 钞汇 交易日期 交易金额 账户余额 交易地点附言 对方账号与户名 --> 横版-表格-建设银行 | ||
301 | # ('建设银行', (None, None, None, None, None, 2, None, None, None, None, None, None, None)), | ||
302 | # | ||
303 | # # 微信:交易单号 交易时间 交易类型 收/支/其他 交易方式 金额(元) 交易对方 商户单号 | ||
304 | # | ||
305 | # # 支付宝:流水号 时间 名称/备注 收入 支出 账户余额 资金渠道 | ||
306 | # | ||
307 | # # ----------------- | ||
308 | # | ||
309 | # # 农业银行:交易日期 摘要/附言 交易金额 余额 交易地点/对方账号和户名 | ||
310 | # ('农业银行', (1, None, 3, 4, None, 2, None, 5, None, None, None, None, None)), | ||
311 | # | ||
312 | # # 农商行:交易日期 交易发生额 账户余额 对方账号 对方户名 摘要 备注 | ||
313 | # | ||
314 | # # 中信银行:交易日期 交易摘要 收入金额 支出金额 账户余额 对方户名 对方账号 对方银行 交易流水号 | ||
315 | # ('中信银行', (1, None, None, 5, None, 2, 6, 7, 8, None, None, 3, 4)), | ||
316 | # | ||
317 | # # 平安电子账单:序号 交易日期 交易网点 摘要 借贷发生额(借:-贷:+) 账户余额 | ||
318 | # ('平安电子账单', (2, None, 5, 6, None, 4, None, None, None, None, None, None, None)), | ||
319 | # | ||
320 | # # ------------------------- | ||
321 | # | ||
322 | # # 招商银行:记账日期 货币 交易金额 联机余额 冲补账 交易摘要 | ||
323 | # | ||
324 | # # 邮储银行:交易日期、交易类型 交易币种 交易金额(元) 账户余额(元) [对手方户名 对手方账户 收支类型] --> 竖版-无表格-邮储银行-账户对账单 含有对手方户名 对手方账户 | ||
325 | # # 交易日期、交易类型 交易金额(元) 账户余额(元) 操作柜员 --> 竖版-无表格-邮储银行 账户对账单 | ||
326 | # | ||
327 | # # 工商银行电子版:交易日期 账号 储种 序号 币种 妙汇 摘要 地区 收入/支出金额 余额 [对方户名 对方账号] 渠道 | ||
328 | # | ||
329 | # # 招商银行电子版:记账日期 货币 交易金额 联机余额 交易摘要 对手信息 | ||
330 | # | ||
331 | # # 民生银行:凭证类型 凭证号码 摘要信息 交易时间 交易金额 账户余额 现转标志 交易渠道 交易机构 对方户名 对方行名 | ||
332 | # # 凭证类型 凭证号码 交易时间 摘要 交易金额 账户余额 现转标志 交易渠道 交易机构 对方户名 对方行名 | ||
333 | # | ||
334 | # # 农业银行:交易日期 摘要/附言 交易金额 对方账号和户名 | ||
335 | # | ||
336 | # # 农业银行:交易日期 摘要/附言 交易金额 余额 交易地点/对方账号和户名 | ||
337 | # | ||
338 | # # 农业银行:日期、时间、[日志号]、短摘要、交易金额、本次余额、交易网点、渠道、附言 | ||
339 | # | ||
340 | # # 交通银行:交易日期 记账日期、交易地点、交易类型、借贷状态、交易金额、余额 | ||
341 | # | ||
342 | # # =================================== | ||
343 | # | ||
344 | # # 建设银行:摘要、交易日期、交易金额、账户余额、商户/网点号及其名称、对方账号、对方户名 | ||
345 | # # 交易日期、摘要、币种、钞汇、交易金额、帐户余额、对方账号、对方帐户名称 | ||
346 | # | ||
347 | # | ||
348 | # # =================================== | ||
349 | # | ||
350 | # # 邮储银行:序号、交易日期、交易渠道、摘要、交易金额、账户余额、对方账号/卡号/汇票号、原子账号、交易机构名称 | ||
351 | # | ||
352 | # # 建设银行:序号 摘要 币别 钞汇 交易日期 交易金额 账户余额 交易地点附言 对方账号与户名 | ||
353 | # ] | ||
354 | |||
355 | # { | ||
356 | # "0": "全表格-中国农业银行个人账户明细", | ||
357 | # "1": "全表格-中国银行", | ||
358 | # "2": "全表格-北京银行", | ||
359 | # "3": "全表格-工商银行", | ||
360 | # "4": "全表格-建设银行", | ||
361 | # "5": "部分格线-横版-中信银行账户交易明细", | ||
362 | # "6": "部分格线-横版-中信银行账户交易明细特殊", | ||
363 | # "7": "部分格线-竖版-中国农业银行", | ||
364 | # "8": "部分格线-竖版-中国农业银行分账户(窄页)", | ||
365 | # "9": "部分格线-竖版-平安电子账单" | ||
366 | # } | ||
367 | CLASSIFY_LIST = [ | ||
368 | ('农业银行', (1, None, 3, 5, None, 8, 7, 6, None, None, None, None, None)), | ||
369 | ('中国银行', (1, 2, 4, 5, 6, 9, 10, 11, 12, None, None, None, None)), | ||
370 | ('北京银行', (1, None, 4, 5, None, 2, 6, 7, None, None, 3, None, None)), | ||
371 | ('工商银行', (1, None, 9, 10, None, 7, None, None, None, None, None, None, None)), | ||
372 | ('建设银行', (None, None, None, None, None, 2, None, None, None, None, None, None, None)), | ||
373 | ('中信银行', (1, None, None, 5, None, 2, 6, 7, 8, None, None, 3, 4)), | ||
374 | ('中信银行', (1, None, None, 5, None, 2, 6, 7, 8, None, None, 3, 4)), | ||
375 | ('农业银行', (1, None, 3, 4, None, 2, None, 5, None, None, None, None, None)), | ||
376 | ('农业银行', (1, None, 3, 4, None, 2, None, 5, None, None, None, None, None)), | ||
377 | ('平安电子账单', (2, None, 5, 6, None, 4, None, None, None, None, None, None, None)), | ||
378 | ] | ||
93 | 379 | ... | ... |
1 | import os | 1 | import os |
2 | import time | 2 | import time |
3 | import fitz | ||
4 | import signal | 3 | import signal |
5 | import base64 | ||
6 | import asyncio | 4 | import asyncio |
7 | import aiohttp | 5 | import aiohttp |
6 | import difflib | ||
8 | import requests | 7 | import requests |
8 | from datetime import datetime | ||
9 | from collections import Counter | ||
9 | from apps.doc.ocr.wb import BSWorkbook, Workbook | 10 | from apps.doc.ocr.wb import BSWorkbook, Workbook |
10 | from django.core.management import BaseCommand | 11 | from django.core.management import BaseCommand |
11 | 12 | ||
... | @@ -65,8 +66,6 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -65,8 +66,6 @@ class Command(BaseCommand, LoggerMixin): |
65 | return doc, business_type | 66 | return doc, business_type |
66 | 67 | ||
67 | def pdf_download(self, doc, business_type): | 68 | def pdf_download(self, doc, business_type): |
68 | if doc is None: | ||
69 | return None, None, None, None | ||
70 | doc_data_path = os.path.join(self.data_dir, business_type, str(doc.id)) | 69 | doc_data_path = os.path.join(self.data_dir, business_type, str(doc.id)) |
71 | os.makedirs(doc_data_path, exist_ok=True) | 70 | os.makedirs(doc_data_path, exist_ok=True) |
72 | pdf_path = os.path.join(doc_data_path, '{0}.pdf'.format(doc.id)) | 71 | pdf_path = os.path.join(doc_data_path, '{0}.pdf'.format(doc.id)) |
... | @@ -80,20 +79,96 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -80,20 +79,96 @@ class Command(BaseCommand, LoggerMixin): |
80 | return doc_data_path, excel_path, src_excel_path, pdf_path | 79 | return doc_data_path, excel_path, src_excel_path, pdf_path |
81 | 80 | ||
82 | @staticmethod | 81 | @staticmethod |
83 | def append_sheet(wb, sheets_list, img_name, role_summary): | 82 | def append_bs_sheet(wb, sheets, bs_summary, unknown_summary, pno, img_idx, classify, confidence): |
84 | for i, sheet in enumerate(sheets_list): | 83 | for i, sheet in enumerate(sheets): |
85 | sheet_name = '{0}_{1}'.format(img_name, i) | 84 | sheet_name = 'page_{0}_img_{1}_{2}'.format(pno, img_idx, i) |
86 | role_summary['银行-户名'].append((sheet_name, 1, None, None, None, None, None)) | 85 | # ['户名', '卡号', '页码', '回单验证码', '打印时间', '起始时间', '终止时间'] |
86 | summary = sheet.get('summary') | ||
87 | card = summary[1] | ||
88 | if card is None: | ||
89 | classify_dict = unknown_summary.setdefault(classify, {}) | ||
90 | role = consts.UNKNOWN_ROLE if summary[0] is None else summary[0] | ||
91 | role_dict = classify_dict.setdefault(role, {}) | ||
92 | role_dict['classify'] = classify | ||
93 | role_dict['role'] = role | ||
94 | role_dict.setdefault('sheet', []).append(sheet_name) | ||
95 | role_dict.setdefault('confidence', []).append(confidence) | ||
96 | code_list = role_dict.setdefault('code', []) | ||
97 | pt_list = role_dict.setdefault('print_time', []) | ||
98 | sd_list = role_dict.setdefault('start_date', []) | ||
99 | ed_list = role_dict.setdefault('end_date', []) | ||
100 | if summary[3] is not None: | ||
101 | code_list.append((summary[2], summary[3])) | ||
102 | if summary[4] is not None: | ||
103 | pt_list.append(summary[4]) | ||
104 | if summary[5] is not None: | ||
105 | sd_list.append(summary[5]) | ||
106 | if summary[6] is not None: | ||
107 | ed_list.append(summary[6]) | ||
108 | else: | ||
109 | card_dict = bs_summary.setdefault(card, {}) | ||
110 | card_dict['count'] = card_dict.get('count', 0) + 1 | ||
111 | card_dict.setdefault('classify', []).append(classify) | ||
112 | card_dict.setdefault('confidence', []).append(confidence) | ||
113 | card_dict.setdefault('sheet', []).append(sheet_name) | ||
114 | role_list = card_dict.setdefault('role', []) | ||
115 | role_set = card_dict.setdefault('role_set', set()) | ||
116 | code_list = card_dict.setdefault('code', []) | ||
117 | pt_list = card_dict.setdefault('print_time', []) | ||
118 | sd_list = card_dict.setdefault('start_date', []) | ||
119 | ed_list = card_dict.setdefault('end_date', []) | ||
120 | if summary[0] is not None: | ||
121 | role_list.append(summary[0]) | ||
122 | role_set.add(summary[0]) | ||
123 | if summary[3] is not None: | ||
124 | code_list.append((summary[2], summary[3])) | ||
125 | if summary[4] is not None: | ||
126 | pt_list.append(summary[4]) | ||
127 | if summary[5] is not None: | ||
128 | sd_list.append(summary[5]) | ||
129 | if summary[6] is not None: | ||
130 | ed_list.append(summary[6]) | ||
131 | |||
87 | ws = wb.create_sheet(sheet_name) | 132 | ws = wb.create_sheet(sheet_name) |
88 | cells = sheet.get('cells') | 133 | cells = sheet.get('cells') |
89 | for cell in cells: | 134 | for cell in cells: |
90 | c1 = cell.get('start_column') | 135 | c1 = cell.get('start_column') |
91 | # c2 = cell.get('end_column') | ||
92 | r1 = cell.get('start_row') | 136 | r1 = cell.get('start_row') |
93 | # r2 = cell.get('end_row') | ||
94 | words = cell.get('words') | 137 | words = cell.get('words') |
95 | ws.cell(row=r1+1, column=c1+1, value=words) | 138 | ws.cell(row=r1+1, column=c1+1, value=words) |
96 | 139 | ||
140 | def ocr_2_wb(self, res, wb, pno, img_idx, bs_summary, unknown_summary, license_summary): | ||
141 | # res = { | ||
142 | # 'code': 1, | ||
143 | # 'msg': 'success', | ||
144 | # 'data': { | ||
145 | # 'classify': 0, | ||
146 | # 'confidence': 0.999, | ||
147 | # 'sheets': [ | ||
148 | # { | ||
149 | # 'summary': ['户名', '卡号', '页码', '回单验证码', '打印时间', '起始时间', '终止时间'], | ||
150 | # 'cells': [] | ||
151 | # }, | ||
152 | # { | ||
153 | # 'summary': ['户名', '卡号', '页码', '回单验证码', '打印时间', '起始时间', '终止时间'], | ||
154 | # 'cells': [] | ||
155 | # } | ||
156 | # ] | ||
157 | # } | ||
158 | # } | ||
159 | data = res.get('data', {}) | ||
160 | classify = data.get('classify') | ||
161 | if classify is None: | ||
162 | return | ||
163 | # if classify in | ||
164 | sheets = data.get('sheets', []) | ||
165 | if not sheets: | ||
166 | return | ||
167 | confidence = data.get('confidence', 1) | ||
168 | self.append_bs_sheet(wb, sheets, bs_summary, unknown_summary, pno, img_idx, classify, confidence) | ||
169 | # else: | ||
170 | # pass | ||
171 | |||
97 | # async def fetch_ocr_result(self, img_path): | 172 | # async def fetch_ocr_result(self, img_path): |
98 | # async with aiohttp.ClientSession( | 173 | # async with aiohttp.ClientSession( |
99 | # headers=self.ocr_header, connector=aiohttp.TCPConnector(ssl=False) | 174 | # headers=self.ocr_header, connector=aiohttp.TCPConnector(ssl=False) |
... | @@ -102,35 +177,170 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -102,35 +177,170 @@ class Command(BaseCommand, LoggerMixin): |
102 | # async with session.post(self.ocr_url, json=json_data) as response: | 177 | # async with session.post(self.ocr_url, json=json_data) as response: |
103 | # return await response.json() | 178 | # return await response.json() |
104 | # | 179 | # |
105 | # async def img_ocr_excel(self, wb, img_path, role_summary): | 180 | # async def img_2_ocr_2_wb(self, wb, img_path, summary): |
106 | # res = await self.fetch_ocr_result(img_path) | 181 | # res = await self.fetch_ocr_result(img_path) |
107 | # self.cronjob_log.info('{0} [fetch ocr result success] [img={1}] [res={2}]'.format(self.log_base, img_path, res)) | 182 | # self.cronjob_log.info('{0} [fetch ocr result success] [img={1}] [res={2}]'.format(self.log_base, img_path, res)) |
108 | # sheets_list = res.get('result').get('res') | 183 | # sheets_list = res.get('result').get('res') |
109 | # img_name = os.path.basename(img_path) | 184 | # img_name = os.path.basename(img_path) |
110 | # self.append_sheet(wb, sheets_list, img_name, role_summary) | 185 | # self.append_sheet(wb, sheets_list, img_name, summary) |
111 | 186 | ||
112 | def fetch_ocr_result(self, img_path): | 187 | def fetch_ocr_result(self, img_path): |
113 | # payload = {'name': 'page_0_img_0_0'} | ||
114 | files = [ | 188 | files = [ |
115 | ('img', open(img_path, 'rb')) | 189 | ('img', open(img_path, 'rb')) |
116 | ] | 190 | ] |
117 | response = requests.request("POST", self.ocr_url, files=files) | 191 | response = requests.request("POST", self.ocr_url, files=files) |
118 | return response.json() | 192 | return response.json() |
119 | 193 | ||
120 | def img_ocr_excel(self, wb, img_path, role_summary): | 194 | def img_2_ocr_2_wb(self, wb, img_info, bs_summary, unknown_summary, license_summary): |
121 | res = self.fetch_ocr_result(img_path) | 195 | res = self.fetch_ocr_result(img_info[0]) |
122 | self.cronjob_log.info('{0} [fetch ocr result success] [img={1}] [res={2}]'.format(self.log_base, img_path, res)) | 196 | self.cronjob_log.info('{0} [fetch ocr result success] [img={1}] [res={2}]'.format( |
197 | self.log_base, img_info[0], res)) | ||
123 | if res.get('code') == 1: | 198 | if res.get('code') == 1: |
124 | sheets_list = res.get('data') | 199 | self.ocr_2_wb(res, wb, img_info[1], img_info[2], bs_summary, unknown_summary, license_summary) |
125 | if not sheets_list: | 200 | |
126 | return | 201 | @staticmethod |
127 | img_name = os.path.basename(img_path) | 202 | def get_most(value_list): |
128 | self.append_sheet(wb, sheets_list, img_name, role_summary) | 203 | if value_list: |
204 | most_common = Counter(value_list).most_common(1) | ||
205 | return most_common[0][0] if most_common else None | ||
206 | |||
207 | @staticmethod | ||
208 | def date_format(date_str, format_str): | ||
209 | try: | ||
210 | date = datetime.strptime(date_str, format_str) | ||
211 | except Exception as e: | ||
212 | return | ||
213 | else: | ||
214 | return date | ||
215 | |||
216 | def get_validate_date(self, date_list): | ||
217 | for date_str in date_list: | ||
218 | for format_str in consts.DATE_FORMAT: | ||
219 | date = self.date_format(date_str, format_str) | ||
220 | if isinstance(date, datetime): | ||
221 | return date | ||
222 | |||
223 | def merge_card(self, bs_summary): | ||
224 | merged_bs_summary = {} | ||
225 | sorted_card = sorted(bs_summary.keys(), key=lambda x: bs_summary[x]['count'], reverse=True) | ||
226 | for main_card in sorted_card: | ||
227 | if bs_summary.get(main_card) is None: | ||
228 | continue | ||
229 | merged_bs_summary[main_card] = bs_summary.pop(main_card) | ||
230 | del merged_bs_summary[main_card]['count'] | ||
231 | merge_cards = [] | ||
232 | for card in bs_summary.keys(): | ||
233 | if difflib.SequenceMatcher(None, main_card, card).quick_ratio() > consts.CARD_RATIO: | ||
234 | merged_bs_summary[main_card]['classify'].extend(bs_summary[card]['classify']) | ||
235 | merged_bs_summary[main_card]['confidence'].extend(bs_summary[card]['confidence']) | ||
236 | merged_bs_summary[main_card]['sheet'].extend(bs_summary[card]['sheet']) | ||
237 | merged_bs_summary[main_card]['role'].extend(bs_summary[card]['role']) | ||
238 | merged_bs_summary[main_card]['role_set'].update(bs_summary[card]['role_set']) | ||
239 | merged_bs_summary[main_card]['code'].extend(bs_summary[card]['sheet']) | ||
240 | merged_bs_summary[main_card]['print_time'].extend(bs_summary[card]['print_time']) | ||
241 | merged_bs_summary[main_card]['start_date'].extend(bs_summary[card]['start_date']) | ||
242 | merged_bs_summary[main_card]['end_date'].extend(bs_summary[card]['end_date']) | ||
243 | merge_cards.append(card) | ||
244 | for card in merge_cards: | ||
245 | del bs_summary[card] | ||
246 | merged_bs_summary[main_card]['classify'] = self.get_most(merged_bs_summary[main_card]['classify']) | ||
247 | merged_bs_summary[main_card]['role'] = self.get_most(merged_bs_summary[main_card]['role']) | ||
248 | del bs_summary | ||
249 | return merged_bs_summary | ||
250 | |||
251 | def prune_bs_summary(self, bs_summary): | ||
252 | for summary in bs_summary.values(): | ||
253 | del summary['count'] | ||
254 | summary['classify'] = self.get_most(summary['classify']) | ||
255 | summary['role'] = self.get_most(summary['role']) | ||
256 | return bs_summary | ||
257 | |||
258 | |||
259 | def rebuild_bs_summary(self, bs_summary, unknown_summary): | ||
260 | # bs_summary = { | ||
261 | # '卡号': { | ||
262 | # 'count': 100, | ||
263 | # 'classify': [], | ||
264 | # 'confidence': [], | ||
265 | # 'role': [], | ||
266 | # 'code': [('page', 'code')], | ||
267 | # 'print_time': [], | ||
268 | # 'start_date': [], | ||
269 | # 'end_date': [], | ||
270 | # 'sheet': ['sheet_name'] | ||
271 | # } | ||
272 | # } | ||
273 | # | ||
274 | # unknown_summary = { | ||
275 | # 0: { | ||
276 | # '户名': { | ||
277 | # 'classify': 0, | ||
278 | # 'confidence': [], | ||
279 | # 'role': '户名', | ||
280 | # 'code': [('page', 'code')], | ||
281 | # 'print_time': [], | ||
282 | # 'start_date': [], | ||
283 | # 'end_date': [], | ||
284 | # 'sheet': ['sheet_name'] | ||
285 | # } | ||
286 | # } | ||
287 | # } | ||
288 | # 无卡号 | ||
289 | if len(bs_summary) == 0: | ||
290 | del bs_summary | ||
291 | merged_bs_summary = {} | ||
292 | card_num = 1 | ||
293 | for role_dict in unknown_summary.values(): | ||
294 | for summary in role_dict.values(): | ||
295 | card = '{0}_{1}'.format(consts.UNKNOWN_CARD, card_num) | ||
296 | card_num += 1 | ||
297 | merged_bs_summary[card] = summary | ||
298 | else: | ||
299 | # 1卡号 | ||
300 | if len(bs_summary) == 1: | ||
301 | merged_bs_summary = self.prune_bs_summary(bs_summary) | ||
302 | # 多卡号 | ||
303 | else: | ||
304 | merged_bs_summary = self.merge_card(bs_summary) | ||
305 | |||
306 | for card_summary in merged_bs_summary.values(): | ||
307 | merge_role = [] | ||
308 | classify_summary = unknown_summary.get(card_summary['classify'], {}) | ||
309 | for role, summary in classify_summary.items(): | ||
310 | if role in card_summary['role_set']: | ||
311 | merge_role.append(role) | ||
312 | card_summary['sheet'].extend(summary['sheet']) | ||
313 | card_summary['code'].extend(summary['sheet']) | ||
314 | card_summary['print_time'].extend(summary['print_time']) | ||
315 | card_summary['start_date'].extend(summary['start_date']) | ||
316 | card_summary['end_date'].extend(summary['end_date']) | ||
317 | |||
318 | for role in merge_role: | ||
319 | del classify_summary[role] | ||
320 | |||
321 | card_num = 1 | ||
322 | for role_dict in unknown_summary.values(): | ||
323 | for summary in role_dict.values(): | ||
324 | card = '{0}_{1}'.format(consts.UNKNOWN_CARD, card_num) | ||
325 | card_num += 1 | ||
326 | merged_bs_summary[card] = summary | ||
327 | |||
328 | del unknown_summary | ||
329 | for summary in merged_bs_summary.values(): | ||
330 | if summary.get('role_set') is not None: | ||
331 | del summary['role_set'] | ||
332 | summary['print_time'] = self.get_validate_date(summary['print_time']) | ||
333 | summary['start_date'] = self.get_validate_date(summary['start_date']) | ||
334 | summary['end_date'] = self.get_validate_date(summary['end_date']) | ||
335 | summary['confidence'] = max(summary['confidence']) | ||
336 | return merged_bs_summary | ||
129 | 337 | ||
130 | # TODO 细化文件状态,不同异常状态,归还队列,重试时采取不同的处理 | 338 | # TODO 细化文件状态,不同异常状态,归还队列,重试时采取不同的处理 |
131 | # TODO 调用接口重试 | 339 | # TODO 调用接口重试 |
340 | # TODO 协程异步发送OCR请求 | ||
132 | # TODO 异常邮件通知 | 341 | # TODO 异常邮件通知 |
133 | # TODO 数据库断联问题 | 342 | # TODO 数据库断联问题 |
343 | # TODO 非流水证件处理,Excel模板 | ||
134 | def handle(self, *args, **kwargs): | 344 | def handle(self, *args, **kwargs): |
135 | sleep_second = int(conf.SLEEP_SECOND) | 345 | sleep_second = int(conf.SLEEP_SECOND) |
136 | max_sleep_second = int(conf.MAX_SLEEP_SECOND) | 346 | max_sleep_second = int(conf.MAX_SLEEP_SECOND) |
... | @@ -138,17 +348,19 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -138,17 +348,19 @@ class Command(BaseCommand, LoggerMixin): |
138 | while self.switch: | 348 | while self.switch: |
139 | # 1. 从队列获取文件信息 | 349 | # 1. 从队列获取文件信息 |
140 | doc, business_type = self.get_doc_info() | 350 | doc, business_type = self.get_doc_info() |
351 | # 队列为空时的处理 | ||
352 | if doc is None: | ||
353 | time.sleep(sleep_second) | ||
354 | sleep_second = min(max_sleep_second, sleep_second + 5) | ||
355 | continue | ||
356 | sleep_second = int(conf.SLEEP_SECOND) | ||
357 | |||
141 | try: | 358 | try: |
359 | start_time = time.time() | ||
142 | # 2. 从EDMS获取PDF文件 | 360 | # 2. 从EDMS获取PDF文件 |
143 | doc_data_path, excel_path, src_excel_path, pdf_path = self.pdf_download(doc, business_type) | 361 | doc_data_path, excel_path, src_excel_path, pdf_path = self.pdf_download(doc, business_type) |
144 | # 队列为空时的处理 | 362 | |
145 | if pdf_path is None: | ||
146 | time.sleep(sleep_second) | ||
147 | sleep_second = min(max_sleep_second, sleep_second+5) | ||
148 | continue | ||
149 | sleep_second = int(conf.SLEEP_SECOND) | ||
150 | # 3.PDF文件提取图片 | 363 | # 3.PDF文件提取图片 |
151 | start_time = time.time() | ||
152 | img_save_path = os.path.join(doc_data_path, 'img') | 364 | img_save_path = os.path.join(doc_data_path, 'img') |
153 | self.cronjob_log.info('{0} [pdf to img start] [business_type={1}] [doc_id={2}]'.format( | 365 | self.cronjob_log.info('{0} [pdf to img start] [business_type={1}] [doc_id={2}]'.format( |
154 | self.log_base, business_type, doc.id)) | 366 | self.log_base, business_type, doc.id)) |
... | @@ -158,28 +370,42 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -158,28 +370,42 @@ class Command(BaseCommand, LoggerMixin): |
158 | self.log_base, business_type, doc.id)) | 370 | self.log_base, business_type, doc.id)) |
159 | write_zip_file(img_save_path, os.path.join(doc_data_path, '{0}_img.zip'.format(doc.id))) | 371 | write_zip_file(img_save_path, os.path.join(doc_data_path, '{0}_img.zip'.format(doc.id))) |
160 | 372 | ||
161 | # 4.图片调用算法判断是否为银行流水, 图片调用算法OCR为excel文件 | 373 | # 4.获取OCR结果并且构建excel文件 |
162 | role_summary = { | 374 | bs_summary = {} |
163 | '银行-户名': [] | 375 | license_summary = {} |
164 | } | 376 | unknown_summary = [] |
165 | # interest_keyword = Keywords.objects.filter( | 377 | interest_keyword = Keywords.objects.filter( |
166 | # type=KeywordsType.INTEREST.value).values_list('keyword', flat=True) | 378 | type=KeywordsType.INTEREST.value).values_list('keyword', flat=True) |
167 | # salary_keyword = Keywords.objects.filter( | 379 | salary_keyword = Keywords.objects.filter( |
168 | # type=KeywordsType.SALARY.value).values_list('keyword', flat=True) | 380 | type=KeywordsType.SALARY.value).values_list('keyword', flat=True) |
169 | # loan_keyword = Keywords.objects.filter(type=KeywordsType.LOAN.value).values_list('keyword', flat=True) | 381 | loan_keyword = Keywords.objects.filter( |
170 | # wb = BSWorkbook(interest_keyword, salary_keyword, loan_keyword) | 382 | type__in=[KeywordsType.LOAN.value, KeywordsType.ALI_WECHART.value]).values_list( |
171 | wb = Workbook() | 383 | 'keyword', flat=True) |
384 | wb = BSWorkbook(interest_keyword, salary_keyword, loan_keyword) | ||
385 | |||
386 | # wb = Workbook() | ||
387 | |||
388 | # 4.1 获取OCR结果 | ||
172 | # loop = asyncio.get_event_loop() | 389 | # loop = asyncio.get_event_loop() |
173 | # tasks = [self.img_ocr_excel(wb, img_path, role_summary) for img_path in pdf_handler.img_path_list] | 390 | # tasks = [self.img_2_ocr_2_wb(wb, img_path, summary) for img_path in pdf_handler.img_path_list] |
174 | # loop.run_until_complete(asyncio.wait(tasks)) | 391 | # loop.run_until_complete(asyncio.wait(tasks)) |
175 | # loop.close() | 392 | # loop.close() |
176 | 393 | ||
177 | for img_path in pdf_handler.img_path_list: | 394 | for img_info in pdf_handler.img_info_list: |
178 | self.img_ocr_excel(wb, img_path, role_summary) | 395 | self.img_2_ocr_2_wb(wb, img_info, bs_summary, unknown_summary, license_summary) |
396 | |||
397 | self.cronjob_log.info('{0} [bs_summary={1}] [unknown_summary={2}] [license_summary={3}]'.format( | ||
398 | self.log_base, bs_summary, unknown_summary, license_summary)) | ||
399 | |||
400 | merged_bs_summary = self.rebuild_bs_summary(bs_summary, unknown_summary) | ||
179 | 401 | ||
180 | # 整合excel文件 | 402 | self.cronjob_log.info('{0} [merged_bs_summary={1}] [unknown_summary={2}]'.format( |
181 | # wb.save(src_excel_path) | 403 | self.log_base, merged_bs_summary, unknown_summary)) |
182 | # wb.rebuild(role_summary) | 404 | del unknown_summary |
405 | |||
406 | # 4.2 重构Excel文件 | ||
407 | wb.save(src_excel_path) | ||
408 | wb.rebuild(merged_bs_summary, license_summary) | ||
183 | wb.save(excel_path) | 409 | wb.save(excel_path) |
184 | except Exception as e: | 410 | except Exception as e: |
185 | doc.status = DocStatus.PROCESS_FAILED.value | 411 | doc.status = DocStatus.PROCESS_FAILED.value |
... | @@ -194,14 +420,16 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -194,14 +420,16 @@ class Command(BaseCommand, LoggerMixin): |
194 | except Exception as e: | 420 | except Exception as e: |
195 | doc.status = DocStatus.UPLOAD_FAILED.value | 421 | doc.status = DocStatus.UPLOAD_FAILED.value |
196 | doc.save() | 422 | doc.save() |
197 | self.cronjob_log.error('{0} [upload failed] [business_type={1}] [doc_id={2}] [err={3}]'.format( | 423 | end_time = time.time() |
198 | self.log_base, business_type, doc.id, e)) | 424 | speed_time = int(end_time - start_time) |
425 | self.cronjob_log.error('{0} [upload failed] [business_type={1}] [doc_id={2}] [speed_time={3}] ' | ||
426 | '[err={4}]'.format(self.log_base, business_type, doc.id, speed_time, e)) | ||
199 | else: | 427 | else: |
200 | doc.status = DocStatus.COMPLETE.value | 428 | doc.status = DocStatus.COMPLETE.value |
201 | doc.save() | 429 | doc.save() |
202 | end_time = time.time() | 430 | end_time = time.time() |
203 | speed_time = int(end_time - start_time) | 431 | speed_time = int(end_time - start_time) |
204 | self.cronjob_log.info('{0} [doc process complete] [business_type={1}] [doc_id={2}] ' | 432 | self.cronjob_log.info('{0} [process complete] [business_type={1}] [doc_id={2}] ' |
205 | '[speed_time={3}]'.format(self.log_base, business_type, doc.id, speed_time)) | 433 | '[speed_time={3}]'.format(self.log_base, business_type, doc.id, speed_time)) |
206 | 434 | ||
207 | self.cronjob_log.info('{0} [stop safely]'.format(self.log_base)) | 435 | self.cronjob_log.info('{0} [stop safely]'.format(self.log_base)) | ... | ... |
... | @@ -11,6 +11,8 @@ class DocHandler: | ... | @@ -11,6 +11,8 @@ class DocHandler: |
11 | return '/data/{1}/{0}/{0}.pdf'.format(doc_id, business_type) | 11 | return '/data/{1}/{0}/{0}.pdf'.format(doc_id, business_type) |
12 | elif file == 'img': | 12 | elif file == 'img': |
13 | return '/data/{1}/{0}/{0}_img.zip'.format(doc_id, business_type) | 13 | return '/data/{1}/{0}/{0}_img.zip'.format(doc_id, business_type) |
14 | elif file == 'src_excel': | ||
15 | return '/data/{1}/{0}/src.xlsx'.format(doc_id, business_type) | ||
14 | else: | 16 | else: |
15 | return '/data/{1}/{0}/{0}.xlsx'.format(doc_id, business_type) | 17 | return '/data/{1}/{0}/{0}.xlsx'.format(doc_id, business_type) |
16 | 18 | ||
... | @@ -22,6 +24,7 @@ class DocHandler: | ... | @@ -22,6 +24,7 @@ class DocHandler: |
22 | doc_dict['pdf_link'] = self.get_link(doc_id, business_type) | 24 | doc_dict['pdf_link'] = self.get_link(doc_id, business_type) |
23 | doc_dict['img_link'] = self.get_link(doc_id, business_type, file='img') | 25 | doc_dict['img_link'] = self.get_link(doc_id, business_type, file='img') |
24 | doc_dict['excel_link'] = self.get_link(doc_id, business_type, file='excel') | 26 | doc_dict['excel_link'] = self.get_link(doc_id, business_type, file='excel') |
27 | doc_dict['src_excel_link'] = self.get_link(doc_id, business_type, file='src_excel') | ||
25 | return list(doc_queryset) | 28 | return list(doc_queryset) |
26 | 29 | ||
27 | @staticmethod | 30 | @staticmethod | ... | ... |
... | @@ -13,6 +13,7 @@ class BSWorkbook(Workbook): | ... | @@ -13,6 +13,7 @@ class BSWorkbook(Workbook): |
13 | 13 | ||
14 | def __init__(self, interest_keyword, salary_keyword, loan_keyword, *args, **kwargs): | 14 | def __init__(self, interest_keyword, salary_keyword, loan_keyword, *args, **kwargs): |
15 | super().__init__(*args, **kwargs) | 15 | super().__init__(*args, **kwargs) |
16 | locale.setlocale(locale.LC_NUMERIC, 'en_US.UTF-8') | ||
16 | self.meta_sheet_title = '关键信息提取和展示' | 17 | self.meta_sheet_title = '关键信息提取和展示' |
17 | self.blank_row = (None,) | 18 | self.blank_row = (None,) |
18 | self.code_header = ('页数', '电子回单验证码') | 19 | self.code_header = ('页数', '电子回单验证码') |
... | @@ -24,26 +25,59 @@ class BSWorkbook(Workbook): | ... | @@ -24,26 +25,59 @@ class BSWorkbook(Workbook): |
24 | self.proof_res = ('对', '错') | 25 | self.proof_res = ('对', '错') |
25 | self.loan_fill = PatternFill("solid", fgColor="00FFCC00") | 26 | self.loan_fill = PatternFill("solid", fgColor="00FFCC00") |
26 | self.amount_fill = PatternFill("solid", fgColor="00FFFF00") | 27 | self.amount_fill = PatternFill("solid", fgColor="00FFFF00") |
27 | self.bd = Side(style='thin', color="000000") | 28 | # self.bd = Side(style='thin', color="000000") |
28 | self.border = Border(left=self.bd, top=self.bd, right=self.bd, bottom=self.bd) | 29 | # self.border = Border(left=self.bd, top=self.bd, right=self.bd, bottom=self.bd) |
29 | self.MAX_MEAN = 31 | 30 | self.MAX_MEAN = 31 |
30 | 31 | ||
31 | @staticmethod | 32 | @staticmethod |
32 | def sheet_prune(ws): | 33 | def sheet_prune(ws, classify): |
33 | ws.insert_cols(1, amount=consts.FIXED_COL_AMOUNT) | 34 | ws.insert_cols(1, amount=consts.FIXED_COL_AMOUNT) |
35 | moved_col_set = set() | ||
36 | header_col_set = set() | ||
37 | # 根据第一行关键词排列 | ||
34 | for col in range(consts.FIXED_COL_AMOUNT + 1, ws.max_column + 1): | 38 | for col in range(consts.FIXED_COL_AMOUNT + 1, ws.max_column + 1): |
35 | header_value = ws.cell(1, col).value | 39 | header_value = ws.cell(1, col).value |
36 | header_idx = consts.HEADERS_MAPPING.get(header_value) | 40 | header_col = consts.HEADERS_MAPPING.get(header_value) |
37 | # TODO 关键字段再次查找 | 41 | if header_col is not None: |
38 | # TODO 支付宝、微信流水第一行非表头,怎么处理 | 42 | letter = get_column_letter(col) |
39 | if header_idx is None: | 43 | ws.move_range("{0}1:{0}{1}".format(letter, ws.max_row), cols=header_col - col) |
44 | moved_col_set.add(col) | ||
45 | header_col_set.add(header_col) | ||
46 | elif header_value in consts.BORROW_HEADERS_SET: | ||
47 | letter = get_column_letter(col) | ||
48 | ws.move_range("{0}1:{0}{1}".format(letter, ws.max_row), cols=consts.BORROW_HEADER_COL - col) | ||
49 | moved_col_set.add(col) | ||
50 | header_col_set.add(consts.BORROW_HEADER_COL) | ||
51 | elif header_value in consts.INCOME_HEADERS_SET: | ||
52 | letter = get_column_letter(col) | ||
53 | ws.move_range("{0}1:{0}{1}".format(letter, ws.max_row), cols=consts.INCOME_HEADER_COL - col) | ||
54 | moved_col_set.add(col) | ||
55 | header_col_set.add(consts.INCOME_HEADER_COL) | ||
56 | elif header_value in consts.OUTLAY_HEADERS_SET: | ||
57 | letter = get_column_letter(col) | ||
58 | ws.move_range("{0}1:{0}{1}".format(letter, ws.max_row), cols=consts.OUTLAY_HEADER_COL - col) | ||
59 | moved_col_set.add(col) | ||
60 | header_col_set.add(consts.OUTLAY_HEADER_COL) | ||
61 | |||
62 | # 缺失表头再次查找 | ||
63 | for header_col in range(1, consts.FIXED_COL_AMOUNT + 1): | ||
64 | if header_col in header_col_set or header_col == consts.RESULT_HEADER_COL: | ||
65 | continue | ||
66 | fix_col = consts.CLASSIFY_LIST[classify][1][header_col - 1] # TODO 合并分类情况 | ||
67 | if fix_col is None: | ||
40 | continue | 68 | continue |
41 | letter = get_column_letter(col) | 69 | fix_col = fix_col + consts.FIXED_COL_AMOUNT |
42 | ws.move_range("{0}1:{0}{1}".format(letter, ws.max_row), cols=header_idx - col) | 70 | if fix_col in moved_col_set: |
71 | break | ||
72 | letter = get_column_letter(fix_col) | ||
73 | ws.move_range("{0}1:{0}{1}".format(letter, ws.max_row), cols=header_col - fix_col) | ||
74 | |||
43 | ws.delete_cols(consts.FIXED_COL_AMOUNT + 1, amount=ws.max_column) | 75 | ws.delete_cols(consts.FIXED_COL_AMOUNT + 1, amount=ws.max_column) |
76 | min_row = 1 if len(moved_col_set) == 0 else 2 | ||
77 | return min_row | ||
44 | 78 | ||
45 | @staticmethod | 79 | @staticmethod |
46 | def month_split(dti, date_list): | 80 | def month_split(dti, date_list, date_statistics): |
47 | month_list = [] | 81 | month_list = [] |
48 | idx_list = [] | 82 | idx_list = [] |
49 | month_pre = None | 83 | month_pre = None |
... | @@ -53,15 +87,17 @@ class BSWorkbook(Workbook): | ... | @@ -53,15 +87,17 @@ class BSWorkbook(Workbook): |
53 | if month_str != month_pre: | 87 | if month_str != month_pre: |
54 | month_list.append(month_str) | 88 | month_list.append(month_str) |
55 | if month_pre is None: | 89 | if month_pre is None: |
56 | date_list.append(dti[idx].date()) | 90 | if date_statistics: |
91 | date_list.append(dti[idx].date()) | ||
57 | idx = 0 | 92 | idx = 0 |
58 | idx_list.append(idx) | 93 | idx_list.append(idx) |
59 | month_pre = month_str | 94 | month_pre = month_str |
60 | for idx in range(len(dti)-1, -1, -1): | 95 | if date_statistics: |
61 | if isinstance(dti[idx], NaTType): | 96 | for idx in range(len(dti) - 1, -1, -1): |
62 | continue | 97 | if isinstance(dti[idx], NaTType): |
63 | date_list.append(dti[idx].date()) | 98 | continue |
64 | break | 99 | date_list.append(dti[idx].date()) |
100 | break | ||
65 | return month_list, idx_list | 101 | return month_list, idx_list |
66 | 102 | ||
67 | @staticmethod | 103 | @staticmethod |
... | @@ -86,8 +122,8 @@ class BSWorkbook(Workbook): | ... | @@ -86,8 +122,8 @@ class BSWorkbook(Workbook): |
86 | reverse_trend = -1 | 122 | reverse_trend = -1 |
87 | return reverse_trend | 123 | return reverse_trend |
88 | 124 | ||
89 | def sheet_split(self, ws, month_mapping, date_list, reverse_trend_list): | 125 | def sheet_split(self, ws, month_mapping, reverse_trend_list, min_row, date_list, date_statistics): |
90 | for date_tuple_src in ws.iter_cols(min_col=1, max_col=1, min_row=2, values_only=True): | 126 | for date_tuple_src in ws.iter_cols(min_col=1, max_col=1, min_row=min_row, values_only=True): |
91 | date_tuple = [date[:10] if isinstance(date, str) else date for date in date_tuple_src] | 127 | date_tuple = [date[:10] if isinstance(date, str) else date for date in date_tuple_src] |
92 | dt_array, tz_parsed = tslib.array_to_datetime( | 128 | dt_array, tz_parsed = tslib.array_to_datetime( |
93 | np.array(date_tuple, copy=False, dtype=np.object_), | 129 | np.array(date_tuple, copy=False, dtype=np.object_), |
... | @@ -95,16 +131,16 @@ class BSWorkbook(Workbook): | ... | @@ -95,16 +131,16 @@ class BSWorkbook(Workbook): |
95 | utc=False, | 131 | utc=False, |
96 | dayfirst=False, | 132 | dayfirst=False, |
97 | yearfirst=False, | 133 | yearfirst=False, |
98 | require_iso8601=False, | 134 | require_iso8601=True, |
99 | ) | 135 | ) |
100 | dti = DatetimeIndex(dt_array, tz=None, name=None) | 136 | dti = DatetimeIndex(dt_array, tz=None, name=None) |
101 | 137 | ||
102 | month_list, idx_list = self.month_split(dti, date_list) | 138 | month_list, idx_list = self.month_split(dti, date_list, date_statistics) |
103 | 139 | ||
104 | if len(month_list) == 0: | 140 | if len(month_list) == 0: |
105 | # month_info process | 141 | # month_info process |
106 | month_info = month_mapping.setdefault('xxxx-xx', []) | 142 | month_info = month_mapping.setdefault('xxxx-xx', []) |
107 | month_info.append((ws.title, 2, ws.max_row, 0)) | 143 | month_info.append((ws.title, min_row, ws.max_row, 0)) |
108 | elif len(month_list) == 1: | 144 | elif len(month_list) == 1: |
109 | # reverse_trend_list process | 145 | # reverse_trend_list process |
110 | reverse_trend = self.get_reverse_trend(dti.day, idx_list) | 146 | reverse_trend = self.get_reverse_trend(dti.day, idx_list) |
... | @@ -113,14 +149,14 @@ class BSWorkbook(Workbook): | ... | @@ -113,14 +149,14 @@ class BSWorkbook(Workbook): |
113 | month_info = month_mapping.setdefault(month_list[0], []) | 149 | month_info = month_mapping.setdefault(month_list[0], []) |
114 | day_mean = np.mean(dti.day.dropna()) | 150 | day_mean = np.mean(dti.day.dropna()) |
115 | if len(month_info) == 0: | 151 | if len(month_info) == 0: |
116 | month_info.append((ws.title, 2, ws.max_row, day_mean)) | 152 | month_info.append((ws.title, min_row, ws.max_row, day_mean)) |
117 | else: | 153 | else: |
118 | for i, item in enumerate(month_info): | 154 | for i, item in enumerate(month_info): |
119 | if day_mean <= item[-1]: | 155 | if day_mean <= item[-1]: |
120 | month_info.insert(i, (ws.title, 2, ws.max_row, day_mean)) | 156 | month_info.insert(i, (ws.title, min_row, ws.max_row, day_mean)) |
121 | break | 157 | break |
122 | else: | 158 | else: |
123 | month_info.append((ws.title, 2, ws.max_row, day_mean)) | 159 | month_info.append((ws.title, min_row, ws.max_row, day_mean)) |
124 | else: | 160 | else: |
125 | # reverse_trend_list process | 161 | # reverse_trend_list process |
126 | reverse_trend = self.get_reverse_trend(dti.day, idx_list) | 162 | reverse_trend = self.get_reverse_trend(dti.day, idx_list) |
... | @@ -128,34 +164,41 @@ class BSWorkbook(Workbook): | ... | @@ -128,34 +164,41 @@ class BSWorkbook(Workbook): |
128 | # month_info process | 164 | # month_info process |
129 | for i, item in enumerate(month_list[:-1]): | 165 | for i, item in enumerate(month_list[:-1]): |
130 | month_mapping.setdefault(item, []).append( | 166 | month_mapping.setdefault(item, []).append( |
131 | (ws.title, idx_list[i] + 2, idx_list[i + 1] + 1, self.MAX_MEAN)) | 167 | (ws.title, idx_list[i] + min_row, idx_list[i + 1] + min_row - 1, self.MAX_MEAN)) |
132 | month_mapping.setdefault(month_list[-1], []).insert( | 168 | month_mapping.setdefault(month_list[-1], []).insert( |
133 | 0, (ws.title, idx_list[-1] + 2, ws.max_row, 0)) | 169 | 0, (ws.title, idx_list[-1] + min_row, ws.max_row, 0)) |
134 | 170 | ||
135 | def build_metadata_rows(self, confidence_max, code_list, print_time, start_date, end_date, date_interval): | 171 | def build_metadata_rows(self, classify, confidence, role, code, print_time, start_date, end_date): |
136 | metadata_rows = [('流水识别置信度', confidence_max), self.blank_row, self.code_header] | 172 | metadata_rows = [ |
137 | metadata_rows.extend(code_list) | 173 | ('流水识别置信度', confidence), |
174 | self.blank_row, | ||
175 | ('分类结果', classify), | ||
176 | self.blank_row, | ||
177 | ('户名', role), | ||
178 | self.blank_row, | ||
179 | self.code_header, | ||
180 | ] | ||
181 | metadata_rows.extend(code) | ||
138 | metadata_rows.extend( | 182 | metadata_rows.extend( |
139 | [self.blank_row, | 183 | [self.blank_row, |
140 | self.date_header, | 184 | self.date_header, |
141 | (print_time, start_date, end_date, date_interval), | 185 | (print_time, start_date, end_date, (end_date - start_date).days), |
142 | self.blank_row, | 186 | self.blank_row, |
143 | self.keyword_header] | 187 | self.keyword_header] |
144 | ) | 188 | ) |
145 | return metadata_rows | 189 | return metadata_rows |
146 | 190 | ||
147 | def create_meta_sheet(self, role): | 191 | def create_meta_sheet(self, card): |
148 | if self.worksheets[0].title == 'Sheet': | 192 | if self.worksheets[0].title == 'Sheet': |
149 | ms = self.worksheets[0] | 193 | ms = self.worksheets[0] |
150 | ms.title = '{0}({1})'.format(self.meta_sheet_title, role) | 194 | ms.title = '{0}({1})'.format(self.meta_sheet_title, card) |
151 | else: | 195 | else: |
152 | ms = self.create_sheet('{0}({1})'.format(self.meta_sheet_title, role)) | 196 | ms = self.create_sheet('{0}({1})'.format(self.meta_sheet_title, card)) |
153 | return ms | 197 | return ms |
154 | 198 | ||
155 | def build_meta_sheet(self, role, confidence_max, code_list, print_time, start_date, end_date, date_interval): | 199 | def build_meta_sheet(self, card, classify, confidence, role, code, print_time, start_date, end_date): |
156 | metadata_rows = self.build_metadata_rows(confidence_max, code_list, print_time, | 200 | metadata_rows = self.build_metadata_rows(classify, confidence, role, code, print_time, start_date, end_date) |
157 | start_date, end_date, date_interval) | 201 | ms = self.create_meta_sheet(card) |
158 | ms = self.create_meta_sheet(role) | ||
159 | for row in metadata_rows: | 202 | for row in metadata_rows: |
160 | ms.append(row) | 203 | ms.append(row) |
161 | return ms | 204 | return ms |
... | @@ -169,55 +212,84 @@ class BSWorkbook(Workbook): | ... | @@ -169,55 +212,84 @@ class BSWorkbook(Workbook): |
169 | new_ws.append(consts.FIXED_HEADERS) | 212 | new_ws.append(consts.FIXED_HEADERS) |
170 | for part in parts: | 213 | for part in parts: |
171 | ws = self.get_sheet_by_name(part[0]) | 214 | ws = self.get_sheet_by_name(part[0]) |
172 | for row in ws.iter_rows(min_row=part[1], max_row=part[2], values_only=True): | 215 | for row_value in ws.iter_rows(min_row=part[1], max_row=part[2], values_only=True): |
173 | new_ws.append(row) | 216 | new_ws.append(row_value) |
174 | # 3.2.提取信息、高亮 | 217 | # 3.2.提取信息、高亮 |
175 | amount_mapping = {} | 218 | amount_mapping = {} |
176 | amount_fill_row = set() | 219 | amount_fill_row = set() |
177 | for rows in new_ws.iter_rows(): | 220 | for rows in new_ws.iter_rows(min_row=2): |
178 | summary_cell = rows[5] | 221 | summary_cell = rows[consts.SUMMARY_IDX] |
179 | date_cell = rows[0] | 222 | date_cell = rows[consts.DATE_IDX] |
223 | amount_cell = rows[consts.AMOUNT_IDX] | ||
224 | row = summary_cell.row | ||
180 | # 关键词1提取 | 225 | # 关键词1提取 |
181 | if summary_cell.value in self.interest_keyword: | 226 | if summary_cell.value in self.interest_keyword: |
182 | ms.append((summary_cell.value, date_cell.value, rows[2].value)) | 227 | ms.append((summary_cell.value, date_cell.value, amount_cell.value)) |
183 | # 关键词2提取至临时表 | 228 | # 关键词2提取至临时表 |
184 | elif summary_cell.value in self.salary_keyword: | 229 | elif summary_cell.value in self.salary_keyword: |
185 | tmp_ws.append((summary_cell.value, date_cell.value, rows[2].value)) | 230 | tmp_ws.append((summary_cell.value, date_cell.value, amount_cell.value)) |
186 | # 贷款关键词高亮 | 231 | # 贷款关键词高亮 |
187 | elif summary_cell.value in self.loan_keyword: | 232 | elif summary_cell.value in self.loan_keyword: |
188 | summary_cell.fill = self.loan_fill | 233 | summary_cell.fill = self.loan_fill |
189 | for i, cell in enumerate(rows): | 234 | |
190 | cell.border = self.border | 235 | # 3.3.余额转数值 |
191 | if (i == 2 or i == 3) and cell.row > 1: | 236 | over_cell = rows[consts.OVER_IDX] |
237 | try: | ||
238 | if isinstance(over_cell.value, str): | ||
239 | over_cell.value = over_cell.value.translate(consts.TRANS) | ||
240 | over_cell.value = locale.atof(over_cell.value) | ||
241 | except Exception as e: | ||
242 | continue | ||
243 | else: | ||
244 | over_cell.number_format = numbers.FORMAT_NUMBER_COMMA_SEPARATED1 | ||
245 | |||
246 | # 3.4.余额转数值 | ||
247 | try: | ||
248 | try: | ||
249 | if isinstance(amount_cell.value, str): | ||
250 | amount_cell.value = amount_cell.value.translate(consts.TRANS) | ||
251 | amount_cell.value = locale.atof(amount_cell.value) | ||
252 | except Exception as e: | ||
192 | try: | 253 | try: |
193 | # 3.3.金额、余额转数值 | 254 | if isinstance(rows[consts.INCOME_IDX].value, str): |
194 | cell.value = locale.atof(cell.value) | 255 | rows[consts.OUTLAY_IDX].value = rows[consts.INCOME_IDX].value.translate(consts.TRANS) |
195 | except Exception: | 256 | amount_cell.value = locale.atof(rows[consts.OUTLAY_IDX].value) |
196 | continue | 257 | except Exception as e: |
197 | else: | 258 | if isinstance(rows[consts.OUTLAY_IDX].value, str): |
198 | cell.number_format = numbers.FORMAT_NUMBER_COMMA_SEPARATED1 | 259 | rows[consts.OUTLAY_IDX].value = rows[consts.OUTLAY_IDX].value.translate(consts.TRANS) |
199 | if i == 2: | 260 | amount_cell.value = locale.atof(rows[consts.OUTLAY_IDX].value) |
200 | same_amount_mapping = amount_mapping.get(date_cell.value, {}) | 261 | if amount_cell.value > 0: |
201 | fill_rows = same_amount_mapping.get(-cell.value) | 262 | amount_cell.value = -amount_cell.value |
202 | if fill_rows: | 263 | except Exception as e: |
203 | amount_fill_row.add(cell.row) | 264 | continue |
204 | amount_fill_row.update(fill_rows) | 265 | else: |
205 | amount_mapping.setdefault(date_cell.value, {}).setdefault( | 266 | if rows[consts.BORROW_IDX].value in consts.BORROW_OUTLAY_SET: |
206 | cell.value, []).append(cell.row) | 267 | amount_cell.value = -amount_cell.value |
207 | # 3.4.核对结果 | 268 | amount_cell.number_format = numbers.FORMAT_NUMBER_COMMA_SEPARATED1 |
208 | # TODO 借贷、开支类型银行流水,需要手动添加+-号 | 269 | same_amount_mapping = amount_mapping.get(date_cell.value, {}) |
209 | if i == 9 and cell.row > 2: | 270 | fill_rows = same_amount_mapping.get(-amount_cell.value) |
210 | if is_reverse: | 271 | if fill_rows: |
211 | cell.value = '=IF(D{0}=SUM(D{1},C{0}), "{2}", "{3}")'.format( | 272 | amount_fill_row.add(row) |
212 | cell.row - 1, cell.row, *self.proof_res) | 273 | amount_fill_row.update(fill_rows) |
213 | else: | 274 | amount_mapping.setdefault(date_cell.value, {}).setdefault( |
214 | cell.value = '=IF(D{0}=SUM(D{1},C{0}), "{2}", "{3}")'.format( | 275 | amount_cell.value, []).append(row) |
215 | cell.row, cell.row - 1, *self.proof_res) | ||
216 | 276 | ||
217 | # 3.5.同一天相同进出账高亮 | 277 | # 3.5.核对结果 |
278 | if row > 2: | ||
279 | if is_reverse: | ||
280 | rows[consts.RESULT_IDX].value = '=IF(D{0}=SUM(D{1},C{0}), "{2}", "{3}")'.format( | ||
281 | row - 1, row, *self.proof_res) | ||
282 | else: | ||
283 | rows[consts.RESULT_IDX].value = '=IF(D{0}=SUM(D{1},C{0}), "{2}", "{3}")'.format( | ||
284 | row, row - 1, *self.proof_res) | ||
285 | |||
286 | # 删除金额辅助列 | ||
287 | new_ws.delete_cols(consts.BORROW_HEADER_COL, amount=new_ws.max_column) | ||
288 | |||
289 | # 3.6.同一天相同进出账高亮 | ||
218 | del amount_mapping | 290 | del amount_mapping |
219 | for row in amount_fill_row: | 291 | for row in amount_fill_row: |
220 | new_ws[row][2].fill = self.amount_fill | 292 | new_ws[row][consts.AMOUNT_IDX].fill = self.amount_fill |
221 | 293 | ||
222 | # 关键词2信息提取 | 294 | # 关键词2信息提取 |
223 | ms.append(self.blank_row) | 295 | ms.append(self.blank_row) |
... | @@ -226,34 +298,51 @@ class BSWorkbook(Workbook): | ... | @@ -226,34 +298,51 @@ class BSWorkbook(Workbook): |
226 | ms.append(row) | 298 | ms.append(row) |
227 | self.remove(tmp_ws) | 299 | self.remove(tmp_ws) |
228 | 300 | ||
229 | def rebuild(self, role_summary): | 301 | def bs_rebuild(self, bs_summary): |
230 | # (sheet_name, confidence, page, code, print_time, start_date, end_date) # TODO 表名简化,+卡号 | 302 | # bs_summary = { |
231 | for role, summary_list in role_summary.items(): | 303 | # '卡号': { |
304 | # 'classify': 0, | ||
305 | # 'confidence': 0.9, | ||
306 | # 'role': '柳雪', | ||
307 | # 'code': [('page', 'code')], | ||
308 | # 'print_time': 'datetime', | ||
309 | # 'start_date': 'datetime', | ||
310 | # 'end_date': 'datetime', | ||
311 | # 'sheet': ['sheet_name'] | ||
312 | # } | ||
313 | # } | ||
314 | for card, summary in bs_summary.items(): | ||
232 | # 1.原表修剪、排列、按照月份分割 | 315 | # 1.原表修剪、排列、按照月份分割 |
233 | reverse_trend_list = [] | 316 | start_date = summary['start_date'] |
234 | confidence_max = 0 | 317 | end_date = summary['end_date'] |
235 | code_list = [] | 318 | date_statistics = False |
236 | month_mapping = {} | 319 | if start_date is None or end_date is None: |
320 | date_statistics = True | ||
237 | date_list = [] | 321 | date_list = [] |
238 | start_date = end_date = date_interval = print_time = None | 322 | month_mapping = {} |
239 | for summary in summary_list: | 323 | reverse_trend_list = [] |
240 | sheet_name, confidence, page, code, print_time_local, start_date_local, end_date_local = summary | 324 | for sheet in summary['sheet']: |
241 | ws = self.get_sheet_by_name(sheet_name) | 325 | ws = self.get_sheet_by_name(sheet) |
242 | # 1.1.删除多余列、排列 | 326 | # 1.1.删除多余列、排列 |
243 | self.sheet_prune(ws) | 327 | min_row = self.sheet_prune(ws, summary['classify']) |
244 | # 1.2.按月份分割 | 328 | # 1.2.按月份分割 |
245 | self.sheet_split(ws, month_mapping, date_list, reverse_trend_list) | 329 | self.sheet_split(ws, month_mapping, reverse_trend_list, min_row, date_list, date_statistics) |
246 | # 1.3.元数据处理 TODO 时间与日期处理 | 330 | |
247 | confidence_max = max(confidence, confidence_max) | 331 | if date_statistics is True and len(date_list) > 1: |
248 | if code is not None: | 332 | start_date = min(date_list) if start_date is None else start_date |
249 | code_list.append((page, code)) | 333 | end_date = max(date_list) if end_date is None else end_date |
250 | 334 | ||
251 | if len(date_list) > 1: | ||
252 | start_date = min(date_list) | ||
253 | end_date = max(date_list) | ||
254 | date_interval = (end_date - start_date).days | ||
255 | # 2.元信息提取表 | 335 | # 2.元信息提取表 |
256 | ms = self.build_meta_sheet(role, confidence_max, code_list, print_time, start_date, end_date, date_interval) | 336 | bank_name = consts.CLASSIFY_LIST[summary['classify']][0] |
337 | base_sheet_name = '{0}_{1}'.format(bank_name, summary['role']) | ||
338 | ms = self.build_meta_sheet(card, | ||
339 | summary['classify'], | ||
340 | summary['confidence'], | ||
341 | summary['role'], | ||
342 | summary['code'], | ||
343 | summary['print_time'], | ||
344 | start_date, | ||
345 | end_date) | ||
257 | 346 | ||
258 | # 3.创建月份表、提取/高亮关键行 | 347 | # 3.创建月份表、提取/高亮关键行 |
259 | is_reverse = False | 348 | is_reverse = False |
... | @@ -261,8 +350,11 @@ class BSWorkbook(Workbook): | ... | @@ -261,8 +350,11 @@ class BSWorkbook(Workbook): |
261 | is_reverse = True | 350 | is_reverse = True |
262 | for month_list in month_mapping.values(): | 351 | for month_list in month_mapping.values(): |
263 | month_list.sort(key=lambda x: x[-1], reverse=True) | 352 | month_list.sort(key=lambda x: x[-1], reverse=True) |
264 | self.build_month_sheet(role, month_mapping, ms, is_reverse) | 353 | self.build_month_sheet(base_sheet_name, month_mapping, ms, is_reverse) |
354 | |||
355 | # 4.删除原表 | ||
356 | for sheet in summary['sheet']: | ||
357 | self.remove(self.get_sheet_by_name(sheet)) | ||
265 | 358 | ||
266 | # 删除原表 | 359 | def rebuild(self, bs_summary, license_summary): |
267 | for summary in summary_list: | 360 | self.bs_rebuild(bs_summary) |
268 | self.remove(self.get_sheet_by_name(summary[0])) | ||
... | \ No newline at end of file | ... | \ No newline at end of file | ... | ... |
... | @@ -25,7 +25,7 @@ class PDFHandler: | ... | @@ -25,7 +25,7 @@ class PDFHandler: |
25 | def __init__(self, path, img_dir_path): | 25 | def __init__(self, path, img_dir_path): |
26 | self.path = path | 26 | self.path = path |
27 | self.img_dir_path = img_dir_path | 27 | self.img_dir_path = img_dir_path |
28 | self.img_path_list = [] | 28 | self.img_info_list = [] |
29 | self.xref_set = set() | 29 | self.xref_set = set() |
30 | 30 | ||
31 | def get_img_save_path(self, pno, img_index=0, ext='png'): | 31 | def get_img_save_path(self, pno, img_index=0, ext='png'): |
... | @@ -38,7 +38,7 @@ class PDFHandler: | ... | @@ -38,7 +38,7 @@ class PDFHandler: |
38 | pm = page.getPixmap(matrix=trans_2, alpha=False) | 38 | pm = page.getPixmap(matrix=trans_2, alpha=False) |
39 | img_save_path = self.get_img_save_path(page.number) | 39 | img_save_path = self.get_img_save_path(page.number) |
40 | pm.writePNG(img_save_path) | 40 | pm.writePNG(img_save_path) |
41 | self.img_path_list.append(img_save_path) | 41 | self.img_info_list.append((img_save_path, page.number, 0)) |
42 | 42 | ||
43 | @staticmethod | 43 | @staticmethod |
44 | def getimage(pix): | 44 | def getimage(pix): |
... | @@ -88,7 +88,7 @@ class PDFHandler: | ... | @@ -88,7 +88,7 @@ class PDFHandler: |
88 | with open(img_save_path, "wb") as f: | 88 | with open(img_save_path, "wb") as f: |
89 | f.write(img_data) | 89 | f.write(img_data) |
90 | self.xref_set.add(xref) | 90 | self.xref_set.add(xref) |
91 | self.img_path_list.append(img_save_path) | 91 | self.img_info_list.append((img_save_path, pno, img_index)) |
92 | 92 | ||
93 | @staticmethod | 93 | @staticmethod |
94 | def split_il(il): | 94 | def split_il(il): |
... | @@ -179,7 +179,7 @@ class PDFHandler: | ... | @@ -179,7 +179,7 @@ class PDFHandler: |
179 | img_save_path = self.get_img_save_path(pno, img_index, im_list[0][2]) | 179 | img_save_path = self.get_img_save_path(pno, img_index, im_list[0][2]) |
180 | new_img.save(img_save_path) | 180 | new_img.save(img_save_path) |
181 | page_to_png = False | 181 | page_to_png = False |
182 | self.img_path_list.append(img_save_path) | 182 | self.img_info_list.append((img_save_path, pno, img_index)) |
183 | 183 | ||
184 | # 3.3 碎图分组大于2、全过滤、含特殊filter,特殊处理:整个页面保存为png图片 | 184 | # 3.3 碎图分组大于2、全过滤、含特殊filter,特殊处理:整个页面保存为png图片 |
185 | if page_to_png: | 185 | if page_to_png: | ... | ... |
-
Please register or sign in to post a comment