merge license
Showing
6 changed files
with
583 additions
and
222 deletions
... | @@ -35,9 +35,35 @@ DEALER_CODE_META_FIELD_id = 13 | ... | @@ -35,9 +35,35 @@ DEALER_CODE_META_FIELD_id = 13 |
35 | BUSINESS_TYPE_META_FIELD_id = 93 | 35 | BUSINESS_TYPE_META_FIELD_id = 93 |
36 | DEALER_CODE = 'ocr_situ_group' | 36 | DEALER_CODE = 'ocr_situ_group' |
37 | 37 | ||
38 | RETRY_TIMES = 3 | ||
39 | |||
38 | # ---------银行流水模板相关-------------------------------------------------------------------------------------------- | 40 | # ---------银行流水模板相关-------------------------------------------------------------------------------------------- |
39 | 41 | ||
40 | TRANS = str.maketrans('Cc((oODlLmAsSbg', '000000011345569') | 42 | TRANS_MAP = { |
43 | 'C': "0", | ||
44 | 'c': "0", | ||
45 | '(': "0", | ||
46 | 'o': "0", | ||
47 | 'O': "0", | ||
48 | 'D': "0", | ||
49 | |||
50 | '[': "1", | ||
51 | ']': "1", | ||
52 | 'l': "1", | ||
53 | 'L': "1", | ||
54 | |||
55 | 'A': "4", | ||
56 | 's': "5", | ||
57 | 'S': "5", | ||
58 | 'b': "6", | ||
59 | 'g': "9", | ||
60 | 'E': "9", | ||
61 | 'B': "13", | ||
62 | } | ||
63 | TRANS = str.maketrans(TRANS_MAP) | ||
64 | ERROR_CHARS = {'.', ':', ':', '•', '·'} | ||
65 | SKIP_IMG_SHEET_NAME = '未处理图片' | ||
66 | SKIP_IMG_SHEET_HEADER = ('页码', '序号') | ||
41 | 67 | ||
42 | CARD_RATIO = 0.9 | 68 | CARD_RATIO = 0.9 |
43 | UNKNOWN_CARD = '未知卡号' | 69 | UNKNOWN_CARD = '未知卡号' |
... | @@ -95,7 +121,7 @@ HEADERS_MAPPING.update( | ... | @@ -95,7 +121,7 @@ HEADERS_MAPPING.update( |
95 | HEADERS_MAPPING.update( | 121 | HEADERS_MAPPING.update( |
96 | { | 122 | { |
97 | '交易日期': BASE_HEADERS_MAPPING['记账日期'], | 123 | '交易日期': BASE_HEADERS_MAPPING['记账日期'], |
98 | '存入': BASE_HEADERS_MAPPING['金额'], | 124 | # '存入': BASE_HEADERS_MAPPING['金额'], |
99 | '对方账号': BASE_HEADERS_MAPPING['对方卡号/账号'], | 125 | '对方账号': BASE_HEADERS_MAPPING['对方卡号/账号'], |
100 | '对方名称': BASE_HEADERS_MAPPING['对方账户名'], | 126 | '对方名称': BASE_HEADERS_MAPPING['对方账户名'], |
101 | '摘要': BASE_HEADERS_MAPPING['附言'], | 127 | '摘要': BASE_HEADERS_MAPPING['附言'], |
... | @@ -160,6 +186,12 @@ HEADERS_MAPPING.update( | ... | @@ -160,6 +186,12 @@ HEADERS_MAPPING.update( |
160 | '交易地点/对方账号和户名': BASE_HEADERS_MAPPING['对方卡号/账号'], | 186 | '交易地点/对方账号和户名': BASE_HEADERS_MAPPING['对方卡号/账号'], |
161 | } | 187 | } |
162 | ) | 188 | ) |
189 | # 农业银行-窄页 | ||
190 | HEADERS_MAPPING.update( | ||
191 | { | ||
192 | '交易对手账号': BASE_HEADERS_MAPPING['对方卡号/账号'], | ||
193 | } | ||
194 | ) | ||
163 | # 竖版-特殊-农商行 | 195 | # 竖版-特殊-农商行 |
164 | HEADERS_MAPPING.update( | 196 | HEADERS_MAPPING.update( |
165 | { | 197 | { |
... | @@ -299,17 +331,27 @@ HEADERS_MAPPING.update( | ... | @@ -299,17 +331,27 @@ HEADERS_MAPPING.update( |
299 | # ('中国银行', (1, 2, 4, 5, 6, 9, 10, 11, 12, None, None, None, None)), # 横版-表格-中国银行(不规则) | 331 | # ('中国银行', (1, 2, 4, 5, 6, 9, 10, 11, 12, None, None, None, None)), # 横版-表格-中国银行(不规则) |
300 | # | 332 | # |
301 | # # 农业银行:交易日期 交易网点 存入 支出 余额 对方账号 对方名称 摘要 渠道 附言 | 333 | # # 农业银行:交易日期 交易网点 存入 支出 余额 对方账号 对方名称 摘要 渠道 附言 |
302 | # ('农业银行', (1, None, 3, 5, None, 8, 7, 6, None, None, None, None, None)), # 横版-表格-农业银行-中国农业银行个人账户明细 | 334 | # ('农业银行-10', (1, None, None, 5, None, 8, 7, 6, None, None, None, 3, 4)), # 横版-表格-农业银行-中国农业银行个人账户明细 |
335 | # | ||
336 | # # 农业银行:序号 日期 摘要 交易金额 余额 对方账号 对方名称 交易地点 渠道 附言 | ||
337 | # ('农业银行-10-1', (2, None, 4, 5, None, 3, 7, 6, None, None, None, None, None)), | ||
338 | # | ||
339 | # # 农业银行:交易日期 摘要 交易金额 余额 交易渠道 交易网点 对方账号 对方名称 附言 | ||
340 | # ('农业银行-9', (1, None, 3, 4, None, 2, 8, 7, None, None, None, None, None)), | ||
303 | # | 341 | # |
304 | # # 北京银行:交易日期 业务摘要 收/支 发生额 余额 对方户名 对方账号 交易渠道 | 342 | # # 北京银行:交易日期 业务摘要 收/支 发生额 余额 对方户名 对方账号 交易渠道 |
305 | # ('北京银行', (1, None, 4, 5, None, 2, 6, 7, None, None, 3, None, None)), # 横版-表格-北京银行 | 343 | # ('北京银行', (1, None, 4, 5, None, 2, 6, 7, None, None, 3, None, None)), # 横版-表格-北京银行 |
306 | # | 344 | # |
307 | # # 工商银行:交易日期 账号 储种 序号 币种 钞汇 摘要 地区 收入/支出金额 余额 [对方户名 对方账号] 渠道 | 345 | # # 工商银行:交易日期 账号 储种 序号 币种 钞汇 摘要 地区 收入/支出金额 余额 渠道 |
308 | # ('工商银行', (1, None, 9, 10, None, 7, None, None, None, None, None, None, None)), | 346 | # ('工商银行', (1, None, 9, 10, None, 7, None, None, None, None, None, None, None)), |
309 | # | 347 | # |
348 | # # 工商银行:交易日期 账号 储种 序号 币种 钞汇 摘要 地区 收入/支出金额 余额 对方户名 对方账号 渠道 | ||
349 | # ('工商银行-电子账单', (1, None, 9, 10, None, 7, 11, 12, None, None, None, None, None)), | ||
350 | # | ||
310 | # # 建设银行:空 摘要 交易日期 交易金额 账户余额 商户/网点号及其名称 对方账号与户名 --> 竖版-表格-建设银行 | 351 | # # 建设银行:空 摘要 交易日期 交易金额 账户余额 商户/网点号及其名称 对方账号与户名 --> 竖版-表格-建设银行 |
311 | # # 序号 摘要 币别 钞汇 交易日期 交易金额 账户余额 交易地点附言 对方账号与户名 --> 横版-表格-建设银行 | 352 | # # 序号 摘要 币别 钞汇 交易日期 交易金额 账户余额 交易地点附言 对方账号与户名 --> 横版-表格-建设银行 |
312 | # ('建设银行', (None, None, None, None, None, 2, None, None, None, None, None, None, None)), | 353 | # ('建设银行-竖版', (3, None, 4, 5, None, 2, None, 7, None, None, None, None, None)), |
354 | # ('建设银行-横版', (5, None, 6, 7, None, 2, None, 9, None, None, None, None, None)), | ||
313 | # | 355 | # |
314 | # # 微信:交易单号 交易时间 交易类型 收/支/其他 交易方式 金额(元) 交易对方 商户单号 | 356 | # # 微信:交易单号 交易时间 交易类型 收/支/其他 交易方式 金额(元) 交易对方 商户单号 |
315 | # ('微信', (2, None, 6, None, None, 3, 7, None, None, None, None, None, None)), | 357 | # ('微信', (2, None, 6, None, None, 3, 7, None, None, None, None, None, None)), |
... | @@ -320,7 +362,13 @@ HEADERS_MAPPING.update( | ... | @@ -320,7 +362,13 @@ HEADERS_MAPPING.update( |
320 | # # -----------------普通打印:部分格线-------------------------------- | 362 | # # -----------------普通打印:部分格线-------------------------------- |
321 | # | 363 | # |
322 | # # 农业银行:交易日期 摘要/附言 交易金额 余额 交易地点/对方账号和户名 | 364 | # # 农业银行:交易日期 摘要/附言 交易金额 余额 交易地点/对方账号和户名 |
323 | # ('农业银行', (1, None, 3, 4, None, 2, None, 5, None, None, None, None, None)), | 365 | # ('农业银行-5', (1, None, 3, 4, None, 2, None, 5, None, None, None, None, None)), |
366 | # | ||
367 | # # 农业银行:日期 地点 摘要 存入 支出 余额 对方账号 对方户名 | ||
368 | # ('农业银行-8', (1, None, None, 6, None, 3, 8, 7, None, None, None, 4, 5)), | ||
369 | |||
370 | # # 农业银行:日期 摘要 交易金额 余额 地点 交易对手账号 对方户名 | ||
371 | # ('农业银行-窄页', (1, None, 3, 4, None, 2, 7, 6, None, None, None, None, None)), | ||
324 | # | 372 | # |
325 | # # 农商行:交易日期 交易发生额 账户余额 对方账号 对方户名 摘要 备注 | 373 | # # 农商行:交易日期 交易发生额 账户余额 对方账号 对方户名 摘要 备注 |
326 | # ('农商行', (1, None, 2, 3, None, 6, 5, 4, None, None, None, None, None)), | 374 | # ('农商行', (1, None, 2, 3, None, 6, 5, 4, None, None, None, None, None)), |
... | @@ -330,6 +378,9 @@ HEADERS_MAPPING.update( | ... | @@ -330,6 +378,9 @@ HEADERS_MAPPING.update( |
330 | # | 378 | # |
331 | # # 平安电子账单:序号 交易日期 交易网点 摘要 借贷发生额(借:-贷:+) 账户余额 | 379 | # # 平安电子账单:序号 交易日期 交易网点 摘要 借贷发生额(借:-贷:+) 账户余额 |
332 | # ('平安电子账单', (2, None, 5, 6, None, 4, None, None, None, None, None, None, None)), | 380 | # ('平安电子账单', (2, None, 5, 6, None, 4, None, None, None, None, None, None, None)), |
381 | |||
382 | # # 建设银行:序号 摘要 币别 钞汇 交易日期 交易金额 账户余额 交易地点附言 对方账号与户名 | ||
383 | # ('建设银行-电子账单', (5, None, 6, 7, None, 2, None, 9, None, None, None, None, None)), | ||
333 | # | 384 | # |
334 | # # -----------------普通打印:无格线------------------------------------- | 385 | # # -----------------普通打印:无格线------------------------------------- |
335 | # | 386 | # |
... | @@ -338,7 +389,8 @@ HEADERS_MAPPING.update( | ... | @@ -338,7 +389,8 @@ HEADERS_MAPPING.update( |
338 | # | 389 | # |
339 | # # 邮储银行:交易日期、交易类型 交易币种 交易金额(元) 账户余额(元) [对手方户名 对手方账户 收支类型] --> 竖版-无表格-邮储银行-账户对账单 含有对手方户名 对手方账户 | 390 | # # 邮储银行:交易日期、交易类型 交易币种 交易金额(元) 账户余额(元) [对手方户名 对手方账户 收支类型] --> 竖版-无表格-邮储银行-账户对账单 含有对手方户名 对手方账户 |
340 | # # 交易日期、交易类型 交易金额(元) 账户余额(元) 操作柜员 --> 竖版-无表格-邮储银行 账户对账单 | 391 | # # 交易日期、交易类型 交易金额(元) 账户余额(元) 操作柜员 --> 竖版-无表格-邮储银行 账户对账单 |
341 | # ('邮储银行', (1, None, None, None, None, 2, None, None, None, None, None, None, None)), | 392 | # ('邮储银行-8', (1, None, 4, 5, None, 2, 6, 7, None, None, None, None, None)), |
393 | # ('邮储银行-5', (1, None, 3, 4, None, 2, None, None, None, None, None, None, None)), | ||
342 | # | 394 | # |
343 | # # 工商银行电子版:交易日期 账号 储种 序号 币种 妙汇 摘要 地区 收入/支出金额 余额 [对方户名 对方账号] 渠道 | 395 | # # 工商银行电子版:交易日期 账号 储种 序号 币种 妙汇 摘要 地区 收入/支出金额 余额 [对方户名 对方账号] 渠道 |
344 | # ('工商银行电子版', (1, None, 9, 10, None, 7, None, None, None, None, None, None, None)), | 396 | # ('工商银行电子版', (1, None, 9, 10, None, 7, None, None, None, None, None, None, None)), |
... | @@ -351,13 +403,15 @@ HEADERS_MAPPING.update( | ... | @@ -351,13 +403,15 @@ HEADERS_MAPPING.update( |
351 | # ('民生银行', (None, None, 5, 6, None, None, 7, None, 8, None, None, None, None)), | 403 | # ('民生银行', (None, None, 5, 6, None, None, 7, None, 8, None, None, None, None)), |
352 | # | 404 | # |
353 | # # 农业银行:交易日期 摘要/附言 交易金额 对方账号和户名 | 405 | # # 农业银行:交易日期 摘要/附言 交易金额 对方账号和户名 |
354 | # ('农业银行', (1, None, 3, None, None, 2, None, 4, None, None, None, None, None)), | 406 | # ('农业银行-整数', (1, None, 3, None, None, 2, None, 4, None, None, None, None, None)), |
355 | # | 407 | # |
356 | # # 农业银行:交易日期 摘要/附言 交易金额 余额 交易地点/对方账号和户名 | 408 | # # 农业银行:交易日期 摘要/附言 交易金额 余额 交易地点/对方账号和户名 |
357 | # ('农业银行', (1, None, 3, 4, None, 2, None, 5, None, None, None, None, None)), | 409 | # ('农业银行', (1, None, 3, 4, None, 2, None, 5, None, None, None, None, None)), |
358 | # | 410 | # |
359 | # # 农业银行:日期、时间、[日志号]、短摘要、交易金额、本次余额、交易网点、渠道、附言 | 411 | # # 农业银行:日期、时间、短摘要、交易金额、本次余额、交易网点、渠道、附言 |
360 | # ('农业银行', (1, 2, None, None, None, None, None, None, None, None, None, None, None)), | 412 | # # 农业银行:日期、时间、日志号、短摘要、交易金额、本次余额、交易网点、渠道、附言 |
413 | # ('农业银行', (1, 2, 4, 5, None, 3, None, None, None, None, None, None, None)), | ||
414 | # ('农业银行-扩张缩进', (1, 2, 5, 6, None, 4, None, None, None, None, None, None, None)), | ||
361 | # | 415 | # |
362 | # # 交通银行:交易日期 记账日期、交易地点、交易类型、借贷状态、交易金额、余额 | 416 | # # 交通银行:交易日期 记账日期、交易地点、交易类型、借贷状态、交易金额、余额 |
363 | # ('交通银行', (1, None, 5, 6, None, 3, None, None, None, None, 4, None, None)), | 417 | # ('交通银行', (1, None, 5, 6, None, 3, None, None, None, None, 4, None, None)), |
... | @@ -374,11 +428,10 @@ HEADERS_MAPPING.update( | ... | @@ -374,11 +428,10 @@ HEADERS_MAPPING.update( |
374 | # | 428 | # |
375 | # # 邮储银行:序号、交易日期、交易渠道、摘要、交易金额、账户余额、对方账号/卡号/汇票号、原子账号、交易机构名称 | 429 | # # 邮储银行:序号、交易日期、交易渠道、摘要、交易金额、账户余额、对方账号/卡号/汇票号、原子账号、交易机构名称 |
376 | # ('邮储银行', (2, None, 5, 6, None, 4, None, 7, None, None, None, None, None)), | 430 | # ('邮储银行', (2, None, 5, 6, None, 4, None, 7, None, None, None, None, None)), |
377 | # | ||
378 | # # 建设银行:序号 摘要 币别 钞汇 交易日期 交易金额 账户余额 交易地点附言 对方账号与户名 | ||
379 | # ('建设银行', (5, None, 6, 7, None, 2, None, 9, None, None, None, None, None)), | ||
380 | # ] | 431 | # ] |
381 | 432 | ||
433 | OTHER_TUPLE = (None, None, None, None, None, None, None, None, None, None, None, None, None) | ||
434 | |||
382 | # { | 435 | # { |
383 | # "0":"其他", | 436 | # "0":"其他", |
384 | # "1":"普通打印-全表格-中国农业银行", | 437 | # "1":"普通打印-全表格-中国农业银行", |
... | @@ -408,67 +461,163 @@ HEADERS_MAPPING.update( | ... | @@ -408,67 +461,163 @@ HEADERS_MAPPING.update( |
408 | # "22":"针式打印-部分格线-邮储银行一本通绿卡" | 461 | # "22":"针式打印-部分格线-邮储银行一本通绿卡" |
409 | # } | 462 | # } |
410 | 463 | ||
464 | # CLASSIFY_LIST = [ | ||
465 | # ('其他', OTHER_TUPLE), | ||
466 | # ('农业银行', (1, None, 3, 5, None, 8, 7, 6, None, None, None, None, None)), | ||
467 | # ('中国银行', (1, 2, 4, 5, 6, 9, 10, 11, 12, None, None, None, None)), | ||
468 | # ('北京银行', (1, None, 4, 5, None, 2, 6, 7, None, None, 3, None, None)), | ||
469 | # ('工商银行', (1, None, 9, 10, None, 7, None, None, None, None, None, None, None)), | ||
470 | # ('建设银行', (None, None, None, None, None, 2, None, None, None, None, None, None, None)), | ||
471 | # ('微信', (2, None, 6, None, None, 3, 7, None, None, None, None, None, None)), | ||
472 | # ('支付宝', (2, None, None, 6, None, 3, None, None, None, None, None, 4, 5)), | ||
473 | # | ||
474 | # ('交通银行', (1, None, 5, 6, None, 3, None, None, None, None, 4, None, None)), | ||
475 | # ('农业银行', (1, None, 3, None, None, 2, None, 4, None, None, None, None, None)), | ||
476 | # ('农业银行', (1, 2, None, None, None, None, None, None, None, None, None, None, None)), | ||
477 | # ('招商银行', (1, None, 3, 4, None, 6, None, None, None, None, None, None, None)), | ||
478 | # ('招商银行电子版', (1, None, 3, 4, None, 5, 6, None, None, None, None, None, None)), | ||
479 | # ('民生银行', (None, None, 5, 6, None, None, 7, None, 8, None, None, None, None)), | ||
480 | # | ||
481 | # ('中信银行', (1, None, None, 5, None, 2, 6, 7, 8, None, None, 3, 4)), | ||
482 | # ('农业银行', (1, None, 3, 4, None, 2, None, 5, None, None, None, None, None)), | ||
483 | # ('农业银行', (1, None, 3, 4, None, 2, None, 5, None, None, None, None, None)), | ||
484 | # ('农业银行', (1, None, 3, 4, None, 2, None, 5, None, None, None, None, None)), | ||
485 | # ('平安电子账单', (2, None, 5, 6, None, 4, None, None, None, None, None, None, None)), | ||
486 | # | ||
487 | # ('建设银行', (None, None, None, None, None, None, None, None, None, None, None, None, None)), | ||
488 | # ('邮储银行', (2, None, 5, 6, None, 4, None, 7, None, None, None, None, None)), | ||
489 | # ('邮储银行', (2, None, 5, 6, None, 4, None, 7, None, None, None, None, None)), | ||
490 | # ] | ||
491 | |||
492 | # "4":"普通打印-全表格-中国银行", | ||
493 | # "5":"普通打印-全表格-农业银行-10列", | ||
494 | # "6":"普通打印-全表格-农业银行-10列-1", | ||
495 | # "7":"普通打印-全表格-农业银行-9列", | ||
496 | # "8":"普通打印-全表格-北京银行", | ||
497 | # "9":"普通打印-全表格-工商银行", | ||
498 | # "10":"普通打印-全表格-工商银行-电子账单", | ||
499 | # "11":"普通打印-全表格-建设银行", | ||
500 | # "12":"普通打印-全表格-微信账单", | ||
501 | # "13":"普通打印-全表格-支付宝账单", | ||
502 | |||
503 | # "14":"普通打印-无格线-交通银行", | ||
504 | # "15":"普通打印-无格线-储蓄银行-5列", | ||
505 | # "16":"普通打印-无格线-储蓄银行-8列", | ||
506 | # "17":"普通打印-无格线-农业银行-扩张缩进", | ||
507 | # "18":"普通打印-无格线-农业银行-整数", | ||
508 | # "19":"普通打印-无格线-招商银行", | ||
509 | # "20":"普通打印-无格线-招商银行-电子账单", | ||
510 | # "21":"普通打印-无格线-民生银行", | ||
511 | |||
512 | # "22":"普通打印-部分格线-横版-中信银行", | ||
513 | # "23":"普通打印-部分格线-竖版-农业银行-5列", | ||
514 | # "24":"普通打印-部分格线-竖版-农业银行-8列", | ||
515 | # "25":"普通打印-部分格线-竖版-农业银行-窄页", | ||
516 | # "26":"普通打印-部分格线-竖版-平安电子账单", | ||
517 | # "27":"普通打印-部分格线-竖版-建设银行-电子账单", | ||
518 | |||
519 | # "34":"针式打印-全格线-建设银行", | ||
520 | # "35":"针式打印-部分格线-竖版-邮储银行", | ||
521 | # "36":"针式打印-部分格线-竖版-邮储银行-绿卡", | ||
522 | |||
411 | CLASSIFY_LIST = [ | 523 | CLASSIFY_LIST = [ |
412 | ('其他', (None, None, None, None, None, None, None, None, None, None, None, None, None)), | 524 | ('其他', OTHER_TUPLE), |
413 | ('农业银行', (1, None, 3, 5, None, 8, 7, 6, None, None, None, None, None)), | 525 | ('其他', OTHER_TUPLE), |
414 | ('中国银行', (1, 2, 4, 5, 6, 9, 10, 11, 12, None, None, None, None)), | 526 | ('其他', OTHER_TUPLE), |
415 | ('北京银行', (1, None, 4, 5, None, 2, 6, 7, None, None, 3, None, None)), | 527 | ('其他', OTHER_TUPLE), |
416 | ('工商银行', (1, None, 9, 10, None, 7, None, None, None, None, None, None, None)), | 528 | ('普通打印-全表格-中国银行', (1, 2, 4, 5, 6, 9, 10, 11, 12, None, None, None, None)), |
417 | ('建设银行', (None, None, None, None, None, 2, None, None, None, None, None, None, None)), | 529 | ('普通打印-全表格-农业银行-10列', (1, None, None, 5, None, 8, 7, 6, None, None, None, 3, 4)), |
418 | ('微信', (2, None, 6, None, None, 3, 7, None, None, None, None, None, None)), | 530 | ('普通打印-全表格-农业银行-10列-1', (2, None, 4, 5, None, 3, 7, 6, None, None, None, None, None)), |
419 | ('支付宝', (2, None, None, 6, None, 3, None, None, None, None, None, 4, 5)), | 531 | ('普通打印-全表格-农业银行-9列', (1, None, 3, 4, None, 2, 8, 7, None, None, None, None, None)), |
420 | 532 | ('普通打印-全表格-北京银行', (1, None, 4, 5, None, 2, 6, 7, None, None, 3, None, None)), | |
421 | ('交通银行', (1, None, 5, 6, None, 3, None, None, None, None, 4, None, None)), | 533 | ('普通打印-全表格-工商银行', (1, None, 9, 10, None, 7, None, None, None, None, None, None, None)), |
422 | ('农业银行', (1, None, 3, None, None, 2, None, 4, None, None, None, None, None)), | 534 | ('普通打印-全表格-工商银行-电子账单', (1, None, 9, 10, None, 7, 11, 12, None, None, None, None, None)), |
423 | ('农业银行', (1, 2, None, None, None, None, None, None, None, None, None, None, None)), | 535 | ('普通打印-全表格-建设银行', (3, None, 4, 5, None, 2, None, 7, None, None, None, None, None)), |
424 | ('招商银行', (1, None, 3, 4, None, 6, None, None, None, None, None, None, None)), | 536 | ('普通打印-全表格-微信账单', (2, None, 6, None, None, 3, 7, None, None, None, None, None, None)), |
425 | ('招商银行电子版', (1, None, 3, 4, None, 5, 6, None, None, None, None, None, None)), | 537 | ('普通打印-全表格-支付宝账单', (2, None, None, 6, None, 3, None, None, None, None, None, 4, 5)), |
426 | ('民生银行', (None, None, 5, 6, None, None, 7, None, 8, None, None, None, None)), | 538 | |
427 | 539 | ('普通打印-无格线-交通银行', (1, None, 5, 6, None, 3, None, None, None, None, 4, None, None)), | |
428 | ('中信银行', (1, None, None, 5, None, 2, 6, 7, 8, None, None, 3, 4)), | 540 | ('普通打印-无格线-储蓄银行-5列', (1, None, 3, 4, None, 2, None, None, None, None, None, None, None)), |
429 | ('农业银行', (1, None, 3, 4, None, 2, None, 5, None, None, None, None, None)), | 541 | ('普通打印-无格线-储蓄银行-8列', (1, None, 4, 5, None, 2, 6, 7, None, None, None, None, None)), |
430 | ('农业银行', (1, None, 3, 4, None, 2, None, 5, None, None, None, None, None)), | 542 | ('普通打印-无格线-农业银行-扩张缩进', (1, 2, 5, 6, None, 4, None, None, None, None, None, None, None)), |
431 | ('农业银行', (1, None, 3, 4, None, 2, None, 5, None, None, None, None, None)), | 543 | ('普通打印-无格线-农业银行-整数', (1, None, 3, None, None, 2, None, 4, None, None, None, None, None)), |
432 | ('平安电子账单', (2, None, 5, 6, None, 4, None, None, None, None, None, None, None)), | 544 | ('普通打印-无格线-招商银行', (1, None, 3, 4, None, 6, None, None, None, None, None, None, None)), |
433 | 545 | ('普通打印-无格线-招商银行-电子账单', (1, None, 3, 4, None, 5, 6, None, None, None, None, None, None)), | |
434 | ('建设银行', (None, None, None, None, None, None, None, None, None, None, None, None, None)), | 546 | ('普通打印-无格线-民生银行', (None, None, 5, 6, None, None, 7, None, 8, None, None, None, None)), |
435 | ('邮储银行', (2, None, 5, 6, None, 4, None, 7, None, None, None, None, None)), | 547 | |
436 | ('邮储银行', (2, None, 5, 6, None, 4, None, 7, None, None, None, None, None)), | 548 | ('普通打印-部分格线-横版-中信银行', (1, None, None, 5, None, 2, 6, 7, 8, None, None, 3, 4)), |
549 | ('普通打印-部分格线-竖版-农业银行-5列', (1, None, 3, 4, None, 2, None, 5, None, None, None, None, None)), | ||
550 | ('普通打印-部分格线-竖版-农业银行-8列', (1, None, None, 6, None, 3, 8, 7, None, None, None, 4, 5)), | ||
551 | ('普通打印-部分格线-竖版-农业银行-窄页', (1, None, 3, 4, None, 2, 7, 6, None, None, None, None, None)), | ||
552 | ('普通打印-部分格线-竖版-平安电子账单', (2, None, 5, 6, None, 4, None, None, None, None, None, None, None)), | ||
553 | ('普通打印-部分格线-竖版-建设银行-电子账单', (5, None, 6, 7, None, 2, None, 9, None, None, None, None, None)), | ||
554 | ('其他', OTHER_TUPLE), | ||
555 | ('其他', OTHER_TUPLE), | ||
556 | ('其他', OTHER_TUPLE), | ||
557 | ('其他', OTHER_TUPLE), | ||
558 | ('其他', OTHER_TUPLE), | ||
559 | ('其他', OTHER_TUPLE), | ||
560 | ('针式打印-全格线-建设银行', OTHER_TUPLE), | ||
561 | ('针式打印-部分格线-竖版-邮储银行', (2, None, 5, 6, None, 4, None, 7, None, None, None, None, None)), | ||
562 | ('针式打印-部分格线-竖版-邮储银行-绿卡', (2, None, 5, 6, None, 4, None, 7, None, None, None, None, None)), | ||
563 | ('其他', OTHER_TUPLE), | ||
437 | ] | 564 | ] |
438 | 565 | ||
439 | # ----------license相关------------------------------------------------------------------------------------------------ | 566 | # ----------license相关------------------------------------------------------------------------------------------------ |
567 | |||
568 | # "0":"AVT Invioce", | ||
569 | # "1":"二手车发票", | ||
570 | # "2":"其他", | ||
571 | # "3":"护照", | ||
572 | # "28":"机动车登记证", | ||
573 | # "29":"机动车销售统一发票", | ||
574 | # "30":"港澳通行证", | ||
575 | # "31":"营业执照", | ||
576 | # "32":"行驶证", | ||
577 | # "33":"身份证", | ||
578 | # "37":"银行卡" | ||
579 | |||
580 | # 其他 | ||
581 | OTHER_CLASSIFY = 2 | ||
582 | |||
583 | # 身份证 | ||
584 | IC_CN_NAME = '身份证' | ||
585 | IC_CLASSIFY = 33 | ||
586 | # 增值税发票 | ||
587 | VAT_CN_NAME = '增值税发票' | ||
588 | VAT_CLASSIFY = 0 | ||
589 | # 机动车登记证书 | ||
590 | MVC_CN_NAME = '机动车登记证书' | ||
591 | MVC_CLASSIFY = 28 | ||
592 | # 机动车销售统一发票 | ||
593 | MVI_CN_NAME = '机动车销售统一发票' | ||
594 | MVI_CLASSIFY = 29 | ||
595 | IC_PID = VAT_PID = MVC_PID = MVI_PID = None | ||
596 | |||
440 | # 营业执照 | 597 | # 营业执照 |
441 | BL_KEY = 'bl' | 598 | BL_CN_NAME = '营业执照' |
599 | BL_CLASSIFY = 31 | ||
600 | BL_PID = 41 | ||
442 | # 二手车发票 | 601 | # 二手车发票 |
443 | UCI_KEY = 'uci' | 602 | UCI_CN_NAME = '二手车发票' |
603 | UCI_CLASSIFY = 1 | ||
604 | UCI_PID = 60 | ||
444 | # 港澳台通行证 | 605 | # 港澳台通行证 |
445 | EEP_KEY = 'eep' | 606 | EEP_CN_NAME = '港澳台通行证' |
607 | EEP_CLASSIFY = 30 | ||
608 | EEP_PID = 1018 | ||
446 | # 行驶证 | 609 | # 行驶证 |
447 | DL_KEY = 'dl' | 610 | DL_CN_NAME = '行驶证' |
611 | DL_CLASSIFY = 32 | ||
612 | DL_PID = 5 | ||
448 | # 护照 | 613 | # 护照 |
449 | PP_KEY = 'pp' | 614 | PP_CN_NAME = '护照' |
615 | PP_CLASSIFY = 3 | ||
616 | PP_PID = 8 | ||
450 | # 银行卡 | 617 | # 银行卡 |
451 | BC_KEY = 'bc' | 618 | BC_CN_NAME = '银行卡' |
452 | # 身份证 | 619 | BC_CLASSIFY = 37 |
453 | IC_KEY = 'ic' | 620 | BC_PID = 4 |
454 | # 机动车登记证书 | ||
455 | MVC_KEY = 'mvc' | ||
456 | # 机动车销售统一发票 | ||
457 | MVI_KEY = 'mvi' | ||
458 | # 增值税发票 | ||
459 | VAT_KEY = 'vat' | ||
460 | |||
461 | LICENSE_ORDER = ((MVI_KEY, '机动车销售统一发票'), | ||
462 | (IC_KEY, '身份证'), | ||
463 | (BC_KEY, '银行卡'), | ||
464 | (BL_KEY, '营业执照'), | ||
465 | (UCI_KEY, '二手车发票'), | ||
466 | (EEP_KEY, '港澳台通行证'), | ||
467 | (DL_KEY, '行驶证'), | ||
468 | (PP_KEY, '护照'), | ||
469 | (MVC_KEY, '机动车登记证书'), | ||
470 | (VAT_KEY, '增值税发票')) | ||
471 | |||
472 | BC_FIELD = (('CardNum', '银行卡号'), | 621 | BC_FIELD = (('CardNum', '银行卡号'), |
473 | ('BankName', '发卡行名称'), | 622 | ('BankName', '发卡行名称'), |
474 | ('CardName', '银行卡名称'), | 623 | ('CardName', '银行卡名称'), |
... | @@ -478,3 +627,19 @@ BC_FIELD = (('CardNum', '银行卡号'), | ... | @@ -478,3 +627,19 @@ BC_FIELD = (('CardNum', '银行卡号'), |
478 | 627 | ||
479 | SUCCESS_CODE_SET = {'0', 0} | 628 | SUCCESS_CODE_SET = {'0', 0} |
480 | 629 | ||
630 | LICENSE_ORDER = ((MVI_CLASSIFY, (MVI_PID, MVI_CN_NAME)), | ||
631 | (IC_CLASSIFY, (IC_PID, IC_CN_NAME)), | ||
632 | (BC_CLASSIFY, (BC_PID, BC_CN_NAME)), | ||
633 | (BL_CLASSIFY, (BL_PID, BL_CN_NAME)), | ||
634 | (UCI_CLASSIFY, (UCI_PID, UCI_CN_NAME)), | ||
635 | (EEP_CLASSIFY, (EEP_PID, EEP_CN_NAME)), | ||
636 | (DL_CLASSIFY, (DL_PID, DL_CN_NAME)), | ||
637 | (PP_CLASSIFY, (PP_PID, PP_CN_NAME)), | ||
638 | (MVC_CLASSIFY, (MVC_PID, MVC_CN_NAME)), | ||
639 | (VAT_CLASSIFY, (VAT_PID, VAT_CN_NAME))) | ||
640 | |||
641 | LICENSE_CLASSIFY_MAPPING = dict(LICENSE_ORDER) | ||
642 | |||
643 | OTHER_CLASSIFY_SET = {OTHER_CLASSIFY} | ||
644 | LICENSE_CLASSIFY_SET_1 = {IC_CLASSIFY, VAT_CLASSIFY, MVC_CLASSIFY, MVI_CLASSIFY} | ||
645 | LICENSE_CLASSIFY_SET_2 = {BL_CLASSIFY, UCI_CLASSIFY, EEP_CLASSIFY, DL_CLASSIFY, PP_CLASSIFY, BC_CLASSIFY} | ... | ... |
src/apps/doc/exceptions.py
0 → 100644
... | @@ -4,6 +4,7 @@ import signal | ... | @@ -4,6 +4,7 @@ import signal |
4 | import asyncio | 4 | import asyncio |
5 | import aiohttp | 5 | import aiohttp |
6 | import difflib | 6 | import difflib |
7 | import base64 | ||
7 | import requests | 8 | import requests |
8 | from datetime import datetime, date | 9 | from datetime import datetime, date |
9 | from collections import Counter | 10 | from collections import Counter |
... | @@ -18,6 +19,7 @@ from apps.doc.models import DocStatus, HILDoc, AFCDoc, Keywords | ... | @@ -18,6 +19,7 @@ from apps.doc.models import DocStatus, HILDoc, AFCDoc, Keywords |
18 | from apps.doc.named_enum import KeywordsType | 19 | from apps.doc.named_enum import KeywordsType |
19 | from apps.doc import consts | 20 | from apps.doc import consts |
20 | from apps.doc.ocr.edms import EDMS, rh | 21 | from apps.doc.ocr.edms import EDMS, rh |
22 | from apps.doc.exceptions import EDMSException | ||
21 | 23 | ||
22 | 24 | ||
23 | class Command(BaseCommand, LoggerMixin): | 25 | class Command(BaseCommand, LoggerMixin): |
... | @@ -30,7 +32,8 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -30,7 +32,8 @@ class Command(BaseCommand, LoggerMixin): |
30 | # 数据目录 | 32 | # 数据目录 |
31 | self.data_dir = conf.DATA_DIR | 33 | self.data_dir = conf.DATA_DIR |
32 | # ocr相关 | 34 | # ocr相关 |
33 | self.ocr_url = conf.OCR_URL | 35 | self.ocr_url_1 = conf.OCR_URL_1 |
36 | self.ocr_url_2 = conf.OCR_URL_2 | ||
34 | # EDMS web_service_api | 37 | # EDMS web_service_api |
35 | self.edms = EDMS(conf.EDMS_USER, conf.EDMS_PWD) | 38 | self.edms = EDMS(conf.EDMS_USER, conf.EDMS_PWD) |
36 | # 优雅退出信号:15 | 39 | # 优雅退出信号:15 |
... | @@ -70,18 +73,44 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -70,18 +73,44 @@ class Command(BaseCommand, LoggerMixin): |
70 | os.makedirs(doc_data_path, exist_ok=True) | 73 | os.makedirs(doc_data_path, exist_ok=True) |
71 | pdf_path = os.path.join(doc_data_path, '{0}.pdf'.format(doc.id)) | 74 | pdf_path = os.path.join(doc_data_path, '{0}.pdf'.format(doc.id)) |
72 | if not doc.application_id.startswith(consts.FIXED_APPLICATION_ID_PREFIX): | 75 | if not doc.application_id.startswith(consts.FIXED_APPLICATION_ID_PREFIX): |
73 | self.edms.download(pdf_path, doc.metadata_version_id) | 76 | for times in range(consts.RETRY_TIMES): |
77 | try: | ||
78 | self.edms.download(pdf_path, doc.metadata_version_id) | ||
79 | except Exception as e: | ||
80 | self.cronjob_log.warn('{0} [edms download failed] [times={1}] [business_type={2}] [doc_id={3}] ' | ||
81 | '[error={4}]'.format(self.log_base, times, business_type, doc.id, e)) | ||
82 | edms_exc = str(e) | ||
83 | else: | ||
84 | break | ||
85 | else: | ||
86 | raise EDMSException(edms_exc) | ||
74 | 87 | ||
75 | excel_path = os.path.join(doc_data_path, '{0}.xlsx'.format(doc.id)) | 88 | excel_path = os.path.join(doc_data_path, '{0}.xlsx'.format(doc.id)) |
76 | src_excel_path = os.path.join(doc_data_path, 'src.xlsx') | 89 | src_excel_path = os.path.join(doc_data_path, 'src.xlsx') |
77 | self.cronjob_log.info('{0} [pdf download success] [business_type={1}] [doc_id={2}] [pdf_path={3}]'.format( | 90 | self.cronjob_log.info('{0} [edms download success] [business_type={1}] [doc_id={2}] [pdf_path={3}]'.format( |
78 | self.log_base, business_type, doc.id, pdf_path)) | 91 | self.log_base, business_type, doc.id, pdf_path)) |
79 | return doc_data_path, excel_path, src_excel_path, pdf_path | 92 | return doc_data_path, excel_path, src_excel_path, pdf_path |
80 | 93 | ||
81 | @staticmethod | 94 | def bs_process(self, wb, ocr_data, bs_summary, unknown_summary, img_path, classify, skip_img): |
82 | def append_bs_sheet(wb, sheets, bs_summary, unknown_summary, pno, img_idx, classify, confidence): | 95 | sheets = ocr_data.get('data', []) |
96 | if not sheets: | ||
97 | skip_img.append(self.parse_img_path(img_path)) | ||
98 | return | ||
99 | confidence = ocr_data.get('confidence', 1) | ||
100 | img_name, _ = os.path.splitext(os.path.basename(img_path)) | ||
83 | for i, sheet in enumerate(sheets): | 101 | for i, sheet in enumerate(sheets): |
84 | sheet_name = 'page_{0}_img_{1}_{2}'.format(pno, img_idx, i) | 102 | cells = sheet.get('cells') |
103 | if not cells: | ||
104 | skip_img.append(self.parse_img_path(img_path)) | ||
105 | continue | ||
106 | sheet_name = '{0}_{1}'.format(img_name, i) | ||
107 | ws = wb.create_sheet(sheet_name) | ||
108 | for cell in cells: | ||
109 | c1 = cell.get('start_column') | ||
110 | r1 = cell.get('start_row') | ||
111 | words = cell.get('words') | ||
112 | ws.cell(row=r1 + 1, column=c1 + 1, value=words) | ||
113 | |||
85 | # ['户名', '卡号', '页码', '回单验证码', '打印时间', '起始时间', '终止时间'] | 114 | # ['户名', '卡号', '页码', '回单验证码', '打印时间', '起始时间', '终止时间'] |
86 | summary = sheet.get('summary') | 115 | summary = sheet.get('summary') |
87 | card = summary[1] | 116 | card = summary[1] |
... | @@ -129,74 +158,199 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -129,74 +158,199 @@ class Command(BaseCommand, LoggerMixin): |
129 | if summary[6] is not None: | 158 | if summary[6] is not None: |
130 | ed_list.append(summary[6]) | 159 | ed_list.append(summary[6]) |
131 | 160 | ||
132 | ws = wb.create_sheet(sheet_name) | 161 | def license1_process(self, ocr_data, license_summary, classify, skip_img, img_path): |
133 | cells = sheet.get('cells') | 162 | license_data = ocr_data.get('data', []) |
134 | for cell in cells: | 163 | if not license_data: |
135 | c1 = cell.get('start_column') | 164 | skip_img.append(self.parse_img_path(img_path)) |
136 | r1 = cell.get('start_row') | ||
137 | words = cell.get('words') | ||
138 | ws.cell(row=r1+1, column=c1+1, value=words) | ||
139 | |||
140 | def ocr_2_wb(self, res, wb, pno, img_idx, bs_summary, unknown_summary, license_summary): | ||
141 | # res = { | ||
142 | # 'code': 1, | ||
143 | # 'msg': 'success', | ||
144 | # 'data': { | ||
145 | # 'classify': 0, | ||
146 | # 'confidence': 0.999, | ||
147 | # 'sheets': [ | ||
148 | # { | ||
149 | # 'summary': ['户名', '卡号', '页码', '回单验证码', '打印时间', '起始时间', '终止时间'], | ||
150 | # 'cells': [] | ||
151 | # }, | ||
152 | # { | ||
153 | # 'summary': ['户名', '卡号', '页码', '回单验证码', '打印时间', '起始时间', '终止时间'], | ||
154 | # 'cells': [] | ||
155 | # } | ||
156 | # ] | ||
157 | # } | ||
158 | # } | ||
159 | data = res.get('data', {}) | ||
160 | classify = data.get('classify') | ||
161 | if classify is None: | ||
162 | return | 165 | return |
163 | # if classify in | 166 | for license_dict in license_data: |
164 | sheets = data.get('sheets', []) | 167 | res_list = [] |
165 | if not sheets: | 168 | for field, value in license_dict.items(): |
166 | return | 169 | res_list.append((field, value)) |
167 | confidence = data.get('confidence', 1) | 170 | license_summary.setdefault(classify, []).append(res_list) |
168 | self.append_bs_sheet(wb, sheets, bs_summary, unknown_summary, pno, img_idx, classify, confidence) | 171 | |
169 | # else: | 172 | def license2_process(self, ocr_res_2, license_summary, pid, classify, skip_img, img_path): |
170 | # pass | 173 | if ocr_res_2.get('ErrorCode') in consts.SUCCESS_CODE_SET: |
171 | 174 | if pid == consts.BC_PID: | |
172 | # async def fetch_ocr_result(self, img_path): | 175 | # 银行卡 |
173 | # async with aiohttp.ClientSession( | 176 | res_list = [] |
174 | # headers=self.ocr_header, connector=aiohttp.TCPConnector(ssl=False) | 177 | for en_key, chn_key in consts.BC_FIELD: |
175 | # ) as session: | 178 | res_list.append((chn_key, ocr_res_2.get(en_key, ''))) |
176 | # json_data = self.get_ocr_json(img_path) | 179 | license_summary.setdefault(classify, []).append(res_list) |
177 | # async with session.post(self.ocr_url, json=json_data) as response: | 180 | else: |
178 | # return await response.json() | 181 | # 营业执照、行驶证等 |
182 | for result_dict in ocr_res_2.get('ResultList', []): | ||
183 | res_list = [] | ||
184 | for field_dict in result_dict.get('FieldList', []): | ||
185 | res_list.append( | ||
186 | (field_dict.get('chn_key', ''), field_dict.get('value', ''))) | ||
187 | license_summary.setdefault(classify, []).append(res_list) | ||
188 | else: | ||
189 | skip_img.append(self.parse_img_path(img_path)) | ||
190 | |||
191 | @staticmethod | ||
192 | async def fetch_ocr_1_result(url, json_data): | ||
193 | async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(ssl=False)) as session: | ||
194 | async with session.post(url, json=json_data) as response: | ||
195 | if response.status == 200: | ||
196 | return await response.json() | ||
197 | |||
198 | @staticmethod | ||
199 | async def fetch_ocr_2_result(url, json_data): | ||
200 | async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(ssl=False)) as session: | ||
201 | async with session.post(url, data=json_data) as response: | ||
202 | if response.status == 200: | ||
203 | return await response.json() | ||
204 | |||
205 | async def img_2_ocr_2_wb(self, wb, img_path, bs_summary, unknown_summary, license_summary, skip_img): | ||
206 | with open(img_path, 'rb') as f: | ||
207 | base64_data = base64.b64encode(f.read()) | ||
208 | # 获取解码后的base64值 | ||
209 | file_data = base64_data.decode() | ||
210 | json_data_1 = { | ||
211 | "file": file_data | ||
212 | } | ||
213 | ocr_res_1 = await self.fetch_ocr_1_result(self.ocr_url_1, json_data_1) | ||
214 | if ocr_res_1 is None: | ||
215 | raise Exception('ocr 1 error, img_path={0}'.format(img_path)) | ||
216 | else: | ||
217 | self.cronjob_log.info('{0} [ocr_1 result] [img={1}] [res={2}]'.format( | ||
218 | self.log_base, img_path, ocr_res_1)) | ||
219 | |||
220 | if ocr_res_1.get('code') == 1: | ||
221 | ocr_data = ocr_res_1.get('data', {}) | ||
222 | classify = ocr_data.get('classify') | ||
223 | if classify is None: | ||
224 | skip_img.append(self.parse_img_path(img_path)) | ||
225 | return | ||
226 | elif classify in consts.OTHER_CLASSIFY_SET: # 其他类 | ||
227 | skip_img.append(self.parse_img_path(img_path)) | ||
228 | return | ||
229 | elif classify in consts.LICENSE_CLASSIFY_SET_1: # 证件1 | ||
230 | self.license1_process(ocr_data, license_summary, classify, skip_img, img_path) | ||
231 | elif classify in consts.LICENSE_CLASSIFY_SET_2: # 证件2 | ||
232 | pid, _ = consts.LICENSE_CLASSIFY_MAPPING.get(classify) | ||
233 | json_data_2 = { | ||
234 | "pid": str(pid), | ||
235 | "key": conf.OCR_KEY, | ||
236 | "secret": conf.OCR_SECRET, | ||
237 | "file": file_data | ||
238 | } | ||
239 | ocr_res_2 = await self.fetch_ocr_2_result(self.ocr_url_2, json_data_2) | ||
240 | if ocr_res_2 is None: | ||
241 | raise Exception('ocr 2 error, img_path={0}'.format(img_path)) | ||
242 | else: | ||
243 | # 识别结果 | ||
244 | self.cronjob_log.info('{0} [ocr_2 result] [img={1}] [res={2}]'.format( | ||
245 | self.log_base, img_path, ocr_res_2)) | ||
246 | self.license2_process(ocr_res_2, license_summary, pid, classify, skip_img, img_path) | ||
247 | else: # 流水处理 | ||
248 | self.bs_process(wb, ocr_data, bs_summary, unknown_summary, img_path, classify, skip_img) | ||
249 | else: | ||
250 | skip_img.append(self.parse_img_path(img_path)) | ||
251 | |||
252 | # def img_2_ocr_2_wb(self, wb, img_path, bs_summary, unknown_summary, license_summary, skip_img): | ||
253 | # # # 流水 | ||
254 | # # res = { | ||
255 | # # 'code': 1, | ||
256 | # # 'msg': 'success', | ||
257 | # # 'data': { | ||
258 | # # 'classify': 0, | ||
259 | # # 'confidence': 0.999, | ||
260 | # # 'data': [ | ||
261 | # # { | ||
262 | # # 'summary': ['户名', '卡号', '页码', '回单验证码', '打印时间', '起始时间', '终止时间'], | ||
263 | # # 'cells': [] | ||
264 | # # }, | ||
265 | # # { | ||
266 | # # 'summary': ['户名', '卡号', '页码', '回单验证码', '打印时间', '起始时间', '终止时间'], | ||
267 | # # 'cells': [] | ||
268 | # # } | ||
269 | # # ] | ||
270 | # # } | ||
271 | # # } | ||
272 | # # | ||
273 | # # # 证件-1 | ||
274 | # # res = { | ||
275 | # # 'code': 1, | ||
276 | # # 'msg': 'success', | ||
277 | # # 'data': { | ||
278 | # # 'classify': 0, | ||
279 | # # 'confidence': 0.999, | ||
280 | # # 'data': [ | ||
281 | # # { | ||
282 | # # 'cn_key': 'value', | ||
283 | # # 'cn_key': 'value', | ||
284 | # # }, | ||
285 | # # { | ||
286 | # # 'cn_key': 'value', | ||
287 | # # 'cn_key': 'value', | ||
288 | # # }, | ||
289 | # # ] | ||
290 | # # } | ||
291 | # # } | ||
292 | # # | ||
293 | # # # 证件-2 or 其他类 | ||
294 | # # res = { | ||
295 | # # 'code': 1, | ||
296 | # # 'msg': 'success', | ||
297 | # # 'data': { | ||
298 | # # 'classify': 0, | ||
299 | # # 'confidence': 0.999, | ||
300 | # # } | ||
301 | # # } | ||
302 | # with open(img_path, 'rb') as f: | ||
303 | # base64_data = base64.b64encode(f.read()) | ||
304 | # # 获取解码后的base64值 | ||
305 | # file_data = base64_data.decode() | ||
306 | # json_data_1 = { | ||
307 | # "file": file_data | ||
308 | # } | ||
309 | # response_1 = requests.post(self.ocr_url_1, json=json_data_1) | ||
310 | # if response_1.status_code == 200: | ||
311 | # ocr_res_1 = response_1.json() | ||
312 | # self.cronjob_log.info('{0} [ocr_1 result] [img={1}] [res={2}]'.format( | ||
313 | # self.log_base, img_path, ocr_res_1)) | ||
179 | # | 314 | # |
180 | # async def img_2_ocr_2_wb(self, wb, img_path, summary): | 315 | # if ocr_res_1.get('code') == 1: |
181 | # res = await self.fetch_ocr_result(img_path) | 316 | # ocr_data = ocr_res_1.get('data', {}) |
182 | # self.cronjob_log.info('{0} [fetch ocr result success] [img={1}] [res={2}]'.format(self.log_base, img_path, res)) | 317 | # classify = ocr_data.get('classify') |
183 | # sheets_list = res.get('result').get('res') | 318 | # if classify is None: |
184 | # img_name = os.path.basename(img_path) | 319 | # skip_img.append(self.parse_img_path(img_path)) |
185 | # self.append_sheet(wb, sheets_list, img_name, summary) | 320 | # return |
186 | 321 | # elif classify in consts.OTHER_CLASSIFY_SET: # 其他类 | |
187 | def fetch_ocr_result(self, img_path): | 322 | # skip_img.append(self.parse_img_path(img_path)) |
188 | files = [ | 323 | # return |
189 | ('img', open(img_path, 'rb')) | 324 | # elif classify in consts.LICENSE_CLASSIFY_SET_1: # 证件1 |
190 | ] | 325 | # self.license1_process(ocr_data, license_summary, classify, skip_img, img_path) |
191 | response = requests.request("POST", self.ocr_url, files=files) | 326 | # elif classify in consts.LICENSE_CLASSIFY_SET_2: # 证件2 |
192 | return response.json() | 327 | # pid, _ = consts.LICENSE_CLASSIFY_MAPPING.get(classify) |
193 | 328 | # json_data_2 = { | |
194 | def img_2_ocr_2_wb(self, wb, img_info, bs_summary, unknown_summary, license_summary): | 329 | # "pid": str(pid), |
195 | res = self.fetch_ocr_result(img_info[0]) | 330 | # "key": conf.OCR_KEY, |
196 | self.cronjob_log.info('{0} [fetch ocr result success] [img={1}] [res={2}]'.format( | 331 | # "secret": conf.OCR_SECRET, |
197 | self.log_base, img_info[0], res)) | 332 | # "file": file_data |
198 | if res.get('code') == 1: | 333 | # } |
199 | self.ocr_2_wb(res, wb, img_info[1], img_info[2], bs_summary, unknown_summary, license_summary) | 334 | # response_2 = requests.post(self.ocr_url_2, data=json_data_2) |
335 | # if response_2.status_code == 200: | ||
336 | # # 识别结果 | ||
337 | # ocr_res_2 = response_2.json() | ||
338 | # self.cronjob_log.info('{0} [ocr_2 result] [img={1}] [res={2}]'.format( | ||
339 | # self.log_base, img_path, ocr_res_2)) | ||
340 | # self.license2_process(ocr_res_2, license_summary, pid, classify, skip_img, img_path) | ||
341 | # else: | ||
342 | # raise Exception('ocr 2 error, img_path={0}'.format(img_path)) | ||
343 | # else: # 流水处理 | ||
344 | # self.bs_process(wb, ocr_data, bs_summary, unknown_summary, img_path, classify, skip_img) | ||
345 | # else: | ||
346 | # skip_img.append(self.parse_img_path(img_path)) | ||
347 | # else: | ||
348 | # raise Exception('ocr 1 error, img_path={0}'.format(img_path)) | ||
349 | |||
350 | @staticmethod | ||
351 | def parse_img_path(img_path): | ||
352 | img_name, _ = os.path.splitext(os.path.basename(img_path)) | ||
353 | return int(img_name[5])+1, int(img_name[11])+1 | ||
200 | 354 | ||
201 | @staticmethod | 355 | @staticmethod |
202 | def get_most(value_list): | 356 | def get_most(value_list): |
... | @@ -255,7 +409,6 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -255,7 +409,6 @@ class Command(BaseCommand, LoggerMixin): |
255 | summary['role'] = self.get_most(summary['role']) | 409 | summary['role'] = self.get_most(summary['role']) |
256 | return bs_summary | 410 | return bs_summary |
257 | 411 | ||
258 | |||
259 | def rebuild_bs_summary(self, bs_summary, unknown_summary): | 412 | def rebuild_bs_summary(self, bs_summary, unknown_summary): |
260 | # bs_summary = { | 413 | # bs_summary = { |
261 | # '卡号': { | 414 | # '卡号': { |
... | @@ -297,8 +450,10 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -297,8 +450,10 @@ class Command(BaseCommand, LoggerMixin): |
297 | merged_bs_summary[card] = summary | 450 | merged_bs_summary[card] = summary |
298 | else: | 451 | else: |
299 | # 1卡号 | 452 | # 1卡号 |
453 | one_card = False | ||
300 | if len(bs_summary) == 1: | 454 | if len(bs_summary) == 1: |
301 | merged_bs_summary = self.prune_bs_summary(bs_summary) | 455 | merged_bs_summary = self.prune_bs_summary(bs_summary) |
456 | one_card = True | ||
302 | # 多卡号 | 457 | # 多卡号 |
303 | else: | 458 | else: |
304 | merged_bs_summary = self.merge_card(bs_summary) | 459 | merged_bs_summary = self.merge_card(bs_summary) |
... | @@ -307,7 +462,7 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -307,7 +462,7 @@ class Command(BaseCommand, LoggerMixin): |
307 | merge_role = [] | 462 | merge_role = [] |
308 | classify_summary = unknown_summary.get(card_summary['classify'], {}) | 463 | classify_summary = unknown_summary.get(card_summary['classify'], {}) |
309 | for role, summary in classify_summary.items(): | 464 | for role, summary in classify_summary.items(): |
310 | if role in card_summary['role_set']: | 465 | if one_card or role in card_summary['role_set']: |
311 | merge_role.append(role) | 466 | merge_role.append(role) |
312 | card_summary['sheet'].extend(summary['sheet']) | 467 | card_summary['sheet'].extend(summary['sheet']) |
313 | card_summary['code'].extend(summary['code']) | 468 | card_summary['code'].extend(summary['code']) |
... | @@ -336,12 +491,13 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -336,12 +491,13 @@ class Command(BaseCommand, LoggerMixin): |
336 | return merged_bs_summary | 491 | return merged_bs_summary |
337 | 492 | ||
338 | # TODO 细化文件状态,不同异常状态,归还队列,重试时采取不同的处理 | 493 | # TODO 细化文件状态,不同异常状态,归还队列,重试时采取不同的处理 |
339 | # TODO 调用接口重试 | ||
340 | # TODO 协程异步发送OCR请求 | ||
341 | # TODO 异常邮件通知 | 494 | # TODO 异常邮件通知 |
495 | # 识别失败:普通异常,如PDF异常、构建过程异常 | ||
496 | # EDMS异常:下载异常-->回队列-->邮件;上传异常-->重新上传队列-->邮件 | ||
497 | # 算法异常:第一道异常-->识别失败-->邮件;第二道异常-->识别失败-->邮件 | ||
498 | # TODO 协程异步发送OCR请求 | ||
499 | # TODO 调用接口重试 | ||
342 | # TODO 数据库断联问题 | 500 | # TODO 数据库断联问题 |
343 | # TODO 非流水证件处理 | ||
344 | # TODO EDMS API GATEWAY | ||
345 | def handle(self, *args, **kwargs): | 501 | def handle(self, *args, **kwargs): |
346 | sleep_second = int(conf.SLEEP_SECOND) | 502 | sleep_second = int(conf.SLEEP_SECOND) |
347 | max_sleep_second = int(conf.MAX_SLEEP_SECOND) | 503 | max_sleep_second = int(conf.MAX_SLEEP_SECOND) |
... | @@ -369,61 +525,82 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -369,61 +525,82 @@ class Command(BaseCommand, LoggerMixin): |
369 | pdf_handler.extract_image() | 525 | pdf_handler.extract_image() |
370 | self.cronjob_log.info('{0} [pdf to img end] [business_type={1}] [doc_id={2}]'.format( | 526 | self.cronjob_log.info('{0} [pdf to img end] [business_type={1}] [doc_id={2}]'.format( |
371 | self.log_base, business_type, doc.id)) | 527 | self.log_base, business_type, doc.id)) |
372 | write_zip_file(img_save_path, os.path.join(doc_data_path, '{0}_img.zip'.format(doc.id))) | ||
373 | 528 | ||
374 | # 4.获取OCR结果并且构建excel文件 | 529 | # 4.获取OCR结果并且构建excel文件 |
375 | bs_summary = {} | 530 | bs_summary = {} |
376 | license_summary = {} | 531 | license_summary = {} |
377 | unknown_summary = {} | 532 | unknown_summary = {} |
533 | skip_img = [] | ||
378 | interest_keyword = Keywords.objects.filter( | 534 | interest_keyword = Keywords.objects.filter( |
379 | type=KeywordsType.INTEREST.value).values_list('keyword', flat=True) | 535 | type=KeywordsType.INTEREST.value, on_off=True).values_list('keyword', flat=True) |
380 | salary_keyword = Keywords.objects.filter( | 536 | salary_keyword = Keywords.objects.filter( |
381 | type=KeywordsType.SALARY.value).values_list('keyword', flat=True) | 537 | type=KeywordsType.SALARY.value, on_off=True).values_list('keyword', flat=True) |
382 | loan_keyword = Keywords.objects.filter( | 538 | loan_keyword = Keywords.objects.filter( |
383 | type__in=[KeywordsType.LOAN.value, KeywordsType.ALI_WECHART.value]).values_list( | 539 | type__in=[KeywordsType.LOAN.value, KeywordsType.ALI_WECHART.value], on_off=True).values_list( |
384 | 'keyword', flat=True) | 540 | 'keyword', flat=True) |
385 | wb = BSWorkbook(interest_keyword, salary_keyword, loan_keyword) | 541 | wb = BSWorkbook(interest_keyword, salary_keyword, loan_keyword) |
386 | 542 | ||
387 | # wb = Workbook() | 543 | # wb = Workbook() |
388 | 544 | ||
389 | # 4.1 获取OCR结果 | 545 | # 4.1 获取OCR结果 |
390 | # loop = asyncio.get_event_loop() | 546 | loop = asyncio.get_event_loop() |
391 | # tasks = [self.img_2_ocr_2_wb(wb, img_path, summary) for img_path in pdf_handler.img_path_list] | 547 | tasks = [self.img_2_ocr_2_wb(wb, img_path, bs_summary, unknown_summary, license_summary, skip_img) |
392 | # loop.run_until_complete(asyncio.wait(tasks)) | 548 | for img_path in pdf_handler.img_path_list] |
549 | loop.run_until_complete(asyncio.wait(tasks)) | ||
393 | # loop.close() | 550 | # loop.close() |
394 | 551 | ||
395 | for img_info in pdf_handler.img_info_list: | 552 | # for img_path in pdf_handler.img_path_list: |
396 | self.img_2_ocr_2_wb(wb, img_info, bs_summary, unknown_summary, license_summary) | 553 | # self.img_2_ocr_2_wb(wb, img_path, bs_summary, unknown_summary, license_summary, skip_img) |
397 | 554 | ||
398 | self.cronjob_log.info('{0} [bs_summary={1}] [unknown_summary={2}] [license_summary={3}]'.format( | 555 | self.cronjob_log.info('{0} [business_type={1}] [doc_id={2}] [bs_summary={3}] [unknown_summary={4}] ' |
399 | self.log_base, bs_summary, unknown_summary, license_summary)) | 556 | '[license_summary={5}]'.format(self.log_base, business_type, doc.id, bs_summary, |
557 | unknown_summary, license_summary)) | ||
400 | 558 | ||
401 | merged_bs_summary = self.rebuild_bs_summary(bs_summary, unknown_summary) | 559 | merged_bs_summary = self.rebuild_bs_summary(bs_summary, unknown_summary) |
402 | 560 | ||
403 | self.cronjob_log.info('{0} [merged_bs_summary={1}] [unknown_summary={2}]'.format( | 561 | self.cronjob_log.info('{0} [business_type={1}] [doc_id={2}] [merged_bs_summary={3}] ' |
404 | self.log_base, merged_bs_summary, unknown_summary)) | 562 | '[unknown_summary={4}] [skip_img={5}]'.format(self.log_base, business_type, |
563 | doc.id, merged_bs_summary, | ||
564 | unknown_summary, skip_img)) | ||
405 | del unknown_summary | 565 | del unknown_summary |
406 | 566 | ||
407 | # 4.2 重构Excel文件 | 567 | # 4.2 重构Excel文件 |
408 | wb.save(src_excel_path) | 568 | wb.save(src_excel_path) |
409 | wb.rebuild(merged_bs_summary, license_summary) | 569 | wb.rebuild(merged_bs_summary, license_summary, skip_img) |
410 | wb.save(excel_path) | 570 | wb.save(excel_path) |
571 | except EDMSException as e: | ||
572 | self.cronjob_log.error('{0} [process failed (edms download)] [business_type={1}] [doc_id={2}] ' | ||
573 | '[err={3}]'.format(self.log_base, business_type, doc.id, e)) | ||
411 | except Exception as e: | 574 | except Exception as e: |
412 | doc.status = DocStatus.PROCESS_FAILED.value | 575 | doc.status = DocStatus.PROCESS_FAILED.value |
413 | doc.save() | 576 | doc.save() |
414 | self.cronjob_log.error('{0} [process failed] [business_type={1}] [doc_id={2}] [err={3}]'.format( | 577 | self.cronjob_log.error('{0} [process failed (program)] [business_type={1}] [doc_id={2}] ' |
415 | self.log_base, business_type, doc.id, e)) | 578 | '[err={3}]'.format(self.log_base, business_type, doc.id, e)) |
416 | else: | 579 | else: |
417 | try: | 580 | try: |
418 | # 5.上传至EDMS | 581 | # 5.上传至EDMS |
419 | self.edms.upload(excel_path, doc, business_type) | 582 | for times in range(consts.RETRY_TIMES): |
583 | try: | ||
584 | self.edms.upload(excel_path, doc, business_type) | ||
585 | except Exception as e: | ||
586 | self.cronjob_log.warn( | ||
587 | '{0} [edms upload failed] [times={1}] [business_type={2}] [doc_id={3}] ' | ||
588 | '[error={4}]'.format(self.log_base, times, business_type, doc.id, e)) | ||
589 | edms_exc = str(e) | ||
590 | else: | ||
591 | break | ||
592 | else: | ||
593 | raise EDMSException(edms_exc) | ||
420 | except Exception as e: | 594 | except Exception as e: |
421 | doc.status = DocStatus.UPLOAD_FAILED.value | 595 | doc.status = DocStatus.UPLOAD_FAILED.value |
422 | doc.save() | 596 | doc.save() |
423 | end_time = time.time() | 597 | end_time = time.time() |
424 | speed_time = int(end_time - start_time) | 598 | speed_time = int(end_time - start_time) |
425 | self.cronjob_log.error('{0} [upload failed] [business_type={1}] [doc_id={2}] [speed_time={3}] ' | 599 | self.cronjob_log.error('{0} [process failed (edms upload)] [business_type={1}] [doc_id={2}] ' |
426 | '[err={4}]'.format(self.log_base, business_type, doc.id, speed_time, e)) | 600 | '[speed_time={3}] [err={4}]'.format(self.log_base, business_type, doc.id, |
601 | speed_time, e)) | ||
602 | write_zip_file(img_save_path, os.path.join(doc_data_path, '{0}_img.zip'.format(doc.id))) | ||
603 | |||
427 | else: | 604 | else: |
428 | doc.status = DocStatus.COMPLETE.value | 605 | doc.status = DocStatus.COMPLETE.value |
429 | doc.save() | 606 | doc.save() |
... | @@ -431,5 +608,6 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -431,5 +608,6 @@ class Command(BaseCommand, LoggerMixin): |
431 | speed_time = int(end_time - start_time) | 608 | speed_time = int(end_time - start_time) |
432 | self.cronjob_log.info('{0} [process complete] [business_type={1}] [doc_id={2}] ' | 609 | self.cronjob_log.info('{0} [process complete] [business_type={1}] [doc_id={2}] ' |
433 | '[speed_time={3}]'.format(self.log_base, business_type, doc.id, speed_time)) | 610 | '[speed_time={3}]'.format(self.log_base, business_type, doc.id, speed_time)) |
611 | write_zip_file(img_save_path, os.path.join(doc_data_path, '{0}_img.zip'.format(doc.id))) | ||
434 | 612 | ||
435 | self.cronjob_log.info('{0} [stop safely]'.format(self.log_base)) | 613 | self.cronjob_log.info('{0} [stop safely]'.format(self.log_base)) | ... | ... |
... | @@ -141,32 +141,22 @@ class BSWorkbook(Workbook): | ... | @@ -141,32 +141,22 @@ class BSWorkbook(Workbook): |
141 | # month_info process | 141 | # month_info process |
142 | month_info = month_mapping.setdefault('xxxx-xx', []) | 142 | month_info = month_mapping.setdefault('xxxx-xx', []) |
143 | month_info.append((ws.title, min_row, ws.max_row, 0)) | 143 | month_info.append((ws.title, min_row, ws.max_row, 0)) |
144 | elif len(month_list) == 1: | ||
145 | # reverse_trend_list process | ||
146 | reverse_trend = self.get_reverse_trend(dti.day, idx_list) | ||
147 | reverse_trend_list.append(reverse_trend) | ||
148 | # month_info process | ||
149 | month_info = month_mapping.setdefault(month_list[0], []) | ||
150 | day_mean = np.mean(dti.day.dropna()) | ||
151 | if len(month_info) == 0: | ||
152 | month_info.append((ws.title, min_row, ws.max_row, day_mean)) | ||
153 | else: | ||
154 | for i, item in enumerate(month_info): | ||
155 | if day_mean <= item[-1]: | ||
156 | month_info.insert(i, (ws.title, min_row, ws.max_row, day_mean)) | ||
157 | break | ||
158 | else: | ||
159 | month_info.append((ws.title, min_row, ws.max_row, day_mean)) | ||
160 | else: | 144 | else: |
161 | # reverse_trend_list process | 145 | # reverse_trend_list process |
162 | reverse_trend = self.get_reverse_trend(dti.day, idx_list) | 146 | reverse_trend = self.get_reverse_trend(dti.day, idx_list) |
163 | reverse_trend_list.append(reverse_trend) | 147 | reverse_trend_list.append(reverse_trend) |
164 | # month_info process | 148 | # month_info process |
165 | for i, item in enumerate(month_list[:-1]): | 149 | day_idx = dti.day |
166 | month_mapping.setdefault(item, []).append( | 150 | idx_list_max_idx = len(idx_list) - 1 |
167 | (ws.title, idx_list[i] + min_row, idx_list[i + 1] + min_row - 1, self.MAX_MEAN)) | 151 | for i, item in enumerate(month_list): |
168 | month_mapping.setdefault(month_list[-1], []).insert( | 152 | if i == idx_list_max_idx: |
169 | 0, (ws.title, idx_list[-1] + min_row, ws.max_row, 0)) | 153 | day_mean = np.mean(day_idx[idx_list[i]:].dropna()) |
154 | month_mapping.setdefault(item, []).append( | ||
155 | (ws.title, idx_list[i] + min_row, ws.max_row, day_mean)) | ||
156 | else: | ||
157 | day_mean = np.mean(day_idx[idx_list[i]: idx_list[i + 1]].dropna()) | ||
158 | month_mapping.setdefault(item, []).append( | ||
159 | (ws.title, idx_list[i] + min_row, idx_list[i + 1] + min_row - 1, day_mean)) | ||
170 | 160 | ||
171 | def build_metadata_rows(self, confidence, code, print_time, start_date, end_date): | 161 | def build_metadata_rows(self, confidence, code, print_time, start_date, end_date): |
172 | if start_date is None or end_date is None: | 162 | if start_date is None or end_date is None: |
... | @@ -191,9 +181,9 @@ class BSWorkbook(Workbook): | ... | @@ -191,9 +181,9 @@ class BSWorkbook(Workbook): |
191 | def create_meta_sheet(self, card): | 181 | def create_meta_sheet(self, card): |
192 | if self.worksheets[0].title == 'Sheet': | 182 | if self.worksheets[0].title == 'Sheet': |
193 | ms = self.worksheets[0] | 183 | ms = self.worksheets[0] |
194 | ms.title = '{0}({1})'.format(self.meta_sheet_title, card) | 184 | ms.title = '{0}({1})'.format(self.meta_sheet_title, card[-6:]) |
195 | else: | 185 | else: |
196 | ms = self.create_sheet('{0}({1})'.format(self.meta_sheet_title, card)) | 186 | ms = self.create_sheet('{0}({1})'.format(self.meta_sheet_title, card[-6:])) |
197 | return ms | 187 | return ms |
198 | 188 | ||
199 | def build_meta_sheet(self, card, confidence, code, print_time, start_date, end_date): | 189 | def build_meta_sheet(self, card, confidence, code, print_time, start_date, end_date): |
... | @@ -203,6 +193,26 @@ class BSWorkbook(Workbook): | ... | @@ -203,6 +193,26 @@ class BSWorkbook(Workbook): |
203 | ms.append(row) | 193 | ms.append(row) |
204 | return ms | 194 | return ms |
205 | 195 | ||
196 | @staticmethod | ||
197 | def amount_format(amount_str): | ||
198 | if not isinstance(amount_str, str) or amount_str == '': | ||
199 | return amount_str | ||
200 | # 1.替换 | ||
201 | res_str = amount_str.translate(consts.TRANS) | ||
202 | # 2.删除多余的- | ||
203 | res_str = res_str[0] + res_str[1:].replace('-', '') | ||
204 | # 3.首字符处理 | ||
205 | if res_str[0] in consts.ERROR_CHARS: | ||
206 | res_str = '-{0}'.format(res_str[1:]) | ||
207 | # 4.逗号与句号处理 | ||
208 | if len(res_str) >= 4: | ||
209 | period_idx = len(res_str) - 3 | ||
210 | if res_str[period_idx] == '.' and res_str[period_idx - 1] == ',': | ||
211 | res_str = '{0}{1}'.format(res_str[:period_idx - 1], res_str[period_idx:]) | ||
212 | elif res_str[period_idx] == ',': | ||
213 | res_str = '{0}.{1}'.format(res_str[:period_idx], res_str[period_idx + 1:]) | ||
214 | return res_str | ||
215 | |||
206 | def build_month_sheet(self, card, month_mapping, ms, is_reverse): | 216 | def build_month_sheet(self, card, month_mapping, ms, is_reverse): |
207 | tmp_ws = self.create_sheet('tmp_ws') | 217 | tmp_ws = self.create_sheet('tmp_ws') |
208 | for month in sorted(month_mapping.keys()): | 218 | for month in sorted(month_mapping.keys()): |
... | @@ -235,29 +245,25 @@ class BSWorkbook(Workbook): | ... | @@ -235,29 +245,25 @@ class BSWorkbook(Workbook): |
235 | # 3.3.余额转数值 | 245 | # 3.3.余额转数值 |
236 | over_cell = rows[consts.OVER_IDX] | 246 | over_cell = rows[consts.OVER_IDX] |
237 | try: | 247 | try: |
238 | if isinstance(over_cell.value, str): | 248 | over_cell.value = locale.atof(self.amount_format(over_cell.value)) |
239 | over_cell.value = over_cell.value.translate(consts.TRANS) | ||
240 | over_cell.value = locale.atof(over_cell.value) | ||
241 | except Exception as e: | 249 | except Exception as e: |
242 | continue | 250 | continue |
243 | else: | 251 | else: |
244 | over_cell.number_format = numbers.FORMAT_NUMBER_COMMA_SEPARATED1 | 252 | over_cell.number_format = numbers.FORMAT_NUMBER_COMMA_SEPARATED1 |
245 | 253 | ||
246 | # 3.4.余额转数值 | 254 | # 3.4.金额转数值 |
247 | try: | 255 | try: |
248 | try: | 256 | try: |
249 | if isinstance(amount_cell.value, str): # TODO 可在转化数字失败后,再替换 | 257 | amount_cell.value = locale.atof(self.amount_format(amount_cell.value)) |
250 | amount_cell.value = amount_cell.value.translate(consts.TRANS) | ||
251 | amount_cell.value = locale.atof(amount_cell.value) | ||
252 | except Exception as e: | 258 | except Exception as e: |
253 | try: | 259 | try: |
254 | if isinstance(rows[consts.INCOME_IDX].value, str): | 260 | amount_cell.value = locale.atof(self.amount_format(rows[consts.INCOME_IDX].value)) |
255 | rows[consts.OUTLAY_IDX].value = rows[consts.INCOME_IDX].value.translate(consts.TRANS) | 261 | if amount_cell.value == 0: |
256 | amount_cell.value = locale.atof(rows[consts.OUTLAY_IDX].value) | 262 | raise |
263 | elif amount_cell.value < 0: | ||
264 | amount_cell.value = -amount_cell.value | ||
257 | except Exception as e: | 265 | except Exception as e: |
258 | if isinstance(rows[consts.OUTLAY_IDX].value, str): | 266 | amount_cell.value = locale.atof(self.amount_format(rows[consts.OUTLAY_IDX].value)) |
259 | rows[consts.OUTLAY_IDX].value = rows[consts.OUTLAY_IDX].value.translate(consts.TRANS) | ||
260 | amount_cell.value = locale.atof(rows[consts.OUTLAY_IDX].value) | ||
261 | if amount_cell.value > 0: | 267 | if amount_cell.value > 0: |
262 | amount_cell.value = -amount_cell.value | 268 | amount_cell.value = -amount_cell.value |
263 | except Exception as e: | 269 | except Exception as e: |
... | @@ -313,18 +319,18 @@ class BSWorkbook(Workbook): | ... | @@ -313,18 +319,18 @@ class BSWorkbook(Workbook): |
313 | # } | 319 | # } |
314 | for card, summary in bs_summary.items(): | 320 | for card, summary in bs_summary.items(): |
315 | # 1.原表修剪、排列、按照月份分割 | 321 | # 1.原表修剪、排列、按照月份分割 |
316 | start_date = summary['start_date'] | 322 | start_date = summary.get('start_date') |
317 | end_date = summary['end_date'] | 323 | end_date = summary.get('end_date') |
318 | date_statistics = False | 324 | date_statistics = False |
319 | if start_date is None or end_date is None: | 325 | if start_date is None or end_date is None: |
320 | date_statistics = True | 326 | date_statistics = True |
321 | date_list = [] | 327 | date_list = [] |
322 | month_mapping = {} | 328 | month_mapping = {} |
323 | reverse_trend_list = [] | 329 | reverse_trend_list = [] |
324 | for sheet in summary['sheet']: | 330 | for sheet in summary.get('sheet', []): |
325 | ws = self.get_sheet_by_name(sheet) | 331 | ws = self.get_sheet_by_name(sheet) |
326 | # 1.1.删除多余列、排列 | 332 | # 1.1.删除多余列、排列 |
327 | min_row = self.sheet_prune(ws, summary['classify']) | 333 | min_row = self.sheet_prune(ws, summary.get('classify', 0)) |
328 | # 1.2.按月份分割 | 334 | # 1.2.按月份分割 |
329 | self.sheet_split(ws, month_mapping, reverse_trend_list, min_row, date_list, date_statistics) | 335 | self.sheet_split(ws, month_mapping, reverse_trend_list, min_row, date_list, date_statistics) |
330 | 336 | ||
... | @@ -334,32 +340,43 @@ class BSWorkbook(Workbook): | ... | @@ -334,32 +340,43 @@ class BSWorkbook(Workbook): |
334 | 340 | ||
335 | # 2.元信息提取表 | 341 | # 2.元信息提取表 |
336 | ms = self.build_meta_sheet(card, | 342 | ms = self.build_meta_sheet(card, |
337 | summary['confidence'], | 343 | summary.get('confidence', 1), |
338 | summary['code'], | 344 | summary.get('code'), |
339 | summary['print_time'], | 345 | summary.get('print_time'), |
340 | start_date, | 346 | start_date, |
341 | end_date) | 347 | end_date) |
342 | 348 | ||
343 | # 3.创建月份表、提取/高亮关键行 | 349 | # 3.创建月份表、提取/高亮关键行 |
344 | is_reverse = False | 350 | # 倒序处理 |
345 | if sum(reverse_trend_list) > 0: # 倒序处理 | 351 | is_reverse = True if sum(reverse_trend_list) > 0 else False |
346 | is_reverse = True | 352 | for month_list in month_mapping.values(): |
347 | for month_list in month_mapping.values(): | 353 | month_list.sort(key=lambda x: x[-1], reverse=is_reverse) |
348 | month_list.sort(key=lambda x: x[-1], reverse=True) | 354 | |
349 | self.build_month_sheet(card, month_mapping, ms, is_reverse) | 355 | self.build_month_sheet(card, month_mapping, ms, is_reverse) |
350 | 356 | ||
351 | # 4.删除原表 | 357 | # 4.删除原表 |
352 | for sheet in summary['sheet']: | 358 | for sheet in summary.get('sheet'): |
353 | self.remove(self.get_sheet_by_name(sheet)) | 359 | self.remove(self.get_sheet_by_name(sheet)) |
354 | 360 | ||
355 | def license_rebuild(self, license_summary): | 361 | def license_rebuild(self, license_summary): |
356 | for en_key, cn_key in consts.LICENSE_ORDER: | 362 | for classify, (_, name) in consts.LICENSE_ORDER: |
357 | ws = self.create_sheet(cn_key) | 363 | res = license_summary.get(classify) |
358 | for bl in license_summary.get(en_key, []): | 364 | if res is None: |
365 | continue | ||
366 | ws = self.create_sheet(name) | ||
367 | for bl in res: | ||
359 | for bl_field in bl: | 368 | for bl_field in bl: |
360 | ws.append(bl_field) | 369 | ws.append(bl_field) |
361 | ws.append((None, )) | 370 | ws.append((None, )) |
362 | 371 | ||
363 | def rebuild(self, bs_summary, license_summary): | 372 | def skip_img_sheet(self, skip_img): |
373 | if skip_img: | ||
374 | ws = self.create_sheet(consts.SKIP_IMG_SHEET_NAME) | ||
375 | ws.append(consts.SKIP_IMG_SHEET_HEADER) | ||
376 | for img_tuple in skip_img: | ||
377 | ws.append(img_tuple) | ||
378 | |||
379 | def rebuild(self, bs_summary, license_summary, skip_img): | ||
364 | self.bs_rebuild(bs_summary) | 380 | self.bs_rebuild(bs_summary) |
365 | # self.license_rebuild(license_summary) | 381 | self.license_rebuild(license_summary) |
382 | self.skip_img_sheet(skip_img) | ... | ... |
... | @@ -25,7 +25,7 @@ class PDFHandler: | ... | @@ -25,7 +25,7 @@ class PDFHandler: |
25 | def __init__(self, path, img_dir_path): | 25 | def __init__(self, path, img_dir_path): |
26 | self.path = path | 26 | self.path = path |
27 | self.img_dir_path = img_dir_path | 27 | self.img_dir_path = img_dir_path |
28 | self.img_info_list = [] | 28 | self.img_path_list = [] |
29 | self.xref_set = set() | 29 | self.xref_set = set() |
30 | 30 | ||
31 | def get_img_save_path(self, pno, img_index=0, ext='png'): | 31 | def get_img_save_path(self, pno, img_index=0, ext='png'): |
... | @@ -38,7 +38,7 @@ class PDFHandler: | ... | @@ -38,7 +38,7 @@ class PDFHandler: |
38 | pm = page.getPixmap(matrix=trans_2, alpha=False) | 38 | pm = page.getPixmap(matrix=trans_2, alpha=False) |
39 | img_save_path = self.get_img_save_path(page.number) | 39 | img_save_path = self.get_img_save_path(page.number) |
40 | pm.writePNG(img_save_path) | 40 | pm.writePNG(img_save_path) |
41 | self.img_info_list.append((img_save_path, page.number, 0)) | 41 | self.img_path_list.append(img_save_path) |
42 | 42 | ||
43 | @staticmethod | 43 | @staticmethod |
44 | def getimage(pix): | 44 | def getimage(pix): |
... | @@ -88,7 +88,7 @@ class PDFHandler: | ... | @@ -88,7 +88,7 @@ class PDFHandler: |
88 | with open(img_save_path, "wb") as f: | 88 | with open(img_save_path, "wb") as f: |
89 | f.write(img_data) | 89 | f.write(img_data) |
90 | self.xref_set.add(xref) | 90 | self.xref_set.add(xref) |
91 | self.img_info_list.append((img_save_path, pno, img_index)) | 91 | self.img_path_list.append(img_save_path) |
92 | 92 | ||
93 | @staticmethod | 93 | @staticmethod |
94 | def split_il(il): | 94 | def split_il(il): |
... | @@ -179,7 +179,7 @@ class PDFHandler: | ... | @@ -179,7 +179,7 @@ class PDFHandler: |
179 | img_save_path = self.get_img_save_path(pno, img_index, im_list[0][2]) | 179 | img_save_path = self.get_img_save_path(pno, img_index, im_list[0][2]) |
180 | new_img.save(img_save_path) | 180 | new_img.save(img_save_path) |
181 | page_to_png = False | 181 | page_to_png = False |
182 | self.img_info_list.append((img_save_path, pno, img_index)) | 182 | self.img_path_list.append(img_save_path) |
183 | 183 | ||
184 | # 3.3 碎图分组大于2、全过滤、含特殊filter,特殊处理:整个页面保存为png图片 | 184 | # 3.3 碎图分组大于2、全过滤、含特殊filter,特殊处理:整个页面保存为png图片 |
185 | if page_to_png: | 185 | if page_to_png: | ... | ... |
-
Please register or sign in to post a comment