merge license
Showing
6 changed files
with
575 additions
and
214 deletions
| ... | @@ -35,9 +35,35 @@ DEALER_CODE_META_FIELD_id = 13 | ... | @@ -35,9 +35,35 @@ DEALER_CODE_META_FIELD_id = 13 |
| 35 | BUSINESS_TYPE_META_FIELD_id = 93 | 35 | BUSINESS_TYPE_META_FIELD_id = 93 |
| 36 | DEALER_CODE = 'ocr_situ_group' | 36 | DEALER_CODE = 'ocr_situ_group' |
| 37 | 37 | ||
| 38 | RETRY_TIMES = 3 | ||
| 39 | |||
| 38 | # ---------银行流水模板相关-------------------------------------------------------------------------------------------- | 40 | # ---------银行流水模板相关-------------------------------------------------------------------------------------------- |
| 39 | 41 | ||
| 40 | TRANS = str.maketrans('Cc((oODlLmAsSbg', '000000011345569') | 42 | TRANS_MAP = { |
| 43 | 'C': "0", | ||
| 44 | 'c': "0", | ||
| 45 | '(': "0", | ||
| 46 | 'o': "0", | ||
| 47 | 'O': "0", | ||
| 48 | 'D': "0", | ||
| 49 | |||
| 50 | '[': "1", | ||
| 51 | ']': "1", | ||
| 52 | 'l': "1", | ||
| 53 | 'L': "1", | ||
| 54 | |||
| 55 | 'A': "4", | ||
| 56 | 's': "5", | ||
| 57 | 'S': "5", | ||
| 58 | 'b': "6", | ||
| 59 | 'g': "9", | ||
| 60 | 'E': "9", | ||
| 61 | 'B': "13", | ||
| 62 | } | ||
| 63 | TRANS = str.maketrans(TRANS_MAP) | ||
| 64 | ERROR_CHARS = {'.', ':', ':', '•', '·'} | ||
| 65 | SKIP_IMG_SHEET_NAME = '未处理图片' | ||
| 66 | SKIP_IMG_SHEET_HEADER = ('页码', '序号') | ||
| 41 | 67 | ||
| 42 | CARD_RATIO = 0.9 | 68 | CARD_RATIO = 0.9 |
| 43 | UNKNOWN_CARD = '未知卡号' | 69 | UNKNOWN_CARD = '未知卡号' |
| ... | @@ -95,7 +121,7 @@ HEADERS_MAPPING.update( | ... | @@ -95,7 +121,7 @@ HEADERS_MAPPING.update( |
| 95 | HEADERS_MAPPING.update( | 121 | HEADERS_MAPPING.update( |
| 96 | { | 122 | { |
| 97 | '交易日期': BASE_HEADERS_MAPPING['记账日期'], | 123 | '交易日期': BASE_HEADERS_MAPPING['记账日期'], |
| 98 | '存入': BASE_HEADERS_MAPPING['金额'], | 124 | # '存入': BASE_HEADERS_MAPPING['金额'], |
| 99 | '对方账号': BASE_HEADERS_MAPPING['对方卡号/账号'], | 125 | '对方账号': BASE_HEADERS_MAPPING['对方卡号/账号'], |
| 100 | '对方名称': BASE_HEADERS_MAPPING['对方账户名'], | 126 | '对方名称': BASE_HEADERS_MAPPING['对方账户名'], |
| 101 | '摘要': BASE_HEADERS_MAPPING['附言'], | 127 | '摘要': BASE_HEADERS_MAPPING['附言'], |
| ... | @@ -160,6 +186,12 @@ HEADERS_MAPPING.update( | ... | @@ -160,6 +186,12 @@ HEADERS_MAPPING.update( |
| 160 | '交易地点/对方账号和户名': BASE_HEADERS_MAPPING['对方卡号/账号'], | 186 | '交易地点/对方账号和户名': BASE_HEADERS_MAPPING['对方卡号/账号'], |
| 161 | } | 187 | } |
| 162 | ) | 188 | ) |
| 189 | # 农业银行-窄页 | ||
| 190 | HEADERS_MAPPING.update( | ||
| 191 | { | ||
| 192 | '交易对手账号': BASE_HEADERS_MAPPING['对方卡号/账号'], | ||
| 193 | } | ||
| 194 | ) | ||
| 163 | # 竖版-特殊-农商行 | 195 | # 竖版-特殊-农商行 |
| 164 | HEADERS_MAPPING.update( | 196 | HEADERS_MAPPING.update( |
| 165 | { | 197 | { |
| ... | @@ -299,17 +331,27 @@ HEADERS_MAPPING.update( | ... | @@ -299,17 +331,27 @@ HEADERS_MAPPING.update( |
| 299 | # ('中国银行', (1, 2, 4, 5, 6, 9, 10, 11, 12, None, None, None, None)), # 横版-表格-中国银行(不规则) | 331 | # ('中国银行', (1, 2, 4, 5, 6, 9, 10, 11, 12, None, None, None, None)), # 横版-表格-中国银行(不规则) |
| 300 | # | 332 | # |
| 301 | # # 农业银行:交易日期 交易网点 存入 支出 余额 对方账号 对方名称 摘要 渠道 附言 | 333 | # # 农业银行:交易日期 交易网点 存入 支出 余额 对方账号 对方名称 摘要 渠道 附言 |
| 302 | # ('农业银行', (1, None, 3, 5, None, 8, 7, 6, None, None, None, None, None)), # 横版-表格-农业银行-中国农业银行个人账户明细 | 334 | # ('农业银行-10', (1, None, None, 5, None, 8, 7, 6, None, None, None, 3, 4)), # 横版-表格-农业银行-中国农业银行个人账户明细 |
| 335 | # | ||
| 336 | # # 农业银行:序号 日期 摘要 交易金额 余额 对方账号 对方名称 交易地点 渠道 附言 | ||
| 337 | # ('农业银行-10-1', (2, None, 4, 5, None, 3, 7, 6, None, None, None, None, None)), | ||
| 338 | # | ||
| 339 | # # 农业银行:交易日期 摘要 交易金额 余额 交易渠道 交易网点 对方账号 对方名称 附言 | ||
| 340 | # ('农业银行-9', (1, None, 3, 4, None, 2, 8, 7, None, None, None, None, None)), | ||
| 303 | # | 341 | # |
| 304 | # # 北京银行:交易日期 业务摘要 收/支 发生额 余额 对方户名 对方账号 交易渠道 | 342 | # # 北京银行:交易日期 业务摘要 收/支 发生额 余额 对方户名 对方账号 交易渠道 |
| 305 | # ('北京银行', (1, None, 4, 5, None, 2, 6, 7, None, None, 3, None, None)), # 横版-表格-北京银行 | 343 | # ('北京银行', (1, None, 4, 5, None, 2, 6, 7, None, None, 3, None, None)), # 横版-表格-北京银行 |
| 306 | # | 344 | # |
| 307 | # # 工商银行:交易日期 账号 储种 序号 币种 钞汇 摘要 地区 收入/支出金额 余额 [对方户名 对方账号] 渠道 | 345 | # # 工商银行:交易日期 账号 储种 序号 币种 钞汇 摘要 地区 收入/支出金额 余额 渠道 |
| 308 | # ('工商银行', (1, None, 9, 10, None, 7, None, None, None, None, None, None, None)), | 346 | # ('工商银行', (1, None, 9, 10, None, 7, None, None, None, None, None, None, None)), |
| 309 | # | 347 | # |
| 348 | # # 工商银行:交易日期 账号 储种 序号 币种 钞汇 摘要 地区 收入/支出金额 余额 对方户名 对方账号 渠道 | ||
| 349 | # ('工商银行-电子账单', (1, None, 9, 10, None, 7, 11, 12, None, None, None, None, None)), | ||
| 350 | # | ||
| 310 | # # 建设银行:空 摘要 交易日期 交易金额 账户余额 商户/网点号及其名称 对方账号与户名 --> 竖版-表格-建设银行 | 351 | # # 建设银行:空 摘要 交易日期 交易金额 账户余额 商户/网点号及其名称 对方账号与户名 --> 竖版-表格-建设银行 |
| 311 | # # 序号 摘要 币别 钞汇 交易日期 交易金额 账户余额 交易地点附言 对方账号与户名 --> 横版-表格-建设银行 | 352 | # # 序号 摘要 币别 钞汇 交易日期 交易金额 账户余额 交易地点附言 对方账号与户名 --> 横版-表格-建设银行 |
| 312 | # ('建设银行', (None, None, None, None, None, 2, None, None, None, None, None, None, None)), | 353 | # ('建设银行-竖版', (3, None, 4, 5, None, 2, None, 7, None, None, None, None, None)), |
| 354 | # ('建设银行-横版', (5, None, 6, 7, None, 2, None, 9, None, None, None, None, None)), | ||
| 313 | # | 355 | # |
| 314 | # # 微信:交易单号 交易时间 交易类型 收/支/其他 交易方式 金额(元) 交易对方 商户单号 | 356 | # # 微信:交易单号 交易时间 交易类型 收/支/其他 交易方式 金额(元) 交易对方 商户单号 |
| 315 | # ('微信', (2, None, 6, None, None, 3, 7, None, None, None, None, None, None)), | 357 | # ('微信', (2, None, 6, None, None, 3, 7, None, None, None, None, None, None)), |
| ... | @@ -320,7 +362,13 @@ HEADERS_MAPPING.update( | ... | @@ -320,7 +362,13 @@ HEADERS_MAPPING.update( |
| 320 | # # -----------------普通打印:部分格线-------------------------------- | 362 | # # -----------------普通打印:部分格线-------------------------------- |
| 321 | # | 363 | # |
| 322 | # # 农业银行:交易日期 摘要/附言 交易金额 余额 交易地点/对方账号和户名 | 364 | # # 农业银行:交易日期 摘要/附言 交易金额 余额 交易地点/对方账号和户名 |
| 323 | # ('农业银行', (1, None, 3, 4, None, 2, None, 5, None, None, None, None, None)), | 365 | # ('农业银行-5', (1, None, 3, 4, None, 2, None, 5, None, None, None, None, None)), |
| 366 | # | ||
| 367 | # # 农业银行:日期 地点 摘要 存入 支出 余额 对方账号 对方户名 | ||
| 368 | # ('农业银行-8', (1, None, None, 6, None, 3, 8, 7, None, None, None, 4, 5)), | ||
| 369 | |||
| 370 | # # 农业银行:日期 摘要 交易金额 余额 地点 交易对手账号 对方户名 | ||
| 371 | # ('农业银行-窄页', (1, None, 3, 4, None, 2, 7, 6, None, None, None, None, None)), | ||
| 324 | # | 372 | # |
| 325 | # # 农商行:交易日期 交易发生额 账户余额 对方账号 对方户名 摘要 备注 | 373 | # # 农商行:交易日期 交易发生额 账户余额 对方账号 对方户名 摘要 备注 |
| 326 | # ('农商行', (1, None, 2, 3, None, 6, 5, 4, None, None, None, None, None)), | 374 | # ('农商行', (1, None, 2, 3, None, 6, 5, 4, None, None, None, None, None)), |
| ... | @@ -330,6 +378,9 @@ HEADERS_MAPPING.update( | ... | @@ -330,6 +378,9 @@ HEADERS_MAPPING.update( |
| 330 | # | 378 | # |
| 331 | # # 平安电子账单:序号 交易日期 交易网点 摘要 借贷发生额(借:-贷:+) 账户余额 | 379 | # # 平安电子账单:序号 交易日期 交易网点 摘要 借贷发生额(借:-贷:+) 账户余额 |
| 332 | # ('平安电子账单', (2, None, 5, 6, None, 4, None, None, None, None, None, None, None)), | 380 | # ('平安电子账单', (2, None, 5, 6, None, 4, None, None, None, None, None, None, None)), |
| 381 | |||
| 382 | # # 建设银行:序号 摘要 币别 钞汇 交易日期 交易金额 账户余额 交易地点附言 对方账号与户名 | ||
| 383 | # ('建设银行-电子账单', (5, None, 6, 7, None, 2, None, 9, None, None, None, None, None)), | ||
| 333 | # | 384 | # |
| 334 | # # -----------------普通打印:无格线------------------------------------- | 385 | # # -----------------普通打印:无格线------------------------------------- |
| 335 | # | 386 | # |
| ... | @@ -338,7 +389,8 @@ HEADERS_MAPPING.update( | ... | @@ -338,7 +389,8 @@ HEADERS_MAPPING.update( |
| 338 | # | 389 | # |
| 339 | # # 邮储银行:交易日期、交易类型 交易币种 交易金额(元) 账户余额(元) [对手方户名 对手方账户 收支类型] --> 竖版-无表格-邮储银行-账户对账单 含有对手方户名 对手方账户 | 390 | # # 邮储银行:交易日期、交易类型 交易币种 交易金额(元) 账户余额(元) [对手方户名 对手方账户 收支类型] --> 竖版-无表格-邮储银行-账户对账单 含有对手方户名 对手方账户 |
| 340 | # # 交易日期、交易类型 交易金额(元) 账户余额(元) 操作柜员 --> 竖版-无表格-邮储银行 账户对账单 | 391 | # # 交易日期、交易类型 交易金额(元) 账户余额(元) 操作柜员 --> 竖版-无表格-邮储银行 账户对账单 |
| 341 | # ('邮储银行', (1, None, None, None, None, 2, None, None, None, None, None, None, None)), | 392 | # ('邮储银行-8', (1, None, 4, 5, None, 2, 6, 7, None, None, None, None, None)), |
| 393 | # ('邮储银行-5', (1, None, 3, 4, None, 2, None, None, None, None, None, None, None)), | ||
| 342 | # | 394 | # |
| 343 | # # 工商银行电子版:交易日期 账号 储种 序号 币种 妙汇 摘要 地区 收入/支出金额 余额 [对方户名 对方账号] 渠道 | 395 | # # 工商银行电子版:交易日期 账号 储种 序号 币种 妙汇 摘要 地区 收入/支出金额 余额 [对方户名 对方账号] 渠道 |
| 344 | # ('工商银行电子版', (1, None, 9, 10, None, 7, None, None, None, None, None, None, None)), | 396 | # ('工商银行电子版', (1, None, 9, 10, None, 7, None, None, None, None, None, None, None)), |
| ... | @@ -351,13 +403,15 @@ HEADERS_MAPPING.update( | ... | @@ -351,13 +403,15 @@ HEADERS_MAPPING.update( |
| 351 | # ('民生银行', (None, None, 5, 6, None, None, 7, None, 8, None, None, None, None)), | 403 | # ('民生银行', (None, None, 5, 6, None, None, 7, None, 8, None, None, None, None)), |
| 352 | # | 404 | # |
| 353 | # # 农业银行:交易日期 摘要/附言 交易金额 对方账号和户名 | 405 | # # 农业银行:交易日期 摘要/附言 交易金额 对方账号和户名 |
| 354 | # ('农业银行', (1, None, 3, None, None, 2, None, 4, None, None, None, None, None)), | 406 | # ('农业银行-整数', (1, None, 3, None, None, 2, None, 4, None, None, None, None, None)), |
| 355 | # | 407 | # |
| 356 | # # 农业银行:交易日期 摘要/附言 交易金额 余额 交易地点/对方账号和户名 | 408 | # # 农业银行:交易日期 摘要/附言 交易金额 余额 交易地点/对方账号和户名 |
| 357 | # ('农业银行', (1, None, 3, 4, None, 2, None, 5, None, None, None, None, None)), | 409 | # ('农业银行', (1, None, 3, 4, None, 2, None, 5, None, None, None, None, None)), |
| 358 | # | 410 | # |
| 359 | # # 农业银行:日期、时间、[日志号]、短摘要、交易金额、本次余额、交易网点、渠道、附言 | 411 | # # 农业银行:日期、时间、短摘要、交易金额、本次余额、交易网点、渠道、附言 |
| 360 | # ('农业银行', (1, 2, None, None, None, None, None, None, None, None, None, None, None)), | 412 | # # 农业银行:日期、时间、日志号、短摘要、交易金额、本次余额、交易网点、渠道、附言 |
| 413 | # ('农业银行', (1, 2, 4, 5, None, 3, None, None, None, None, None, None, None)), | ||
| 414 | # ('农业银行-扩张缩进', (1, 2, 5, 6, None, 4, None, None, None, None, None, None, None)), | ||
| 361 | # | 415 | # |
| 362 | # # 交通银行:交易日期 记账日期、交易地点、交易类型、借贷状态、交易金额、余额 | 416 | # # 交通银行:交易日期 记账日期、交易地点、交易类型、借贷状态、交易金额、余额 |
| 363 | # ('交通银行', (1, None, 5, 6, None, 3, None, None, None, None, 4, None, None)), | 417 | # ('交通银行', (1, None, 5, 6, None, 3, None, None, None, None, 4, None, None)), |
| ... | @@ -374,11 +428,10 @@ HEADERS_MAPPING.update( | ... | @@ -374,11 +428,10 @@ HEADERS_MAPPING.update( |
| 374 | # | 428 | # |
| 375 | # # 邮储银行:序号、交易日期、交易渠道、摘要、交易金额、账户余额、对方账号/卡号/汇票号、原子账号、交易机构名称 | 429 | # # 邮储银行:序号、交易日期、交易渠道、摘要、交易金额、账户余额、对方账号/卡号/汇票号、原子账号、交易机构名称 |
| 376 | # ('邮储银行', (2, None, 5, 6, None, 4, None, 7, None, None, None, None, None)), | 430 | # ('邮储银行', (2, None, 5, 6, None, 4, None, 7, None, None, None, None, None)), |
| 377 | # | ||
| 378 | # # 建设银行:序号 摘要 币别 钞汇 交易日期 交易金额 账户余额 交易地点附言 对方账号与户名 | ||
| 379 | # ('建设银行', (5, None, 6, 7, None, 2, None, 9, None, None, None, None, None)), | ||
| 380 | # ] | 431 | # ] |
| 381 | 432 | ||
| 433 | OTHER_TUPLE = (None, None, None, None, None, None, None, None, None, None, None, None, None) | ||
| 434 | |||
| 382 | # { | 435 | # { |
| 383 | # "0":"其他", | 436 | # "0":"其他", |
| 384 | # "1":"普通打印-全表格-中国农业银行", | 437 | # "1":"普通打印-全表格-中国农业银行", |
| ... | @@ -408,67 +461,163 @@ HEADERS_MAPPING.update( | ... | @@ -408,67 +461,163 @@ HEADERS_MAPPING.update( |
| 408 | # "22":"针式打印-部分格线-邮储银行一本通绿卡" | 461 | # "22":"针式打印-部分格线-邮储银行一本通绿卡" |
| 409 | # } | 462 | # } |
| 410 | 463 | ||
| 464 | # CLASSIFY_LIST = [ | ||
| 465 | # ('其他', OTHER_TUPLE), | ||
| 466 | # ('农业银行', (1, None, 3, 5, None, 8, 7, 6, None, None, None, None, None)), | ||
| 467 | # ('中国银行', (1, 2, 4, 5, 6, 9, 10, 11, 12, None, None, None, None)), | ||
| 468 | # ('北京银行', (1, None, 4, 5, None, 2, 6, 7, None, None, 3, None, None)), | ||
| 469 | # ('工商银行', (1, None, 9, 10, None, 7, None, None, None, None, None, None, None)), | ||
| 470 | # ('建设银行', (None, None, None, None, None, 2, None, None, None, None, None, None, None)), | ||
| 471 | # ('微信', (2, None, 6, None, None, 3, 7, None, None, None, None, None, None)), | ||
| 472 | # ('支付宝', (2, None, None, 6, None, 3, None, None, None, None, None, 4, 5)), | ||
| 473 | # | ||
| 474 | # ('交通银行', (1, None, 5, 6, None, 3, None, None, None, None, 4, None, None)), | ||
| 475 | # ('农业银行', (1, None, 3, None, None, 2, None, 4, None, None, None, None, None)), | ||
| 476 | # ('农业银行', (1, 2, None, None, None, None, None, None, None, None, None, None, None)), | ||
| 477 | # ('招商银行', (1, None, 3, 4, None, 6, None, None, None, None, None, None, None)), | ||
| 478 | # ('招商银行电子版', (1, None, 3, 4, None, 5, 6, None, None, None, None, None, None)), | ||
| 479 | # ('民生银行', (None, None, 5, 6, None, None, 7, None, 8, None, None, None, None)), | ||
| 480 | # | ||
| 481 | # ('中信银行', (1, None, None, 5, None, 2, 6, 7, 8, None, None, 3, 4)), | ||
| 482 | # ('农业银行', (1, None, 3, 4, None, 2, None, 5, None, None, None, None, None)), | ||
| 483 | # ('农业银行', (1, None, 3, 4, None, 2, None, 5, None, None, None, None, None)), | ||
| 484 | # ('农业银行', (1, None, 3, 4, None, 2, None, 5, None, None, None, None, None)), | ||
| 485 | # ('平安电子账单', (2, None, 5, 6, None, 4, None, None, None, None, None, None, None)), | ||
| 486 | # | ||
| 487 | # ('建设银行', (None, None, None, None, None, None, None, None, None, None, None, None, None)), | ||
| 488 | # ('邮储银行', (2, None, 5, 6, None, 4, None, 7, None, None, None, None, None)), | ||
| 489 | # ('邮储银行', (2, None, 5, 6, None, 4, None, 7, None, None, None, None, None)), | ||
| 490 | # ] | ||
| 491 | |||
| 492 | # "4":"普通打印-全表格-中国银行", | ||
| 493 | # "5":"普通打印-全表格-农业银行-10列", | ||
| 494 | # "6":"普通打印-全表格-农业银行-10列-1", | ||
| 495 | # "7":"普通打印-全表格-农业银行-9列", | ||
| 496 | # "8":"普通打印-全表格-北京银行", | ||
| 497 | # "9":"普通打印-全表格-工商银行", | ||
| 498 | # "10":"普通打印-全表格-工商银行-电子账单", | ||
| 499 | # "11":"普通打印-全表格-建设银行", | ||
| 500 | # "12":"普通打印-全表格-微信账单", | ||
| 501 | # "13":"普通打印-全表格-支付宝账单", | ||
| 502 | |||
| 503 | # "14":"普通打印-无格线-交通银行", | ||
| 504 | # "15":"普通打印-无格线-储蓄银行-5列", | ||
| 505 | # "16":"普通打印-无格线-储蓄银行-8列", | ||
| 506 | # "17":"普通打印-无格线-农业银行-扩张缩进", | ||
| 507 | # "18":"普通打印-无格线-农业银行-整数", | ||
| 508 | # "19":"普通打印-无格线-招商银行", | ||
| 509 | # "20":"普通打印-无格线-招商银行-电子账单", | ||
| 510 | # "21":"普通打印-无格线-民生银行", | ||
| 511 | |||
| 512 | # "22":"普通打印-部分格线-横版-中信银行", | ||
| 513 | # "23":"普通打印-部分格线-竖版-农业银行-5列", | ||
| 514 | # "24":"普通打印-部分格线-竖版-农业银行-8列", | ||
| 515 | # "25":"普通打印-部分格线-竖版-农业银行-窄页", | ||
| 516 | # "26":"普通打印-部分格线-竖版-平安电子账单", | ||
| 517 | # "27":"普通打印-部分格线-竖版-建设银行-电子账单", | ||
| 518 | |||
| 519 | # "34":"针式打印-全格线-建设银行", | ||
| 520 | # "35":"针式打印-部分格线-竖版-邮储银行", | ||
| 521 | # "36":"针式打印-部分格线-竖版-邮储银行-绿卡", | ||
| 522 | |||
| 411 | CLASSIFY_LIST = [ | 523 | CLASSIFY_LIST = [ |
| 412 | ('其他', (None, None, None, None, None, None, None, None, None, None, None, None, None)), | 524 | ('其他', OTHER_TUPLE), |
| 413 | ('农业银行', (1, None, 3, 5, None, 8, 7, 6, None, None, None, None, None)), | 525 | ('其他', OTHER_TUPLE), |
| 414 | ('中国银行', (1, 2, 4, 5, 6, 9, 10, 11, 12, None, None, None, None)), | 526 | ('其他', OTHER_TUPLE), |
| 415 | ('北京银行', (1, None, 4, 5, None, 2, 6, 7, None, None, 3, None, None)), | 527 | ('其他', OTHER_TUPLE), |
| 416 | ('工商银行', (1, None, 9, 10, None, 7, None, None, None, None, None, None, None)), | 528 | ('普通打印-全表格-中国银行', (1, 2, 4, 5, 6, 9, 10, 11, 12, None, None, None, None)), |
| 417 | ('建设银行', (None, None, None, None, None, 2, None, None, None, None, None, None, None)), | 529 | ('普通打印-全表格-农业银行-10列', (1, None, None, 5, None, 8, 7, 6, None, None, None, 3, 4)), |
| 418 | ('微信', (2, None, 6, None, None, 3, 7, None, None, None, None, None, None)), | 530 | ('普通打印-全表格-农业银行-10列-1', (2, None, 4, 5, None, 3, 7, 6, None, None, None, None, None)), |
| 419 | ('支付宝', (2, None, None, 6, None, 3, None, None, None, None, None, 4, 5)), | 531 | ('普通打印-全表格-农业银行-9列', (1, None, 3, 4, None, 2, 8, 7, None, None, None, None, None)), |
| 420 | 532 | ('普通打印-全表格-北京银行', (1, None, 4, 5, None, 2, 6, 7, None, None, 3, None, None)), | |
| 421 | ('交通银行', (1, None, 5, 6, None, 3, None, None, None, None, 4, None, None)), | 533 | ('普通打印-全表格-工商银行', (1, None, 9, 10, None, 7, None, None, None, None, None, None, None)), |
| 422 | ('农业银行', (1, None, 3, None, None, 2, None, 4, None, None, None, None, None)), | 534 | ('普通打印-全表格-工商银行-电子账单', (1, None, 9, 10, None, 7, 11, 12, None, None, None, None, None)), |
| 423 | ('农业银行', (1, 2, None, None, None, None, None, None, None, None, None, None, None)), | 535 | ('普通打印-全表格-建设银行', (3, None, 4, 5, None, 2, None, 7, None, None, None, None, None)), |
| 424 | ('招商银行', (1, None, 3, 4, None, 6, None, None, None, None, None, None, None)), | 536 | ('普通打印-全表格-微信账单', (2, None, 6, None, None, 3, 7, None, None, None, None, None, None)), |
| 425 | ('招商银行电子版', (1, None, 3, 4, None, 5, 6, None, None, None, None, None, None)), | 537 | ('普通打印-全表格-支付宝账单', (2, None, None, 6, None, 3, None, None, None, None, None, 4, 5)), |
| 426 | ('民生银行', (None, None, 5, 6, None, None, 7, None, 8, None, None, None, None)), | 538 | |
| 427 | 539 | ('普通打印-无格线-交通银行', (1, None, 5, 6, None, 3, None, None, None, None, 4, None, None)), | |
| 428 | ('中信银行', (1, None, None, 5, None, 2, 6, 7, 8, None, None, 3, 4)), | 540 | ('普通打印-无格线-储蓄银行-5列', (1, None, 3, 4, None, 2, None, None, None, None, None, None, None)), |
| 429 | ('农业银行', (1, None, 3, 4, None, 2, None, 5, None, None, None, None, None)), | 541 | ('普通打印-无格线-储蓄银行-8列', (1, None, 4, 5, None, 2, 6, 7, None, None, None, None, None)), |
| 430 | ('农业银行', (1, None, 3, 4, None, 2, None, 5, None, None, None, None, None)), | 542 | ('普通打印-无格线-农业银行-扩张缩进', (1, 2, 5, 6, None, 4, None, None, None, None, None, None, None)), |
| 431 | ('农业银行', (1, None, 3, 4, None, 2, None, 5, None, None, None, None, None)), | 543 | ('普通打印-无格线-农业银行-整数', (1, None, 3, None, None, 2, None, 4, None, None, None, None, None)), |
| 432 | ('平安电子账单', (2, None, 5, 6, None, 4, None, None, None, None, None, None, None)), | 544 | ('普通打印-无格线-招商银行', (1, None, 3, 4, None, 6, None, None, None, None, None, None, None)), |
| 433 | 545 | ('普通打印-无格线-招商银行-电子账单', (1, None, 3, 4, None, 5, 6, None, None, None, None, None, None)), | |
| 434 | ('建设银行', (None, None, None, None, None, None, None, None, None, None, None, None, None)), | 546 | ('普通打印-无格线-民生银行', (None, None, 5, 6, None, None, 7, None, 8, None, None, None, None)), |
| 435 | ('邮储银行', (2, None, 5, 6, None, 4, None, 7, None, None, None, None, None)), | 547 | |
| 436 | ('邮储银行', (2, None, 5, 6, None, 4, None, 7, None, None, None, None, None)), | 548 | ('普通打印-部分格线-横版-中信银行', (1, None, None, 5, None, 2, 6, 7, 8, None, None, 3, 4)), |
| 549 | ('普通打印-部分格线-竖版-农业银行-5列', (1, None, 3, 4, None, 2, None, 5, None, None, None, None, None)), | ||
| 550 | ('普通打印-部分格线-竖版-农业银行-8列', (1, None, None, 6, None, 3, 8, 7, None, None, None, 4, 5)), | ||
| 551 | ('普通打印-部分格线-竖版-农业银行-窄页', (1, None, 3, 4, None, 2, 7, 6, None, None, None, None, None)), | ||
| 552 | ('普通打印-部分格线-竖版-平安电子账单', (2, None, 5, 6, None, 4, None, None, None, None, None, None, None)), | ||
| 553 | ('普通打印-部分格线-竖版-建设银行-电子账单', (5, None, 6, 7, None, 2, None, 9, None, None, None, None, None)), | ||
| 554 | ('其他', OTHER_TUPLE), | ||
| 555 | ('其他', OTHER_TUPLE), | ||
| 556 | ('其他', OTHER_TUPLE), | ||
| 557 | ('其他', OTHER_TUPLE), | ||
| 558 | ('其他', OTHER_TUPLE), | ||
| 559 | ('其他', OTHER_TUPLE), | ||
| 560 | ('针式打印-全格线-建设银行', OTHER_TUPLE), | ||
| 561 | ('针式打印-部分格线-竖版-邮储银行', (2, None, 5, 6, None, 4, None, 7, None, None, None, None, None)), | ||
| 562 | ('针式打印-部分格线-竖版-邮储银行-绿卡', (2, None, 5, 6, None, 4, None, 7, None, None, None, None, None)), | ||
| 563 | ('其他', OTHER_TUPLE), | ||
| 437 | ] | 564 | ] |
| 438 | 565 | ||
| 439 | # ----------license相关------------------------------------------------------------------------------------------------ | 566 | # ----------license相关------------------------------------------------------------------------------------------------ |
| 567 | |||
| 568 | # "0":"AVT Invioce", | ||
| 569 | # "1":"二手车发票", | ||
| 570 | # "2":"其他", | ||
| 571 | # "3":"护照", | ||
| 572 | # "28":"机动车登记证", | ||
| 573 | # "29":"机动车销售统一发票", | ||
| 574 | # "30":"港澳通行证", | ||
| 575 | # "31":"营业执照", | ||
| 576 | # "32":"行驶证", | ||
| 577 | # "33":"身份证", | ||
| 578 | # "37":"银行卡" | ||
| 579 | |||
| 580 | # 其他 | ||
| 581 | OTHER_CLASSIFY = 2 | ||
| 582 | |||
| 583 | # 身份证 | ||
| 584 | IC_CN_NAME = '身份证' | ||
| 585 | IC_CLASSIFY = 33 | ||
| 586 | # 增值税发票 | ||
| 587 | VAT_CN_NAME = '增值税发票' | ||
| 588 | VAT_CLASSIFY = 0 | ||
| 589 | # 机动车登记证书 | ||
| 590 | MVC_CN_NAME = '机动车登记证书' | ||
| 591 | MVC_CLASSIFY = 28 | ||
| 592 | # 机动车销售统一发票 | ||
| 593 | MVI_CN_NAME = '机动车销售统一发票' | ||
| 594 | MVI_CLASSIFY = 29 | ||
| 595 | IC_PID = VAT_PID = MVC_PID = MVI_PID = None | ||
| 596 | |||
| 440 | # 营业执照 | 597 | # 营业执照 |
| 441 | BL_KEY = 'bl' | 598 | BL_CN_NAME = '营业执照' |
| 599 | BL_CLASSIFY = 31 | ||
| 600 | BL_PID = 41 | ||
| 442 | # 二手车发票 | 601 | # 二手车发票 |
| 443 | UCI_KEY = 'uci' | 602 | UCI_CN_NAME = '二手车发票' |
| 603 | UCI_CLASSIFY = 1 | ||
| 604 | UCI_PID = 60 | ||
| 444 | # 港澳台通行证 | 605 | # 港澳台通行证 |
| 445 | EEP_KEY = 'eep' | 606 | EEP_CN_NAME = '港澳台通行证' |
| 607 | EEP_CLASSIFY = 30 | ||
| 608 | EEP_PID = 1018 | ||
| 446 | # 行驶证 | 609 | # 行驶证 |
| 447 | DL_KEY = 'dl' | 610 | DL_CN_NAME = '行驶证' |
| 611 | DL_CLASSIFY = 32 | ||
| 612 | DL_PID = 5 | ||
| 448 | # 护照 | 613 | # 护照 |
| 449 | PP_KEY = 'pp' | 614 | PP_CN_NAME = '护照' |
| 615 | PP_CLASSIFY = 3 | ||
| 616 | PP_PID = 8 | ||
| 450 | # 银行卡 | 617 | # 银行卡 |
| 451 | BC_KEY = 'bc' | 618 | BC_CN_NAME = '银行卡' |
| 452 | # 身份证 | 619 | BC_CLASSIFY = 37 |
| 453 | IC_KEY = 'ic' | 620 | BC_PID = 4 |
| 454 | # 机动车登记证书 | ||
| 455 | MVC_KEY = 'mvc' | ||
| 456 | # 机动车销售统一发票 | ||
| 457 | MVI_KEY = 'mvi' | ||
| 458 | # 增值税发票 | ||
| 459 | VAT_KEY = 'vat' | ||
| 460 | |||
| 461 | LICENSE_ORDER = ((MVI_KEY, '机动车销售统一发票'), | ||
| 462 | (IC_KEY, '身份证'), | ||
| 463 | (BC_KEY, '银行卡'), | ||
| 464 | (BL_KEY, '营业执照'), | ||
| 465 | (UCI_KEY, '二手车发票'), | ||
| 466 | (EEP_KEY, '港澳台通行证'), | ||
| 467 | (DL_KEY, '行驶证'), | ||
| 468 | (PP_KEY, '护照'), | ||
| 469 | (MVC_KEY, '机动车登记证书'), | ||
| 470 | (VAT_KEY, '增值税发票')) | ||
| 471 | |||
| 472 | BC_FIELD = (('CardNum', '银行卡号'), | 621 | BC_FIELD = (('CardNum', '银行卡号'), |
| 473 | ('BankName', '发卡行名称'), | 622 | ('BankName', '发卡行名称'), |
| 474 | ('CardName', '银行卡名称'), | 623 | ('CardName', '银行卡名称'), |
| ... | @@ -478,3 +627,19 @@ BC_FIELD = (('CardNum', '银行卡号'), | ... | @@ -478,3 +627,19 @@ BC_FIELD = (('CardNum', '银行卡号'), |
| 478 | 627 | ||
| 479 | SUCCESS_CODE_SET = {'0', 0} | 628 | SUCCESS_CODE_SET = {'0', 0} |
| 480 | 629 | ||
| 630 | LICENSE_ORDER = ((MVI_CLASSIFY, (MVI_PID, MVI_CN_NAME)), | ||
| 631 | (IC_CLASSIFY, (IC_PID, IC_CN_NAME)), | ||
| 632 | (BC_CLASSIFY, (BC_PID, BC_CN_NAME)), | ||
| 633 | (BL_CLASSIFY, (BL_PID, BL_CN_NAME)), | ||
| 634 | (UCI_CLASSIFY, (UCI_PID, UCI_CN_NAME)), | ||
| 635 | (EEP_CLASSIFY, (EEP_PID, EEP_CN_NAME)), | ||
| 636 | (DL_CLASSIFY, (DL_PID, DL_CN_NAME)), | ||
| 637 | (PP_CLASSIFY, (PP_PID, PP_CN_NAME)), | ||
| 638 | (MVC_CLASSIFY, (MVC_PID, MVC_CN_NAME)), | ||
| 639 | (VAT_CLASSIFY, (VAT_PID, VAT_CN_NAME))) | ||
| 640 | |||
| 641 | LICENSE_CLASSIFY_MAPPING = dict(LICENSE_ORDER) | ||
| 642 | |||
| 643 | OTHER_CLASSIFY_SET = {OTHER_CLASSIFY} | ||
| 644 | LICENSE_CLASSIFY_SET_1 = {IC_CLASSIFY, VAT_CLASSIFY, MVC_CLASSIFY, MVI_CLASSIFY} | ||
| 645 | LICENSE_CLASSIFY_SET_2 = {BL_CLASSIFY, UCI_CLASSIFY, EEP_CLASSIFY, DL_CLASSIFY, PP_CLASSIFY, BC_CLASSIFY} | ... | ... |
src/apps/doc/exceptions.py
0 → 100644
| ... | @@ -4,6 +4,7 @@ import signal | ... | @@ -4,6 +4,7 @@ import signal |
| 4 | import asyncio | 4 | import asyncio |
| 5 | import aiohttp | 5 | import aiohttp |
| 6 | import difflib | 6 | import difflib |
| 7 | import base64 | ||
| 7 | import requests | 8 | import requests |
| 8 | from datetime import datetime, date | 9 | from datetime import datetime, date |
| 9 | from collections import Counter | 10 | from collections import Counter |
| ... | @@ -18,6 +19,7 @@ from apps.doc.models import DocStatus, HILDoc, AFCDoc, Keywords | ... | @@ -18,6 +19,7 @@ from apps.doc.models import DocStatus, HILDoc, AFCDoc, Keywords |
| 18 | from apps.doc.named_enum import KeywordsType | 19 | from apps.doc.named_enum import KeywordsType |
| 19 | from apps.doc import consts | 20 | from apps.doc import consts |
| 20 | from apps.doc.ocr.edms import EDMS, rh | 21 | from apps.doc.ocr.edms import EDMS, rh |
| 22 | from apps.doc.exceptions import EDMSException | ||
| 21 | 23 | ||
| 22 | 24 | ||
| 23 | class Command(BaseCommand, LoggerMixin): | 25 | class Command(BaseCommand, LoggerMixin): |
| ... | @@ -30,7 +32,8 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -30,7 +32,8 @@ class Command(BaseCommand, LoggerMixin): |
| 30 | # 数据目录 | 32 | # 数据目录 |
| 31 | self.data_dir = conf.DATA_DIR | 33 | self.data_dir = conf.DATA_DIR |
| 32 | # ocr相关 | 34 | # ocr相关 |
| 33 | self.ocr_url = conf.OCR_URL | 35 | self.ocr_url_1 = conf.OCR_URL_1 |
| 36 | self.ocr_url_2 = conf.OCR_URL_2 | ||
| 34 | # EDMS web_service_api | 37 | # EDMS web_service_api |
| 35 | self.edms = EDMS(conf.EDMS_USER, conf.EDMS_PWD) | 38 | self.edms = EDMS(conf.EDMS_USER, conf.EDMS_PWD) |
| 36 | # 优雅退出信号:15 | 39 | # 优雅退出信号:15 |
| ... | @@ -70,18 +73,44 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -70,18 +73,44 @@ class Command(BaseCommand, LoggerMixin): |
| 70 | os.makedirs(doc_data_path, exist_ok=True) | 73 | os.makedirs(doc_data_path, exist_ok=True) |
| 71 | pdf_path = os.path.join(doc_data_path, '{0}.pdf'.format(doc.id)) | 74 | pdf_path = os.path.join(doc_data_path, '{0}.pdf'.format(doc.id)) |
| 72 | if not doc.application_id.startswith(consts.FIXED_APPLICATION_ID_PREFIX): | 75 | if not doc.application_id.startswith(consts.FIXED_APPLICATION_ID_PREFIX): |
| 76 | for times in range(consts.RETRY_TIMES): | ||
| 77 | try: | ||
| 73 | self.edms.download(pdf_path, doc.metadata_version_id) | 78 | self.edms.download(pdf_path, doc.metadata_version_id) |
| 79 | except Exception as e: | ||
| 80 | self.cronjob_log.warn('{0} [edms download failed] [times={1}] [business_type={2}] [doc_id={3}] ' | ||
| 81 | '[error={4}]'.format(self.log_base, times, business_type, doc.id, e)) | ||
| 82 | edms_exc = str(e) | ||
| 83 | else: | ||
| 84 | break | ||
| 85 | else: | ||
| 86 | raise EDMSException(edms_exc) | ||
| 74 | 87 | ||
| 75 | excel_path = os.path.join(doc_data_path, '{0}.xlsx'.format(doc.id)) | 88 | excel_path = os.path.join(doc_data_path, '{0}.xlsx'.format(doc.id)) |
| 76 | src_excel_path = os.path.join(doc_data_path, 'src.xlsx') | 89 | src_excel_path = os.path.join(doc_data_path, 'src.xlsx') |
| 77 | self.cronjob_log.info('{0} [pdf download success] [business_type={1}] [doc_id={2}] [pdf_path={3}]'.format( | 90 | self.cronjob_log.info('{0} [edms download success] [business_type={1}] [doc_id={2}] [pdf_path={3}]'.format( |
| 78 | self.log_base, business_type, doc.id, pdf_path)) | 91 | self.log_base, business_type, doc.id, pdf_path)) |
| 79 | return doc_data_path, excel_path, src_excel_path, pdf_path | 92 | return doc_data_path, excel_path, src_excel_path, pdf_path |
| 80 | 93 | ||
| 81 | @staticmethod | 94 | def bs_process(self, wb, ocr_data, bs_summary, unknown_summary, img_path, classify, skip_img): |
| 82 | def append_bs_sheet(wb, sheets, bs_summary, unknown_summary, pno, img_idx, classify, confidence): | 95 | sheets = ocr_data.get('data', []) |
| 96 | if not sheets: | ||
| 97 | skip_img.append(self.parse_img_path(img_path)) | ||
| 98 | return | ||
| 99 | confidence = ocr_data.get('confidence', 1) | ||
| 100 | img_name, _ = os.path.splitext(os.path.basename(img_path)) | ||
| 83 | for i, sheet in enumerate(sheets): | 101 | for i, sheet in enumerate(sheets): |
| 84 | sheet_name = 'page_{0}_img_{1}_{2}'.format(pno, img_idx, i) | 102 | cells = sheet.get('cells') |
| 103 | if not cells: | ||
| 104 | skip_img.append(self.parse_img_path(img_path)) | ||
| 105 | continue | ||
| 106 | sheet_name = '{0}_{1}'.format(img_name, i) | ||
| 107 | ws = wb.create_sheet(sheet_name) | ||
| 108 | for cell in cells: | ||
| 109 | c1 = cell.get('start_column') | ||
| 110 | r1 = cell.get('start_row') | ||
| 111 | words = cell.get('words') | ||
| 112 | ws.cell(row=r1 + 1, column=c1 + 1, value=words) | ||
| 113 | |||
| 85 | # ['户名', '卡号', '页码', '回单验证码', '打印时间', '起始时间', '终止时间'] | 114 | # ['户名', '卡号', '页码', '回单验证码', '打印时间', '起始时间', '终止时间'] |
| 86 | summary = sheet.get('summary') | 115 | summary = sheet.get('summary') |
| 87 | card = summary[1] | 116 | card = summary[1] |
| ... | @@ -129,74 +158,199 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -129,74 +158,199 @@ class Command(BaseCommand, LoggerMixin): |
| 129 | if summary[6] is not None: | 158 | if summary[6] is not None: |
| 130 | ed_list.append(summary[6]) | 159 | ed_list.append(summary[6]) |
| 131 | 160 | ||
| 132 | ws = wb.create_sheet(sheet_name) | 161 | def license1_process(self, ocr_data, license_summary, classify, skip_img, img_path): |
| 133 | cells = sheet.get('cells') | 162 | license_data = ocr_data.get('data', []) |
| 134 | for cell in cells: | 163 | if not license_data: |
| 135 | c1 = cell.get('start_column') | 164 | skip_img.append(self.parse_img_path(img_path)) |
| 136 | r1 = cell.get('start_row') | 165 | return |
| 137 | words = cell.get('words') | 166 | for license_dict in license_data: |
| 138 | ws.cell(row=r1+1, column=c1+1, value=words) | 167 | res_list = [] |
| 168 | for field, value in license_dict.items(): | ||
| 169 | res_list.append((field, value)) | ||
| 170 | license_summary.setdefault(classify, []).append(res_list) | ||
| 171 | |||
| 172 | def license2_process(self, ocr_res_2, license_summary, pid, classify, skip_img, img_path): | ||
| 173 | if ocr_res_2.get('ErrorCode') in consts.SUCCESS_CODE_SET: | ||
| 174 | if pid == consts.BC_PID: | ||
| 175 | # 银行卡 | ||
| 176 | res_list = [] | ||
| 177 | for en_key, chn_key in consts.BC_FIELD: | ||
| 178 | res_list.append((chn_key, ocr_res_2.get(en_key, ''))) | ||
| 179 | license_summary.setdefault(classify, []).append(res_list) | ||
| 180 | else: | ||
| 181 | # 营业执照、行驶证等 | ||
| 182 | for result_dict in ocr_res_2.get('ResultList', []): | ||
| 183 | res_list = [] | ||
| 184 | for field_dict in result_dict.get('FieldList', []): | ||
| 185 | res_list.append( | ||
| 186 | (field_dict.get('chn_key', ''), field_dict.get('value', ''))) | ||
| 187 | license_summary.setdefault(classify, []).append(res_list) | ||
| 188 | else: | ||
| 189 | skip_img.append(self.parse_img_path(img_path)) | ||
| 139 | 190 | ||
| 140 | def ocr_2_wb(self, res, wb, pno, img_idx, bs_summary, unknown_summary, license_summary): | 191 | @staticmethod |
| 141 | # res = { | 192 | async def fetch_ocr_1_result(url, json_data): |
| 142 | # 'code': 1, | 193 | async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(ssl=False)) as session: |
| 143 | # 'msg': 'success', | 194 | async with session.post(url, json=json_data) as response: |
| 144 | # 'data': { | 195 | if response.status == 200: |
| 145 | # 'classify': 0, | 196 | return await response.json() |
| 146 | # 'confidence': 0.999, | 197 | |
| 147 | # 'sheets': [ | 198 | @staticmethod |
| 148 | # { | 199 | async def fetch_ocr_2_result(url, json_data): |
| 149 | # 'summary': ['户名', '卡号', '页码', '回单验证码', '打印时间', '起始时间', '终止时间'], | 200 | async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(ssl=False)) as session: |
| 150 | # 'cells': [] | 201 | async with session.post(url, data=json_data) as response: |
| 151 | # }, | 202 | if response.status == 200: |
| 152 | # { | 203 | return await response.json() |
| 153 | # 'summary': ['户名', '卡号', '页码', '回单验证码', '打印时间', '起始时间', '终止时间'], | 204 | |
| 154 | # 'cells': [] | 205 | async def img_2_ocr_2_wb(self, wb, img_path, bs_summary, unknown_summary, license_summary, skip_img): |
| 155 | # } | 206 | with open(img_path, 'rb') as f: |
| 156 | # ] | 207 | base64_data = base64.b64encode(f.read()) |
| 157 | # } | 208 | # 获取解码后的base64值 |
| 158 | # } | 209 | file_data = base64_data.decode() |
| 159 | data = res.get('data', {}) | 210 | json_data_1 = { |
| 160 | classify = data.get('classify') | 211 | "file": file_data |
| 212 | } | ||
| 213 | ocr_res_1 = await self.fetch_ocr_1_result(self.ocr_url_1, json_data_1) | ||
| 214 | if ocr_res_1 is None: | ||
| 215 | raise Exception('ocr 1 error, img_path={0}'.format(img_path)) | ||
| 216 | else: | ||
| 217 | self.cronjob_log.info('{0} [ocr_1 result] [img={1}] [res={2}]'.format( | ||
| 218 | self.log_base, img_path, ocr_res_1)) | ||
| 219 | |||
| 220 | if ocr_res_1.get('code') == 1: | ||
| 221 | ocr_data = ocr_res_1.get('data', {}) | ||
| 222 | classify = ocr_data.get('classify') | ||
| 161 | if classify is None: | 223 | if classify is None: |
| 224 | skip_img.append(self.parse_img_path(img_path)) | ||
| 162 | return | 225 | return |
| 163 | # if classify in | 226 | elif classify in consts.OTHER_CLASSIFY_SET: # 其他类 |
| 164 | sheets = data.get('sheets', []) | 227 | skip_img.append(self.parse_img_path(img_path)) |
| 165 | if not sheets: | ||
| 166 | return | 228 | return |
| 167 | confidence = data.get('confidence', 1) | 229 | elif classify in consts.LICENSE_CLASSIFY_SET_1: # 证件1 |
| 168 | self.append_bs_sheet(wb, sheets, bs_summary, unknown_summary, pno, img_idx, classify, confidence) | 230 | self.license1_process(ocr_data, license_summary, classify, skip_img, img_path) |
| 169 | # else: | 231 | elif classify in consts.LICENSE_CLASSIFY_SET_2: # 证件2 |
| 170 | # pass | 232 | pid, _ = consts.LICENSE_CLASSIFY_MAPPING.get(classify) |
| 171 | 233 | json_data_2 = { | |
| 172 | # async def fetch_ocr_result(self, img_path): | 234 | "pid": str(pid), |
| 173 | # async with aiohttp.ClientSession( | 235 | "key": conf.OCR_KEY, |
| 174 | # headers=self.ocr_header, connector=aiohttp.TCPConnector(ssl=False) | 236 | "secret": conf.OCR_SECRET, |
| 175 | # ) as session: | 237 | "file": file_data |
| 176 | # json_data = self.get_ocr_json(img_path) | 238 | } |
| 177 | # async with session.post(self.ocr_url, json=json_data) as response: | 239 | ocr_res_2 = await self.fetch_ocr_2_result(self.ocr_url_2, json_data_2) |
| 178 | # return await response.json() | 240 | if ocr_res_2 is None: |
| 241 | raise Exception('ocr 2 error, img_path={0}'.format(img_path)) | ||
| 242 | else: | ||
| 243 | # 识别结果 | ||
| 244 | self.cronjob_log.info('{0} [ocr_2 result] [img={1}] [res={2}]'.format( | ||
| 245 | self.log_base, img_path, ocr_res_2)) | ||
| 246 | self.license2_process(ocr_res_2, license_summary, pid, classify, skip_img, img_path) | ||
| 247 | else: # 流水处理 | ||
| 248 | self.bs_process(wb, ocr_data, bs_summary, unknown_summary, img_path, classify, skip_img) | ||
| 249 | else: | ||
| 250 | skip_img.append(self.parse_img_path(img_path)) | ||
| 251 | |||
| 252 | # def img_2_ocr_2_wb(self, wb, img_path, bs_summary, unknown_summary, license_summary, skip_img): | ||
| 253 | # # # 流水 | ||
| 254 | # # res = { | ||
| 255 | # # 'code': 1, | ||
| 256 | # # 'msg': 'success', | ||
| 257 | # # 'data': { | ||
| 258 | # # 'classify': 0, | ||
| 259 | # # 'confidence': 0.999, | ||
| 260 | # # 'data': [ | ||
| 261 | # # { | ||
| 262 | # # 'summary': ['户名', '卡号', '页码', '回单验证码', '打印时间', '起始时间', '终止时间'], | ||
| 263 | # # 'cells': [] | ||
| 264 | # # }, | ||
| 265 | # # { | ||
| 266 | # # 'summary': ['户名', '卡号', '页码', '回单验证码', '打印时间', '起始时间', '终止时间'], | ||
| 267 | # # 'cells': [] | ||
| 268 | # # } | ||
| 269 | # # ] | ||
| 270 | # # } | ||
| 271 | # # } | ||
| 272 | # # | ||
| 273 | # # # 证件-1 | ||
| 274 | # # res = { | ||
| 275 | # # 'code': 1, | ||
| 276 | # # 'msg': 'success', | ||
| 277 | # # 'data': { | ||
| 278 | # # 'classify': 0, | ||
| 279 | # # 'confidence': 0.999, | ||
| 280 | # # 'data': [ | ||
| 281 | # # { | ||
| 282 | # # 'cn_key': 'value', | ||
| 283 | # # 'cn_key': 'value', | ||
| 284 | # # }, | ||
| 285 | # # { | ||
| 286 | # # 'cn_key': 'value', | ||
| 287 | # # 'cn_key': 'value', | ||
| 288 | # # }, | ||
| 289 | # # ] | ||
| 290 | # # } | ||
| 291 | # # } | ||
| 292 | # # | ||
| 293 | # # # 证件-2 or 其他类 | ||
| 294 | # # res = { | ||
| 295 | # # 'code': 1, | ||
| 296 | # # 'msg': 'success', | ||
| 297 | # # 'data': { | ||
| 298 | # # 'classify': 0, | ||
| 299 | # # 'confidence': 0.999, | ||
| 300 | # # } | ||
| 301 | # # } | ||
| 302 | # with open(img_path, 'rb') as f: | ||
| 303 | # base64_data = base64.b64encode(f.read()) | ||
| 304 | # # 获取解码后的base64值 | ||
| 305 | # file_data = base64_data.decode() | ||
| 306 | # json_data_1 = { | ||
| 307 | # "file": file_data | ||
| 308 | # } | ||
| 309 | # response_1 = requests.post(self.ocr_url_1, json=json_data_1) | ||
| 310 | # if response_1.status_code == 200: | ||
| 311 | # ocr_res_1 = response_1.json() | ||
| 312 | # self.cronjob_log.info('{0} [ocr_1 result] [img={1}] [res={2}]'.format( | ||
| 313 | # self.log_base, img_path, ocr_res_1)) | ||
| 179 | # | 314 | # |
| 180 | # async def img_2_ocr_2_wb(self, wb, img_path, summary): | 315 | # if ocr_res_1.get('code') == 1: |
| 181 | # res = await self.fetch_ocr_result(img_path) | 316 | # ocr_data = ocr_res_1.get('data', {}) |
| 182 | # self.cronjob_log.info('{0} [fetch ocr result success] [img={1}] [res={2}]'.format(self.log_base, img_path, res)) | 317 | # classify = ocr_data.get('classify') |
| 183 | # sheets_list = res.get('result').get('res') | 318 | # if classify is None: |
| 184 | # img_name = os.path.basename(img_path) | 319 | # skip_img.append(self.parse_img_path(img_path)) |
| 185 | # self.append_sheet(wb, sheets_list, img_name, summary) | 320 | # return |
| 186 | 321 | # elif classify in consts.OTHER_CLASSIFY_SET: # 其他类 | |
| 187 | def fetch_ocr_result(self, img_path): | 322 | # skip_img.append(self.parse_img_path(img_path)) |
| 188 | files = [ | 323 | # return |
| 189 | ('img', open(img_path, 'rb')) | 324 | # elif classify in consts.LICENSE_CLASSIFY_SET_1: # 证件1 |
| 190 | ] | 325 | # self.license1_process(ocr_data, license_summary, classify, skip_img, img_path) |
| 191 | response = requests.request("POST", self.ocr_url, files=files) | 326 | # elif classify in consts.LICENSE_CLASSIFY_SET_2: # 证件2 |
| 192 | return response.json() | 327 | # pid, _ = consts.LICENSE_CLASSIFY_MAPPING.get(classify) |
| 193 | 328 | # json_data_2 = { | |
| 194 | def img_2_ocr_2_wb(self, wb, img_info, bs_summary, unknown_summary, license_summary): | 329 | # "pid": str(pid), |
| 195 | res = self.fetch_ocr_result(img_info[0]) | 330 | # "key": conf.OCR_KEY, |
| 196 | self.cronjob_log.info('{0} [fetch ocr result success] [img={1}] [res={2}]'.format( | 331 | # "secret": conf.OCR_SECRET, |
| 197 | self.log_base, img_info[0], res)) | 332 | # "file": file_data |
| 198 | if res.get('code') == 1: | 333 | # } |
| 199 | self.ocr_2_wb(res, wb, img_info[1], img_info[2], bs_summary, unknown_summary, license_summary) | 334 | # response_2 = requests.post(self.ocr_url_2, data=json_data_2) |
| 335 | # if response_2.status_code == 200: | ||
| 336 | # # 识别结果 | ||
| 337 | # ocr_res_2 = response_2.json() | ||
| 338 | # self.cronjob_log.info('{0} [ocr_2 result] [img={1}] [res={2}]'.format( | ||
| 339 | # self.log_base, img_path, ocr_res_2)) | ||
| 340 | # self.license2_process(ocr_res_2, license_summary, pid, classify, skip_img, img_path) | ||
| 341 | # else: | ||
| 342 | # raise Exception('ocr 2 error, img_path={0}'.format(img_path)) | ||
| 343 | # else: # 流水处理 | ||
| 344 | # self.bs_process(wb, ocr_data, bs_summary, unknown_summary, img_path, classify, skip_img) | ||
| 345 | # else: | ||
| 346 | # skip_img.append(self.parse_img_path(img_path)) | ||
| 347 | # else: | ||
| 348 | # raise Exception('ocr 1 error, img_path={0}'.format(img_path)) | ||
| 349 | |||
| 350 | @staticmethod | ||
| 351 | def parse_img_path(img_path): | ||
| 352 | img_name, _ = os.path.splitext(os.path.basename(img_path)) | ||
| 353 | return int(img_name[5])+1, int(img_name[11])+1 | ||
| 200 | 354 | ||
| 201 | @staticmethod | 355 | @staticmethod |
| 202 | def get_most(value_list): | 356 | def get_most(value_list): |
| ... | @@ -255,7 +409,6 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -255,7 +409,6 @@ class Command(BaseCommand, LoggerMixin): |
| 255 | summary['role'] = self.get_most(summary['role']) | 409 | summary['role'] = self.get_most(summary['role']) |
| 256 | return bs_summary | 410 | return bs_summary |
| 257 | 411 | ||
| 258 | |||
| 259 | def rebuild_bs_summary(self, bs_summary, unknown_summary): | 412 | def rebuild_bs_summary(self, bs_summary, unknown_summary): |
| 260 | # bs_summary = { | 413 | # bs_summary = { |
| 261 | # '卡号': { | 414 | # '卡号': { |
| ... | @@ -297,8 +450,10 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -297,8 +450,10 @@ class Command(BaseCommand, LoggerMixin): |
| 297 | merged_bs_summary[card] = summary | 450 | merged_bs_summary[card] = summary |
| 298 | else: | 451 | else: |
| 299 | # 1卡号 | 452 | # 1卡号 |
| 453 | one_card = False | ||
| 300 | if len(bs_summary) == 1: | 454 | if len(bs_summary) == 1: |
| 301 | merged_bs_summary = self.prune_bs_summary(bs_summary) | 455 | merged_bs_summary = self.prune_bs_summary(bs_summary) |
| 456 | one_card = True | ||
| 302 | # 多卡号 | 457 | # 多卡号 |
| 303 | else: | 458 | else: |
| 304 | merged_bs_summary = self.merge_card(bs_summary) | 459 | merged_bs_summary = self.merge_card(bs_summary) |
| ... | @@ -307,7 +462,7 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -307,7 +462,7 @@ class Command(BaseCommand, LoggerMixin): |
| 307 | merge_role = [] | 462 | merge_role = [] |
| 308 | classify_summary = unknown_summary.get(card_summary['classify'], {}) | 463 | classify_summary = unknown_summary.get(card_summary['classify'], {}) |
| 309 | for role, summary in classify_summary.items(): | 464 | for role, summary in classify_summary.items(): |
| 310 | if role in card_summary['role_set']: | 465 | if one_card or role in card_summary['role_set']: |
| 311 | merge_role.append(role) | 466 | merge_role.append(role) |
| 312 | card_summary['sheet'].extend(summary['sheet']) | 467 | card_summary['sheet'].extend(summary['sheet']) |
| 313 | card_summary['code'].extend(summary['code']) | 468 | card_summary['code'].extend(summary['code']) |
| ... | @@ -336,12 +491,13 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -336,12 +491,13 @@ class Command(BaseCommand, LoggerMixin): |
| 336 | return merged_bs_summary | 491 | return merged_bs_summary |
| 337 | 492 | ||
| 338 | # TODO 细化文件状态,不同异常状态,归还队列,重试时采取不同的处理 | 493 | # TODO 细化文件状态,不同异常状态,归还队列,重试时采取不同的处理 |
| 339 | # TODO 调用接口重试 | ||
| 340 | # TODO 协程异步发送OCR请求 | ||
| 341 | # TODO 异常邮件通知 | 494 | # TODO 异常邮件通知 |
| 495 | # 识别失败:普通异常,如PDF异常、构建过程异常 | ||
| 496 | # EDMS异常:下载异常-->回队列-->邮件;上传异常-->重新上传队列-->邮件 | ||
| 497 | # 算法异常:第一道异常-->识别失败-->邮件;第二道异常-->识别失败-->邮件 | ||
| 498 | # TODO 协程异步发送OCR请求 | ||
| 499 | # TODO 调用接口重试 | ||
| 342 | # TODO 数据库断联问题 | 500 | # TODO 数据库断联问题 |
| 343 | # TODO 非流水证件处理 | ||
| 344 | # TODO EDMS API GATEWAY | ||
| 345 | def handle(self, *args, **kwargs): | 501 | def handle(self, *args, **kwargs): |
| 346 | sleep_second = int(conf.SLEEP_SECOND) | 502 | sleep_second = int(conf.SLEEP_SECOND) |
| 347 | max_sleep_second = int(conf.MAX_SLEEP_SECOND) | 503 | max_sleep_second = int(conf.MAX_SLEEP_SECOND) |
| ... | @@ -369,61 +525,82 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -369,61 +525,82 @@ class Command(BaseCommand, LoggerMixin): |
| 369 | pdf_handler.extract_image() | 525 | pdf_handler.extract_image() |
| 370 | self.cronjob_log.info('{0} [pdf to img end] [business_type={1}] [doc_id={2}]'.format( | 526 | self.cronjob_log.info('{0} [pdf to img end] [business_type={1}] [doc_id={2}]'.format( |
| 371 | self.log_base, business_type, doc.id)) | 527 | self.log_base, business_type, doc.id)) |
| 372 | write_zip_file(img_save_path, os.path.join(doc_data_path, '{0}_img.zip'.format(doc.id))) | ||
| 373 | 528 | ||
| 374 | # 4.获取OCR结果并且构建excel文件 | 529 | # 4.获取OCR结果并且构建excel文件 |
| 375 | bs_summary = {} | 530 | bs_summary = {} |
| 376 | license_summary = {} | 531 | license_summary = {} |
| 377 | unknown_summary = {} | 532 | unknown_summary = {} |
| 533 | skip_img = [] | ||
| 378 | interest_keyword = Keywords.objects.filter( | 534 | interest_keyword = Keywords.objects.filter( |
| 379 | type=KeywordsType.INTEREST.value).values_list('keyword', flat=True) | 535 | type=KeywordsType.INTEREST.value, on_off=True).values_list('keyword', flat=True) |
| 380 | salary_keyword = Keywords.objects.filter( | 536 | salary_keyword = Keywords.objects.filter( |
| 381 | type=KeywordsType.SALARY.value).values_list('keyword', flat=True) | 537 | type=KeywordsType.SALARY.value, on_off=True).values_list('keyword', flat=True) |
| 382 | loan_keyword = Keywords.objects.filter( | 538 | loan_keyword = Keywords.objects.filter( |
| 383 | type__in=[KeywordsType.LOAN.value, KeywordsType.ALI_WECHART.value]).values_list( | 539 | type__in=[KeywordsType.LOAN.value, KeywordsType.ALI_WECHART.value], on_off=True).values_list( |
| 384 | 'keyword', flat=True) | 540 | 'keyword', flat=True) |
| 385 | wb = BSWorkbook(interest_keyword, salary_keyword, loan_keyword) | 541 | wb = BSWorkbook(interest_keyword, salary_keyword, loan_keyword) |
| 386 | 542 | ||
| 387 | # wb = Workbook() | 543 | # wb = Workbook() |
| 388 | 544 | ||
| 389 | # 4.1 获取OCR结果 | 545 | # 4.1 获取OCR结果 |
| 390 | # loop = asyncio.get_event_loop() | 546 | loop = asyncio.get_event_loop() |
| 391 | # tasks = [self.img_2_ocr_2_wb(wb, img_path, summary) for img_path in pdf_handler.img_path_list] | 547 | tasks = [self.img_2_ocr_2_wb(wb, img_path, bs_summary, unknown_summary, license_summary, skip_img) |
| 392 | # loop.run_until_complete(asyncio.wait(tasks)) | 548 | for img_path in pdf_handler.img_path_list] |
| 549 | loop.run_until_complete(asyncio.wait(tasks)) | ||
| 393 | # loop.close() | 550 | # loop.close() |
| 394 | 551 | ||
| 395 | for img_info in pdf_handler.img_info_list: | 552 | # for img_path in pdf_handler.img_path_list: |
| 396 | self.img_2_ocr_2_wb(wb, img_info, bs_summary, unknown_summary, license_summary) | 553 | # self.img_2_ocr_2_wb(wb, img_path, bs_summary, unknown_summary, license_summary, skip_img) |
| 397 | 554 | ||
| 398 | self.cronjob_log.info('{0} [bs_summary={1}] [unknown_summary={2}] [license_summary={3}]'.format( | 555 | self.cronjob_log.info('{0} [business_type={1}] [doc_id={2}] [bs_summary={3}] [unknown_summary={4}] ' |
| 399 | self.log_base, bs_summary, unknown_summary, license_summary)) | 556 | '[license_summary={5}]'.format(self.log_base, business_type, doc.id, bs_summary, |
| 557 | unknown_summary, license_summary)) | ||
| 400 | 558 | ||
| 401 | merged_bs_summary = self.rebuild_bs_summary(bs_summary, unknown_summary) | 559 | merged_bs_summary = self.rebuild_bs_summary(bs_summary, unknown_summary) |
| 402 | 560 | ||
| 403 | self.cronjob_log.info('{0} [merged_bs_summary={1}] [unknown_summary={2}]'.format( | 561 | self.cronjob_log.info('{0} [business_type={1}] [doc_id={2}] [merged_bs_summary={3}] ' |
| 404 | self.log_base, merged_bs_summary, unknown_summary)) | 562 | '[unknown_summary={4}] [skip_img={5}]'.format(self.log_base, business_type, |
| 563 | doc.id, merged_bs_summary, | ||
| 564 | unknown_summary, skip_img)) | ||
| 405 | del unknown_summary | 565 | del unknown_summary |
| 406 | 566 | ||
| 407 | # 4.2 重构Excel文件 | 567 | # 4.2 重构Excel文件 |
| 408 | wb.save(src_excel_path) | 568 | wb.save(src_excel_path) |
| 409 | wb.rebuild(merged_bs_summary, license_summary) | 569 | wb.rebuild(merged_bs_summary, license_summary, skip_img) |
| 410 | wb.save(excel_path) | 570 | wb.save(excel_path) |
| 571 | except EDMSException as e: | ||
| 572 | self.cronjob_log.error('{0} [process failed (edms download)] [business_type={1}] [doc_id={2}] ' | ||
| 573 | '[err={3}]'.format(self.log_base, business_type, doc.id, e)) | ||
| 411 | except Exception as e: | 574 | except Exception as e: |
| 412 | doc.status = DocStatus.PROCESS_FAILED.value | 575 | doc.status = DocStatus.PROCESS_FAILED.value |
| 413 | doc.save() | 576 | doc.save() |
| 414 | self.cronjob_log.error('{0} [process failed] [business_type={1}] [doc_id={2}] [err={3}]'.format( | 577 | self.cronjob_log.error('{0} [process failed (program)] [business_type={1}] [doc_id={2}] ' |
| 415 | self.log_base, business_type, doc.id, e)) | 578 | '[err={3}]'.format(self.log_base, business_type, doc.id, e)) |
| 416 | else: | 579 | else: |
| 417 | try: | 580 | try: |
| 418 | # 5.上传至EDMS | 581 | # 5.上传至EDMS |
| 582 | for times in range(consts.RETRY_TIMES): | ||
| 583 | try: | ||
| 419 | self.edms.upload(excel_path, doc, business_type) | 584 | self.edms.upload(excel_path, doc, business_type) |
| 420 | except Exception as e: | 585 | except Exception as e: |
| 586 | self.cronjob_log.warn( | ||
| 587 | '{0} [edms upload failed] [times={1}] [business_type={2}] [doc_id={3}] ' | ||
| 588 | '[error={4}]'.format(self.log_base, times, business_type, doc.id, e)) | ||
| 589 | edms_exc = str(e) | ||
| 590 | else: | ||
| 591 | break | ||
| 592 | else: | ||
| 593 | raise EDMSException(edms_exc) | ||
| 594 | except Exception as e: | ||
| 421 | doc.status = DocStatus.UPLOAD_FAILED.value | 595 | doc.status = DocStatus.UPLOAD_FAILED.value |
| 422 | doc.save() | 596 | doc.save() |
| 423 | end_time = time.time() | 597 | end_time = time.time() |
| 424 | speed_time = int(end_time - start_time) | 598 | speed_time = int(end_time - start_time) |
| 425 | self.cronjob_log.error('{0} [upload failed] [business_type={1}] [doc_id={2}] [speed_time={3}] ' | 599 | self.cronjob_log.error('{0} [process failed (edms upload)] [business_type={1}] [doc_id={2}] ' |
| 426 | '[err={4}]'.format(self.log_base, business_type, doc.id, speed_time, e)) | 600 | '[speed_time={3}] [err={4}]'.format(self.log_base, business_type, doc.id, |
| 601 | speed_time, e)) | ||
| 602 | write_zip_file(img_save_path, os.path.join(doc_data_path, '{0}_img.zip'.format(doc.id))) | ||
| 603 | |||
| 427 | else: | 604 | else: |
| 428 | doc.status = DocStatus.COMPLETE.value | 605 | doc.status = DocStatus.COMPLETE.value |
| 429 | doc.save() | 606 | doc.save() |
| ... | @@ -431,5 +608,6 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -431,5 +608,6 @@ class Command(BaseCommand, LoggerMixin): |
| 431 | speed_time = int(end_time - start_time) | 608 | speed_time = int(end_time - start_time) |
| 432 | self.cronjob_log.info('{0} [process complete] [business_type={1}] [doc_id={2}] ' | 609 | self.cronjob_log.info('{0} [process complete] [business_type={1}] [doc_id={2}] ' |
| 433 | '[speed_time={3}]'.format(self.log_base, business_type, doc.id, speed_time)) | 610 | '[speed_time={3}]'.format(self.log_base, business_type, doc.id, speed_time)) |
| 611 | write_zip_file(img_save_path, os.path.join(doc_data_path, '{0}_img.zip'.format(doc.id))) | ||
| 434 | 612 | ||
| 435 | self.cronjob_log.info('{0} [stop safely]'.format(self.log_base)) | 613 | self.cronjob_log.info('{0} [stop safely]'.format(self.log_base)) | ... | ... |
| ... | @@ -141,32 +141,22 @@ class BSWorkbook(Workbook): | ... | @@ -141,32 +141,22 @@ class BSWorkbook(Workbook): |
| 141 | # month_info process | 141 | # month_info process |
| 142 | month_info = month_mapping.setdefault('xxxx-xx', []) | 142 | month_info = month_mapping.setdefault('xxxx-xx', []) |
| 143 | month_info.append((ws.title, min_row, ws.max_row, 0)) | 143 | month_info.append((ws.title, min_row, ws.max_row, 0)) |
| 144 | elif len(month_list) == 1: | ||
| 145 | # reverse_trend_list process | ||
| 146 | reverse_trend = self.get_reverse_trend(dti.day, idx_list) | ||
| 147 | reverse_trend_list.append(reverse_trend) | ||
| 148 | # month_info process | ||
| 149 | month_info = month_mapping.setdefault(month_list[0], []) | ||
| 150 | day_mean = np.mean(dti.day.dropna()) | ||
| 151 | if len(month_info) == 0: | ||
| 152 | month_info.append((ws.title, min_row, ws.max_row, day_mean)) | ||
| 153 | else: | ||
| 154 | for i, item in enumerate(month_info): | ||
| 155 | if day_mean <= item[-1]: | ||
| 156 | month_info.insert(i, (ws.title, min_row, ws.max_row, day_mean)) | ||
| 157 | break | ||
| 158 | else: | ||
| 159 | month_info.append((ws.title, min_row, ws.max_row, day_mean)) | ||
| 160 | else: | 144 | else: |
| 161 | # reverse_trend_list process | 145 | # reverse_trend_list process |
| 162 | reverse_trend = self.get_reverse_trend(dti.day, idx_list) | 146 | reverse_trend = self.get_reverse_trend(dti.day, idx_list) |
| 163 | reverse_trend_list.append(reverse_trend) | 147 | reverse_trend_list.append(reverse_trend) |
| 164 | # month_info process | 148 | # month_info process |
| 165 | for i, item in enumerate(month_list[:-1]): | 149 | day_idx = dti.day |
| 150 | idx_list_max_idx = len(idx_list) - 1 | ||
| 151 | for i, item in enumerate(month_list): | ||
| 152 | if i == idx_list_max_idx: | ||
| 153 | day_mean = np.mean(day_idx[idx_list[i]:].dropna()) | ||
| 166 | month_mapping.setdefault(item, []).append( | 154 | month_mapping.setdefault(item, []).append( |
| 167 | (ws.title, idx_list[i] + min_row, idx_list[i + 1] + min_row - 1, self.MAX_MEAN)) | 155 | (ws.title, idx_list[i] + min_row, ws.max_row, day_mean)) |
| 168 | month_mapping.setdefault(month_list[-1], []).insert( | 156 | else: |
| 169 | 0, (ws.title, idx_list[-1] + min_row, ws.max_row, 0)) | 157 | day_mean = np.mean(day_idx[idx_list[i]: idx_list[i + 1]].dropna()) |
| 158 | month_mapping.setdefault(item, []).append( | ||
| 159 | (ws.title, idx_list[i] + min_row, idx_list[i + 1] + min_row - 1, day_mean)) | ||
| 170 | 160 | ||
| 171 | def build_metadata_rows(self, confidence, code, print_time, start_date, end_date): | 161 | def build_metadata_rows(self, confidence, code, print_time, start_date, end_date): |
| 172 | if start_date is None or end_date is None: | 162 | if start_date is None or end_date is None: |
| ... | @@ -191,9 +181,9 @@ class BSWorkbook(Workbook): | ... | @@ -191,9 +181,9 @@ class BSWorkbook(Workbook): |
| 191 | def create_meta_sheet(self, card): | 181 | def create_meta_sheet(self, card): |
| 192 | if self.worksheets[0].title == 'Sheet': | 182 | if self.worksheets[0].title == 'Sheet': |
| 193 | ms = self.worksheets[0] | 183 | ms = self.worksheets[0] |
| 194 | ms.title = '{0}({1})'.format(self.meta_sheet_title, card) | 184 | ms.title = '{0}({1})'.format(self.meta_sheet_title, card[-6:]) |
| 195 | else: | 185 | else: |
| 196 | ms = self.create_sheet('{0}({1})'.format(self.meta_sheet_title, card)) | 186 | ms = self.create_sheet('{0}({1})'.format(self.meta_sheet_title, card[-6:])) |
| 197 | return ms | 187 | return ms |
| 198 | 188 | ||
| 199 | def build_meta_sheet(self, card, confidence, code, print_time, start_date, end_date): | 189 | def build_meta_sheet(self, card, confidence, code, print_time, start_date, end_date): |
| ... | @@ -203,6 +193,26 @@ class BSWorkbook(Workbook): | ... | @@ -203,6 +193,26 @@ class BSWorkbook(Workbook): |
| 203 | ms.append(row) | 193 | ms.append(row) |
| 204 | return ms | 194 | return ms |
| 205 | 195 | ||
| 196 | @staticmethod | ||
| 197 | def amount_format(amount_str): | ||
| 198 | if not isinstance(amount_str, str) or amount_str == '': | ||
| 199 | return amount_str | ||
| 200 | # 1.替换 | ||
| 201 | res_str = amount_str.translate(consts.TRANS) | ||
| 202 | # 2.删除多余的- | ||
| 203 | res_str = res_str[0] + res_str[1:].replace('-', '') | ||
| 204 | # 3.首字符处理 | ||
| 205 | if res_str[0] in consts.ERROR_CHARS: | ||
| 206 | res_str = '-{0}'.format(res_str[1:]) | ||
| 207 | # 4.逗号与句号处理 | ||
| 208 | if len(res_str) >= 4: | ||
| 209 | period_idx = len(res_str) - 3 | ||
| 210 | if res_str[period_idx] == '.' and res_str[period_idx - 1] == ',': | ||
| 211 | res_str = '{0}{1}'.format(res_str[:period_idx - 1], res_str[period_idx:]) | ||
| 212 | elif res_str[period_idx] == ',': | ||
| 213 | res_str = '{0}.{1}'.format(res_str[:period_idx], res_str[period_idx + 1:]) | ||
| 214 | return res_str | ||
| 215 | |||
| 206 | def build_month_sheet(self, card, month_mapping, ms, is_reverse): | 216 | def build_month_sheet(self, card, month_mapping, ms, is_reverse): |
| 207 | tmp_ws = self.create_sheet('tmp_ws') | 217 | tmp_ws = self.create_sheet('tmp_ws') |
| 208 | for month in sorted(month_mapping.keys()): | 218 | for month in sorted(month_mapping.keys()): |
| ... | @@ -235,29 +245,25 @@ class BSWorkbook(Workbook): | ... | @@ -235,29 +245,25 @@ class BSWorkbook(Workbook): |
| 235 | # 3.3.余额转数值 | 245 | # 3.3.余额转数值 |
| 236 | over_cell = rows[consts.OVER_IDX] | 246 | over_cell = rows[consts.OVER_IDX] |
| 237 | try: | 247 | try: |
| 238 | if isinstance(over_cell.value, str): | 248 | over_cell.value = locale.atof(self.amount_format(over_cell.value)) |
| 239 | over_cell.value = over_cell.value.translate(consts.TRANS) | ||
| 240 | over_cell.value = locale.atof(over_cell.value) | ||
| 241 | except Exception as e: | 249 | except Exception as e: |
| 242 | continue | 250 | continue |
| 243 | else: | 251 | else: |
| 244 | over_cell.number_format = numbers.FORMAT_NUMBER_COMMA_SEPARATED1 | 252 | over_cell.number_format = numbers.FORMAT_NUMBER_COMMA_SEPARATED1 |
| 245 | 253 | ||
| 246 | # 3.4.余额转数值 | 254 | # 3.4.金额转数值 |
| 247 | try: | 255 | try: |
| 248 | try: | 256 | try: |
| 249 | if isinstance(amount_cell.value, str): # TODO 可在转化数字失败后,再替换 | 257 | amount_cell.value = locale.atof(self.amount_format(amount_cell.value)) |
| 250 | amount_cell.value = amount_cell.value.translate(consts.TRANS) | ||
| 251 | amount_cell.value = locale.atof(amount_cell.value) | ||
| 252 | except Exception as e: | 258 | except Exception as e: |
| 253 | try: | 259 | try: |
| 254 | if isinstance(rows[consts.INCOME_IDX].value, str): | 260 | amount_cell.value = locale.atof(self.amount_format(rows[consts.INCOME_IDX].value)) |
| 255 | rows[consts.OUTLAY_IDX].value = rows[consts.INCOME_IDX].value.translate(consts.TRANS) | 261 | if amount_cell.value == 0: |
| 256 | amount_cell.value = locale.atof(rows[consts.OUTLAY_IDX].value) | 262 | raise |
| 263 | elif amount_cell.value < 0: | ||
| 264 | amount_cell.value = -amount_cell.value | ||
| 257 | except Exception as e: | 265 | except Exception as e: |
| 258 | if isinstance(rows[consts.OUTLAY_IDX].value, str): | 266 | amount_cell.value = locale.atof(self.amount_format(rows[consts.OUTLAY_IDX].value)) |
| 259 | rows[consts.OUTLAY_IDX].value = rows[consts.OUTLAY_IDX].value.translate(consts.TRANS) | ||
| 260 | amount_cell.value = locale.atof(rows[consts.OUTLAY_IDX].value) | ||
| 261 | if amount_cell.value > 0: | 267 | if amount_cell.value > 0: |
| 262 | amount_cell.value = -amount_cell.value | 268 | amount_cell.value = -amount_cell.value |
| 263 | except Exception as e: | 269 | except Exception as e: |
| ... | @@ -313,18 +319,18 @@ class BSWorkbook(Workbook): | ... | @@ -313,18 +319,18 @@ class BSWorkbook(Workbook): |
| 313 | # } | 319 | # } |
| 314 | for card, summary in bs_summary.items(): | 320 | for card, summary in bs_summary.items(): |
| 315 | # 1.原表修剪、排列、按照月份分割 | 321 | # 1.原表修剪、排列、按照月份分割 |
| 316 | start_date = summary['start_date'] | 322 | start_date = summary.get('start_date') |
| 317 | end_date = summary['end_date'] | 323 | end_date = summary.get('end_date') |
| 318 | date_statistics = False | 324 | date_statistics = False |
| 319 | if start_date is None or end_date is None: | 325 | if start_date is None or end_date is None: |
| 320 | date_statistics = True | 326 | date_statistics = True |
| 321 | date_list = [] | 327 | date_list = [] |
| 322 | month_mapping = {} | 328 | month_mapping = {} |
| 323 | reverse_trend_list = [] | 329 | reverse_trend_list = [] |
| 324 | for sheet in summary['sheet']: | 330 | for sheet in summary.get('sheet', []): |
| 325 | ws = self.get_sheet_by_name(sheet) | 331 | ws = self.get_sheet_by_name(sheet) |
| 326 | # 1.1.删除多余列、排列 | 332 | # 1.1.删除多余列、排列 |
| 327 | min_row = self.sheet_prune(ws, summary['classify']) | 333 | min_row = self.sheet_prune(ws, summary.get('classify', 0)) |
| 328 | # 1.2.按月份分割 | 334 | # 1.2.按月份分割 |
| 329 | self.sheet_split(ws, month_mapping, reverse_trend_list, min_row, date_list, date_statistics) | 335 | self.sheet_split(ws, month_mapping, reverse_trend_list, min_row, date_list, date_statistics) |
| 330 | 336 | ||
| ... | @@ -334,32 +340,43 @@ class BSWorkbook(Workbook): | ... | @@ -334,32 +340,43 @@ class BSWorkbook(Workbook): |
| 334 | 340 | ||
| 335 | # 2.元信息提取表 | 341 | # 2.元信息提取表 |
| 336 | ms = self.build_meta_sheet(card, | 342 | ms = self.build_meta_sheet(card, |
| 337 | summary['confidence'], | 343 | summary.get('confidence', 1), |
| 338 | summary['code'], | 344 | summary.get('code'), |
| 339 | summary['print_time'], | 345 | summary.get('print_time'), |
| 340 | start_date, | 346 | start_date, |
| 341 | end_date) | 347 | end_date) |
| 342 | 348 | ||
| 343 | # 3.创建月份表、提取/高亮关键行 | 349 | # 3.创建月份表、提取/高亮关键行 |
| 344 | is_reverse = False | 350 | # 倒序处理 |
| 345 | if sum(reverse_trend_list) > 0: # 倒序处理 | 351 | is_reverse = True if sum(reverse_trend_list) > 0 else False |
| 346 | is_reverse = True | ||
| 347 | for month_list in month_mapping.values(): | 352 | for month_list in month_mapping.values(): |
| 348 | month_list.sort(key=lambda x: x[-1], reverse=True) | 353 | month_list.sort(key=lambda x: x[-1], reverse=is_reverse) |
| 354 | |||
| 349 | self.build_month_sheet(card, month_mapping, ms, is_reverse) | 355 | self.build_month_sheet(card, month_mapping, ms, is_reverse) |
| 350 | 356 | ||
| 351 | # 4.删除原表 | 357 | # 4.删除原表 |
| 352 | for sheet in summary['sheet']: | 358 | for sheet in summary.get('sheet'): |
| 353 | self.remove(self.get_sheet_by_name(sheet)) | 359 | self.remove(self.get_sheet_by_name(sheet)) |
| 354 | 360 | ||
| 355 | def license_rebuild(self, license_summary): | 361 | def license_rebuild(self, license_summary): |
| 356 | for en_key, cn_key in consts.LICENSE_ORDER: | 362 | for classify, (_, name) in consts.LICENSE_ORDER: |
| 357 | ws = self.create_sheet(cn_key) | 363 | res = license_summary.get(classify) |
| 358 | for bl in license_summary.get(en_key, []): | 364 | if res is None: |
| 365 | continue | ||
| 366 | ws = self.create_sheet(name) | ||
| 367 | for bl in res: | ||
| 359 | for bl_field in bl: | 368 | for bl_field in bl: |
| 360 | ws.append(bl_field) | 369 | ws.append(bl_field) |
| 361 | ws.append((None, )) | 370 | ws.append((None, )) |
| 362 | 371 | ||
| 363 | def rebuild(self, bs_summary, license_summary): | 372 | def skip_img_sheet(self, skip_img): |
| 373 | if skip_img: | ||
| 374 | ws = self.create_sheet(consts.SKIP_IMG_SHEET_NAME) | ||
| 375 | ws.append(consts.SKIP_IMG_SHEET_HEADER) | ||
| 376 | for img_tuple in skip_img: | ||
| 377 | ws.append(img_tuple) | ||
| 378 | |||
| 379 | def rebuild(self, bs_summary, license_summary, skip_img): | ||
| 364 | self.bs_rebuild(bs_summary) | 380 | self.bs_rebuild(bs_summary) |
| 365 | # self.license_rebuild(license_summary) | 381 | self.license_rebuild(license_summary) |
| 382 | self.skip_img_sheet(skip_img) | ... | ... |
| ... | @@ -25,7 +25,7 @@ class PDFHandler: | ... | @@ -25,7 +25,7 @@ class PDFHandler: |
| 25 | def __init__(self, path, img_dir_path): | 25 | def __init__(self, path, img_dir_path): |
| 26 | self.path = path | 26 | self.path = path |
| 27 | self.img_dir_path = img_dir_path | 27 | self.img_dir_path = img_dir_path |
| 28 | self.img_info_list = [] | 28 | self.img_path_list = [] |
| 29 | self.xref_set = set() | 29 | self.xref_set = set() |
| 30 | 30 | ||
| 31 | def get_img_save_path(self, pno, img_index=0, ext='png'): | 31 | def get_img_save_path(self, pno, img_index=0, ext='png'): |
| ... | @@ -38,7 +38,7 @@ class PDFHandler: | ... | @@ -38,7 +38,7 @@ class PDFHandler: |
| 38 | pm = page.getPixmap(matrix=trans_2, alpha=False) | 38 | pm = page.getPixmap(matrix=trans_2, alpha=False) |
| 39 | img_save_path = self.get_img_save_path(page.number) | 39 | img_save_path = self.get_img_save_path(page.number) |
| 40 | pm.writePNG(img_save_path) | 40 | pm.writePNG(img_save_path) |
| 41 | self.img_info_list.append((img_save_path, page.number, 0)) | 41 | self.img_path_list.append(img_save_path) |
| 42 | 42 | ||
| 43 | @staticmethod | 43 | @staticmethod |
| 44 | def getimage(pix): | 44 | def getimage(pix): |
| ... | @@ -88,7 +88,7 @@ class PDFHandler: | ... | @@ -88,7 +88,7 @@ class PDFHandler: |
| 88 | with open(img_save_path, "wb") as f: | 88 | with open(img_save_path, "wb") as f: |
| 89 | f.write(img_data) | 89 | f.write(img_data) |
| 90 | self.xref_set.add(xref) | 90 | self.xref_set.add(xref) |
| 91 | self.img_info_list.append((img_save_path, pno, img_index)) | 91 | self.img_path_list.append(img_save_path) |
| 92 | 92 | ||
| 93 | @staticmethod | 93 | @staticmethod |
| 94 | def split_il(il): | 94 | def split_il(il): |
| ... | @@ -179,7 +179,7 @@ class PDFHandler: | ... | @@ -179,7 +179,7 @@ class PDFHandler: |
| 179 | img_save_path = self.get_img_save_path(pno, img_index, im_list[0][2]) | 179 | img_save_path = self.get_img_save_path(pno, img_index, im_list[0][2]) |
| 180 | new_img.save(img_save_path) | 180 | new_img.save(img_save_path) |
| 181 | page_to_png = False | 181 | page_to_png = False |
| 182 | self.img_info_list.append((img_save_path, pno, img_index)) | 182 | self.img_path_list.append(img_save_path) |
| 183 | 183 | ||
| 184 | # 3.3 碎图分组大于2、全过滤、含特殊filter,特殊处理:整个页面保存为png图片 | 184 | # 3.3 碎图分组大于2、全过滤、含特殊filter,特殊处理:整个页面保存为png图片 |
| 185 | if page_to_png: | 185 | if page_to_png: | ... | ... |
-
Please register or sign in to post a comment