7dfc2ee8 by 周伟奇

merge license

2 parents 1242adb8 e570371a
...@@ -33,6 +33,5 @@ data/* ...@@ -33,6 +33,5 @@ data/*
33 # 脚本 33 # 脚本
34 src/*.sh 34 src/*.sh
35 35
36 test.py
37 ocr_test.py
38 ocr_test_2.py
...\ No newline at end of file ...\ No newline at end of file
36 test*
37 ocr_test.py
...\ No newline at end of file ...\ No newline at end of file
......
...@@ -35,9 +35,35 @@ DEALER_CODE_META_FIELD_id = 13 ...@@ -35,9 +35,35 @@ DEALER_CODE_META_FIELD_id = 13
35 BUSINESS_TYPE_META_FIELD_id = 93 35 BUSINESS_TYPE_META_FIELD_id = 93
36 DEALER_CODE = 'ocr_situ_group' 36 DEALER_CODE = 'ocr_situ_group'
37 37
38 RETRY_TIMES = 3
39
38 # ---------银行流水模板相关-------------------------------------------------------------------------------------------- 40 # ---------银行流水模板相关--------------------------------------------------------------------------------------------
39 41
40 TRANS = str.maketrans('Cc((oODlLmAsSbg', '000000011345569') 42 TRANS_MAP = {
43 'C': "0",
44 'c': "0",
45 '(': "0",
46 'o': "0",
47 'O': "0",
48 'D': "0",
49
50 '[': "1",
51 ']': "1",
52 'l': "1",
53 'L': "1",
54
55 'A': "4",
56 's': "5",
57 'S': "5",
58 'b': "6",
59 'g': "9",
60 'E': "9",
61 'B': "13",
62 }
63 TRANS = str.maketrans(TRANS_MAP)
64 ERROR_CHARS = {'.', ':', ':', '•', '·'}
65 SKIP_IMG_SHEET_NAME = '未处理图片'
66 SKIP_IMG_SHEET_HEADER = ('页码', '序号')
41 67
42 CARD_RATIO = 0.9 68 CARD_RATIO = 0.9
43 UNKNOWN_CARD = '未知卡号' 69 UNKNOWN_CARD = '未知卡号'
...@@ -95,7 +121,7 @@ HEADERS_MAPPING.update( ...@@ -95,7 +121,7 @@ HEADERS_MAPPING.update(
95 HEADERS_MAPPING.update( 121 HEADERS_MAPPING.update(
96 { 122 {
97 '交易日期': BASE_HEADERS_MAPPING['记账日期'], 123 '交易日期': BASE_HEADERS_MAPPING['记账日期'],
98 '存入': BASE_HEADERS_MAPPING['金额'], 124 # '存入': BASE_HEADERS_MAPPING['金额'],
99 '对方账号': BASE_HEADERS_MAPPING['对方卡号/账号'], 125 '对方账号': BASE_HEADERS_MAPPING['对方卡号/账号'],
100 '对方名称': BASE_HEADERS_MAPPING['对方账户名'], 126 '对方名称': BASE_HEADERS_MAPPING['对方账户名'],
101 '摘要': BASE_HEADERS_MAPPING['附言'], 127 '摘要': BASE_HEADERS_MAPPING['附言'],
...@@ -160,6 +186,12 @@ HEADERS_MAPPING.update( ...@@ -160,6 +186,12 @@ HEADERS_MAPPING.update(
160 '交易地点/对方账号和户名': BASE_HEADERS_MAPPING['对方卡号/账号'], 186 '交易地点/对方账号和户名': BASE_HEADERS_MAPPING['对方卡号/账号'],
161 } 187 }
162 ) 188 )
189 # 农业银行-窄页
190 HEADERS_MAPPING.update(
191 {
192 '交易对手账号': BASE_HEADERS_MAPPING['对方卡号/账号'],
193 }
194 )
163 # 竖版-特殊-农商行 195 # 竖版-特殊-农商行
164 HEADERS_MAPPING.update( 196 HEADERS_MAPPING.update(
165 { 197 {
...@@ -299,17 +331,27 @@ HEADERS_MAPPING.update( ...@@ -299,17 +331,27 @@ HEADERS_MAPPING.update(
299 # ('中国银行', (1, 2, 4, 5, 6, 9, 10, 11, 12, None, None, None, None)), # 横版-表格-中国银行(不规则) 331 # ('中国银行', (1, 2, 4, 5, 6, 9, 10, 11, 12, None, None, None, None)), # 横版-表格-中国银行(不规则)
300 # 332 #
301 # # 农业银行:交易日期 交易网点 存入 支出 余额 对方账号 对方名称 摘要 渠道 附言 333 # # 农业银行:交易日期 交易网点 存入 支出 余额 对方账号 对方名称 摘要 渠道 附言
302 # ('农业银行', (1, None, 3, 5, None, 8, 7, 6, None, None, None, None, None)), # 横版-表格-农业银行-中国农业银行个人账户明细 334 # ('农业银行-10', (1, None, None, 5, None, 8, 7, 6, None, None, None, 3, 4)), # 横版-表格-农业银行-中国农业银行个人账户明细
335 #
336 # # 农业银行:序号 日期 摘要 交易金额 余额 对方账号 对方名称 交易地点 渠道 附言
337 # ('农业银行-10-1', (2, None, 4, 5, None, 3, 7, 6, None, None, None, None, None)),
338 #
339 # # 农业银行:交易日期 摘要 交易金额 余额 交易渠道 交易网点 对方账号 对方名称 附言
340 # ('农业银行-9', (1, None, 3, 4, None, 2, 8, 7, None, None, None, None, None)),
303 # 341 #
304 # # 北京银行:交易日期 业务摘要 收/支 发生额 余额 对方户名 对方账号 交易渠道 342 # # 北京银行:交易日期 业务摘要 收/支 发生额 余额 对方户名 对方账号 交易渠道
305 # ('北京银行', (1, None, 4, 5, None, 2, 6, 7, None, None, 3, None, None)), # 横版-表格-北京银行 343 # ('北京银行', (1, None, 4, 5, None, 2, 6, 7, None, None, 3, None, None)), # 横版-表格-北京银行
306 # 344 #
307 # # 工商银行:交易日期 账号 储种 序号 币种 钞汇 摘要 地区 收入/支出金额 余额 [对方户名 对方账号] 渠道 345 # # 工商银行:交易日期 账号 储种 序号 币种 钞汇 摘要 地区 收入/支出金额 余额 渠道
308 # ('工商银行', (1, None, 9, 10, None, 7, None, None, None, None, None, None, None)), 346 # ('工商银行', (1, None, 9, 10, None, 7, None, None, None, None, None, None, None)),
309 # 347 #
348 # # 工商银行:交易日期 账号 储种 序号 币种 钞汇 摘要 地区 收入/支出金额 余额 对方户名 对方账号 渠道
349 # ('工商银行-电子账单', (1, None, 9, 10, None, 7, 11, 12, None, None, None, None, None)),
350 #
310 # # 建设银行:空 摘要 交易日期 交易金额 账户余额 商户/网点号及其名称 对方账号与户名 --> 竖版-表格-建设银行 351 # # 建设银行:空 摘要 交易日期 交易金额 账户余额 商户/网点号及其名称 对方账号与户名 --> 竖版-表格-建设银行
311 # # 序号 摘要 币别 钞汇 交易日期 交易金额 账户余额 交易地点附言 对方账号与户名 --> 横版-表格-建设银行 352 # # 序号 摘要 币别 钞汇 交易日期 交易金额 账户余额 交易地点附言 对方账号与户名 --> 横版-表格-建设银行
312 # ('建设银行', (None, None, None, None, None, 2, None, None, None, None, None, None, None)), 353 # ('建设银行-竖版', (3, None, 4, 5, None, 2, None, 7, None, None, None, None, None)),
354 # ('建设银行-横版', (5, None, 6, 7, None, 2, None, 9, None, None, None, None, None)),
313 # 355 #
314 # # 微信:交易单号 交易时间 交易类型 收/支/其他 交易方式 金额(元) 交易对方 商户单号 356 # # 微信:交易单号 交易时间 交易类型 收/支/其他 交易方式 金额(元) 交易对方 商户单号
315 # ('微信', (2, None, 6, None, None, 3, 7, None, None, None, None, None, None)), 357 # ('微信', (2, None, 6, None, None, 3, 7, None, None, None, None, None, None)),
...@@ -320,7 +362,13 @@ HEADERS_MAPPING.update( ...@@ -320,7 +362,13 @@ HEADERS_MAPPING.update(
320 # # -----------------普通打印:部分格线-------------------------------- 362 # # -----------------普通打印:部分格线--------------------------------
321 # 363 #
322 # # 农业银行:交易日期 摘要/附言 交易金额 余额 交易地点/对方账号和户名 364 # # 农业银行:交易日期 摘要/附言 交易金额 余额 交易地点/对方账号和户名
323 # ('农业银行', (1, None, 3, 4, None, 2, None, 5, None, None, None, None, None)), 365 # ('农业银行-5', (1, None, 3, 4, None, 2, None, 5, None, None, None, None, None)),
366 #
367 # # 农业银行:日期 地点 摘要 存入 支出 余额 对方账号 对方户名
368 # ('农业银行-8', (1, None, None, 6, None, 3, 8, 7, None, None, None, 4, 5)),
369
370 # # 农业银行:日期 摘要 交易金额 余额 地点 交易对手账号 对方户名
371 # ('农业银行-窄页', (1, None, 3, 4, None, 2, 7, 6, None, None, None, None, None)),
324 # 372 #
325 # # 农商行:交易日期 交易发生额 账户余额 对方账号 对方户名 摘要 备注 373 # # 农商行:交易日期 交易发生额 账户余额 对方账号 对方户名 摘要 备注
326 # ('农商行', (1, None, 2, 3, None, 6, 5, 4, None, None, None, None, None)), 374 # ('农商行', (1, None, 2, 3, None, 6, 5, 4, None, None, None, None, None)),
...@@ -330,6 +378,9 @@ HEADERS_MAPPING.update( ...@@ -330,6 +378,9 @@ HEADERS_MAPPING.update(
330 # 378 #
331 # # 平安电子账单:序号 交易日期 交易网点 摘要 借贷发生额(借:-贷:+) 账户余额 379 # # 平安电子账单:序号 交易日期 交易网点 摘要 借贷发生额(借:-贷:+) 账户余额
332 # ('平安电子账单', (2, None, 5, 6, None, 4, None, None, None, None, None, None, None)), 380 # ('平安电子账单', (2, None, 5, 6, None, 4, None, None, None, None, None, None, None)),
381
382 # # 建设银行:序号 摘要 币别 钞汇 交易日期 交易金额 账户余额 交易地点附言 对方账号与户名
383 # ('建设银行-电子账单', (5, None, 6, 7, None, 2, None, 9, None, None, None, None, None)),
333 # 384 #
334 # # -----------------普通打印:无格线------------------------------------- 385 # # -----------------普通打印:无格线-------------------------------------
335 # 386 #
...@@ -338,7 +389,8 @@ HEADERS_MAPPING.update( ...@@ -338,7 +389,8 @@ HEADERS_MAPPING.update(
338 # 389 #
339 # # 邮储银行:交易日期、交易类型 交易币种 交易金额(元) 账户余额(元) [对手方户名 对手方账户 收支类型] --> 竖版-无表格-邮储银行-账户对账单 含有对手方户名 对手方账户 390 # # 邮储银行:交易日期、交易类型 交易币种 交易金额(元) 账户余额(元) [对手方户名 对手方账户 收支类型] --> 竖版-无表格-邮储银行-账户对账单 含有对手方户名 对手方账户
340 # # 交易日期、交易类型 交易金额(元) 账户余额(元) 操作柜员 --> 竖版-无表格-邮储银行 账户对账单 391 # # 交易日期、交易类型 交易金额(元) 账户余额(元) 操作柜员 --> 竖版-无表格-邮储银行 账户对账单
341 # ('邮储银行', (1, None, None, None, None, 2, None, None, None, None, None, None, None)), 392 # ('邮储银行-8', (1, None, 4, 5, None, 2, 6, 7, None, None, None, None, None)),
393 # ('邮储银行-5', (1, None, 3, 4, None, 2, None, None, None, None, None, None, None)),
342 # 394 #
343 # # 工商银行电子版:交易日期 账号 储种 序号 币种 妙汇 摘要 地区 收入/支出金额 余额 [对方户名 对方账号] 渠道 395 # # 工商银行电子版:交易日期 账号 储种 序号 币种 妙汇 摘要 地区 收入/支出金额 余额 [对方户名 对方账号] 渠道
344 # ('工商银行电子版', (1, None, 9, 10, None, 7, None, None, None, None, None, None, None)), 396 # ('工商银行电子版', (1, None, 9, 10, None, 7, None, None, None, None, None, None, None)),
...@@ -351,13 +403,15 @@ HEADERS_MAPPING.update( ...@@ -351,13 +403,15 @@ HEADERS_MAPPING.update(
351 # ('民生银行', (None, None, 5, 6, None, None, 7, None, 8, None, None, None, None)), 403 # ('民生银行', (None, None, 5, 6, None, None, 7, None, 8, None, None, None, None)),
352 # 404 #
353 # # 农业银行:交易日期 摘要/附言 交易金额 对方账号和户名 405 # # 农业银行:交易日期 摘要/附言 交易金额 对方账号和户名
354 # ('农业银行', (1, None, 3, None, None, 2, None, 4, None, None, None, None, None)), 406 # ('农业银行-整数', (1, None, 3, None, None, 2, None, 4, None, None, None, None, None)),
355 # 407 #
356 # # 农业银行:交易日期 摘要/附言 交易金额 余额 交易地点/对方账号和户名 408 # # 农业银行:交易日期 摘要/附言 交易金额 余额 交易地点/对方账号和户名
357 # ('农业银行', (1, None, 3, 4, None, 2, None, 5, None, None, None, None, None)), 409 # ('农业银行', (1, None, 3, 4, None, 2, None, 5, None, None, None, None, None)),
358 # 410 #
359 # # 农业银行:日期、时间、[日志号]、短摘要、交易金额、本次余额、交易网点、渠道、附言 411 # # 农业银行:日期、时间、短摘要、交易金额、本次余额、交易网点、渠道、附言
360 # ('农业银行', (1, 2, None, None, None, None, None, None, None, None, None, None, None)), 412 # # 农业银行:日期、时间、日志号、短摘要、交易金额、本次余额、交易网点、渠道、附言
413 # ('农业银行', (1, 2, 4, 5, None, 3, None, None, None, None, None, None, None)),
414 # ('农业银行-扩张缩进', (1, 2, 5, 6, None, 4, None, None, None, None, None, None, None)),
361 # 415 #
362 # # 交通银行:交易日期 记账日期、交易地点、交易类型、借贷状态、交易金额、余额 416 # # 交通银行:交易日期 记账日期、交易地点、交易类型、借贷状态、交易金额、余额
363 # ('交通银行', (1, None, 5, 6, None, 3, None, None, None, None, 4, None, None)), 417 # ('交通银行', (1, None, 5, 6, None, 3, None, None, None, None, 4, None, None)),
...@@ -374,11 +428,10 @@ HEADERS_MAPPING.update( ...@@ -374,11 +428,10 @@ HEADERS_MAPPING.update(
374 # 428 #
375 # # 邮储银行:序号、交易日期、交易渠道、摘要、交易金额、账户余额、对方账号/卡号/汇票号、原子账号、交易机构名称 429 # # 邮储银行:序号、交易日期、交易渠道、摘要、交易金额、账户余额、对方账号/卡号/汇票号、原子账号、交易机构名称
376 # ('邮储银行', (2, None, 5, 6, None, 4, None, 7, None, None, None, None, None)), 430 # ('邮储银行', (2, None, 5, 6, None, 4, None, 7, None, None, None, None, None)),
377 #
378 # # 建设银行:序号 摘要 币别 钞汇 交易日期 交易金额 账户余额 交易地点附言 对方账号与户名
379 # ('建设银行', (5, None, 6, 7, None, 2, None, 9, None, None, None, None, None)),
380 # ] 431 # ]
381 432
433 OTHER_TUPLE = (None, None, None, None, None, None, None, None, None, None, None, None, None)
434
382 # { 435 # {
383 # "0":"其他", 436 # "0":"其他",
384 # "1":"普通打印-全表格-中国农业银行", 437 # "1":"普通打印-全表格-中国农业银行",
...@@ -408,67 +461,163 @@ HEADERS_MAPPING.update( ...@@ -408,67 +461,163 @@ HEADERS_MAPPING.update(
408 # "22":"针式打印-部分格线-邮储银行一本通绿卡" 461 # "22":"针式打印-部分格线-邮储银行一本通绿卡"
409 # } 462 # }
410 463
464 # CLASSIFY_LIST = [
465 # ('其他', OTHER_TUPLE),
466 # ('农业银行', (1, None, 3, 5, None, 8, 7, 6, None, None, None, None, None)),
467 # ('中国银行', (1, 2, 4, 5, 6, 9, 10, 11, 12, None, None, None, None)),
468 # ('北京银行', (1, None, 4, 5, None, 2, 6, 7, None, None, 3, None, None)),
469 # ('工商银行', (1, None, 9, 10, None, 7, None, None, None, None, None, None, None)),
470 # ('建设银行', (None, None, None, None, None, 2, None, None, None, None, None, None, None)),
471 # ('微信', (2, None, 6, None, None, 3, 7, None, None, None, None, None, None)),
472 # ('支付宝', (2, None, None, 6, None, 3, None, None, None, None, None, 4, 5)),
473 #
474 # ('交通银行', (1, None, 5, 6, None, 3, None, None, None, None, 4, None, None)),
475 # ('农业银行', (1, None, 3, None, None, 2, None, 4, None, None, None, None, None)),
476 # ('农业银行', (1, 2, None, None, None, None, None, None, None, None, None, None, None)),
477 # ('招商银行', (1, None, 3, 4, None, 6, None, None, None, None, None, None, None)),
478 # ('招商银行电子版', (1, None, 3, 4, None, 5, 6, None, None, None, None, None, None)),
479 # ('民生银行', (None, None, 5, 6, None, None, 7, None, 8, None, None, None, None)),
480 #
481 # ('中信银行', (1, None, None, 5, None, 2, 6, 7, 8, None, None, 3, 4)),
482 # ('农业银行', (1, None, 3, 4, None, 2, None, 5, None, None, None, None, None)),
483 # ('农业银行', (1, None, 3, 4, None, 2, None, 5, None, None, None, None, None)),
484 # ('农业银行', (1, None, 3, 4, None, 2, None, 5, None, None, None, None, None)),
485 # ('平安电子账单', (2, None, 5, 6, None, 4, None, None, None, None, None, None, None)),
486 #
487 # ('建设银行', (None, None, None, None, None, None, None, None, None, None, None, None, None)),
488 # ('邮储银行', (2, None, 5, 6, None, 4, None, 7, None, None, None, None, None)),
489 # ('邮储银行', (2, None, 5, 6, None, 4, None, 7, None, None, None, None, None)),
490 # ]
491
492 # "4":"普通打印-全表格-中国银行",
493 # "5":"普通打印-全表格-农业银行-10列",
494 # "6":"普通打印-全表格-农业银行-10列-1",
495 # "7":"普通打印-全表格-农业银行-9列",
496 # "8":"普通打印-全表格-北京银行",
497 # "9":"普通打印-全表格-工商银行",
498 # "10":"普通打印-全表格-工商银行-电子账单",
499 # "11":"普通打印-全表格-建设银行",
500 # "12":"普通打印-全表格-微信账单",
501 # "13":"普通打印-全表格-支付宝账单",
502
503 # "14":"普通打印-无格线-交通银行",
504 # "15":"普通打印-无格线-储蓄银行-5列",
505 # "16":"普通打印-无格线-储蓄银行-8列",
506 # "17":"普通打印-无格线-农业银行-扩张缩进",
507 # "18":"普通打印-无格线-农业银行-整数",
508 # "19":"普通打印-无格线-招商银行",
509 # "20":"普通打印-无格线-招商银行-电子账单",
510 # "21":"普通打印-无格线-民生银行",
511
512 # "22":"普通打印-部分格线-横版-中信银行",
513 # "23":"普通打印-部分格线-竖版-农业银行-5列",
514 # "24":"普通打印-部分格线-竖版-农业银行-8列",
515 # "25":"普通打印-部分格线-竖版-农业银行-窄页",
516 # "26":"普通打印-部分格线-竖版-平安电子账单",
517 # "27":"普通打印-部分格线-竖版-建设银行-电子账单",
518
519 # "34":"针式打印-全格线-建设银行",
520 # "35":"针式打印-部分格线-竖版-邮储银行",
521 # "36":"针式打印-部分格线-竖版-邮储银行-绿卡",
522
411 CLASSIFY_LIST = [ 523 CLASSIFY_LIST = [
412 ('其他', (None, None, None, None, None, None, None, None, None, None, None, None, None)), 524 ('其他', OTHER_TUPLE),
413 ('农业银行', (1, None, 3, 5, None, 8, 7, 6, None, None, None, None, None)), 525 ('其他', OTHER_TUPLE),
414 ('中国银行', (1, 2, 4, 5, 6, 9, 10, 11, 12, None, None, None, None)), 526 ('其他', OTHER_TUPLE),
415 ('北京银行', (1, None, 4, 5, None, 2, 6, 7, None, None, 3, None, None)), 527 ('其他', OTHER_TUPLE),
416 ('工商银行', (1, None, 9, 10, None, 7, None, None, None, None, None, None, None)), 528 ('普通打印-全表格-中国银行', (1, 2, 4, 5, 6, 9, 10, 11, 12, None, None, None, None)),
417 ('建设银行', (None, None, None, None, None, 2, None, None, None, None, None, None, None)), 529 ('普通打印-全表格-农业银行-10列', (1, None, None, 5, None, 8, 7, 6, None, None, None, 3, 4)),
418 ('微信', (2, None, 6, None, None, 3, 7, None, None, None, None, None, None)), 530 ('普通打印-全表格-农业银行-10列-1', (2, None, 4, 5, None, 3, 7, 6, None, None, None, None, None)),
419 ('支付宝', (2, None, None, 6, None, 3, None, None, None, None, None, 4, 5)), 531 ('普通打印-全表格-农业银行-9列', (1, None, 3, 4, None, 2, 8, 7, None, None, None, None, None)),
420 532 ('普通打印-全表格-北京银行', (1, None, 4, 5, None, 2, 6, 7, None, None, 3, None, None)),
421 ('交通银行', (1, None, 5, 6, None, 3, None, None, None, None, 4, None, None)), 533 ('普通打印-全表格-工商银行', (1, None, 9, 10, None, 7, None, None, None, None, None, None, None)),
422 ('农业银行', (1, None, 3, None, None, 2, None, 4, None, None, None, None, None)), 534 ('普通打印-全表格-工商银行-电子账单', (1, None, 9, 10, None, 7, 11, 12, None, None, None, None, None)),
423 ('农业银行', (1, 2, None, None, None, None, None, None, None, None, None, None, None)), 535 ('普通打印-全表格-建设银行', (3, None, 4, 5, None, 2, None, 7, None, None, None, None, None)),
424 ('招商银行', (1, None, 3, 4, None, 6, None, None, None, None, None, None, None)), 536 ('普通打印-全表格-微信账单', (2, None, 6, None, None, 3, 7, None, None, None, None, None, None)),
425 ('招商银行电子版', (1, None, 3, 4, None, 5, 6, None, None, None, None, None, None)), 537 ('普通打印-全表格-支付宝账单', (2, None, None, 6, None, 3, None, None, None, None, None, 4, 5)),
426 ('民生银行', (None, None, 5, 6, None, None, 7, None, 8, None, None, None, None)), 538
427 539 ('普通打印-无格线-交通银行', (1, None, 5, 6, None, 3, None, None, None, None, 4, None, None)),
428 ('中信银行', (1, None, None, 5, None, 2, 6, 7, 8, None, None, 3, 4)), 540 ('普通打印-无格线-储蓄银行-5列', (1, None, 3, 4, None, 2, None, None, None, None, None, None, None)),
429 ('农业银行', (1, None, 3, 4, None, 2, None, 5, None, None, None, None, None)), 541 ('普通打印-无格线-储蓄银行-8列', (1, None, 4, 5, None, 2, 6, 7, None, None, None, None, None)),
430 ('农业银行', (1, None, 3, 4, None, 2, None, 5, None, None, None, None, None)), 542 ('普通打印-无格线-农业银行-扩张缩进', (1, 2, 5, 6, None, 4, None, None, None, None, None, None, None)),
431 ('农业银行', (1, None, 3, 4, None, 2, None, 5, None, None, None, None, None)), 543 ('普通打印-无格线-农业银行-整数', (1, None, 3, None, None, 2, None, 4, None, None, None, None, None)),
432 ('平安电子账单', (2, None, 5, 6, None, 4, None, None, None, None, None, None, None)), 544 ('普通打印-无格线-招商银行', (1, None, 3, 4, None, 6, None, None, None, None, None, None, None)),
433 545 ('普通打印-无格线-招商银行-电子账单', (1, None, 3, 4, None, 5, 6, None, None, None, None, None, None)),
434 ('建设银行', (None, None, None, None, None, None, None, None, None, None, None, None, None)), 546 ('普通打印-无格线-民生银行', (None, None, 5, 6, None, None, 7, None, 8, None, None, None, None)),
435 ('邮储银行', (2, None, 5, 6, None, 4, None, 7, None, None, None, None, None)), 547
436 ('邮储银行', (2, None, 5, 6, None, 4, None, 7, None, None, None, None, None)), 548 ('普通打印-部分格线-横版-中信银行', (1, None, None, 5, None, 2, 6, 7, 8, None, None, 3, 4)),
549 ('普通打印-部分格线-竖版-农业银行-5列', (1, None, 3, 4, None, 2, None, 5, None, None, None, None, None)),
550 ('普通打印-部分格线-竖版-农业银行-8列', (1, None, None, 6, None, 3, 8, 7, None, None, None, 4, 5)),
551 ('普通打印-部分格线-竖版-农业银行-窄页', (1, None, 3, 4, None, 2, 7, 6, None, None, None, None, None)),
552 ('普通打印-部分格线-竖版-平安电子账单', (2, None, 5, 6, None, 4, None, None, None, None, None, None, None)),
553 ('普通打印-部分格线-竖版-建设银行-电子账单', (5, None, 6, 7, None, 2, None, 9, None, None, None, None, None)),
554 ('其他', OTHER_TUPLE),
555 ('其他', OTHER_TUPLE),
556 ('其他', OTHER_TUPLE),
557 ('其他', OTHER_TUPLE),
558 ('其他', OTHER_TUPLE),
559 ('其他', OTHER_TUPLE),
560 ('针式打印-全格线-建设银行', OTHER_TUPLE),
561 ('针式打印-部分格线-竖版-邮储银行', (2, None, 5, 6, None, 4, None, 7, None, None, None, None, None)),
562 ('针式打印-部分格线-竖版-邮储银行-绿卡', (2, None, 5, 6, None, 4, None, 7, None, None, None, None, None)),
563 ('其他', OTHER_TUPLE),
437 ] 564 ]
438 565
439 # ----------license相关------------------------------------------------------------------------------------------------ 566 # ----------license相关------------------------------------------------------------------------------------------------
567
568 # "0":"AVT Invioce",
569 # "1":"二手车发票",
570 # "2":"其他",
571 # "3":"护照",
572 # "28":"机动车登记证",
573 # "29":"机动车销售统一发票",
574 # "30":"港澳通行证",
575 # "31":"营业执照",
576 # "32":"行驶证",
577 # "33":"身份证",
578 # "37":"银行卡"
579
580 # 其他
581 OTHER_CLASSIFY = 2
582
583 # 身份证
584 IC_CN_NAME = '身份证'
585 IC_CLASSIFY = 33
586 # 增值税发票
587 VAT_CN_NAME = '增值税发票'
588 VAT_CLASSIFY = 0
589 # 机动车登记证书
590 MVC_CN_NAME = '机动车登记证书'
591 MVC_CLASSIFY = 28
592 # 机动车销售统一发票
593 MVI_CN_NAME = '机动车销售统一发票'
594 MVI_CLASSIFY = 29
595 IC_PID = VAT_PID = MVC_PID = MVI_PID = None
596
440 # 营业执照 597 # 营业执照
441 BL_KEY = 'bl' 598 BL_CN_NAME = '营业执照'
599 BL_CLASSIFY = 31
600 BL_PID = 41
442 # 二手车发票 601 # 二手车发票
443 UCI_KEY = 'uci' 602 UCI_CN_NAME = '二手车发票'
603 UCI_CLASSIFY = 1
604 UCI_PID = 60
444 # 港澳台通行证 605 # 港澳台通行证
445 EEP_KEY = 'eep' 606 EEP_CN_NAME = '港澳台通行证'
607 EEP_CLASSIFY = 30
608 EEP_PID = 1018
446 # 行驶证 609 # 行驶证
447 DL_KEY = 'dl' 610 DL_CN_NAME = '行驶证'
611 DL_CLASSIFY = 32
612 DL_PID = 5
448 # 护照 613 # 护照
449 PP_KEY = 'pp' 614 PP_CN_NAME = '护照'
615 PP_CLASSIFY = 3
616 PP_PID = 8
450 # 银行卡 617 # 银行卡
451 BC_KEY = 'bc' 618 BC_CN_NAME = '银行卡'
452 # 身份证 619 BC_CLASSIFY = 37
453 IC_KEY = 'ic' 620 BC_PID = 4
454 # 机动车登记证书
455 MVC_KEY = 'mvc'
456 # 机动车销售统一发票
457 MVI_KEY = 'mvi'
458 # 增值税发票
459 VAT_KEY = 'vat'
460
461 LICENSE_ORDER = ((MVI_KEY, '机动车销售统一发票'),
462 (IC_KEY, '身份证'),
463 (BC_KEY, '银行卡'),
464 (BL_KEY, '营业执照'),
465 (UCI_KEY, '二手车发票'),
466 (EEP_KEY, '港澳台通行证'),
467 (DL_KEY, '行驶证'),
468 (PP_KEY, '护照'),
469 (MVC_KEY, '机动车登记证书'),
470 (VAT_KEY, '增值税发票'))
471
472 BC_FIELD = (('CardNum', '银行卡号'), 621 BC_FIELD = (('CardNum', '银行卡号'),
473 ('BankName', '发卡行名称'), 622 ('BankName', '发卡行名称'),
474 ('CardName', '银行卡名称'), 623 ('CardName', '银行卡名称'),
...@@ -478,3 +627,19 @@ BC_FIELD = (('CardNum', '银行卡号'), ...@@ -478,3 +627,19 @@ BC_FIELD = (('CardNum', '银行卡号'),
478 627
479 SUCCESS_CODE_SET = {'0', 0} 628 SUCCESS_CODE_SET = {'0', 0}
480 629
630 LICENSE_ORDER = ((MVI_CLASSIFY, (MVI_PID, MVI_CN_NAME)),
631 (IC_CLASSIFY, (IC_PID, IC_CN_NAME)),
632 (BC_CLASSIFY, (BC_PID, BC_CN_NAME)),
633 (BL_CLASSIFY, (BL_PID, BL_CN_NAME)),
634 (UCI_CLASSIFY, (UCI_PID, UCI_CN_NAME)),
635 (EEP_CLASSIFY, (EEP_PID, EEP_CN_NAME)),
636 (DL_CLASSIFY, (DL_PID, DL_CN_NAME)),
637 (PP_CLASSIFY, (PP_PID, PP_CN_NAME)),
638 (MVC_CLASSIFY, (MVC_PID, MVC_CN_NAME)),
639 (VAT_CLASSIFY, (VAT_PID, VAT_CN_NAME)))
640
641 LICENSE_CLASSIFY_MAPPING = dict(LICENSE_ORDER)
642
643 OTHER_CLASSIFY_SET = {OTHER_CLASSIFY}
644 LICENSE_CLASSIFY_SET_1 = {IC_CLASSIFY, VAT_CLASSIFY, MVC_CLASSIFY, MVI_CLASSIFY}
645 LICENSE_CLASSIFY_SET_2 = {BL_CLASSIFY, UCI_CLASSIFY, EEP_CLASSIFY, DL_CLASSIFY, PP_CLASSIFY, BC_CLASSIFY}
......
1 class EDMSException(Exception):
2 pass
...@@ -4,6 +4,7 @@ import signal ...@@ -4,6 +4,7 @@ import signal
4 import asyncio 4 import asyncio
5 import aiohttp 5 import aiohttp
6 import difflib 6 import difflib
7 import base64
7 import requests 8 import requests
8 from datetime import datetime, date 9 from datetime import datetime, date
9 from collections import Counter 10 from collections import Counter
...@@ -18,6 +19,7 @@ from apps.doc.models import DocStatus, HILDoc, AFCDoc, Keywords ...@@ -18,6 +19,7 @@ from apps.doc.models import DocStatus, HILDoc, AFCDoc, Keywords
18 from apps.doc.named_enum import KeywordsType 19 from apps.doc.named_enum import KeywordsType
19 from apps.doc import consts 20 from apps.doc import consts
20 from apps.doc.ocr.edms import EDMS, rh 21 from apps.doc.ocr.edms import EDMS, rh
22 from apps.doc.exceptions import EDMSException
21 23
22 24
23 class Command(BaseCommand, LoggerMixin): 25 class Command(BaseCommand, LoggerMixin):
...@@ -30,7 +32,8 @@ class Command(BaseCommand, LoggerMixin): ...@@ -30,7 +32,8 @@ class Command(BaseCommand, LoggerMixin):
30 # 数据目录 32 # 数据目录
31 self.data_dir = conf.DATA_DIR 33 self.data_dir = conf.DATA_DIR
32 # ocr相关 34 # ocr相关
33 self.ocr_url = conf.OCR_URL 35 self.ocr_url_1 = conf.OCR_URL_1
36 self.ocr_url_2 = conf.OCR_URL_2
34 # EDMS web_service_api 37 # EDMS web_service_api
35 self.edms = EDMS(conf.EDMS_USER, conf.EDMS_PWD) 38 self.edms = EDMS(conf.EDMS_USER, conf.EDMS_PWD)
36 # 优雅退出信号:15 39 # 优雅退出信号:15
...@@ -70,18 +73,44 @@ class Command(BaseCommand, LoggerMixin): ...@@ -70,18 +73,44 @@ class Command(BaseCommand, LoggerMixin):
70 os.makedirs(doc_data_path, exist_ok=True) 73 os.makedirs(doc_data_path, exist_ok=True)
71 pdf_path = os.path.join(doc_data_path, '{0}.pdf'.format(doc.id)) 74 pdf_path = os.path.join(doc_data_path, '{0}.pdf'.format(doc.id))
72 if not doc.application_id.startswith(consts.FIXED_APPLICATION_ID_PREFIX): 75 if not doc.application_id.startswith(consts.FIXED_APPLICATION_ID_PREFIX):
73 self.edms.download(pdf_path, doc.metadata_version_id) 76 for times in range(consts.RETRY_TIMES):
77 try:
78 self.edms.download(pdf_path, doc.metadata_version_id)
79 except Exception as e:
80 self.cronjob_log.warn('{0} [edms download failed] [times={1}] [business_type={2}] [doc_id={3}] '
81 '[error={4}]'.format(self.log_base, times, business_type, doc.id, e))
82 edms_exc = str(e)
83 else:
84 break
85 else:
86 raise EDMSException(edms_exc)
74 87
75 excel_path = os.path.join(doc_data_path, '{0}.xlsx'.format(doc.id)) 88 excel_path = os.path.join(doc_data_path, '{0}.xlsx'.format(doc.id))
76 src_excel_path = os.path.join(doc_data_path, 'src.xlsx') 89 src_excel_path = os.path.join(doc_data_path, 'src.xlsx')
77 self.cronjob_log.info('{0} [pdf download success] [business_type={1}] [doc_id={2}] [pdf_path={3}]'.format( 90 self.cronjob_log.info('{0} [edms download success] [business_type={1}] [doc_id={2}] [pdf_path={3}]'.format(
78 self.log_base, business_type, doc.id, pdf_path)) 91 self.log_base, business_type, doc.id, pdf_path))
79 return doc_data_path, excel_path, src_excel_path, pdf_path 92 return doc_data_path, excel_path, src_excel_path, pdf_path
80 93
81 @staticmethod 94 def bs_process(self, wb, ocr_data, bs_summary, unknown_summary, img_path, classify, skip_img):
82 def append_bs_sheet(wb, sheets, bs_summary, unknown_summary, pno, img_idx, classify, confidence): 95 sheets = ocr_data.get('data', [])
96 if not sheets:
97 skip_img.append(self.parse_img_path(img_path))
98 return
99 confidence = ocr_data.get('confidence', 1)
100 img_name, _ = os.path.splitext(os.path.basename(img_path))
83 for i, sheet in enumerate(sheets): 101 for i, sheet in enumerate(sheets):
84 sheet_name = 'page_{0}_img_{1}_{2}'.format(pno, img_idx, i) 102 cells = sheet.get('cells')
103 if not cells:
104 skip_img.append(self.parse_img_path(img_path))
105 continue
106 sheet_name = '{0}_{1}'.format(img_name, i)
107 ws = wb.create_sheet(sheet_name)
108 for cell in cells:
109 c1 = cell.get('start_column')
110 r1 = cell.get('start_row')
111 words = cell.get('words')
112 ws.cell(row=r1 + 1, column=c1 + 1, value=words)
113
85 # ['户名', '卡号', '页码', '回单验证码', '打印时间', '起始时间', '终止时间'] 114 # ['户名', '卡号', '页码', '回单验证码', '打印时间', '起始时间', '终止时间']
86 summary = sheet.get('summary') 115 summary = sheet.get('summary')
87 card = summary[1] 116 card = summary[1]
...@@ -129,74 +158,199 @@ class Command(BaseCommand, LoggerMixin): ...@@ -129,74 +158,199 @@ class Command(BaseCommand, LoggerMixin):
129 if summary[6] is not None: 158 if summary[6] is not None:
130 ed_list.append(summary[6]) 159 ed_list.append(summary[6])
131 160
132 ws = wb.create_sheet(sheet_name) 161 def license1_process(self, ocr_data, license_summary, classify, skip_img, img_path):
133 cells = sheet.get('cells') 162 license_data = ocr_data.get('data', [])
134 for cell in cells: 163 if not license_data:
135 c1 = cell.get('start_column') 164 skip_img.append(self.parse_img_path(img_path))
136 r1 = cell.get('start_row')
137 words = cell.get('words')
138 ws.cell(row=r1+1, column=c1+1, value=words)
139
140 def ocr_2_wb(self, res, wb, pno, img_idx, bs_summary, unknown_summary, license_summary):
141 # res = {
142 # 'code': 1,
143 # 'msg': 'success',
144 # 'data': {
145 # 'classify': 0,
146 # 'confidence': 0.999,
147 # 'sheets': [
148 # {
149 # 'summary': ['户名', '卡号', '页码', '回单验证码', '打印时间', '起始时间', '终止时间'],
150 # 'cells': []
151 # },
152 # {
153 # 'summary': ['户名', '卡号', '页码', '回单验证码', '打印时间', '起始时间', '终止时间'],
154 # 'cells': []
155 # }
156 # ]
157 # }
158 # }
159 data = res.get('data', {})
160 classify = data.get('classify')
161 if classify is None:
162 return 165 return
163 # if classify in 166 for license_dict in license_data:
164 sheets = data.get('sheets', []) 167 res_list = []
165 if not sheets: 168 for field, value in license_dict.items():
166 return 169 res_list.append((field, value))
167 confidence = data.get('confidence', 1) 170 license_summary.setdefault(classify, []).append(res_list)
168 self.append_bs_sheet(wb, sheets, bs_summary, unknown_summary, pno, img_idx, classify, confidence) 171
169 # else: 172 def license2_process(self, ocr_res_2, license_summary, pid, classify, skip_img, img_path):
170 # pass 173 if ocr_res_2.get('ErrorCode') in consts.SUCCESS_CODE_SET:
171 174 if pid == consts.BC_PID:
172 # async def fetch_ocr_result(self, img_path): 175 # 银行卡
173 # async with aiohttp.ClientSession( 176 res_list = []
174 # headers=self.ocr_header, connector=aiohttp.TCPConnector(ssl=False) 177 for en_key, chn_key in consts.BC_FIELD:
175 # ) as session: 178 res_list.append((chn_key, ocr_res_2.get(en_key, '')))
176 # json_data = self.get_ocr_json(img_path) 179 license_summary.setdefault(classify, []).append(res_list)
177 # async with session.post(self.ocr_url, json=json_data) as response: 180 else:
178 # return await response.json() 181 # 营业执照、行驶证等
182 for result_dict in ocr_res_2.get('ResultList', []):
183 res_list = []
184 for field_dict in result_dict.get('FieldList', []):
185 res_list.append(
186 (field_dict.get('chn_key', ''), field_dict.get('value', '')))
187 license_summary.setdefault(classify, []).append(res_list)
188 else:
189 skip_img.append(self.parse_img_path(img_path))
190
191 @staticmethod
192 async def fetch_ocr_1_result(url, json_data):
193 async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(ssl=False)) as session:
194 async with session.post(url, json=json_data) as response:
195 if response.status == 200:
196 return await response.json()
197
198 @staticmethod
199 async def fetch_ocr_2_result(url, json_data):
200 async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(ssl=False)) as session:
201 async with session.post(url, data=json_data) as response:
202 if response.status == 200:
203 return await response.json()
204
205 async def img_2_ocr_2_wb(self, wb, img_path, bs_summary, unknown_summary, license_summary, skip_img):
206 with open(img_path, 'rb') as f:
207 base64_data = base64.b64encode(f.read())
208 # 获取解码后的base64值
209 file_data = base64_data.decode()
210 json_data_1 = {
211 "file": file_data
212 }
213 ocr_res_1 = await self.fetch_ocr_1_result(self.ocr_url_1, json_data_1)
214 if ocr_res_1 is None:
215 raise Exception('ocr 1 error, img_path={0}'.format(img_path))
216 else:
217 self.cronjob_log.info('{0} [ocr_1 result] [img={1}] [res={2}]'.format(
218 self.log_base, img_path, ocr_res_1))
219
220 if ocr_res_1.get('code') == 1:
221 ocr_data = ocr_res_1.get('data', {})
222 classify = ocr_data.get('classify')
223 if classify is None:
224 skip_img.append(self.parse_img_path(img_path))
225 return
226 elif classify in consts.OTHER_CLASSIFY_SET: # 其他类
227 skip_img.append(self.parse_img_path(img_path))
228 return
229 elif classify in consts.LICENSE_CLASSIFY_SET_1: # 证件1
230 self.license1_process(ocr_data, license_summary, classify, skip_img, img_path)
231 elif classify in consts.LICENSE_CLASSIFY_SET_2: # 证件2
232 pid, _ = consts.LICENSE_CLASSIFY_MAPPING.get(classify)
233 json_data_2 = {
234 "pid": str(pid),
235 "key": conf.OCR_KEY,
236 "secret": conf.OCR_SECRET,
237 "file": file_data
238 }
239 ocr_res_2 = await self.fetch_ocr_2_result(self.ocr_url_2, json_data_2)
240 if ocr_res_2 is None:
241 raise Exception('ocr 2 error, img_path={0}'.format(img_path))
242 else:
243 # 识别结果
244 self.cronjob_log.info('{0} [ocr_2 result] [img={1}] [res={2}]'.format(
245 self.log_base, img_path, ocr_res_2))
246 self.license2_process(ocr_res_2, license_summary, pid, classify, skip_img, img_path)
247 else: # 流水处理
248 self.bs_process(wb, ocr_data, bs_summary, unknown_summary, img_path, classify, skip_img)
249 else:
250 skip_img.append(self.parse_img_path(img_path))
251
252 # def img_2_ocr_2_wb(self, wb, img_path, bs_summary, unknown_summary, license_summary, skip_img):
253 # # # 流水
254 # # res = {
255 # # 'code': 1,
256 # # 'msg': 'success',
257 # # 'data': {
258 # # 'classify': 0,
259 # # 'confidence': 0.999,
260 # # 'data': [
261 # # {
262 # # 'summary': ['户名', '卡号', '页码', '回单验证码', '打印时间', '起始时间', '终止时间'],
263 # # 'cells': []
264 # # },
265 # # {
266 # # 'summary': ['户名', '卡号', '页码', '回单验证码', '打印时间', '起始时间', '终止时间'],
267 # # 'cells': []
268 # # }
269 # # ]
270 # # }
271 # # }
272 # #
273 # # # 证件-1
274 # # res = {
275 # # 'code': 1,
276 # # 'msg': 'success',
277 # # 'data': {
278 # # 'classify': 0,
279 # # 'confidence': 0.999,
280 # # 'data': [
281 # # {
282 # # 'cn_key': 'value',
283 # # 'cn_key': 'value',
284 # # },
285 # # {
286 # # 'cn_key': 'value',
287 # # 'cn_key': 'value',
288 # # },
289 # # ]
290 # # }
291 # # }
292 # #
293 # # # 证件-2 or 其他类
294 # # res = {
295 # # 'code': 1,
296 # # 'msg': 'success',
297 # # 'data': {
298 # # 'classify': 0,
299 # # 'confidence': 0.999,
300 # # }
301 # # }
302 # with open(img_path, 'rb') as f:
303 # base64_data = base64.b64encode(f.read())
304 # # 获取解码后的base64值
305 # file_data = base64_data.decode()
306 # json_data_1 = {
307 # "file": file_data
308 # }
309 # response_1 = requests.post(self.ocr_url_1, json=json_data_1)
310 # if response_1.status_code == 200:
311 # ocr_res_1 = response_1.json()
312 # self.cronjob_log.info('{0} [ocr_1 result] [img={1}] [res={2}]'.format(
313 # self.log_base, img_path, ocr_res_1))
179 # 314 #
180 # async def img_2_ocr_2_wb(self, wb, img_path, summary): 315 # if ocr_res_1.get('code') == 1:
181 # res = await self.fetch_ocr_result(img_path) 316 # ocr_data = ocr_res_1.get('data', {})
182 # self.cronjob_log.info('{0} [fetch ocr result success] [img={1}] [res={2}]'.format(self.log_base, img_path, res)) 317 # classify = ocr_data.get('classify')
183 # sheets_list = res.get('result').get('res') 318 # if classify is None:
184 # img_name = os.path.basename(img_path) 319 # skip_img.append(self.parse_img_path(img_path))
185 # self.append_sheet(wb, sheets_list, img_name, summary) 320 # return
186 321 # elif classify in consts.OTHER_CLASSIFY_SET: # 其他类
187 def fetch_ocr_result(self, img_path): 322 # skip_img.append(self.parse_img_path(img_path))
188 files = [ 323 # return
189 ('img', open(img_path, 'rb')) 324 # elif classify in consts.LICENSE_CLASSIFY_SET_1: # 证件1
190 ] 325 # self.license1_process(ocr_data, license_summary, classify, skip_img, img_path)
191 response = requests.request("POST", self.ocr_url, files=files) 326 # elif classify in consts.LICENSE_CLASSIFY_SET_2: # 证件2
192 return response.json() 327 # pid, _ = consts.LICENSE_CLASSIFY_MAPPING.get(classify)
193 328 # json_data_2 = {
194 def img_2_ocr_2_wb(self, wb, img_info, bs_summary, unknown_summary, license_summary): 329 # "pid": str(pid),
195 res = self.fetch_ocr_result(img_info[0]) 330 # "key": conf.OCR_KEY,
196 self.cronjob_log.info('{0} [fetch ocr result success] [img={1}] [res={2}]'.format( 331 # "secret": conf.OCR_SECRET,
197 self.log_base, img_info[0], res)) 332 # "file": file_data
198 if res.get('code') == 1: 333 # }
199 self.ocr_2_wb(res, wb, img_info[1], img_info[2], bs_summary, unknown_summary, license_summary) 334 # response_2 = requests.post(self.ocr_url_2, data=json_data_2)
335 # if response_2.status_code == 200:
336 # # 识别结果
337 # ocr_res_2 = response_2.json()
338 # self.cronjob_log.info('{0} [ocr_2 result] [img={1}] [res={2}]'.format(
339 # self.log_base, img_path, ocr_res_2))
340 # self.license2_process(ocr_res_2, license_summary, pid, classify, skip_img, img_path)
341 # else:
342 # raise Exception('ocr 2 error, img_path={0}'.format(img_path))
343 # else: # 流水处理
344 # self.bs_process(wb, ocr_data, bs_summary, unknown_summary, img_path, classify, skip_img)
345 # else:
346 # skip_img.append(self.parse_img_path(img_path))
347 # else:
348 # raise Exception('ocr 1 error, img_path={0}'.format(img_path))
349
350 @staticmethod
351 def parse_img_path(img_path):
352 img_name, _ = os.path.splitext(os.path.basename(img_path))
353 return int(img_name[5])+1, int(img_name[11])+1
200 354
201 @staticmethod 355 @staticmethod
202 def get_most(value_list): 356 def get_most(value_list):
...@@ -255,7 +409,6 @@ class Command(BaseCommand, LoggerMixin): ...@@ -255,7 +409,6 @@ class Command(BaseCommand, LoggerMixin):
255 summary['role'] = self.get_most(summary['role']) 409 summary['role'] = self.get_most(summary['role'])
256 return bs_summary 410 return bs_summary
257 411
258
259 def rebuild_bs_summary(self, bs_summary, unknown_summary): 412 def rebuild_bs_summary(self, bs_summary, unknown_summary):
260 # bs_summary = { 413 # bs_summary = {
261 # '卡号': { 414 # '卡号': {
...@@ -297,8 +450,10 @@ class Command(BaseCommand, LoggerMixin): ...@@ -297,8 +450,10 @@ class Command(BaseCommand, LoggerMixin):
297 merged_bs_summary[card] = summary 450 merged_bs_summary[card] = summary
298 else: 451 else:
299 # 1卡号 452 # 1卡号
453 one_card = False
300 if len(bs_summary) == 1: 454 if len(bs_summary) == 1:
301 merged_bs_summary = self.prune_bs_summary(bs_summary) 455 merged_bs_summary = self.prune_bs_summary(bs_summary)
456 one_card = True
302 # 多卡号 457 # 多卡号
303 else: 458 else:
304 merged_bs_summary = self.merge_card(bs_summary) 459 merged_bs_summary = self.merge_card(bs_summary)
...@@ -307,7 +462,7 @@ class Command(BaseCommand, LoggerMixin): ...@@ -307,7 +462,7 @@ class Command(BaseCommand, LoggerMixin):
307 merge_role = [] 462 merge_role = []
308 classify_summary = unknown_summary.get(card_summary['classify'], {}) 463 classify_summary = unknown_summary.get(card_summary['classify'], {})
309 for role, summary in classify_summary.items(): 464 for role, summary in classify_summary.items():
310 if role in card_summary['role_set']: 465 if one_card or role in card_summary['role_set']:
311 merge_role.append(role) 466 merge_role.append(role)
312 card_summary['sheet'].extend(summary['sheet']) 467 card_summary['sheet'].extend(summary['sheet'])
313 card_summary['code'].extend(summary['code']) 468 card_summary['code'].extend(summary['code'])
...@@ -336,12 +491,13 @@ class Command(BaseCommand, LoggerMixin): ...@@ -336,12 +491,13 @@ class Command(BaseCommand, LoggerMixin):
336 return merged_bs_summary 491 return merged_bs_summary
337 492
338 # TODO 细化文件状态,不同异常状态,归还队列,重试时采取不同的处理 493 # TODO 细化文件状态,不同异常状态,归还队列,重试时采取不同的处理
339 # TODO 调用接口重试
340 # TODO 协程异步发送OCR请求
341 # TODO 异常邮件通知 494 # TODO 异常邮件通知
495 # 识别失败:普通异常,如PDF异常、构建过程异常
496 # EDMS异常:下载异常-->回队列-->邮件;上传异常-->重新上传队列-->邮件
497 # 算法异常:第一道异常-->识别失败-->邮件;第二道异常-->识别失败-->邮件
498 # TODO 协程异步发送OCR请求
499 # TODO 调用接口重试
342 # TODO 数据库断联问题 500 # TODO 数据库断联问题
343 # TODO 非流水证件处理
344 # TODO EDMS API GATEWAY
345 def handle(self, *args, **kwargs): 501 def handle(self, *args, **kwargs):
346 sleep_second = int(conf.SLEEP_SECOND) 502 sleep_second = int(conf.SLEEP_SECOND)
347 max_sleep_second = int(conf.MAX_SLEEP_SECOND) 503 max_sleep_second = int(conf.MAX_SLEEP_SECOND)
...@@ -369,61 +525,82 @@ class Command(BaseCommand, LoggerMixin): ...@@ -369,61 +525,82 @@ class Command(BaseCommand, LoggerMixin):
369 pdf_handler.extract_image() 525 pdf_handler.extract_image()
370 self.cronjob_log.info('{0} [pdf to img end] [business_type={1}] [doc_id={2}]'.format( 526 self.cronjob_log.info('{0} [pdf to img end] [business_type={1}] [doc_id={2}]'.format(
371 self.log_base, business_type, doc.id)) 527 self.log_base, business_type, doc.id))
372 write_zip_file(img_save_path, os.path.join(doc_data_path, '{0}_img.zip'.format(doc.id)))
373 528
374 # 4.获取OCR结果并且构建excel文件 529 # 4.获取OCR结果并且构建excel文件
375 bs_summary = {} 530 bs_summary = {}
376 license_summary = {} 531 license_summary = {}
377 unknown_summary = {} 532 unknown_summary = {}
533 skip_img = []
378 interest_keyword = Keywords.objects.filter( 534 interest_keyword = Keywords.objects.filter(
379 type=KeywordsType.INTEREST.value).values_list('keyword', flat=True) 535 type=KeywordsType.INTEREST.value, on_off=True).values_list('keyword', flat=True)
380 salary_keyword = Keywords.objects.filter( 536 salary_keyword = Keywords.objects.filter(
381 type=KeywordsType.SALARY.value).values_list('keyword', flat=True) 537 type=KeywordsType.SALARY.value, on_off=True).values_list('keyword', flat=True)
382 loan_keyword = Keywords.objects.filter( 538 loan_keyword = Keywords.objects.filter(
383 type__in=[KeywordsType.LOAN.value, KeywordsType.ALI_WECHART.value]).values_list( 539 type__in=[KeywordsType.LOAN.value, KeywordsType.ALI_WECHART.value], on_off=True).values_list(
384 'keyword', flat=True) 540 'keyword', flat=True)
385 wb = BSWorkbook(interest_keyword, salary_keyword, loan_keyword) 541 wb = BSWorkbook(interest_keyword, salary_keyword, loan_keyword)
386 542
387 # wb = Workbook() 543 # wb = Workbook()
388 544
389 # 4.1 获取OCR结果 545 # 4.1 获取OCR结果
390 # loop = asyncio.get_event_loop() 546 loop = asyncio.get_event_loop()
391 # tasks = [self.img_2_ocr_2_wb(wb, img_path, summary) for img_path in pdf_handler.img_path_list] 547 tasks = [self.img_2_ocr_2_wb(wb, img_path, bs_summary, unknown_summary, license_summary, skip_img)
392 # loop.run_until_complete(asyncio.wait(tasks)) 548 for img_path in pdf_handler.img_path_list]
549 loop.run_until_complete(asyncio.wait(tasks))
393 # loop.close() 550 # loop.close()
394 551
395 for img_info in pdf_handler.img_info_list: 552 # for img_path in pdf_handler.img_path_list:
396 self.img_2_ocr_2_wb(wb, img_info, bs_summary, unknown_summary, license_summary) 553 # self.img_2_ocr_2_wb(wb, img_path, bs_summary, unknown_summary, license_summary, skip_img)
397 554
398 self.cronjob_log.info('{0} [bs_summary={1}] [unknown_summary={2}] [license_summary={3}]'.format( 555 self.cronjob_log.info('{0} [business_type={1}] [doc_id={2}] [bs_summary={3}] [unknown_summary={4}] '
399 self.log_base, bs_summary, unknown_summary, license_summary)) 556 '[license_summary={5}]'.format(self.log_base, business_type, doc.id, bs_summary,
557 unknown_summary, license_summary))
400 558
401 merged_bs_summary = self.rebuild_bs_summary(bs_summary, unknown_summary) 559 merged_bs_summary = self.rebuild_bs_summary(bs_summary, unknown_summary)
402 560
403 self.cronjob_log.info('{0} [merged_bs_summary={1}] [unknown_summary={2}]'.format( 561 self.cronjob_log.info('{0} [business_type={1}] [doc_id={2}] [merged_bs_summary={3}] '
404 self.log_base, merged_bs_summary, unknown_summary)) 562 '[unknown_summary={4}] [skip_img={5}]'.format(self.log_base, business_type,
563 doc.id, merged_bs_summary,
564 unknown_summary, skip_img))
405 del unknown_summary 565 del unknown_summary
406 566
407 # 4.2 重构Excel文件 567 # 4.2 重构Excel文件
408 wb.save(src_excel_path) 568 wb.save(src_excel_path)
409 wb.rebuild(merged_bs_summary, license_summary) 569 wb.rebuild(merged_bs_summary, license_summary, skip_img)
410 wb.save(excel_path) 570 wb.save(excel_path)
571 except EDMSException as e:
572 self.cronjob_log.error('{0} [process failed (edms download)] [business_type={1}] [doc_id={2}] '
573 '[err={3}]'.format(self.log_base, business_type, doc.id, e))
411 except Exception as e: 574 except Exception as e:
412 doc.status = DocStatus.PROCESS_FAILED.value 575 doc.status = DocStatus.PROCESS_FAILED.value
413 doc.save() 576 doc.save()
414 self.cronjob_log.error('{0} [process failed] [business_type={1}] [doc_id={2}] [err={3}]'.format( 577 self.cronjob_log.error('{0} [process failed (program)] [business_type={1}] [doc_id={2}] '
415 self.log_base, business_type, doc.id, e)) 578 '[err={3}]'.format(self.log_base, business_type, doc.id, e))
416 else: 579 else:
417 try: 580 try:
418 # 5.上传至EDMS 581 # 5.上传至EDMS
419 self.edms.upload(excel_path, doc, business_type) 582 for times in range(consts.RETRY_TIMES):
583 try:
584 self.edms.upload(excel_path, doc, business_type)
585 except Exception as e:
586 self.cronjob_log.warn(
587 '{0} [edms upload failed] [times={1}] [business_type={2}] [doc_id={3}] '
588 '[error={4}]'.format(self.log_base, times, business_type, doc.id, e))
589 edms_exc = str(e)
590 else:
591 break
592 else:
593 raise EDMSException(edms_exc)
420 except Exception as e: 594 except Exception as e:
421 doc.status = DocStatus.UPLOAD_FAILED.value 595 doc.status = DocStatus.UPLOAD_FAILED.value
422 doc.save() 596 doc.save()
423 end_time = time.time() 597 end_time = time.time()
424 speed_time = int(end_time - start_time) 598 speed_time = int(end_time - start_time)
425 self.cronjob_log.error('{0} [upload failed] [business_type={1}] [doc_id={2}] [speed_time={3}] ' 599 self.cronjob_log.error('{0} [process failed (edms upload)] [business_type={1}] [doc_id={2}] '
426 '[err={4}]'.format(self.log_base, business_type, doc.id, speed_time, e)) 600 '[speed_time={3}] [err={4}]'.format(self.log_base, business_type, doc.id,
601 speed_time, e))
602 write_zip_file(img_save_path, os.path.join(doc_data_path, '{0}_img.zip'.format(doc.id)))
603
427 else: 604 else:
428 doc.status = DocStatus.COMPLETE.value 605 doc.status = DocStatus.COMPLETE.value
429 doc.save() 606 doc.save()
...@@ -431,5 +608,6 @@ class Command(BaseCommand, LoggerMixin): ...@@ -431,5 +608,6 @@ class Command(BaseCommand, LoggerMixin):
431 speed_time = int(end_time - start_time) 608 speed_time = int(end_time - start_time)
432 self.cronjob_log.info('{0} [process complete] [business_type={1}] [doc_id={2}] ' 609 self.cronjob_log.info('{0} [process complete] [business_type={1}] [doc_id={2}] '
433 '[speed_time={3}]'.format(self.log_base, business_type, doc.id, speed_time)) 610 '[speed_time={3}]'.format(self.log_base, business_type, doc.id, speed_time))
611 write_zip_file(img_save_path, os.path.join(doc_data_path, '{0}_img.zip'.format(doc.id)))
434 612
435 self.cronjob_log.info('{0} [stop safely]'.format(self.log_base)) 613 self.cronjob_log.info('{0} [stop safely]'.format(self.log_base))
......
...@@ -141,32 +141,22 @@ class BSWorkbook(Workbook): ...@@ -141,32 +141,22 @@ class BSWorkbook(Workbook):
141 # month_info process 141 # month_info process
142 month_info = month_mapping.setdefault('xxxx-xx', []) 142 month_info = month_mapping.setdefault('xxxx-xx', [])
143 month_info.append((ws.title, min_row, ws.max_row, 0)) 143 month_info.append((ws.title, min_row, ws.max_row, 0))
144 elif len(month_list) == 1:
145 # reverse_trend_list process
146 reverse_trend = self.get_reverse_trend(dti.day, idx_list)
147 reverse_trend_list.append(reverse_trend)
148 # month_info process
149 month_info = month_mapping.setdefault(month_list[0], [])
150 day_mean = np.mean(dti.day.dropna())
151 if len(month_info) == 0:
152 month_info.append((ws.title, min_row, ws.max_row, day_mean))
153 else:
154 for i, item in enumerate(month_info):
155 if day_mean <= item[-1]:
156 month_info.insert(i, (ws.title, min_row, ws.max_row, day_mean))
157 break
158 else:
159 month_info.append((ws.title, min_row, ws.max_row, day_mean))
160 else: 144 else:
161 # reverse_trend_list process 145 # reverse_trend_list process
162 reverse_trend = self.get_reverse_trend(dti.day, idx_list) 146 reverse_trend = self.get_reverse_trend(dti.day, idx_list)
163 reverse_trend_list.append(reverse_trend) 147 reverse_trend_list.append(reverse_trend)
164 # month_info process 148 # month_info process
165 for i, item in enumerate(month_list[:-1]): 149 day_idx = dti.day
166 month_mapping.setdefault(item, []).append( 150 idx_list_max_idx = len(idx_list) - 1
167 (ws.title, idx_list[i] + min_row, idx_list[i + 1] + min_row - 1, self.MAX_MEAN)) 151 for i, item in enumerate(month_list):
168 month_mapping.setdefault(month_list[-1], []).insert( 152 if i == idx_list_max_idx:
169 0, (ws.title, idx_list[-1] + min_row, ws.max_row, 0)) 153 day_mean = np.mean(day_idx[idx_list[i]:].dropna())
154 month_mapping.setdefault(item, []).append(
155 (ws.title, idx_list[i] + min_row, ws.max_row, day_mean))
156 else:
157 day_mean = np.mean(day_idx[idx_list[i]: idx_list[i + 1]].dropna())
158 month_mapping.setdefault(item, []).append(
159 (ws.title, idx_list[i] + min_row, idx_list[i + 1] + min_row - 1, day_mean))
170 160
171 def build_metadata_rows(self, confidence, code, print_time, start_date, end_date): 161 def build_metadata_rows(self, confidence, code, print_time, start_date, end_date):
172 if start_date is None or end_date is None: 162 if start_date is None or end_date is None:
...@@ -191,9 +181,9 @@ class BSWorkbook(Workbook): ...@@ -191,9 +181,9 @@ class BSWorkbook(Workbook):
191 def create_meta_sheet(self, card): 181 def create_meta_sheet(self, card):
192 if self.worksheets[0].title == 'Sheet': 182 if self.worksheets[0].title == 'Sheet':
193 ms = self.worksheets[0] 183 ms = self.worksheets[0]
194 ms.title = '{0}({1})'.format(self.meta_sheet_title, card) 184 ms.title = '{0}({1})'.format(self.meta_sheet_title, card[-6:])
195 else: 185 else:
196 ms = self.create_sheet('{0}({1})'.format(self.meta_sheet_title, card)) 186 ms = self.create_sheet('{0}({1})'.format(self.meta_sheet_title, card[-6:]))
197 return ms 187 return ms
198 188
199 def build_meta_sheet(self, card, confidence, code, print_time, start_date, end_date): 189 def build_meta_sheet(self, card, confidence, code, print_time, start_date, end_date):
...@@ -203,6 +193,26 @@ class BSWorkbook(Workbook): ...@@ -203,6 +193,26 @@ class BSWorkbook(Workbook):
203 ms.append(row) 193 ms.append(row)
204 return ms 194 return ms
205 195
196 @staticmethod
197 def amount_format(amount_str):
198 if not isinstance(amount_str, str) or amount_str == '':
199 return amount_str
200 # 1.替换
201 res_str = amount_str.translate(consts.TRANS)
202 # 2.删除多余的-
203 res_str = res_str[0] + res_str[1:].replace('-', '')
204 # 3.首字符处理
205 if res_str[0] in consts.ERROR_CHARS:
206 res_str = '-{0}'.format(res_str[1:])
207 # 4.逗号与句号处理
208 if len(res_str) >= 4:
209 period_idx = len(res_str) - 3
210 if res_str[period_idx] == '.' and res_str[period_idx - 1] == ',':
211 res_str = '{0}{1}'.format(res_str[:period_idx - 1], res_str[period_idx:])
212 elif res_str[period_idx] == ',':
213 res_str = '{0}.{1}'.format(res_str[:period_idx], res_str[period_idx + 1:])
214 return res_str
215
206 def build_month_sheet(self, card, month_mapping, ms, is_reverse): 216 def build_month_sheet(self, card, month_mapping, ms, is_reverse):
207 tmp_ws = self.create_sheet('tmp_ws') 217 tmp_ws = self.create_sheet('tmp_ws')
208 for month in sorted(month_mapping.keys()): 218 for month in sorted(month_mapping.keys()):
...@@ -235,29 +245,25 @@ class BSWorkbook(Workbook): ...@@ -235,29 +245,25 @@ class BSWorkbook(Workbook):
235 # 3.3.余额转数值 245 # 3.3.余额转数值
236 over_cell = rows[consts.OVER_IDX] 246 over_cell = rows[consts.OVER_IDX]
237 try: 247 try:
238 if isinstance(over_cell.value, str): 248 over_cell.value = locale.atof(self.amount_format(over_cell.value))
239 over_cell.value = over_cell.value.translate(consts.TRANS)
240 over_cell.value = locale.atof(over_cell.value)
241 except Exception as e: 249 except Exception as e:
242 continue 250 continue
243 else: 251 else:
244 over_cell.number_format = numbers.FORMAT_NUMBER_COMMA_SEPARATED1 252 over_cell.number_format = numbers.FORMAT_NUMBER_COMMA_SEPARATED1
245 253
246 # 3.4.额转数值 254 # 3.4.额转数值
247 try: 255 try:
248 try: 256 try:
249 if isinstance(amount_cell.value, str): # TODO 可在转化数字失败后,再替换 257 amount_cell.value = locale.atof(self.amount_format(amount_cell.value))
250 amount_cell.value = amount_cell.value.translate(consts.TRANS)
251 amount_cell.value = locale.atof(amount_cell.value)
252 except Exception as e: 258 except Exception as e:
253 try: 259 try:
254 if isinstance(rows[consts.INCOME_IDX].value, str): 260 amount_cell.value = locale.atof(self.amount_format(rows[consts.INCOME_IDX].value))
255 rows[consts.OUTLAY_IDX].value = rows[consts.INCOME_IDX].value.translate(consts.TRANS) 261 if amount_cell.value == 0:
256 amount_cell.value = locale.atof(rows[consts.OUTLAY_IDX].value) 262 raise
263 elif amount_cell.value < 0:
264 amount_cell.value = -amount_cell.value
257 except Exception as e: 265 except Exception as e:
258 if isinstance(rows[consts.OUTLAY_IDX].value, str): 266 amount_cell.value = locale.atof(self.amount_format(rows[consts.OUTLAY_IDX].value))
259 rows[consts.OUTLAY_IDX].value = rows[consts.OUTLAY_IDX].value.translate(consts.TRANS)
260 amount_cell.value = locale.atof(rows[consts.OUTLAY_IDX].value)
261 if amount_cell.value > 0: 267 if amount_cell.value > 0:
262 amount_cell.value = -amount_cell.value 268 amount_cell.value = -amount_cell.value
263 except Exception as e: 269 except Exception as e:
...@@ -313,18 +319,18 @@ class BSWorkbook(Workbook): ...@@ -313,18 +319,18 @@ class BSWorkbook(Workbook):
313 # } 319 # }
314 for card, summary in bs_summary.items(): 320 for card, summary in bs_summary.items():
315 # 1.原表修剪、排列、按照月份分割 321 # 1.原表修剪、排列、按照月份分割
316 start_date = summary['start_date'] 322 start_date = summary.get('start_date')
317 end_date = summary['end_date'] 323 end_date = summary.get('end_date')
318 date_statistics = False 324 date_statistics = False
319 if start_date is None or end_date is None: 325 if start_date is None or end_date is None:
320 date_statistics = True 326 date_statistics = True
321 date_list = [] 327 date_list = []
322 month_mapping = {} 328 month_mapping = {}
323 reverse_trend_list = [] 329 reverse_trend_list = []
324 for sheet in summary['sheet']: 330 for sheet in summary.get('sheet', []):
325 ws = self.get_sheet_by_name(sheet) 331 ws = self.get_sheet_by_name(sheet)
326 # 1.1.删除多余列、排列 332 # 1.1.删除多余列、排列
327 min_row = self.sheet_prune(ws, summary['classify']) 333 min_row = self.sheet_prune(ws, summary.get('classify', 0))
328 # 1.2.按月份分割 334 # 1.2.按月份分割
329 self.sheet_split(ws, month_mapping, reverse_trend_list, min_row, date_list, date_statistics) 335 self.sheet_split(ws, month_mapping, reverse_trend_list, min_row, date_list, date_statistics)
330 336
...@@ -334,32 +340,43 @@ class BSWorkbook(Workbook): ...@@ -334,32 +340,43 @@ class BSWorkbook(Workbook):
334 340
335 # 2.元信息提取表 341 # 2.元信息提取表
336 ms = self.build_meta_sheet(card, 342 ms = self.build_meta_sheet(card,
337 summary['confidence'], 343 summary.get('confidence', 1),
338 summary['code'], 344 summary.get('code'),
339 summary['print_time'], 345 summary.get('print_time'),
340 start_date, 346 start_date,
341 end_date) 347 end_date)
342 348
343 # 3.创建月份表、提取/高亮关键行 349 # 3.创建月份表、提取/高亮关键行
344 is_reverse = False 350 # 倒序处理
345 if sum(reverse_trend_list) > 0: # 倒序处理 351 is_reverse = True if sum(reverse_trend_list) > 0 else False
346 is_reverse = True 352 for month_list in month_mapping.values():
347 for month_list in month_mapping.values(): 353 month_list.sort(key=lambda x: x[-1], reverse=is_reverse)
348 month_list.sort(key=lambda x: x[-1], reverse=True) 354
349 self.build_month_sheet(card, month_mapping, ms, is_reverse) 355 self.build_month_sheet(card, month_mapping, ms, is_reverse)
350 356
351 # 4.删除原表 357 # 4.删除原表
352 for sheet in summary['sheet']: 358 for sheet in summary.get('sheet'):
353 self.remove(self.get_sheet_by_name(sheet)) 359 self.remove(self.get_sheet_by_name(sheet))
354 360
355 def license_rebuild(self, license_summary): 361 def license_rebuild(self, license_summary):
356 for en_key, cn_key in consts.LICENSE_ORDER: 362 for classify, (_, name) in consts.LICENSE_ORDER:
357 ws = self.create_sheet(cn_key) 363 res = license_summary.get(classify)
358 for bl in license_summary.get(en_key, []): 364 if res is None:
365 continue
366 ws = self.create_sheet(name)
367 for bl in res:
359 for bl_field in bl: 368 for bl_field in bl:
360 ws.append(bl_field) 369 ws.append(bl_field)
361 ws.append((None, )) 370 ws.append((None, ))
362 371
363 def rebuild(self, bs_summary, license_summary): 372 def skip_img_sheet(self, skip_img):
373 if skip_img:
374 ws = self.create_sheet(consts.SKIP_IMG_SHEET_NAME)
375 ws.append(consts.SKIP_IMG_SHEET_HEADER)
376 for img_tuple in skip_img:
377 ws.append(img_tuple)
378
379 def rebuild(self, bs_summary, license_summary, skip_img):
364 self.bs_rebuild(bs_summary) 380 self.bs_rebuild(bs_summary)
365 # self.license_rebuild(license_summary) 381 self.license_rebuild(license_summary)
382 self.skip_img_sheet(skip_img)
......
...@@ -25,7 +25,7 @@ class PDFHandler: ...@@ -25,7 +25,7 @@ class PDFHandler:
25 def __init__(self, path, img_dir_path): 25 def __init__(self, path, img_dir_path):
26 self.path = path 26 self.path = path
27 self.img_dir_path = img_dir_path 27 self.img_dir_path = img_dir_path
28 self.img_info_list = [] 28 self.img_path_list = []
29 self.xref_set = set() 29 self.xref_set = set()
30 30
31 def get_img_save_path(self, pno, img_index=0, ext='png'): 31 def get_img_save_path(self, pno, img_index=0, ext='png'):
...@@ -38,7 +38,7 @@ class PDFHandler: ...@@ -38,7 +38,7 @@ class PDFHandler:
38 pm = page.getPixmap(matrix=trans_2, alpha=False) 38 pm = page.getPixmap(matrix=trans_2, alpha=False)
39 img_save_path = self.get_img_save_path(page.number) 39 img_save_path = self.get_img_save_path(page.number)
40 pm.writePNG(img_save_path) 40 pm.writePNG(img_save_path)
41 self.img_info_list.append((img_save_path, page.number, 0)) 41 self.img_path_list.append(img_save_path)
42 42
43 @staticmethod 43 @staticmethod
44 def getimage(pix): 44 def getimage(pix):
...@@ -88,7 +88,7 @@ class PDFHandler: ...@@ -88,7 +88,7 @@ class PDFHandler:
88 with open(img_save_path, "wb") as f: 88 with open(img_save_path, "wb") as f:
89 f.write(img_data) 89 f.write(img_data)
90 self.xref_set.add(xref) 90 self.xref_set.add(xref)
91 self.img_info_list.append((img_save_path, pno, img_index)) 91 self.img_path_list.append(img_save_path)
92 92
93 @staticmethod 93 @staticmethod
94 def split_il(il): 94 def split_il(il):
...@@ -179,7 +179,7 @@ class PDFHandler: ...@@ -179,7 +179,7 @@ class PDFHandler:
179 img_save_path = self.get_img_save_path(pno, img_index, im_list[0][2]) 179 img_save_path = self.get_img_save_path(pno, img_index, im_list[0][2])
180 new_img.save(img_save_path) 180 new_img.save(img_save_path)
181 page_to_png = False 181 page_to_png = False
182 self.img_info_list.append((img_save_path, pno, img_index)) 182 self.img_path_list.append(img_save_path)
183 183
184 # 3.3 碎图分组大于2、全过滤、含特殊filter,特殊处理:整个页面保存为png图片 184 # 3.3 碎图分组大于2、全过滤、含特殊filter,特殊处理:整个页面保存为png图片
185 if page_to_png: 185 if page_to_png:
......
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!