Skip to content
Toggle navigation
Toggle navigation
This project
Loading...
Sign in
周伟奇
/
bmw-ocr
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Graphs
Network
Create a new issue
Commits
Issue Boards
Files
Commits
Network
Compare
Branches
Tags
793920a0
authored
2020-09-17 14:19:26 +0800
by
周伟奇
Browse Files
Options
Browse Files
Tag
Download
Email Patches
Plain Diff
update wb build
1 parent
f3d6e429
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
776 additions
and
166 deletions
src/apps/doc/consts.py
src/apps/doc/management/commands/doc_ocr_process.py
src/apps/doc/mixins.py
src/apps/doc/named_enum.py
src/apps/doc/ocr/wb.py
src/common/tools/pdf_to_img.py
src/apps/doc/consts.py
View file @
793920a
PAGE_DEFAULT
=
1
PAGE_SIZE_DEFAULT
=
10
TRANS
=
str
.
maketrans
(
'Cc((oODlLmAsSbg'
,
'000000011345569'
)
CARD_RATIO
=
0.9
UNKNOWN_CARD
=
'未知卡号'
UNKNOWN_ROLE
=
'未知户名'
DATE_FORMAT
=
[
'
%
Y年
%
m月
%
d日'
,
'
%
Y/
%
m/
%
d'
,
'
%
Y-
%
m-
%
d'
,
'
%
Y
%
m
%
d'
]
FIXED_APPLICATION_ID_PREFIX
=
'CH-S'
DOC_SCHEME_LIST
=
[
'ACCEPTANCE'
,
'SETTLEMENT'
,
'CONTRACT MANAGEMENT'
]
DATA_SOURCE_LIST
=
[
'POS'
,
'EAPP'
,
'ECONTRACT'
]
DATA_SOURCE_LIST
=
[
'POS'
,
'E
-
APP'
,
'ECONTRACT'
]
HIL_PREFIX
=
'HIL'
AFC_PREFIX
=
'AFC'
...
...
@@ -39,11 +46,33 @@ PROOF_COL_TITLE = '核对结果'
PROOF_RES
=
(
'对'
,
'错'
)
META_SHEET_TITLE
=
'关键信息提取和展示'
FIXED_HEADERS
=
(
'记账日期'
,
'记账时间'
,
'金额'
,
'余额'
,
'交易名称'
,
'附言'
,
'对方账户名'
,
'对方卡号/账号'
,
'对方开户行'
,
'核对结果'
)
FIXED_HEADERS
=
(
'记账日期'
,
'记账时间'
,
'金额'
,
'余额'
,
'交易名称'
,
'附言'
,
'对方账户名'
,
'对方卡号/账号'
,
'对方开户行'
,
'核对结果'
,
'借贷'
,
'收入'
,
'支出'
)
FIXED_COL_AMOUNT
=
len
(
FIXED_HEADERS
)
BASE_HEADERS_MAPPING
=
{
label
:
idx
+
1
for
idx
,
label
in
enumerate
(
FIXED_HEADERS
)}
BORROW_HEADER_COL
=
BASE_HEADERS_MAPPING
[
'借贷'
]
INCOME_HEADER_COL
=
BASE_HEADERS_MAPPING
[
'收入'
]
OUTLAY_HEADER_COL
=
BASE_HEADERS_MAPPING
[
'支出'
]
RESULT_HEADER_COL
=
BASE_HEADERS_MAPPING
[
'核对结果'
]
BORROW_IDX
=
BORROW_HEADER_COL
-
1
INCOME_IDX
=
INCOME_HEADER_COL
-
1
OUTLAY_IDX
=
OUTLAY_HEADER_COL
-
1
SUMMARY_IDX
=
FIXED_HEADERS
.
index
(
'附言'
)
DATE_IDX
=
FIXED_HEADERS
.
index
(
'记账日期'
)
AMOUNT_IDX
=
FIXED_HEADERS
.
index
(
'金额'
)
OVER_IDX
=
FIXED_HEADERS
.
index
(
'余额'
)
RESULT_IDX
=
FIXED_HEADERS
.
index
(
'核对结果'
)
# '借贷': ('贷', '借'), # 竖版-无表格-广发银行
# '借贷状态': ('贷', '借'), # 竖版-特殊-交通银行
# '收/支': ('收入', '支出'), # 横版-表格-北京银行
BORROW_HEADERS_SET
=
{
'借贷'
,
'借贷状态'
,
'收/支'
}
BORROW_INCOME_SET
=
{
'贷'
,
'收入'
}
BORROW_OUTLAY_SET
=
{
'借'
,
'支出'
}
INCOME_HEADERS_SET
=
{
'收入金额'
,
'收入'
,
'存入'
,
'存入金额(贷)'
,
'存入金额(贷)'
}
OUTLAY_HEADERS_SET
=
{
'支出金额'
,
'支出'
,
'支取金额(借)'
,
'支取金额(借)'
}
# ------------------普通打印-全格线--------------------------------------------------------------------------------------
HEADERS_MAPPING
=
{}
#
中国银行
#
横版-表格-中国银行(不规则)
HEADERS_MAPPING
.
update
(
{
'记账日期'
:
BASE_HEADERS_MAPPING
[
'记账日期'
],
...
...
@@ -57,37 +86,294 @@ HEADERS_MAPPING.update(
'对方开户行'
:
BASE_HEADERS_MAPPING
[
'对方开户行'
],
}
)
#
竖版-表格-建设银行
#
横版-表格-农业银行-中国农业银行个人账户明细
HEADERS_MAPPING
.
update
(
{
'交易日期'
:
BASE_HEADERS_MAPPING
[
'记账日期'
],
'交易金额'
:
BASE_HEADERS_MAPPING
[
'金额'
],
'账户余额'
:
BASE_HEADERS_MAPPING
[
'余额'
],
'存入'
:
BASE_HEADERS_MAPPING
[
'金额'
],
'对方账号'
:
BASE_HEADERS_MAPPING
[
'对方卡号/账号'
],
'对方名称'
:
BASE_HEADERS_MAPPING
[
'对方账户名'
],
'摘要'
:
BASE_HEADERS_MAPPING
[
'附言'
],
'对方账号与户名'
:
BASE_HEADERS_MAPPING
[
'对方卡号/账号'
],
}
)
# 横版-表格-
农业
银行
# 横版-表格-
北京
银行
HEADERS_MAPPING
.
update
(
{
'
存入'
:
BASE_HEADERS_MAPPING
[
'金额
'
],
'
对方账号'
:
BASE_HEADERS_MAPPING
[
'对方卡号/账号
'
],
'对方
名称
'
:
BASE_HEADERS_MAPPING
[
'对方账户名'
],
'
业务摘要'
:
BASE_HEADERS_MAPPING
[
'附言
'
],
'
发生额'
:
BASE_HEADERS_MAPPING
[
'金额
'
],
'对方
户名
'
:
BASE_HEADERS_MAPPING
[
'对方账户名'
],
}
)
# 横版-表格-工商银行
# 横版-表格-工商银行 借记卡账户历史明细清单
# 横版-表格-工商银行-机打验证码 借记卡账户历史明细清单
# 横版-表格-工商银行CH-B008802400
# 横版-表格-工商银行 工资明细清单
# 工商银行历史明细(申请单号:20042501303039397888)
HEADERS_MAPPING
.
update
(
{
'对方户名'
:
BASE_HEADERS_MAPPING
[
'对方账户名'
],
'收入/支出金额'
:
BASE_HEADERS_MAPPING
[
'金额'
],
'工作日期'
:
BASE_HEADERS_MAPPING
[
'记账日期'
],
}
)
# 横版-表格-北京银行
# 横版-表格-建设银行-个人活期账户交易明细
# 竖版-表格-建设银行-个人活期账户交易明细 CH-B005832604
# 竖版-表格-建设银行-工资账单CH-B008786812
# 竖版-表格-建设银行-个人活期账户交易明细 CH-B005832604 (2)
HEADERS_MAPPING
.
update
(
{
'业务摘要'
:
BASE_HEADERS_MAPPING
[
'附言'
],
'发生额'
:
BASE_HEADERS_MAPPING
[
'金额'
],
'交易金额'
:
BASE_HEADERS_MAPPING
[
'金额'
],
'账户余额'
:
BASE_HEADERS_MAPPING
[
'余额'
],
'对方账号与户名'
:
BASE_HEADERS_MAPPING
[
'对方卡号/账号'
],
}
)
# 微信
HEADERS_MAPPING
.
update
(
{
'交易时间'
:
BASE_HEADERS_MAPPING
[
'记账时间'
],
'交易类型'
:
BASE_HEADERS_MAPPING
[
'附言'
],
'金额(元)'
:
BASE_HEADERS_MAPPING
[
'金额'
],
'金额(元)'
:
BASE_HEADERS_MAPPING
[
'金额'
],
'交易对方'
:
BASE_HEADERS_MAPPING
[
'对方账户名'
],
}
)
# 支付宝
HEADERS_MAPPING
.
update
(
{
'时间'
:
BASE_HEADERS_MAPPING
[
'记账日期'
],
'名称/备注'
:
BASE_HEADERS_MAPPING
[
'附言'
],
}
)
# ------------普通打印-部分格线-------------------------------------------------------------------------------------------
# 竖版-无表格-农业银行
# 竖版-无表格-农业银行CH-B008805428
HEADERS_MAPPING
.
update
(
{
'摘要/附言'
:
BASE_HEADERS_MAPPING
[
'附言'
],
'交易地点/对方账号和户名'
:
BASE_HEADERS_MAPPING
[
'对方卡号/账号'
],
}
)
# 竖版-特殊-农商行
HEADERS_MAPPING
.
update
(
{
'交易发生额'
:
BASE_HEADERS_MAPPING
[
'金额'
],
}
)
# 横版-特殊-中信银行-账户交易明细
HEADERS_MAPPING
.
update
(
{
'对方银行'
:
BASE_HEADERS_MAPPING
[
'对方开户行'
],
'交易摘要'
:
BASE_HEADERS_MAPPING
[
'附言'
],
}
)
# 平安电子账单
HEADERS_MAPPING
.
update
(
{
'借贷发生额(借:-贷:+)'
:
BASE_HEADERS_MAPPING
[
'金额'
],
}
)
# ------------普通打印-无格线--------------------------------------------------------------------------------------------
# 竖版-无表格-招商银行(略歪)
# 竖版-无表格-招商银行账户历史交易明细表
HEADERS_MAPPING
.
update
(
{
'联机余额'
:
BASE_HEADERS_MAPPING
[
'余额'
],
}
)
# 竖版-无表格-邮储银行-账户对账单 含有对手方户名 对手方账户
# 竖版-无表格-邮储银行 账户对账单
# 竖版-无表格-邮储银行-电子章 邮储银行 账户对账单
HEADERS_MAPPING
.
update
(
{
'交易金额(元)'
:
BASE_HEADERS_MAPPING
[
'金额'
],
'交易金额(元)'
:
BASE_HEADERS_MAPPING
[
'金额'
],
'账户余额(元)'
:
BASE_HEADERS_MAPPING
[
'余额'
],
'账户余额(元)'
:
BASE_HEADERS_MAPPING
[
'余额'
],
'对手方户名'
:
BASE_HEADERS_MAPPING
[
'对方账户名'
],
'对手方账户'
:
BASE_HEADERS_MAPPING
[
'对方卡号/账号'
],
}
)
# 横版-无表格-广发银行-账户交易历史 --> 已废弃
# 竖版-无表格-广发银行-账户交易历史 --> 已废弃
HEADERS_MAPPING
.
update
(
{
'会计日期'
:
BASE_HEADERS_MAPPING
[
'记账日期'
],
'对手户名'
:
BASE_HEADERS_MAPPING
[
'对方账户名'
],
'对手账号'
:
BASE_HEADERS_MAPPING
[
'对方卡号/账号'
],
}
)
# 招行电子账单 TODO 有英文,需测试
HEADERS_MAPPING
.
update
(
{
'对手信息'
:
BASE_HEADERS_MAPPING
[
'对方账户名'
],
'摘要代码'
:
BASE_HEADERS_MAPPING
[
'附言'
],
}
)
# 横版-无表格-民生银行-中国民生银行个人账户对账单(客户卡号)
# 横版-无表格-民生银行-无标题(客户账户)
# 横版-无表格-民生银行
HEADERS_MAPPING
.
update
(
{
'摘要信息'
:
BASE_HEADERS_MAPPING
[
'附言'
],
'对方行名'
:
BASE_HEADERS_MAPPING
[
'对方开户行'
],
}
)
# 竖版-无表格-农业银行整数
# 竖版-无表格-农业银行-中国农业银行银行卡交易明细清单
HEADERS_MAPPING
.
update
(
{
'对方账号和户名'
:
BASE_HEADERS_MAPPING
[
'对方卡号/账号'
],
}
)
# 竖版-无表格-农业银行-中国农业银行银行卡活期存折交易明细清单.pdf
# 竖版-无表格-农业银行-扩张.pdf
# 竖版-无表格-农业银行-缩进.pdf
HEADERS_MAPPING
.
update
(
{
'日期'
:
BASE_HEADERS_MAPPING
[
'记账日期'
],
'短摘要'
:
BASE_HEADERS_MAPPING
[
'附言'
],
'本次余额'
:
BASE_HEADERS_MAPPING
[
'余额'
],
}
)
# 竖版-无表格-农业银行-无标题(对手帐号)
HEADERS_MAPPING
.
update
(
{
'交易后余额'
:
BASE_HEADERS_MAPPING
[
'余额'
],
'对手帐号'
:
BASE_HEADERS_MAPPING
[
'对方卡号/账号'
],
}
)
# 竖版-无表格-农商行(非常规)
HEADERS_MAPPING
.
update
(
{
'交易说明'
:
BASE_HEADERS_MAPPING
[
'附言'
],
}
)
# 竖版-无表格-工商银行 抬头三行 活期历史明细清单
HEADERS_MAPPING
.
update
(
{
'对方账户'
:
BASE_HEADERS_MAPPING
[
'对方卡号/账号'
],
}
)
# -----------针式打印-全格线--------------------------------------------------------------------------------------------
# 竖版-表格-建设银行-中国建设银行活期账户交易明细
# 竖版-表格-建设银行-中国建设银行活期账户明细清单
# 竖版-表格-建设银行-对私活期账户明细- (1).pdf
HEADERS_MAPPING
.
update
(
{
'帐户余额'
:
BASE_HEADERS_MAPPING
[
'余额'
],
'对方帐户名称'
:
BASE_HEADERS_MAPPING
[
'对方账户名'
],
}
)
# 竖版-特殊-交通银行 零售客户交易清单 5000以上交易记录
HEADERS_MAPPING
.
update
(
{
'交易日期 记账日期'
:
BASE_HEADERS_MAPPING
[
'记账日期'
],
}
)
# ----------针式打印-部分格线------------------------------------------------------------------------------------------
# 竖版-特殊-邮储银行-一本通绿卡通交易明细(客户)
# 竖版-特殊-邮储银行-账户交易明细(客户)
HEADERS_MAPPING
.
update
(
{
'对方账号/卡号/汇票号'
:
BASE_HEADERS_MAPPING
[
'对方卡号/账号'
],
}
)
# --------------------------------------------------------------------------------------------------------------------
# ('记账日期', '记账时间', '金额', '余额', '交易名称', '附言', '对方账户名', '对方卡号/账号', '对方开户行', '核对结果', '借贷', '收入', '支出')
# CLASSIFY_LIST = [
# ('中国银行', (1, 2, 4, 5, 6, 9, 10, 11, 12, None, None, None, None)), # 横版-表格-中国银行(不规则)
# ('农业银行', (1, None, 3, 5, None, 8, 7, 6, None, None, None, None, None)), # 横版-表格-农业银行-中国农业银行个人账户明细
# ('北京银行', (1, None, 4, 5, None, 2, 6, 7, None, None, 3, None, None)), # 横版-表格-北京银行
#
# # 工商银行:交易日期 账号 储种 序号 币种 钞汇 摘要 地区 收入/支出金额 余额 [对方户名 对方账号] 渠道
# ('工商银行', (1, None, 9, 10, None, 7, None, None, None, None, None, None, None)),
#
# # 建设银行:空 摘要 交易日期 交易金额 账户余额 商户/网点号及其名称 对方账号与户名 --> 竖版-表格-建设银行
# # 序号 摘要 币别 钞汇 交易日期 交易金额 账户余额 交易地点附言 对方账号与户名 --> 横版-表格-建设银行
# ('建设银行', (None, None, None, None, None, 2, None, None, None, None, None, None, None)),
#
# # 微信:交易单号 交易时间 交易类型 收/支/其他 交易方式 金额(元) 交易对方 商户单号
#
# # 支付宝:流水号 时间 名称/备注 收入 支出 账户余额 资金渠道
#
# # -----------------
#
# # 农业银行:交易日期 摘要/附言 交易金额 余额 交易地点/对方账号和户名
# ('农业银行', (1, None, 3, 4, None, 2, None, 5, None, None, None, None, None)),
#
# # 农商行:交易日期 交易发生额 账户余额 对方账号 对方户名 摘要 备注
#
# # 中信银行:交易日期 交易摘要 收入金额 支出金额 账户余额 对方户名 对方账号 对方银行 交易流水号
# ('中信银行', (1, None, None, 5, None, 2, 6, 7, 8, None, None, 3, 4)),
#
# # 平安电子账单:序号 交易日期 交易网点 摘要 借贷发生额(借:-贷:+) 账户余额
# ('平安电子账单', (2, None, 5, 6, None, 4, None, None, None, None, None, None, None)),
#
# # -------------------------
#
# # 招商银行:记账日期 货币 交易金额 联机余额 冲补账 交易摘要
#
# # 邮储银行:交易日期、交易类型 交易币种 交易金额(元) 账户余额(元) [对手方户名 对手方账户 收支类型] --> 竖版-无表格-邮储银行-账户对账单 含有对手方户名 对手方账户
# # 交易日期、交易类型 交易金额(元) 账户余额(元) 操作柜员 --> 竖版-无表格-邮储银行 账户对账单
#
# # 工商银行电子版:交易日期 账号 储种 序号 币种 妙汇 摘要 地区 收入/支出金额 余额 [对方户名 对方账号] 渠道
#
# # 招商银行电子版:记账日期 货币 交易金额 联机余额 交易摘要 对手信息
#
# # 民生银行:凭证类型 凭证号码 摘要信息 交易时间 交易金额 账户余额 现转标志 交易渠道 交易机构 对方户名 对方行名
# # 凭证类型 凭证号码 交易时间 摘要 交易金额 账户余额 现转标志 交易渠道 交易机构 对方户名 对方行名
#
# # 农业银行:交易日期 摘要/附言 交易金额 对方账号和户名
#
# # 农业银行:交易日期 摘要/附言 交易金额 余额 交易地点/对方账号和户名
#
# # 农业银行:日期、时间、[日志号]、短摘要、交易金额、本次余额、交易网点、渠道、附言
#
# # 交通银行:交易日期 记账日期、交易地点、交易类型、借贷状态、交易金额、余额
#
# # ===================================
#
# # 建设银行:摘要、交易日期、交易金额、账户余额、商户/网点号及其名称、对方账号、对方户名
# # 交易日期、摘要、币种、钞汇、交易金额、帐户余额、对方账号、对方帐户名称
#
#
# # ===================================
#
# # 邮储银行:序号、交易日期、交易渠道、摘要、交易金额、账户余额、对方账号/卡号/汇票号、原子账号、交易机构名称
#
# # 建设银行:序号 摘要 币别 钞汇 交易日期 交易金额 账户余额 交易地点附言 对方账号与户名
# ]
# {
# "0": "全表格-中国农业银行个人账户明细",
# "1": "全表格-中国银行",
# "2": "全表格-北京银行",
# "3": "全表格-工商银行",
# "4": "全表格-建设银行",
# "5": "部分格线-横版-中信银行账户交易明细",
# "6": "部分格线-横版-中信银行账户交易明细特殊",
# "7": "部分格线-竖版-中国农业银行",
# "8": "部分格线-竖版-中国农业银行分账户(窄页)",
# "9": "部分格线-竖版-平安电子账单"
# }
CLASSIFY_LIST
=
[
(
'农业银行'
,
(
1
,
None
,
3
,
5
,
None
,
8
,
7
,
6
,
None
,
None
,
None
,
None
,
None
)),
(
'中国银行'
,
(
1
,
2
,
4
,
5
,
6
,
9
,
10
,
11
,
12
,
None
,
None
,
None
,
None
)),
(
'北京银行'
,
(
1
,
None
,
4
,
5
,
None
,
2
,
6
,
7
,
None
,
None
,
3
,
None
,
None
)),
(
'工商银行'
,
(
1
,
None
,
9
,
10
,
None
,
7
,
None
,
None
,
None
,
None
,
None
,
None
,
None
)),
(
'建设银行'
,
(
None
,
None
,
None
,
None
,
None
,
2
,
None
,
None
,
None
,
None
,
None
,
None
,
None
)),
(
'中信银行'
,
(
1
,
None
,
None
,
5
,
None
,
2
,
6
,
7
,
8
,
None
,
None
,
3
,
4
)),
(
'中信银行'
,
(
1
,
None
,
None
,
5
,
None
,
2
,
6
,
7
,
8
,
None
,
None
,
3
,
4
)),
(
'农业银行'
,
(
1
,
None
,
3
,
4
,
None
,
2
,
None
,
5
,
None
,
None
,
None
,
None
,
None
)),
(
'农业银行'
,
(
1
,
None
,
3
,
4
,
None
,
2
,
None
,
5
,
None
,
None
,
None
,
None
,
None
)),
(
'平安电子账单'
,
(
2
,
None
,
5
,
6
,
None
,
4
,
None
,
None
,
None
,
None
,
None
,
None
,
None
)),
]
...
...
src/apps/doc/management/commands/doc_ocr_process.py
View file @
793920a
import
os
import
time
import
fitz
import
signal
import
base64
import
asyncio
import
aiohttp
import
difflib
import
requests
from
datetime
import
datetime
from
collections
import
Counter
from
apps.doc.ocr.wb
import
BSWorkbook
,
Workbook
from
django.core.management
import
BaseCommand
...
...
@@ -65,8 +66,6 @@ class Command(BaseCommand, LoggerMixin):
return
doc
,
business_type
def
pdf_download
(
self
,
doc
,
business_type
):
if
doc
is
None
:
return
None
,
None
,
None
,
None
doc_data_path
=
os
.
path
.
join
(
self
.
data_dir
,
business_type
,
str
(
doc
.
id
))
os
.
makedirs
(
doc_data_path
,
exist_ok
=
True
)
pdf_path
=
os
.
path
.
join
(
doc_data_path
,
'{0}.pdf'
.
format
(
doc
.
id
))
...
...
@@ -80,20 +79,96 @@ class Command(BaseCommand, LoggerMixin):
return
doc_data_path
,
excel_path
,
src_excel_path
,
pdf_path
@staticmethod
def
append_sheet
(
wb
,
sheets_list
,
img_name
,
role_summary
):
for
i
,
sheet
in
enumerate
(
sheets_list
):
sheet_name
=
'{0}_{1}'
.
format
(
img_name
,
i
)
role_summary
[
'银行-户名'
]
.
append
((
sheet_name
,
1
,
None
,
None
,
None
,
None
,
None
))
def
append_bs_sheet
(
wb
,
sheets
,
bs_summary
,
unknown_summary
,
pno
,
img_idx
,
classify
,
confidence
):
for
i
,
sheet
in
enumerate
(
sheets
):
sheet_name
=
'page_{0}_img_{1}_{2}'
.
format
(
pno
,
img_idx
,
i
)
# ['户名', '卡号', '页码', '回单验证码', '打印时间', '起始时间', '终止时间']
summary
=
sheet
.
get
(
'summary'
)
card
=
summary
[
1
]
if
card
is
None
:
classify_dict
=
unknown_summary
.
setdefault
(
classify
,
{})
role
=
consts
.
UNKNOWN_ROLE
if
summary
[
0
]
is
None
else
summary
[
0
]
role_dict
=
classify_dict
.
setdefault
(
role
,
{})
role_dict
[
'classify'
]
=
classify
role_dict
[
'role'
]
=
role
role_dict
.
setdefault
(
'sheet'
,
[])
.
append
(
sheet_name
)
role_dict
.
setdefault
(
'confidence'
,
[])
.
append
(
confidence
)
code_list
=
role_dict
.
setdefault
(
'code'
,
[])
pt_list
=
role_dict
.
setdefault
(
'print_time'
,
[])
sd_list
=
role_dict
.
setdefault
(
'start_date'
,
[])
ed_list
=
role_dict
.
setdefault
(
'end_date'
,
[])
if
summary
[
3
]
is
not
None
:
code_list
.
append
((
summary
[
2
],
summary
[
3
]))
if
summary
[
4
]
is
not
None
:
pt_list
.
append
(
summary
[
4
])
if
summary
[
5
]
is
not
None
:
sd_list
.
append
(
summary
[
5
])
if
summary
[
6
]
is
not
None
:
ed_list
.
append
(
summary
[
6
])
else
:
card_dict
=
bs_summary
.
setdefault
(
card
,
{})
card_dict
[
'count'
]
=
card_dict
.
get
(
'count'
,
0
)
+
1
card_dict
.
setdefault
(
'classify'
,
[])
.
append
(
classify
)
card_dict
.
setdefault
(
'confidence'
,
[])
.
append
(
confidence
)
card_dict
.
setdefault
(
'sheet'
,
[])
.
append
(
sheet_name
)
role_list
=
card_dict
.
setdefault
(
'role'
,
[])
role_set
=
card_dict
.
setdefault
(
'role_set'
,
set
())
code_list
=
card_dict
.
setdefault
(
'code'
,
[])
pt_list
=
card_dict
.
setdefault
(
'print_time'
,
[])
sd_list
=
card_dict
.
setdefault
(
'start_date'
,
[])
ed_list
=
card_dict
.
setdefault
(
'end_date'
,
[])
if
summary
[
0
]
is
not
None
:
role_list
.
append
(
summary
[
0
])
role_set
.
add
(
summary
[
0
])
if
summary
[
3
]
is
not
None
:
code_list
.
append
((
summary
[
2
],
summary
[
3
]))
if
summary
[
4
]
is
not
None
:
pt_list
.
append
(
summary
[
4
])
if
summary
[
5
]
is
not
None
:
sd_list
.
append
(
summary
[
5
])
if
summary
[
6
]
is
not
None
:
ed_list
.
append
(
summary
[
6
])
ws
=
wb
.
create_sheet
(
sheet_name
)
cells
=
sheet
.
get
(
'cells'
)
for
cell
in
cells
:
c1
=
cell
.
get
(
'start_column'
)
# c2 = cell.get('end_column')
r1
=
cell
.
get
(
'start_row'
)
# r2 = cell.get('end_row')
words
=
cell
.
get
(
'words'
)
ws
.
cell
(
row
=
r1
+
1
,
column
=
c1
+
1
,
value
=
words
)
def
ocr_2_wb
(
self
,
res
,
wb
,
pno
,
img_idx
,
bs_summary
,
unknown_summary
,
license_summary
):
# res = {
# 'code': 1,
# 'msg': 'success',
# 'data': {
# 'classify': 0,
# 'confidence': 0.999,
# 'sheets': [
# {
# 'summary': ['户名', '卡号', '页码', '回单验证码', '打印时间', '起始时间', '终止时间'],
# 'cells': []
# },
# {
# 'summary': ['户名', '卡号', '页码', '回单验证码', '打印时间', '起始时间', '终止时间'],
# 'cells': []
# }
# ]
# }
# }
data
=
res
.
get
(
'data'
,
{})
classify
=
data
.
get
(
'classify'
)
if
classify
is
None
:
return
# if classify in
sheets
=
data
.
get
(
'sheets'
,
[])
if
not
sheets
:
return
confidence
=
data
.
get
(
'confidence'
,
1
)
self
.
append_bs_sheet
(
wb
,
sheets
,
bs_summary
,
unknown_summary
,
pno
,
img_idx
,
classify
,
confidence
)
# else:
# pass
# async def fetch_ocr_result(self, img_path):
# async with aiohttp.ClientSession(
# headers=self.ocr_header, connector=aiohttp.TCPConnector(ssl=False)
...
...
@@ -102,35 +177,170 @@ class Command(BaseCommand, LoggerMixin):
# async with session.post(self.ocr_url, json=json_data) as response:
# return await response.json()
#
# async def img_
ocr_excel(self, wb, img_path, role_
summary):
# async def img_
2_ocr_2_wb(self, wb, img_path,
summary):
# res = await self.fetch_ocr_result(img_path)
# self.cronjob_log.info('{0} [fetch ocr result success] [img={1}] [res={2}]'.format(self.log_base, img_path, res))
# sheets_list = res.get('result').get('res')
# img_name = os.path.basename(img_path)
# self.append_sheet(wb, sheets_list, img_name,
role_
summary)
# self.append_sheet(wb, sheets_list, img_name, summary)
def
fetch_ocr_result
(
self
,
img_path
):
# payload = {'name': 'page_0_img_0_0'}
files
=
[
(
'img'
,
open
(
img_path
,
'rb'
))
]
response
=
requests
.
request
(
"POST"
,
self
.
ocr_url
,
files
=
files
)
return
response
.
json
()
def
img_ocr_excel
(
self
,
wb
,
img_path
,
role_summary
):
res
=
self
.
fetch_ocr_result
(
img_path
)
self
.
cronjob_log
.
info
(
'{0} [fetch ocr result success] [img={1}] [res={2}]'
.
format
(
self
.
log_base
,
img_path
,
res
))
def
img_2_ocr_2_wb
(
self
,
wb
,
img_info
,
bs_summary
,
unknown_summary
,
license_summary
):
res
=
self
.
fetch_ocr_result
(
img_info
[
0
])
self
.
cronjob_log
.
info
(
'{0} [fetch ocr result success] [img={1}] [res={2}]'
.
format
(
self
.
log_base
,
img_info
[
0
],
res
))
if
res
.
get
(
'code'
)
==
1
:
sheets_list
=
res
.
get
(
'data'
)
if
not
sheets_list
:
return
img_name
=
os
.
path
.
basename
(
img_path
)
self
.
append_sheet
(
wb
,
sheets_list
,
img_name
,
role_summary
)
self
.
ocr_2_wb
(
res
,
wb
,
img_info
[
1
],
img_info
[
2
],
bs_summary
,
unknown_summary
,
license_summary
)
@staticmethod
def
get_most
(
value_list
):
if
value_list
:
most_common
=
Counter
(
value_list
)
.
most_common
(
1
)
return
most_common
[
0
][
0
]
if
most_common
else
None
@staticmethod
def
date_format
(
date_str
,
format_str
):
try
:
date
=
datetime
.
strptime
(
date_str
,
format_str
)
except
Exception
as
e
:
return
else
:
return
date
def
get_validate_date
(
self
,
date_list
):
for
date_str
in
date_list
:
for
format_str
in
consts
.
DATE_FORMAT
:
date
=
self
.
date_format
(
date_str
,
format_str
)
if
isinstance
(
date
,
datetime
):
return
date
def
merge_card
(
self
,
bs_summary
):
merged_bs_summary
=
{}
sorted_card
=
sorted
(
bs_summary
.
keys
(),
key
=
lambda
x
:
bs_summary
[
x
][
'count'
],
reverse
=
True
)
for
main_card
in
sorted_card
:
if
bs_summary
.
get
(
main_card
)
is
None
:
continue
merged_bs_summary
[
main_card
]
=
bs_summary
.
pop
(
main_card
)
del
merged_bs_summary
[
main_card
][
'count'
]
merge_cards
=
[]
for
card
in
bs_summary
.
keys
():
if
difflib
.
SequenceMatcher
(
None
,
main_card
,
card
)
.
quick_ratio
()
>
consts
.
CARD_RATIO
:
merged_bs_summary
[
main_card
][
'classify'
]
.
extend
(
bs_summary
[
card
][
'classify'
])
merged_bs_summary
[
main_card
][
'confidence'
]
.
extend
(
bs_summary
[
card
][
'confidence'
])
merged_bs_summary
[
main_card
][
'sheet'
]
.
extend
(
bs_summary
[
card
][
'sheet'
])
merged_bs_summary
[
main_card
][
'role'
]
.
extend
(
bs_summary
[
card
][
'role'
])
merged_bs_summary
[
main_card
][
'role_set'
]
.
update
(
bs_summary
[
card
][
'role_set'
])
merged_bs_summary
[
main_card
][
'code'
]
.
extend
(
bs_summary
[
card
][
'sheet'
])
merged_bs_summary
[
main_card
][
'print_time'
]
.
extend
(
bs_summary
[
card
][
'print_time'
])
merged_bs_summary
[
main_card
][
'start_date'
]
.
extend
(
bs_summary
[
card
][
'start_date'
])
merged_bs_summary
[
main_card
][
'end_date'
]
.
extend
(
bs_summary
[
card
][
'end_date'
])
merge_cards
.
append
(
card
)
for
card
in
merge_cards
:
del
bs_summary
[
card
]
merged_bs_summary
[
main_card
][
'classify'
]
=
self
.
get_most
(
merged_bs_summary
[
main_card
][
'classify'
])
merged_bs_summary
[
main_card
][
'role'
]
=
self
.
get_most
(
merged_bs_summary
[
main_card
][
'role'
])
del
bs_summary
return
merged_bs_summary
def
prune_bs_summary
(
self
,
bs_summary
):
for
summary
in
bs_summary
.
values
():
del
summary
[
'count'
]
summary
[
'classify'
]
=
self
.
get_most
(
summary
[
'classify'
])
summary
[
'role'
]
=
self
.
get_most
(
summary
[
'role'
])
return
bs_summary
def
rebuild_bs_summary
(
self
,
bs_summary
,
unknown_summary
):
# bs_summary = {
# '卡号': {
# 'count': 100,
# 'classify': [],
# 'confidence': [],
# 'role': [],
# 'code': [('page', 'code')],
# 'print_time': [],
# 'start_date': [],
# 'end_date': [],
# 'sheet': ['sheet_name']
# }
# }
#
# unknown_summary = {
# 0: {
# '户名': {
# 'classify': 0,
# 'confidence': [],
# 'role': '户名',
# 'code': [('page', 'code')],
# 'print_time': [],
# 'start_date': [],
# 'end_date': [],
# 'sheet': ['sheet_name']
# }
# }
# }
# 无卡号
if
len
(
bs_summary
)
==
0
:
del
bs_summary
merged_bs_summary
=
{}
card_num
=
1
for
role_dict
in
unknown_summary
.
values
():
for
summary
in
role_dict
.
values
():
card
=
'{0}_{1}'
.
format
(
consts
.
UNKNOWN_CARD
,
card_num
)
card_num
+=
1
merged_bs_summary
[
card
]
=
summary
else
:
# 1卡号
if
len
(
bs_summary
)
==
1
:
merged_bs_summary
=
self
.
prune_bs_summary
(
bs_summary
)
# 多卡号
else
:
merged_bs_summary
=
self
.
merge_card
(
bs_summary
)
for
card_summary
in
merged_bs_summary
.
values
():
merge_role
=
[]
classify_summary
=
unknown_summary
.
get
(
card_summary
[
'classify'
],
{})
for
role
,
summary
in
classify_summary
.
items
():
if
role
in
card_summary
[
'role_set'
]:
merge_role
.
append
(
role
)
card_summary
[
'sheet'
]
.
extend
(
summary
[
'sheet'
])
card_summary
[
'code'
]
.
extend
(
summary
[
'sheet'
])
card_summary
[
'print_time'
]
.
extend
(
summary
[
'print_time'
])
card_summary
[
'start_date'
]
.
extend
(
summary
[
'start_date'
])
card_summary
[
'end_date'
]
.
extend
(
summary
[
'end_date'
])
for
role
in
merge_role
:
del
classify_summary
[
role
]
card_num
=
1
for
role_dict
in
unknown_summary
.
values
():
for
summary
in
role_dict
.
values
():
card
=
'{0}_{1}'
.
format
(
consts
.
UNKNOWN_CARD
,
card_num
)
card_num
+=
1
merged_bs_summary
[
card
]
=
summary
del
unknown_summary
for
summary
in
merged_bs_summary
.
values
():
if
summary
.
get
(
'role_set'
)
is
not
None
:
del
summary
[
'role_set'
]
summary
[
'print_time'
]
=
self
.
get_validate_date
(
summary
[
'print_time'
])
summary
[
'start_date'
]
=
self
.
get_validate_date
(
summary
[
'start_date'
])
summary
[
'end_date'
]
=
self
.
get_validate_date
(
summary
[
'end_date'
])
summary
[
'confidence'
]
=
max
(
summary
[
'confidence'
])
return
merged_bs_summary
# TODO 细化文件状态,不同异常状态,归还队列,重试时采取不同的处理
# TODO 调用接口重试
# TODO 协程异步发送OCR请求
# TODO 异常邮件通知
# TODO 数据库断联问题
# TODO 非流水证件处理,Excel模板
def
handle
(
self
,
*
args
,
**
kwargs
):
sleep_second
=
int
(
conf
.
SLEEP_SECOND
)
max_sleep_second
=
int
(
conf
.
MAX_SLEEP_SECOND
)
...
...
@@ -138,17 +348,19 @@ class Command(BaseCommand, LoggerMixin):
while
self
.
switch
:
# 1. 从队列获取文件信息
doc
,
business_type
=
self
.
get_doc_info
()
# 队列为空时的处理
if
doc
is
None
:
time
.
sleep
(
sleep_second
)
sleep_second
=
min
(
max_sleep_second
,
sleep_second
+
5
)
continue
sleep_second
=
int
(
conf
.
SLEEP_SECOND
)
try
:
start_time
=
time
.
time
()
# 2. 从EDMS获取PDF文件
doc_data_path
,
excel_path
,
src_excel_path
,
pdf_path
=
self
.
pdf_download
(
doc
,
business_type
)
# 队列为空时的处理
if
pdf_path
is
None
:
time
.
sleep
(
sleep_second
)
sleep_second
=
min
(
max_sleep_second
,
sleep_second
+
5
)
continue
sleep_second
=
int
(
conf
.
SLEEP_SECOND
)
# 3.PDF文件提取图片
start_time
=
time
.
time
()
img_save_path
=
os
.
path
.
join
(
doc_data_path
,
'img'
)
self
.
cronjob_log
.
info
(
'{0} [pdf to img start] [business_type={1}] [doc_id={2}]'
.
format
(
self
.
log_base
,
business_type
,
doc
.
id
))
...
...
@@ -158,28 +370,42 @@ class Command(BaseCommand, LoggerMixin):
self
.
log_base
,
business_type
,
doc
.
id
))
write_zip_file
(
img_save_path
,
os
.
path
.
join
(
doc_data_path
,
'{0}_img.zip'
.
format
(
doc
.
id
)))
# 4.图片调用算法判断是否为银行流水, 图片调用算法OCR为excel文件
role_summary
=
{
'银行-户名'
:
[]
}
# interest_keyword = Keywords.objects.filter(
# type=KeywordsType.INTEREST.value).values_list('keyword', flat=True)
# salary_keyword = Keywords.objects.filter(
# type=KeywordsType.SALARY.value).values_list('keyword', flat=True)
# loan_keyword = Keywords.objects.filter(type=KeywordsType.LOAN.value).values_list('keyword', flat=True)
# wb = BSWorkbook(interest_keyword, salary_keyword, loan_keyword)
wb
=
Workbook
()
# 4.获取OCR结果并且构建excel文件
bs_summary
=
{}
license_summary
=
{}
unknown_summary
=
[]
interest_keyword
=
Keywords
.
objects
.
filter
(
type
=
KeywordsType
.
INTEREST
.
value
)
.
values_list
(
'keyword'
,
flat
=
True
)
salary_keyword
=
Keywords
.
objects
.
filter
(
type
=
KeywordsType
.
SALARY
.
value
)
.
values_list
(
'keyword'
,
flat
=
True
)
loan_keyword
=
Keywords
.
objects
.
filter
(
type__in
=
[
KeywordsType
.
LOAN
.
value
,
KeywordsType
.
ALI_WECHART
.
value
])
.
values_list
(
'keyword'
,
flat
=
True
)
wb
=
BSWorkbook
(
interest_keyword
,
salary_keyword
,
loan_keyword
)
# wb = Workbook()
# 4.1 获取OCR结果
# loop = asyncio.get_event_loop()
# tasks = [self.img_
ocr_excel(wb, img_path, role_
summary) for img_path in pdf_handler.img_path_list]
# tasks = [self.img_
2_ocr_2_wb(wb, img_path,
summary) for img_path in pdf_handler.img_path_list]
# loop.run_until_complete(asyncio.wait(tasks))
# loop.close()
for
img_path
in
pdf_handler
.
img_path_list
:
self
.
img_ocr_excel
(
wb
,
img_path
,
role_summary
)
for
img_info
in
pdf_handler
.
img_info_list
:
self
.
img_2_ocr_2_wb
(
wb
,
img_info
,
bs_summary
,
unknown_summary
,
license_summary
)
self
.
cronjob_log
.
info
(
'{0} [bs_summary={1}] [unknown_summary={2}] [license_summary={3}]'
.
format
(
self
.
log_base
,
bs_summary
,
unknown_summary
,
license_summary
))
merged_bs_summary
=
self
.
rebuild_bs_summary
(
bs_summary
,
unknown_summary
)
# 整合excel文件
# wb.save(src_excel_path)
# wb.rebuild(role_summary)
self
.
cronjob_log
.
info
(
'{0} [merged_bs_summary={1}] [unknown_summary={2}]'
.
format
(
self
.
log_base
,
merged_bs_summary
,
unknown_summary
))
del
unknown_summary
# 4.2 重构Excel文件
wb
.
save
(
src_excel_path
)
wb
.
rebuild
(
merged_bs_summary
,
license_summary
)
wb
.
save
(
excel_path
)
except
Exception
as
e
:
doc
.
status
=
DocStatus
.
PROCESS_FAILED
.
value
...
...
@@ -194,14 +420,16 @@ class Command(BaseCommand, LoggerMixin):
except
Exception
as
e
:
doc
.
status
=
DocStatus
.
UPLOAD_FAILED
.
value
doc
.
save
()
self
.
cronjob_log
.
error
(
'{0} [upload failed] [business_type={1}] [doc_id={2}] [err={3}]'
.
format
(
self
.
log_base
,
business_type
,
doc
.
id
,
e
))
end_time
=
time
.
time
()
speed_time
=
int
(
end_time
-
start_time
)
self
.
cronjob_log
.
error
(
'{0} [upload failed] [business_type={1}] [doc_id={2}] [speed_time={3}] '
'[err={4}]'
.
format
(
self
.
log_base
,
business_type
,
doc
.
id
,
speed_time
,
e
))
else
:
doc
.
status
=
DocStatus
.
COMPLETE
.
value
doc
.
save
()
end_time
=
time
.
time
()
speed_time
=
int
(
end_time
-
start_time
)
self
.
cronjob_log
.
info
(
'{0} [
doc
process complete] [business_type={1}] [doc_id={2}] '
self
.
cronjob_log
.
info
(
'{0} [process complete] [business_type={1}] [doc_id={2}] '
'[speed_time={3}]'
.
format
(
self
.
log_base
,
business_type
,
doc
.
id
,
speed_time
))
self
.
cronjob_log
.
info
(
'{0} [stop safely]'
.
format
(
self
.
log_base
))
...
...
src/apps/doc/mixins.py
View file @
793920a
...
...
@@ -11,6 +11,8 @@ class DocHandler:
return
'/data/{1}/{0}/{0}.pdf'
.
format
(
doc_id
,
business_type
)
elif
file
==
'img'
:
return
'/data/{1}/{0}/{0}_img.zip'
.
format
(
doc_id
,
business_type
)
elif
file
==
'src_excel'
:
return
'/data/{1}/{0}/src.xlsx'
.
format
(
doc_id
,
business_type
)
else
:
return
'/data/{1}/{0}/{0}.xlsx'
.
format
(
doc_id
,
business_type
)
...
...
@@ -22,6 +24,7 @@ class DocHandler:
doc_dict
[
'pdf_link'
]
=
self
.
get_link
(
doc_id
,
business_type
)
doc_dict
[
'img_link'
]
=
self
.
get_link
(
doc_id
,
business_type
,
file
=
'img'
)
doc_dict
[
'excel_link'
]
=
self
.
get_link
(
doc_id
,
business_type
,
file
=
'excel'
)
doc_dict
[
'src_excel_link'
]
=
self
.
get_link
(
doc_id
,
business_type
,
file
=
'src_excel'
)
return
list
(
doc_queryset
)
@staticmethod
...
...
src/apps/doc/named_enum.py
View file @
793920a
...
...
@@ -13,3 +13,4 @@ class KeywordsType(NamedEnum):
INTEREST
=
(
0
,
"利息"
)
SALARY
=
(
1
,
'薪资'
)
LOAN
=
(
2
,
'贷款'
)
ALI_WECHART
=
(
3
,
'微信/支付宝'
)
...
...
src/apps/doc/ocr/wb.py
View file @
793920a
...
...
@@ -13,6 +13,7 @@ class BSWorkbook(Workbook):
def
__init__
(
self
,
interest_keyword
,
salary_keyword
,
loan_keyword
,
*
args
,
**
kwargs
):
super
()
.
__init__
(
*
args
,
**
kwargs
)
locale
.
setlocale
(
locale
.
LC_NUMERIC
,
'en_US.UTF-8'
)
self
.
meta_sheet_title
=
'关键信息提取和展示'
self
.
blank_row
=
(
None
,)
self
.
code_header
=
(
'页数'
,
'电子回单验证码'
)
...
...
@@ -24,26 +25,59 @@ class BSWorkbook(Workbook):
self
.
proof_res
=
(
'对'
,
'错'
)
self
.
loan_fill
=
PatternFill
(
"solid"
,
fgColor
=
"00FFCC00"
)
self
.
amount_fill
=
PatternFill
(
"solid"
,
fgColor
=
"00FFFF00"
)
self
.
bd
=
Side
(
style
=
'thin'
,
color
=
"000000"
)
self
.
border
=
Border
(
left
=
self
.
bd
,
top
=
self
.
bd
,
right
=
self
.
bd
,
bottom
=
self
.
bd
)
#
self.bd = Side(style='thin', color="000000")
#
self.border = Border(left=self.bd, top=self.bd, right=self.bd, bottom=self.bd)
self
.
MAX_MEAN
=
31
@staticmethod
def
sheet_prune
(
ws
):
def
sheet_prune
(
ws
,
classify
):
ws
.
insert_cols
(
1
,
amount
=
consts
.
FIXED_COL_AMOUNT
)
moved_col_set
=
set
()
header_col_set
=
set
()
# 根据第一行关键词排列
for
col
in
range
(
consts
.
FIXED_COL_AMOUNT
+
1
,
ws
.
max_column
+
1
):
header_value
=
ws
.
cell
(
1
,
col
)
.
value
header_idx
=
consts
.
HEADERS_MAPPING
.
get
(
header_value
)
# TODO 关键字段再次查找
# TODO 支付宝、微信流水第一行非表头,怎么处理
if
header_idx
is
None
:
header_col
=
consts
.
HEADERS_MAPPING
.
get
(
header_value
)
if
header_col
is
not
None
:
letter
=
get_column_letter
(
col
)
ws
.
move_range
(
"{0}1:{0}{1}"
.
format
(
letter
,
ws
.
max_row
),
cols
=
header_col
-
col
)
moved_col_set
.
add
(
col
)
header_col_set
.
add
(
header_col
)
elif
header_value
in
consts
.
BORROW_HEADERS_SET
:
letter
=
get_column_letter
(
col
)
ws
.
move_range
(
"{0}1:{0}{1}"
.
format
(
letter
,
ws
.
max_row
),
cols
=
consts
.
BORROW_HEADER_COL
-
col
)
moved_col_set
.
add
(
col
)
header_col_set
.
add
(
consts
.
BORROW_HEADER_COL
)
elif
header_value
in
consts
.
INCOME_HEADERS_SET
:
letter
=
get_column_letter
(
col
)
ws
.
move_range
(
"{0}1:{0}{1}"
.
format
(
letter
,
ws
.
max_row
),
cols
=
consts
.
INCOME_HEADER_COL
-
col
)
moved_col_set
.
add
(
col
)
header_col_set
.
add
(
consts
.
INCOME_HEADER_COL
)
elif
header_value
in
consts
.
OUTLAY_HEADERS_SET
:
letter
=
get_column_letter
(
col
)
ws
.
move_range
(
"{0}1:{0}{1}"
.
format
(
letter
,
ws
.
max_row
),
cols
=
consts
.
OUTLAY_HEADER_COL
-
col
)
moved_col_set
.
add
(
col
)
header_col_set
.
add
(
consts
.
OUTLAY_HEADER_COL
)
# 缺失表头再次查找
for
header_col
in
range
(
1
,
consts
.
FIXED_COL_AMOUNT
+
1
):
if
header_col
in
header_col_set
or
header_col
==
consts
.
RESULT_HEADER_COL
:
continue
fix_col
=
consts
.
CLASSIFY_LIST
[
classify
][
1
][
header_col
-
1
]
# TODO 合并分类情况
if
fix_col
is
None
:
continue
letter
=
get_column_letter
(
col
)
ws
.
move_range
(
"{0}1:{0}{1}"
.
format
(
letter
,
ws
.
max_row
),
cols
=
header_idx
-
col
)
fix_col
=
fix_col
+
consts
.
FIXED_COL_AMOUNT
if
fix_col
in
moved_col_set
:
break
letter
=
get_column_letter
(
fix_col
)
ws
.
move_range
(
"{0}1:{0}{1}"
.
format
(
letter
,
ws
.
max_row
),
cols
=
header_col
-
fix_col
)
ws
.
delete_cols
(
consts
.
FIXED_COL_AMOUNT
+
1
,
amount
=
ws
.
max_column
)
min_row
=
1
if
len
(
moved_col_set
)
==
0
else
2
return
min_row
@staticmethod
def
month_split
(
dti
,
date_list
):
def
month_split
(
dti
,
date_list
,
date_statistics
):
month_list
=
[]
idx_list
=
[]
month_pre
=
None
...
...
@@ -53,15 +87,17 @@ class BSWorkbook(Workbook):
if
month_str
!=
month_pre
:
month_list
.
append
(
month_str
)
if
month_pre
is
None
:
date_list
.
append
(
dti
[
idx
]
.
date
())
if
date_statistics
:
date_list
.
append
(
dti
[
idx
]
.
date
())
idx
=
0
idx_list
.
append
(
idx
)
month_pre
=
month_str
for
idx
in
range
(
len
(
dti
)
-
1
,
-
1
,
-
1
):
if
isinstance
(
dti
[
idx
],
NaTType
):
continue
date_list
.
append
(
dti
[
idx
]
.
date
())
break
if
date_statistics
:
for
idx
in
range
(
len
(
dti
)
-
1
,
-
1
,
-
1
):
if
isinstance
(
dti
[
idx
],
NaTType
):
continue
date_list
.
append
(
dti
[
idx
]
.
date
())
break
return
month_list
,
idx_list
@staticmethod
...
...
@@ -86,8 +122,8 @@ class BSWorkbook(Workbook):
reverse_trend
=
-
1
return
reverse_trend
def
sheet_split
(
self
,
ws
,
month_mapping
,
date_list
,
reverse_trend_list
):
for
date_tuple_src
in
ws
.
iter_cols
(
min_col
=
1
,
max_col
=
1
,
min_row
=
2
,
values_only
=
True
):
def
sheet_split
(
self
,
ws
,
month_mapping
,
reverse_trend_list
,
min_row
,
date_list
,
date_statistics
):
for
date_tuple_src
in
ws
.
iter_cols
(
min_col
=
1
,
max_col
=
1
,
min_row
=
min_row
,
values_only
=
True
):
date_tuple
=
[
date
[:
10
]
if
isinstance
(
date
,
str
)
else
date
for
date
in
date_tuple_src
]
dt_array
,
tz_parsed
=
tslib
.
array_to_datetime
(
np
.
array
(
date_tuple
,
copy
=
False
,
dtype
=
np
.
object_
),
...
...
@@ -95,16 +131,16 @@ class BSWorkbook(Workbook):
utc
=
False
,
dayfirst
=
False
,
yearfirst
=
False
,
require_iso8601
=
Fals
e
,
require_iso8601
=
Tru
e
,
)
dti
=
DatetimeIndex
(
dt_array
,
tz
=
None
,
name
=
None
)
month_list
,
idx_list
=
self
.
month_split
(
dti
,
date_list
)
month_list
,
idx_list
=
self
.
month_split
(
dti
,
date_list
,
date_statistics
)
if
len
(
month_list
)
==
0
:
# month_info process
month_info
=
month_mapping
.
setdefault
(
'xxxx-xx'
,
[])
month_info
.
append
((
ws
.
title
,
2
,
ws
.
max_row
,
0
))
month_info
.
append
((
ws
.
title
,
min_row
,
ws
.
max_row
,
0
))
elif
len
(
month_list
)
==
1
:
# reverse_trend_list process
reverse_trend
=
self
.
get_reverse_trend
(
dti
.
day
,
idx_list
)
...
...
@@ -113,14 +149,14 @@ class BSWorkbook(Workbook):
month_info
=
month_mapping
.
setdefault
(
month_list
[
0
],
[])
day_mean
=
np
.
mean
(
dti
.
day
.
dropna
())
if
len
(
month_info
)
==
0
:
month_info
.
append
((
ws
.
title
,
2
,
ws
.
max_row
,
day_mean
))
month_info
.
append
((
ws
.
title
,
min_row
,
ws
.
max_row
,
day_mean
))
else
:
for
i
,
item
in
enumerate
(
month_info
):
if
day_mean
<=
item
[
-
1
]:
month_info
.
insert
(
i
,
(
ws
.
title
,
2
,
ws
.
max_row
,
day_mean
))
month_info
.
insert
(
i
,
(
ws
.
title
,
min_row
,
ws
.
max_row
,
day_mean
))
break
else
:
month_info
.
append
((
ws
.
title
,
2
,
ws
.
max_row
,
day_mean
))
month_info
.
append
((
ws
.
title
,
min_row
,
ws
.
max_row
,
day_mean
))
else
:
# reverse_trend_list process
reverse_trend
=
self
.
get_reverse_trend
(
dti
.
day
,
idx_list
)
...
...
@@ -128,34 +164,41 @@ class BSWorkbook(Workbook):
# month_info process
for
i
,
item
in
enumerate
(
month_list
[:
-
1
]):
month_mapping
.
setdefault
(
item
,
[])
.
append
(
(
ws
.
title
,
idx_list
[
i
]
+
2
,
idx_list
[
i
+
1
]
+
1
,
self
.
MAX_MEAN
))
(
ws
.
title
,
idx_list
[
i
]
+
min_row
,
idx_list
[
i
+
1
]
+
min_row
-
1
,
self
.
MAX_MEAN
))
month_mapping
.
setdefault
(
month_list
[
-
1
],
[])
.
insert
(
0
,
(
ws
.
title
,
idx_list
[
-
1
]
+
2
,
ws
.
max_row
,
0
))
0
,
(
ws
.
title
,
idx_list
[
-
1
]
+
min_row
,
ws
.
max_row
,
0
))
def
build_metadata_rows
(
self
,
confidence_max
,
code_list
,
print_time
,
start_date
,
end_date
,
date_interval
):
metadata_rows
=
[(
'流水识别置信度'
,
confidence_max
),
self
.
blank_row
,
self
.
code_header
]
metadata_rows
.
extend
(
code_list
)
def
build_metadata_rows
(
self
,
classify
,
confidence
,
role
,
code
,
print_time
,
start_date
,
end_date
):
metadata_rows
=
[
(
'流水识别置信度'
,
confidence
),
self
.
blank_row
,
(
'分类结果'
,
classify
),
self
.
blank_row
,
(
'户名'
,
role
),
self
.
blank_row
,
self
.
code_header
,
]
metadata_rows
.
extend
(
code
)
metadata_rows
.
extend
(
[
self
.
blank_row
,
self
.
date_header
,
(
print_time
,
start_date
,
end_date
,
date_interval
),
(
print_time
,
start_date
,
end_date
,
(
end_date
-
start_date
)
.
days
),
self
.
blank_row
,
self
.
keyword_header
]
)
return
metadata_rows
def
create_meta_sheet
(
self
,
role
):
def
create_meta_sheet
(
self
,
card
):
if
self
.
worksheets
[
0
]
.
title
==
'Sheet'
:
ms
=
self
.
worksheets
[
0
]
ms
.
title
=
'{0}({1})'
.
format
(
self
.
meta_sheet_title
,
role
)
ms
.
title
=
'{0}({1})'
.
format
(
self
.
meta_sheet_title
,
card
)
else
:
ms
=
self
.
create_sheet
(
'{0}({1})'
.
format
(
self
.
meta_sheet_title
,
role
))
ms
=
self
.
create_sheet
(
'{0}({1})'
.
format
(
self
.
meta_sheet_title
,
card
))
return
ms
def
build_meta_sheet
(
self
,
role
,
confidence_max
,
code_list
,
print_time
,
start_date
,
end_date
,
date_interval
):
metadata_rows
=
self
.
build_metadata_rows
(
confidence_max
,
code_list
,
print_time
,
start_date
,
end_date
,
date_interval
)
ms
=
self
.
create_meta_sheet
(
role
)
def
build_meta_sheet
(
self
,
card
,
classify
,
confidence
,
role
,
code
,
print_time
,
start_date
,
end_date
):
metadata_rows
=
self
.
build_metadata_rows
(
classify
,
confidence
,
role
,
code
,
print_time
,
start_date
,
end_date
)
ms
=
self
.
create_meta_sheet
(
card
)
for
row
in
metadata_rows
:
ms
.
append
(
row
)
return
ms
...
...
@@ -169,55 +212,84 @@ class BSWorkbook(Workbook):
new_ws
.
append
(
consts
.
FIXED_HEADERS
)
for
part
in
parts
:
ws
=
self
.
get_sheet_by_name
(
part
[
0
])
for
row
in
ws
.
iter_rows
(
min_row
=
part
[
1
],
max_row
=
part
[
2
],
values_only
=
True
):
new_ws
.
append
(
row
)
for
row
_value
in
ws
.
iter_rows
(
min_row
=
part
[
1
],
max_row
=
part
[
2
],
values_only
=
True
):
new_ws
.
append
(
row
_value
)
# 3.2.提取信息、高亮
amount_mapping
=
{}
amount_fill_row
=
set
()
for
rows
in
new_ws
.
iter_rows
():
summary_cell
=
rows
[
5
]
date_cell
=
rows
[
0
]
for
rows
in
new_ws
.
iter_rows
(
min_row
=
2
):
summary_cell
=
rows
[
consts
.
SUMMARY_IDX
]
date_cell
=
rows
[
consts
.
DATE_IDX
]
amount_cell
=
rows
[
consts
.
AMOUNT_IDX
]
row
=
summary_cell
.
row
# 关键词1提取
if
summary_cell
.
value
in
self
.
interest_keyword
:
ms
.
append
((
summary_cell
.
value
,
date_cell
.
value
,
rows
[
2
]
.
value
))
ms
.
append
((
summary_cell
.
value
,
date_cell
.
value
,
amount_cell
.
value
))
# 关键词2提取至临时表
elif
summary_cell
.
value
in
self
.
salary_keyword
:
tmp_ws
.
append
((
summary_cell
.
value
,
date_cell
.
value
,
rows
[
2
]
.
value
))
tmp_ws
.
append
((
summary_cell
.
value
,
date_cell
.
value
,
amount_cell
.
value
))
# 贷款关键词高亮
elif
summary_cell
.
value
in
self
.
loan_keyword
:
summary_cell
.
fill
=
self
.
loan_fill
for
i
,
cell
in
enumerate
(
rows
):
cell
.
border
=
self
.
border
if
(
i
==
2
or
i
==
3
)
and
cell
.
row
>
1
:
# 3.3.余额转数值
over_cell
=
rows
[
consts
.
OVER_IDX
]
try
:
if
isinstance
(
over_cell
.
value
,
str
):
over_cell
.
value
=
over_cell
.
value
.
translate
(
consts
.
TRANS
)
over_cell
.
value
=
locale
.
atof
(
over_cell
.
value
)
except
Exception
as
e
:
continue
else
:
over_cell
.
number_format
=
numbers
.
FORMAT_NUMBER_COMMA_SEPARATED1
# 3.4.余额转数值
try
:
try
:
if
isinstance
(
amount_cell
.
value
,
str
):
amount_cell
.
value
=
amount_cell
.
value
.
translate
(
consts
.
TRANS
)
amount_cell
.
value
=
locale
.
atof
(
amount_cell
.
value
)
except
Exception
as
e
:
try
:
# 3.3.金额、余额转数值
cell
.
value
=
locale
.
atof
(
cell
.
value
)
except
Exception
:
continue
else
:
cell
.
number_format
=
numbers
.
FORMAT_NUMBER_COMMA_SEPARATED1
if
i
==
2
:
same_amount_mapping
=
amount_mapping
.
get
(
date_cell
.
value
,
{})
fill_rows
=
same_amount_mapping
.
get
(
-
cell
.
value
)
if
fill_rows
:
amount_fill_row
.
add
(
cell
.
row
)
amount_fill_row
.
update
(
fill_rows
)
amount_mapping
.
setdefault
(
date_cell
.
value
,
{})
.
setdefault
(
cell
.
value
,
[])
.
append
(
cell
.
row
)
# 3.4.核对结果
# TODO 借贷、开支类型银行流水,需要手动添加+-号
if
i
==
9
and
cell
.
row
>
2
:
if
is_reverse
:
cell
.
value
=
'=IF(D{0}=SUM(D{1},C{0}), "{2}", "{3}")'
.
format
(
cell
.
row
-
1
,
cell
.
row
,
*
self
.
proof_res
)
else
:
cell
.
value
=
'=IF(D{0}=SUM(D{1},C{0}), "{2}", "{3}")'
.
format
(
cell
.
row
,
cell
.
row
-
1
,
*
self
.
proof_res
)
if
isinstance
(
rows
[
consts
.
INCOME_IDX
]
.
value
,
str
):
rows
[
consts
.
OUTLAY_IDX
]
.
value
=
rows
[
consts
.
INCOME_IDX
]
.
value
.
translate
(
consts
.
TRANS
)
amount_cell
.
value
=
locale
.
atof
(
rows
[
consts
.
OUTLAY_IDX
]
.
value
)
except
Exception
as
e
:
if
isinstance
(
rows
[
consts
.
OUTLAY_IDX
]
.
value
,
str
):
rows
[
consts
.
OUTLAY_IDX
]
.
value
=
rows
[
consts
.
OUTLAY_IDX
]
.
value
.
translate
(
consts
.
TRANS
)
amount_cell
.
value
=
locale
.
atof
(
rows
[
consts
.
OUTLAY_IDX
]
.
value
)
if
amount_cell
.
value
>
0
:
amount_cell
.
value
=
-
amount_cell
.
value
except
Exception
as
e
:
continue
else
:
if
rows
[
consts
.
BORROW_IDX
]
.
value
in
consts
.
BORROW_OUTLAY_SET
:
amount_cell
.
value
=
-
amount_cell
.
value
amount_cell
.
number_format
=
numbers
.
FORMAT_NUMBER_COMMA_SEPARATED1
same_amount_mapping
=
amount_mapping
.
get
(
date_cell
.
value
,
{})
fill_rows
=
same_amount_mapping
.
get
(
-
amount_cell
.
value
)
if
fill_rows
:
amount_fill_row
.
add
(
row
)
amount_fill_row
.
update
(
fill_rows
)
amount_mapping
.
setdefault
(
date_cell
.
value
,
{})
.
setdefault
(
amount_cell
.
value
,
[])
.
append
(
row
)
# 3.5.同一天相同进出账高亮
# 3.5.核对结果
if
row
>
2
:
if
is_reverse
:
rows
[
consts
.
RESULT_IDX
]
.
value
=
'=IF(D{0}=SUM(D{1},C{0}), "{2}", "{3}")'
.
format
(
row
-
1
,
row
,
*
self
.
proof_res
)
else
:
rows
[
consts
.
RESULT_IDX
]
.
value
=
'=IF(D{0}=SUM(D{1},C{0}), "{2}", "{3}")'
.
format
(
row
,
row
-
1
,
*
self
.
proof_res
)
# 删除金额辅助列
new_ws
.
delete_cols
(
consts
.
BORROW_HEADER_COL
,
amount
=
new_ws
.
max_column
)
# 3.6.同一天相同进出账高亮
del
amount_mapping
for
row
in
amount_fill_row
:
new_ws
[
row
][
2
]
.
fill
=
self
.
amount_fill
new_ws
[
row
][
consts
.
AMOUNT_IDX
]
.
fill
=
self
.
amount_fill
# 关键词2信息提取
ms
.
append
(
self
.
blank_row
)
...
...
@@ -226,34 +298,51 @@ class BSWorkbook(Workbook):
ms
.
append
(
row
)
self
.
remove
(
tmp_ws
)
def
rebuild
(
self
,
role_summary
):
# (sheet_name, confidence, page, code, print_time, start_date, end_date) # TODO 表名简化,+卡号
for
role
,
summary_list
in
role_summary
.
items
():
def
bs_rebuild
(
self
,
bs_summary
):
# bs_summary = {
# '卡号': {
# 'classify': 0,
# 'confidence': 0.9,
# 'role': '柳雪',
# 'code': [('page', 'code')],
# 'print_time': 'datetime',
# 'start_date': 'datetime',
# 'end_date': 'datetime',
# 'sheet': ['sheet_name']
# }
# }
for
card
,
summary
in
bs_summary
.
items
():
# 1.原表修剪、排列、按照月份分割
reverse_trend_list
=
[]
confidence_max
=
0
code_list
=
[]
month_mapping
=
{}
start_date
=
summary
[
'start_date'
]
end_date
=
summary
[
'end_date'
]
date_statistics
=
False
if
start_date
is
None
or
end_date
is
None
:
date_statistics
=
True
date_list
=
[]
start_date
=
end_date
=
date_interval
=
print_time
=
None
for
summary
in
summary_list
:
sheet_name
,
confidence
,
page
,
code
,
print_time_local
,
start_date_local
,
end_date_local
=
summary
ws
=
self
.
get_sheet_by_name
(
sheet
_name
)
month_mapping
=
{}
reverse_trend_list
=
[]
for
sheet
in
summary
[
'sheet'
]:
ws
=
self
.
get_sheet_by_name
(
sheet
)
# 1.1.删除多余列、排列
self
.
sheet_prune
(
ws
)
min_row
=
self
.
sheet_prune
(
ws
,
summary
[
'classify'
]
)
# 1.2.按月份分割
self
.
sheet_split
(
ws
,
month_mapping
,
date_list
,
reverse_trend_list
)
# 1.3.元数据处理 TODO 时间与日期处理
confidence_max
=
max
(
confidence
,
confidence_max
)
if
code
is
not
None
:
code_list
.
append
((
page
,
code
))
self
.
sheet_split
(
ws
,
month_mapping
,
reverse_trend_list
,
min_row
,
date_list
,
date_statistics
)
if
date_statistics
is
True
and
len
(
date_list
)
>
1
:
start_date
=
min
(
date_list
)
if
start_date
is
None
else
start_date
end_date
=
max
(
date_list
)
if
end_date
is
None
else
end_date
if
len
(
date_list
)
>
1
:
start_date
=
min
(
date_list
)
end_date
=
max
(
date_list
)
date_interval
=
(
end_date
-
start_date
)
.
days
# 2.元信息提取表
ms
=
self
.
build_meta_sheet
(
role
,
confidence_max
,
code_list
,
print_time
,
start_date
,
end_date
,
date_interval
)
bank_name
=
consts
.
CLASSIFY_LIST
[
summary
[
'classify'
]][
0
]
base_sheet_name
=
'{0}_{1}'
.
format
(
bank_name
,
summary
[
'role'
])
ms
=
self
.
build_meta_sheet
(
card
,
summary
[
'classify'
],
summary
[
'confidence'
],
summary
[
'role'
],
summary
[
'code'
],
summary
[
'print_time'
],
start_date
,
end_date
)
# 3.创建月份表、提取/高亮关键行
is_reverse
=
False
...
...
@@ -261,8 +350,11 @@ class BSWorkbook(Workbook):
is_reverse
=
True
for
month_list
in
month_mapping
.
values
():
month_list
.
sort
(
key
=
lambda
x
:
x
[
-
1
],
reverse
=
True
)
self
.
build_month_sheet
(
role
,
month_mapping
,
ms
,
is_reverse
)
self
.
build_month_sheet
(
base_sheet_name
,
month_mapping
,
ms
,
is_reverse
)
# 4.删除原表
for
sheet
in
summary
[
'sheet'
]:
self
.
remove
(
self
.
get_sheet_by_name
(
sheet
))
# 删除原表
for
summary
in
summary_list
:
self
.
remove
(
self
.
get_sheet_by_name
(
summary
[
0
]))
def
rebuild
(
self
,
bs_summary
,
license_summary
):
self
.
bs_rebuild
(
bs_summary
)
\ No newline at end of file
...
...
src/common/tools/pdf_to_img.py
View file @
793920a
...
...
@@ -25,7 +25,7 @@ class PDFHandler:
def
__init__
(
self
,
path
,
img_dir_path
):
self
.
path
=
path
self
.
img_dir_path
=
img_dir_path
self
.
img_
path
_list
=
[]
self
.
img_
info
_list
=
[]
self
.
xref_set
=
set
()
def
get_img_save_path
(
self
,
pno
,
img_index
=
0
,
ext
=
'png'
):
...
...
@@ -38,7 +38,7 @@ class PDFHandler:
pm
=
page
.
getPixmap
(
matrix
=
trans_2
,
alpha
=
False
)
img_save_path
=
self
.
get_img_save_path
(
page
.
number
)
pm
.
writePNG
(
img_save_path
)
self
.
img_
path_list
.
append
(
img_save_path
)
self
.
img_
info_list
.
append
((
img_save_path
,
page
.
number
,
0
)
)
@staticmethod
def
getimage
(
pix
):
...
...
@@ -88,7 +88,7 @@ class PDFHandler:
with
open
(
img_save_path
,
"wb"
)
as
f
:
f
.
write
(
img_data
)
self
.
xref_set
.
add
(
xref
)
self
.
img_
path_list
.
append
(
img_save_path
)
self
.
img_
info_list
.
append
((
img_save_path
,
pno
,
img_index
)
)
@staticmethod
def
split_il
(
il
):
...
...
@@ -179,7 +179,7 @@ class PDFHandler:
img_save_path
=
self
.
get_img_save_path
(
pno
,
img_index
,
im_list
[
0
][
2
])
new_img
.
save
(
img_save_path
)
page_to_png
=
False
self
.
img_
path_list
.
append
(
img_save_path
)
self
.
img_
info_list
.
append
((
img_save_path
,
pno
,
img_index
)
)
# 3.3 碎图分组大于2、全过滤、含特殊filter,特殊处理:整个页面保存为png图片
if
page_to_png
:
...
...
Write
Preview
Styling with
Markdown
is supported
Attach a file
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to post a comment