Skip to content
Toggle navigation
Toggle navigation
This project
Loading...
Sign in
周伟奇
/
bmw-ocr
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Graphs
Network
Create a new issue
Commits
Issue Boards
Files
Commits
Network
Compare
Branches
Tags
c40124d4
authored
2020-08-14 14:25:42 +0800
by
周伟奇
Browse Files
Options
Browse Files
Tag
Download
Email Patches
Plain Diff
update excel header
1 parent
e975baa4
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
63 additions
and
26 deletions
src/apps/doc/consts.py
src/apps/doc/management/commands/doc_ocr_process.py
src/apps/doc/ocr/wb.py
src/apps/doc/consts.py
View file @
c40124d
...
...
@@ -38,3 +38,56 @@ OVERAGE_COL_TITLE_SET = {"账户余额", "余额"}
PROOF_COL_TITLE
=
'核对结果'
PROOF_RES
=
(
'对'
,
'错'
)
META_SHEET_TITLE
=
'关键信息提取和展示'
FIXED_HEADERS
=
(
'记账日期'
,
'记账时间'
,
'金额'
,
'余额'
,
'交易名称'
,
'附言'
,
'对方账户名'
,
'对方卡号/账号'
,
'对方开户行'
,
'核对结果'
)
FIXED_COL_AMOUNT
=
len
(
FIXED_HEADERS
)
BASE_HEADERS_MAPPING
=
{
label
:
idx
+
1
for
idx
,
label
in
enumerate
(
FIXED_HEADERS
)}
HEADERS_MAPPING
=
{}
# 中国银行
HEADERS_MAPPING
.
update
(
{
'记账日期'
:
BASE_HEADERS_MAPPING
[
'记账日期'
],
'记账时间'
:
BASE_HEADERS_MAPPING
[
'记账时间'
],
'金额'
:
BASE_HEADERS_MAPPING
[
'金额'
],
'余额'
:
BASE_HEADERS_MAPPING
[
'余额'
],
'交易名称'
:
BASE_HEADERS_MAPPING
[
'交易名称'
],
'附言'
:
BASE_HEADERS_MAPPING
[
'附言'
],
'对方账户名'
:
BASE_HEADERS_MAPPING
[
'对方账户名'
],
'对方卡号/账号'
:
BASE_HEADERS_MAPPING
[
'对方卡号/账号'
],
'对方开户行'
:
BASE_HEADERS_MAPPING
[
'对方开户行'
],
}
)
# 竖版-表格-建设银行
HEADERS_MAPPING
.
update
(
{
'交易日期'
:
BASE_HEADERS_MAPPING
[
'记账日期'
],
'交易金额'
:
BASE_HEADERS_MAPPING
[
'金额'
],
'账户余额'
:
BASE_HEADERS_MAPPING
[
'余额'
],
'摘要'
:
BASE_HEADERS_MAPPING
[
'附言'
],
'对方账号与户名'
:
BASE_HEADERS_MAPPING
[
'对方卡号/账号'
],
}
)
# 横版-表格-农业银行
HEADERS_MAPPING
.
update
(
{
'存入'
:
BASE_HEADERS_MAPPING
[
'金额'
],
'对方账号'
:
BASE_HEADERS_MAPPING
[
'对方卡号/账号'
],
'对方名称'
:
BASE_HEADERS_MAPPING
[
'对方账户名'
],
}
)
# 横版-表格-工商银行
HEADERS_MAPPING
.
update
(
{
'对方户名'
:
BASE_HEADERS_MAPPING
[
'对方账户名'
],
'收入/支出金额'
:
BASE_HEADERS_MAPPING
[
'金额'
],
'工作日期'
:
BASE_HEADERS_MAPPING
[
'记账日期'
],
}
)
# 横版-表格-北京银行
HEADERS_MAPPING
.
update
(
{
'业务摘要'
:
BASE_HEADERS_MAPPING
[
'附言'
],
'发生额'
:
BASE_HEADERS_MAPPING
[
'金额'
],
}
)
...
...
src/apps/doc/management/commands/doc_ocr_process.py
View file @
c40124d
...
...
@@ -83,9 +83,10 @@ class Command(BaseCommand, LoggerMixin):
self
.
edms
.
download
(
pdf_path
,
doc
.
metadata_version_id
)
excel_path
=
os
.
path
.
join
(
doc_data_path
,
'{0}.xlsx'
.
format
(
doc
.
id
))
src_excel_path
=
os
.
path
.
join
(
doc_data_path
,
'src.xlsx'
)
self
.
cronjob_log
.
info
(
'{0} [pdf download success] [business_type={1}] [doc_id={2}] [pdf_path={3}]'
.
format
(
self
.
log_base
,
business_type
,
doc
.
id
,
pdf_path
))
return
doc_data_path
,
excel_path
,
pdf_path
return
doc_data_path
,
excel_path
,
src_excel_path
,
pdf_path
@staticmethod
def
append_sheet
(
wb
,
sheets_list
,
img_name
,
role_summary
):
...
...
@@ -134,7 +135,7 @@ class Command(BaseCommand, LoggerMixin):
doc
,
business_type
=
self
.
get_doc_info
()
try
:
# 2. 从EDMS获取PDF文件
doc_data_path
,
excel_path
,
pdf_path
=
self
.
pdf_download
(
doc
,
business_type
)
doc_data_path
,
excel_path
,
src_excel_path
,
pdf_path
=
self
.
pdf_download
(
doc
,
business_type
)
# 队列为空时的处理
if
pdf_path
is
None
:
time
.
sleep
(
sleep_second
)
...
...
@@ -167,6 +168,7 @@ class Command(BaseCommand, LoggerMixin):
# loop.close()
# 整合excel文件
wb
.
save
(
src_excel_path
)
wb
.
rebuild
(
role_summary
)
wb
.
save
(
excel_path
)
except
Exception
as
e
:
...
...
src/apps/doc/ocr/wb.py
View file @
c40124d
...
...
@@ -6,31 +6,13 @@ from pandas.core.indexes.datetimes import DatetimeIndex
from
openpyxl
import
Workbook
from
openpyxl.styles
import
Border
,
Side
,
PatternFill
,
numbers
from
openpyxl.utils
import
get_column_letter
from
apps.doc
import
consts
class
BSWorkbook
(
Workbook
):
def
__init__
(
self
,
interest_keyword
,
salary_keyword
,
loan_keyword
,
*
args
,
**
kwargs
):
super
()
.
__init__
(
*
args
,
**
kwargs
)
self
.
fixed_headers
=
(
'记账日期'
,
'记账时间'
,
'金额'
,
'余额'
,
'交易名称'
,
'附言'
,
'对方账户名'
,
'对方卡号/账号'
,
'对方开户行'
,
'核对结果'
)
self
.
fixed_col_amount
=
len
(
self
.
fixed_headers
)
self
.
headers_mapping
=
{
'记账日期'
:
1
,
'交易日期'
:
1
,
'记账时间'
:
2
,
'金额'
:
3
,
'交易金额'
:
3
,
'余额'
:
4
,
'账户余额'
:
4
,
'交易名称'
:
5
,
'附言'
:
6
,
'摘要'
:
6
,
'对方账户名'
:
7
,
'对方卡号/账号'
:
8
,
'对方账号与户名'
:
8
,
'对方开户行'
:
9
,
}
self
.
meta_sheet_title
=
'关键信息提取和展示'
self
.
blank_row
=
(
None
,)
self
.
code_header
=
(
'页数'
,
'电子回单验证码'
)
...
...
@@ -47,16 +29,16 @@ class BSWorkbook(Workbook):
self
.
MAX_MEAN
=
31
def
sheet_prune
(
self
,
ws
):
ws
.
insert_cols
(
1
,
amount
=
self
.
fixed_col_amount
)
for
col
in
range
(
self
.
fixed_col_amount
+
1
,
ws
.
max_column
+
1
):
ws
.
insert_cols
(
1
,
amount
=
consts
.
FIXED_COL_AMOUNT
)
for
col
in
range
(
consts
.
FIXED_COL_AMOUNT
+
1
,
ws
.
max_column
+
1
):
header_value
=
ws
.
cell
(
1
,
col
)
.
value
header_idx
=
self
.
headers_mapping
.
get
(
header_value
)
header_idx
=
consts
.
HEADERS_MAPPING
.
get
(
header_value
)
# TODO 关键字段再次查找
if
header_idx
is
None
:
continue
letter
=
get_column_letter
(
col
)
ws
.
move_range
(
"{0}1:{0}{1}"
.
format
(
letter
,
ws
.
max_row
),
cols
=
header_idx
-
col
)
ws
.
delete_cols
(
self
.
fixed_col_amount
+
1
,
amount
=
ws
.
max_column
)
ws
.
delete_cols
(
consts
.
FIXED_COL_AMOUNT
+
1
,
amount
=
ws
.
max_column
)
@staticmethod
def
month_split
(
dti
,
date_list
):
...
...
@@ -151,7 +133,7 @@ class BSWorkbook(Workbook):
# 3.1.拷贝数据
parts
=
month_mapping
.
get
(
month
)
new_ws
=
self
.
create_sheet
(
'{0}({1})'
.
format
(
month
,
role
))
new_ws
.
append
(
self
.
fixed_headers
)
new_ws
.
append
(
consts
.
FIXED_HEADERS
)
for
part
in
parts
:
ws
=
self
.
get_sheet_by_name
(
part
[
0
])
for
row
in
ws
.
iter_rows
(
min_row
=
part
[
1
],
max_row
=
part
[
2
],
values_only
=
True
):
...
...
Write
Preview
Styling with
Markdown
is supported
Attach a file
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to post a comment