Skip to content
Toggle navigation
Toggle navigation
This project
Loading...
Sign in
周伟奇
/
bmw-ocr
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Graphs
Network
Create a new issue
Commits
Issue Boards
Files
Commits
Network
Compare
Branches
Tags
1526125c
authored
2020-08-10 14:44:59 +0800
by
周伟奇
Browse Files
Options
Browse Files
Tag
Download
Email Patches
Plain Diff
update excel process & add keywords admin
1 parent
b2945296
Show whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
251 additions
and
8 deletions
src/apps/account/apps.py
src/apps/doc/__init__.py
src/apps/doc/admin.py
src/apps/doc/apps.py
src/apps/doc/management/commands/doc_process.py → src/apps/doc/management/commands/doc_ocr_process.py
src/apps/doc/models.py
src/apps/doc/named_enum.py
src/apps/doc/edms.py → src/apps/doc/ocr/edms.py
src/apps/doc/ocr/wb.py
src/common/named_enum.py
src/apps/account/apps.py
View file @
1526125
...
...
@@ -2,4 +2,4 @@ from django.apps import AppConfig
class
AccountConfig
(
AppConfig
):
name
=
'account'
name
=
'a
pps.a
ccount'
...
...
src/apps/doc/__init__.py
View file @
1526125
default_app_config
=
'apps.doc.apps.DocConfig'
...
...
src/apps/doc/admin.py
View file @
1526125
from
django.contrib
import
admin
from
.models
import
Keywords
from
.named_enum
import
KeywordsType
# Register your models here.
class
KeywordsAdmin
(
admin
.
ModelAdmin
):
list_display
=
(
'keyword'
,
'type_verbose_name'
,
'on_off'
)
search_fields
=
(
'keyword'
,)
list_filter
=
(
'type'
,
'on_off'
,)
def
type_verbose_name
(
self
,
obj
):
return
KeywordsType
.
get_verbose_name
(
obj
.
type
)
type_verbose_name
.
short_description
=
'类型'
admin
.
site
.
register
(
Keywords
,
KeywordsAdmin
)
admin
.
site
.
site_header
=
'宝马OCR'
admin
.
site
.
site_title
=
'宝马OCR'
...
...
src/apps/doc/apps.py
View file @
1526125
...
...
@@ -2,4 +2,5 @@ from django.apps import AppConfig
class
DocConfig
(
AppConfig
):
name
=
'doc'
name
=
'apps.doc'
verbose_name
=
'文件'
...
...
src/apps/doc/management/commands/doc_process.py
→
src/apps/doc/management/commands/doc_
ocr_
process.py
View file @
1526125
...
...
@@ -8,13 +8,13 @@ import aiohttp
from
openpyxl
import
Workbook
from
django.core.management
import
BaseCommand
from
settings
import
conf
from
common.mixins
import
LoggerMixin
from
common.tools.file_tools
import
write_zip_file
from
common.tools.pdf_to_img
import
PDFHandler
from
apps.doc.models
import
DocStatus
,
HILDoc
,
AFCDoc
from
apps.doc
import
consts
from
settings
import
conf
from
apps.doc.edms
import
EDMS
,
rh
from
apps.doc.ocr.edms
import
EDMS
,
rh
class
Command
(
BaseCommand
,
LoggerMixin
):
...
...
@@ -95,8 +95,8 @@ class Command(BaseCommand, LoggerMixin):
# c2 = cell.get('end_column')
r1
=
cell
.
get
(
'start_row'
)
# r2 = cell.get('end_row')
label
=
cell
.
get
(
'words'
)
ws
.
cell
(
row
=
r1
+
1
,
column
=
c1
+
1
,
value
=
label
)
words
=
cell
.
get
(
'words'
)
ws
.
cell
(
row
=
r1
+
1
,
column
=
c1
+
1
,
value
=
words
)
@staticmethod
def
get_ocr_json
(
img_path
):
...
...
src/apps/doc/models.py
View file @
1526125
from
django.db
import
models
from
.named_enum
import
DocStatus
from
.named_enum
import
DocStatus
,
KeywordsType
# Create your models here.
...
...
@@ -101,3 +101,19 @@ class PriorityApplication(models.Model):
situ_db_label
=
'afc'
db_table
=
'priority_application'
class
Keywords
(
models
.
Model
):
id
=
models
.
AutoField
(
primary_key
=
True
,
verbose_name
=
"id"
)
keyword
=
models
.
CharField
(
max_length
=
64
,
verbose_name
=
"关键词"
)
type
=
models
.
SmallIntegerField
(
choices
=
KeywordsType
.
get_choices_lst
(),
verbose_name
=
"类型"
)
on_off
=
models
.
BooleanField
(
default
=
True
,
verbose_name
=
"是否有效"
)
update_time
=
models
.
DateTimeField
(
auto_now
=
True
,
verbose_name
=
'修改时间'
)
create_time
=
models
.
DateTimeField
(
auto_now_add
=
True
,
verbose_name
=
'创建时间'
)
class
Meta
:
managed
=
False
situ_db_label
=
'afc'
db_table
=
'keywords'
verbose_name
=
'银行流水关键词'
verbose_name_plural
=
verbose_name
...
...
src/apps/doc/named_enum.py
View file @
1526125
...
...
@@ -7,3 +7,9 @@ class DocStatus(NamedEnum):
PROCESS_FAILED
=
(
2
,
'识别失败'
)
UPLOAD_FAILED
=
(
3
,
'同步失败'
)
COMPLETE
=
(
4
,
'已完成'
)
class
KeywordsType
(
NamedEnum
):
INTEREST
=
(
0
,
"利息"
)
SALARY
=
(
1
,
'薪资'
)
LOAN
=
(
2
,
'贷款'
)
...
...
src/apps/doc/edms.py
→
src/apps/doc/
ocr/
edms.py
View file @
1526125
...
...
@@ -2,7 +2,7 @@ import os
import
requests
from
zeep
import
Client
,
xsd
from
settings
import
conf
from
.
import
consts
from
apps.doc
import
consts
from
common.redis_cache
import
redis_handler
as
rh
...
...
src/apps/doc/ocr/wb.py
0 → 100644
View file @
1526125
import
numpy
as
np
import
locale
from
pandas._libs
import
tslib
from
pandas.core.indexes.datetimes
import
DatetimeIndex
from
openpyxl
import
Workbook
from
openpyxl.styles
import
Border
,
Side
,
PatternFill
,
numbers
from
openpyxl.utils
import
get_column_letter
class
BSWorkbook
(
Workbook
):
def
__init__
(
self
,
interest_keyword
,
salary_keyword
,
loan_keyword
,
*
args
,
**
kwargs
):
super
()
.
__init__
(
*
args
,
**
kwargs
)
self
.
fixed_headers
=
(
'记账日期'
,
'记账时间'
,
'金额'
,
'余额'
,
'交易名称'
,
'附言'
,
'对方账户名'
,
'对方卡号/账号'
,
'对方开户行'
,
'核对结果'
)
self
.
fixed_col_amount
=
len
(
self
.
fixed_headers
)
self
.
headers_mapping
=
{
'记账日期'
:
1
,
'交易日期'
:
1
,
'记账时间'
:
2
,
'金额'
:
3
,
'交易金额'
:
3
,
'余额'
:
4
,
'账户余额'
:
4
,
'交易名称'
:
5
,
'附言'
:
6
,
'摘要'
:
6
,
'对方账户名'
:
7
,
'对方卡号/账号'
:
8
,
'对方账号与户名'
:
8
,
'对方开户行'
:
9
,
}
self
.
meta_sheet_title
=
'关键信息提取和展示'
self
.
blank_row
=
(
None
,)
self
.
code_header
=
(
'页数'
,
'电子回单验证码'
)
self
.
date_header
=
(
'打印时间'
,
'起始日期'
,
'终止日期'
,
'流水区间结果'
)
self
.
keyword_header
=
(
'关键词'
,
'记账日期'
,
'金额'
)
self
.
interest_keyword
=
interest_keyword
self
.
salary_keyword
=
salary_keyword
self
.
loan_keyword
=
loan_keyword
self
.
proof_res
=
(
'对'
,
'错'
)
self
.
loan_fill
=
PatternFill
(
"solid"
,
fgColor
=
"00FFCC00"
)
self
.
amount_fill
=
PatternFill
(
"solid"
,
fgColor
=
"00FFFF00"
)
self
.
bd
=
Side
(
style
=
'thin'
,
color
=
"000000"
)
self
.
border
=
Border
(
left
=
self
.
bd
,
top
=
self
.
bd
,
right
=
self
.
bd
,
bottom
=
self
.
bd
)
def
sheet_prune
(
self
,
ws
):
ws
.
insert_cols
(
1
,
amount
=
self
.
fixed_col_amount
)
for
col
in
range
(
self
.
fixed_col_amount
+
1
,
ws
.
max_column
+
1
):
header_value
=
ws
.
cell
(
1
,
col
)
.
value
header_idx
=
self
.
headers_mapping
.
get
(
header_value
)
# TODO 关键字段再次查找
if
header_idx
is
None
:
continue
letter
=
get_column_letter
(
header_idx
)
ws
.
move_range
(
"{0}1:{0}{1}"
.
format
(
letter
,
ws
.
max_row
),
cols
=
header_idx
-
col
)
ws
.
delete_cols
(
self
.
fixed_col_amount
+
1
,
amount
=
ws
.
max_column
)
def
sheet_split
(
self
,
ws
,
month_mapping
):
for
date_tuple
in
ws
.
iter_cols
(
min_col
=
1
,
max_col
=
1
,
min_row
=
2
,
values_only
=
True
):
dt_array
,
tz_parsed
=
tslib
.
array_to_datetime
(
np
.
array
(
date_tuple
,
copy
=
False
,
dtype
=
np
.
object_
),
errors
=
"coerce"
,
utc
=
False
,
dayfirst
=
False
,
yearfirst
=
False
,
require_iso8601
=
False
,
)
dti
=
DatetimeIndex
(
dt_array
,
tz
=
None
,
name
=
None
)
def
build_metadata_rows
(
self
,
confidence_max
,
code_list
,
print_time
,
start_date
,
end_date
,
date_interval
):
metadata_rows
=
[(
'流水识别置信度'
,
confidence_max
),
self
.
blank_row
,
self
.
code_header
]
metadata_rows
.
extend
(
code_list
)
metadata_rows
.
extend
(
[
self
.
blank_row
,
self
.
date_header
,
(
print_time
,
start_date
,
end_date
,
date_interval
),
self
.
blank_row
,
self
.
keyword_header
]
)
return
metadata_rows
def
create_meta_sheet
(
self
,
role
):
if
self
.
worksheets
[
0
]
.
title
==
'Sheet'
:
ms
=
self
.
worksheets
[
0
]
ms
.
title
=
'{0}({1})'
.
format
(
self
.
meta_sheet_title
,
role
)
else
:
ms
=
self
.
create_sheet
(
'{0}({1})'
.
format
(
self
.
meta_sheet_title
,
role
))
return
ms
def
build_meta_sheet
(
self
,
role
,
confidence_max
,
code_list
,
print_time
,
start_date
,
end_date
,
date_interval
):
metadata_rows
=
self
.
build_metadata_rows
(
confidence_max
,
code_list
,
print_time
,
start_date
,
end_date
,
date_interval
)
ms
=
self
.
create_meta_sheet
(
role
)
for
row
in
metadata_rows
:
ms
.
append
(
row
)
return
ms
def
build_month_sheet
(
self
,
role
,
month_mapping
,
ms
):
tmp_ws
=
self
.
create_sheet
(
'tmp_ws'
)
for
month
,
parts
in
month_mapping
.
items
():
# 3.1.拷贝数据
new_ws
=
self
.
create_sheet
(
'{0}({1})'
.
format
(
month
,
role
))
new_ws
.
append
(
self
.
fixed_headers
)
for
part
in
parts
:
ws
=
self
.
get_sheet_by_name
(
part
[
0
])
for
row
in
ws
.
iter_rows
(
min_row
=
part
[
1
],
max_row
=
part
[
2
],
values_only
=
True
):
new_ws
.
append
(
row
)
# 3.2.提取信息、高亮
amount_mapping
=
{}
amount_fill_row
=
set
()
for
rows
in
new_ws
.
iter_rows
():
is_fill
=
False
summary_cell
=
rows
[
5
]
date_cell
=
rows
[
0
]
# 关键词1提取
if
summary_cell
.
value
in
self
.
interest_keyword
:
ms
.
append
((
summary_cell
.
value
,
date_cell
.
value
,
rows
[
2
]
.
value
))
# 关键词2提取至临时表
elif
summary_cell
.
value
in
self
.
salary_keyword
:
tmp_ws
.
append
((
summary_cell
.
value
,
date_cell
.
value
,
rows
[
2
]
.
value
))
# 贷款关键词高亮
elif
summary_cell
.
value
in
self
.
loan_keyword
:
is_fill
=
True
for
i
,
cell
in
enumerate
(
rows
):
cell
.
border
=
self
.
border
if
is_fill
:
cell
.
fill
=
self
.
loan_fill
if
(
i
==
2
or
i
==
3
)
and
cell
.
row
>
1
:
try
:
# 3.3.金额、余额转数值
cell
.
value
=
locale
.
atof
(
cell
.
value
)
except
Exception
:
continue
else
:
cell
.
number_format
=
numbers
.
FORMAT_NUMBER_COMMA_SEPARATED1
if
i
==
2
:
same_amount_mapping
=
amount_mapping
.
get
(
date_cell
.
value
,
{})
fill_rows
=
same_amount_mapping
.
get
(
-
cell
.
value
)
if
fill_rows
:
amount_fill_row
.
add
(
cell
.
row
)
amount_fill_row
.
update
(
fill_rows
)
amount_mapping
.
setdefault
(
date_cell
.
value
,
{})
.
setdefault
(
cell
.
value
,
[])
.
append
(
cell
.
row
)
# 3.4.核对结果
# TODO 借贷、开支类型银行流水,需要手动添加+-号
# TODO 倒序流水需要改变公式
if
i
==
9
and
cell
.
row
>
2
:
cell
.
value
=
'=IF(D{0}=SUM(D{1},C{0}), "{2}", "{3}")'
.
format
(
cell
.
row
,
cell
.
row
-
1
,
*
self
.
proof_res
)
# 3.5.同一天相同进出账高亮
del
amount_mapping
for
row
in
amount_fill_row
:
for
cell
in
new_ws
[
row
]:
cell
.
fill
=
self
.
amount_fill
# 关键词2信息提取
ms
.
append
(
self
.
blank_row
)
ms
.
append
(
self
.
keyword_header
)
for
row
in
tmp_ws
.
iter_rows
(
values_only
=
True
):
ms
.
append
(
row
)
self
.
remove
(
tmp_ws
)
def
rebuild
(
self
,
role_summary
):
# (sheet_name, confidence, page, code, print_time, start_date, end_date)
for
role
,
summary_list
in
role_summary
.
items
():
# 1.原表修剪、排列、按照月份分割
confidence_max
=
0
code_list
=
[]
month_mapping
=
{}
print_time
=
start_date
=
end_date
=
date_interval
=
None
for
summary
in
summary_list
:
sheet_name
,
confidence
,
page
,
code
,
print_time
,
start_date
,
end_date
=
summary
ws
=
self
.
get_sheet_by_name
(
sheet_name
)
# 1.1.删除多余列、排列
self
.
sheet_prune
(
ws
)
# 1.2.TODO 按月份分割
self
.
sheet_split
(
ws
,
month_mapping
)
# 1.3.元数据处理 TODO 时间与日期处理
# confidence_max = max(confidence, confidence_max)
# if code is not None:
# code_list.append((page, code))
# 2.元信息提取表
ms
=
self
.
build_meta_sheet
(
role
,
confidence_max
,
code_list
,
print_time
,
start_date
,
end_date
,
date_interval
)
# 3.创建月份表、提取/高亮关键行
self
.
build_month_sheet
(
role
,
month_mapping
,
ms
)
# 删除原表
for
summary
in
summary_list
:
self
.
remove
(
self
.
get_sheet_by_name
(
summary
[
0
]))
src/common/named_enum.py
View file @
1526125
...
...
@@ -132,6 +132,14 @@ class NamedEnum(enum.Enum):
def
raw_value
(
self
):
return
(
self
.
value
,
self
.
verbose_name
)
@classmethod
@lru_cache
()
def
get_choices_lst
(
cls
):
return
[
(
item
.
value
,
item
.
verbose_name
)
for
_
,
item
in
cls
.
_member_map_
.
items
()
]
def
extend
(
cls
,
sub_cls_name
,
names
,
unique
=
False
):
assert
issubclass
(
cls
,
NamedEnum
)
...
...
Write
Preview
Styling with
Markdown
is supported
Attach a file
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to post a comment