Skip to content
Toggle navigation
Toggle navigation
This project
Loading...
Sign in
周伟奇
/
bmw-ocr
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Graphs
Network
Create a new issue
Commits
Issue Boards
Files
Commits
Network
Compare
Branches
Tags
554d2f4f
authored
2021-06-14 02:55:37 +0800
by
周伟奇
Browse Files
Options
Browse Files
Tag
Download
Plain Diff
Merge branch 'feature/bs_excel' into feature/0611
2 parents
174d2005
b17b3c65
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
90 additions
and
32 deletions
src/apps/doc/consts.py
src/apps/doc/management/commands/ocr_process.py
src/apps/doc/named_enum.py
src/apps/doc/ocr/wb.py
src/apps/doc/consts.py
View file @
554d2f4
...
...
@@ -99,6 +99,7 @@ RES_FAILED = '识别失败'
RES_FAILED_1
=
'识别失败(阶段1)'
RES_FAILED_2
=
'识别失败(阶段2)'
RES_FAILED_3
=
'识别失败(阶段1数据格式错误)'
RES_FAILED_SET
=
{
RES_FAILED
,
RES_FAILED_1
,
RES_FAILED_2
,
RES_FAILED_3
}
CARD_RATIO
=
0.9
UNKNOWN_CARD
=
'未知卡号'
...
...
src/apps/doc/management/commands/ocr_process.py
View file @
554d2f4
...
...
@@ -768,7 +768,9 @@ class Command(BaseCommand, LoggerMixin):
type
=
KeywordsType
.
LOAN
.
value
,
on_off
=
True
)
.
values_list
(
'keyword'
,
flat
=
True
)
wechat_keyword
=
Keywords
.
objects
.
filter
(
type
=
KeywordsType
.
ALI_WECHART
.
value
,
on_off
=
True
)
.
values_list
(
'keyword'
,
flat
=
True
)
wb
=
BSWorkbook
(
interest_keyword
,
salary_keyword
,
loan_keyword
,
wechat_keyword
)
repayments_keyword
=
Keywords
.
objects
.
filter
(
type
=
KeywordsType
.
REPAYMENTS
.
value
,
on_off
=
True
)
.
values_list
(
'keyword'
,
flat
=
True
)
wb
=
BSWorkbook
(
interest_keyword
,
salary_keyword
,
loan_keyword
,
wechat_keyword
,
repayments_keyword
)
for
img_path
,
res
in
ocr_1_res
.
items
():
pno
,
ino
=
self
.
parse_img_path
(
img_path
)
part_idx
=
1
...
...
src/apps/doc/named_enum.py
View file @
554d2f4
...
...
@@ -19,6 +19,7 @@ class KeywordsType(NamedEnum):
SALARY
=
(
1
,
'薪资'
)
LOAN
=
(
2
,
'贷款'
)
ALI_WECHART
=
(
3
,
'微信/支付宝'
)
REPAYMENTS
=
(
4
,
'还款'
)
class
RequestTeam
(
NamedEnum
):
...
...
src/apps/doc/ocr/wb.py
View file @
554d2f4
...
...
@@ -7,24 +7,27 @@ from pandas._libs import tslib
from
pandas._libs.tslibs.nattype
import
NaTType
from
pandas.core.indexes.datetimes
import
DatetimeIndex
from
openpyxl
import
Workbook
from
openpyxl.styles
import
Border
,
Side
,
PatternFill
,
numbers
from
openpyxl.styles
import
PatternFill
,
numbers
from
openpyxl.utils
import
get_column_letter
from
apps.doc
import
consts
class
BSWorkbook
(
Workbook
):
def
__init__
(
self
,
interest_keyword
,
salary_keyword
,
loan_keyword
,
wechat_keyword
,
*
args
,
**
kwargs
):
def
__init__
(
self
,
interest_keyword
,
salary_keyword
,
loan_keyword
,
wechat_keyword
,
repayments_keyword
,
*
args
,
**
kwargs
):
super
()
.
__init__
(
*
args
,
**
kwargs
)
locale
.
setlocale
(
locale
.
LC_NUMERIC
,
'en_US.UTF-8'
)
self
.
meta_sheet_title
=
'
关键信息提取和展示
'
self
.
meta_sheet_title
=
'
Key info
'
self
.
blank_row
=
(
None
,)
self
.
code_header
=
(
'页数'
,
'电子回单验证码'
)
self
.
date_header
=
(
'打印时间'
,
'起始日期'
,
'终止日期'
,
'流水区间结果'
)
self
.
keyword_header
=
(
'关键词'
,
'记账日期'
,
'金额'
)
self
.
interest_keyword_header
=
(
'结息关键词'
,
'记账日期'
,
'金额'
)
self
.
salary_keyword_header
=
(
'收入关键词'
,
'记账日期'
,
'金额'
)
self
.
repayments_keyword_header
=
(
'还款关键词'
,
'记账日期'
,
'金额'
)
self
.
interest_keyword
=
self
.
replace_newline
(
interest_keyword
)
self
.
salary_keyword
=
self
.
replace_newline
(
salary_keyword
)
self
.
loan_keyword
=
self
.
replace_newline
(
loan_keyword
)
self
.
repayments_keyword
=
self
.
replace_newline
(
repayments_keyword
)
self
.
wechat_keyword
=
wechat_keyword
self
.
proof_res
=
(
'对'
,
'错'
)
self
.
loan_fill
=
PatternFill
(
"solid"
,
fgColor
=
"00FFCC00"
)
...
...
@@ -45,7 +48,7 @@ class BSWorkbook(Workbook):
if
not
isinstance
(
card
,
str
):
return
consts
.
ERROR_CARD
try
:
new_card
=
card
.
translate
(
consts
.
SHEET_TITLE_TRANS
)
.
strip
()[
-
6
:]
new_card
=
card
.
translate
(
consts
.
SHEET_TITLE_TRANS
)
.
strip
()[
-
4
:]
if
len
(
new_card
)
==
0
:
new_card
=
consts
.
ERROR_CARD
except
Exception
as
e
:
...
...
@@ -307,13 +310,15 @@ class BSWorkbook(Workbook):
month_mapping
.
setdefault
(
item
,
[])
.
append
(
(
ws
.
title
,
idx_list
[
i
]
+
min_row
,
idx_list
[
i
+
1
]
+
min_row
-
1
,
day_mean
))
def
build_metadata_rows
(
self
,
confidence
,
code
,
print_time
,
start_date
,
end_date
):
def
build_metadata_rows
(
self
,
confidence
,
code
,
print_time
,
start_date
,
end_date
,
res_count_tuple
):
if
start_date
is
None
or
end_date
is
None
:
timedelta
=
None
else
:
timedelta
=
(
end_date
-
start_date
)
.
days
metadata_rows
=
[
(
'流水识别置信度'
,
confidence
),
(
'图片总数'
,
res_count_tuple
[
0
]),
(
'识别成功'
,
res_count_tuple
[
1
]),
self
.
blank_row
,
self
.
code_header
,
]
...
...
@@ -323,13 +328,15 @@ class BSWorkbook(Workbook):
self
.
date_header
,
(
print_time
,
start_date
,
end_date
,
timedelta
),
self
.
blank_row
,
self
.
keyword_header
]
self
.
interest_
keyword_header
]
)
return
metadata_rows
def
build_meta_sheet
(
self
,
card
,
confidence
,
code
,
print_time
,
start_date
,
end_date
):
metadata_rows
=
self
.
build_metadata_rows
(
confidence
,
code
,
print_time
,
start_date
,
end_date
)
ms
=
self
.
create_sheet
(
'{0}({1})'
.
format
(
self
.
meta_sheet_title
,
card
))
def
build_meta_sheet
(
self
,
role_name
,
card
,
confidence
,
code
,
print_time
,
start_date
,
end_date
,
res_count_tuple
):
metadata_rows
=
self
.
build_metadata_rows
(
confidence
,
code
,
print_time
,
start_date
,
end_date
,
res_count_tuple
)
if
not
isinstance
(
role_name
,
str
):
role_name
=
consts
.
UNKNOWN_ROLE
ms
=
self
.
create_sheet
(
'{0}{1}({2})'
.
format
(
self
.
meta_sheet_title
,
role_name
,
card
))
for
row
in
metadata_rows
:
ms
.
append
(
row
)
return
ms
...
...
@@ -398,7 +405,7 @@ class BSWorkbook(Workbook):
row_value
[
1
]
=
'
\n
'
.
join
(
append_list
)
return
row_value
def
build_month_sheet
(
self
,
ms
,
card
,
month_mapping
,
is_reverse
,
statistics_header_info
,
max_column
,
classify
):
def
build_month_sheet
(
self
,
ms
,
role_name
,
card
,
month_mapping
,
is_reverse
,
statistics_header_info
,
max_column
,
classify
):
summary_cell_idx
=
statistics_header_info
.
get
(
consts
.
SUMMARY_KEY
)
date_cell_idx
=
statistics_header_info
.
get
(
consts
.
DATE_KEY
)
amount_cell_idx
=
statistics_header_info
.
get
(
consts
.
AMOUNT_KEY
)
# None or src or append
...
...
@@ -412,15 +419,17 @@ class BSWorkbook(Workbook):
for
i
in
range
(
max_column
-
src_header_len
):
header
.
append
(
None
)
add_col
=
[
'核对结果'
]
add_col
=
[
'核对结果'
,
'合计'
]
if
amount_cell_idx
is
None
:
if
income_cell_idx
is
not
None
or
outlay_cell_idx
is
not
None
:
add_col
=
[
'金额'
,
'核对结果'
]
add_col
=
[
'金额'
,
'核对结果'
,
'合计'
]
amount_cell_idx
=
len
(
header
)
header
.
extend
(
add_col
)
result_idx
=
len
(
header
)
-
1
result_idx
=
len
(
header
)
-
2
amount_sum_idx
=
len
(
header
)
-
1
tmp_ws
=
self
.
create_sheet
(
'tmp_ws'
)
tmp2_ws
=
self
.
create_sheet
(
'tmp2_ws'
)
if
classify
in
consts
.
ALI_WECHART_CLASSIFY
:
high_light_keyword
=
self
.
wechat_keyword
else
:
...
...
@@ -444,7 +453,10 @@ class BSWorkbook(Workbook):
amount_mapping
=
{}
amount_fill_row
=
set
()
loan_fill_row
=
set
()
fill_row
=
set
()
# 添加筛选
new_ws
.
auto_filter
.
ref
=
'A1:{0}{1}'
.
format
(
get_column_letter
(
new_ws
.
max_column
),
new_ws
.
max_row
)
for
rows
in
new_ws
.
iter_rows
(
min_row
=
2
):
length
=
len
(
rows
)
...
...
@@ -466,7 +478,15 @@ class BSWorkbook(Workbook):
# 贷款关键词高亮
if
summary_cell
is
not
None
and
summary_cell_value
in
high_light_keyword
:
loan_fill_row
.
add
(
summary_cell
.
row
)
fill_row
.
add
(
summary_cell
.
row
)
# 户名高亮
row_num
=
2
for
cell
in
rows
:
row_num
=
cell
.
row
if
cell
.
value
==
role_name
:
fill_row
.
add
(
summary_cell
.
row
)
break
# 3.3.余额转数值
over_success
=
False
...
...
@@ -505,16 +525,17 @@ class BSWorkbook(Workbook):
amount_cell
.
number_format
=
numbers
.
FORMAT_NUMBER_00
if
date_cell
is
not
None
and
isinstance
(
date_cell_value
,
str
):
same_amount_mapping
=
amount_mapping
.
get
(
date_cell_value
[:
10
],
{})
fill_rows
=
same_amount_mapping
.
get
(
-
amount_cell
.
value
)
if
fill_rows
:
fill_rows
_set
=
same_amount_mapping
.
get
(
-
amount_cell
.
value
,
set
()
)
if
len
(
fill_rows_set
)
>
0
:
amount_fill_row
.
add
(
amount_cell
.
row
)
amount_fill_row
.
update
(
fill_rows
)
amount_mapping
.
setdefault
(
date_cell_value
[:
10
],
{})
.
setdefault
(
amount_cell
.
value
,
[])
.
append
(
amount_cell
.
row
)
amount_fill_row
.
add
(
fill_rows_set
.
pop
())
else
:
amount_mapping
.
setdefault
(
date_cell_value
[:
10
],
{})
.
setdefault
(
amount_cell
.
value
,
set
())
.
add
(
amount_cell
.
row
)
# 3.5.核对结果
amount_col_letter
=
get_column_letter
(
amount_cell_idx
+
1
)
if
amount_success
and
over_success
and
amount_cell
.
row
>
2
:
amount_col_letter
=
get_column_letter
(
amount_cell_idx
+
1
)
over_col_letter
=
get_column_letter
(
over_cell_idx
+
1
)
if
is_reverse
:
rows
[
result_idx
]
.
value
=
'=IF({2}{0}=ROUND(SUM({2}{1},{3}{0}),4), "{4}", "{5}")'
.
format
(
...
...
@@ -523,6 +544,11 @@ class BSWorkbook(Workbook):
rows
[
result_idx
]
.
value
=
'=IF({2}{0}=ROUND(SUM({2}{1},{3}{0}),4), "{4}", "{5}")'
.
format
(
amount_cell
.
row
,
amount_cell
.
row
-
1
,
over_col_letter
,
amount_col_letter
,
*
self
.
proof_res
)
# 3.6 金额合计列
amount_sum_letter
=
get_column_letter
(
amount_sum_idx
+
1
)
rows
[
amount_sum_idx
]
.
value
=
'=SUM({0}{1},{2}{3})'
.
format
(
amount_sum_letter
,
row_num
-
1
,
amount_col_letter
,
row_num
)
# 3.2.提取信息、高亮
# row = summary_cell.row
if
summary_cell
is
not
None
:
...
...
@@ -534,13 +560,17 @@ class BSWorkbook(Workbook):
elif
summary_cell_value
in
self
.
salary_keyword
:
new_amount_cell_value
=
None
if
amount_cell
is
None
else
amount_cell
.
value
tmp_ws
.
append
((
summary_cell_value
,
date_cell_value
,
new_amount_cell_value
))
# 关键词3提取至临时表
elif
summary_cell_value
in
self
.
repayments_keyword
:
new_amount_cell_value
=
None
if
amount_cell
is
None
else
amount_cell
.
value
tmp2_ws
.
append
((
summary_cell_value
,
date_cell_value
,
new_amount_cell_value
))
# 贷款关键词高亮
# elif summary_cell_value in high_light_keyword:
# summary_cell.fill = self.amount_fill
# if amount_cell is not None:
# amount_cell.fill = self.amount_fill
for
row
in
loan_
fill_row
:
for
row
in
fill_row
:
for
cell
in
new_ws
[
row
]:
cell
.
fill
=
self
.
amount_fill
...
...
@@ -555,12 +585,19 @@ class BSWorkbook(Workbook):
# 关键词2信息提取
ms
.
append
(
self
.
blank_row
)
ms
.
append
(
self
.
keyword_header
)
ms
.
append
(
self
.
salary_
keyword_header
)
for
row
in
tmp_ws
.
iter_rows
(
values_only
=
True
):
ms
.
append
(
row
)
self
.
remove
(
tmp_ws
)
def
bs_rebuild
(
self
,
bs_summary
):
# 关键词3信息提取
ms
.
append
(
self
.
blank_row
)
ms
.
append
(
self
.
repayments_keyword_header
)
for
row
in
tmp2_ws
.
iter_rows
(
values_only
=
True
):
ms
.
append
(
row
)
self
.
remove
(
tmp2_ws
)
def
bs_rebuild
(
self
,
bs_summary
,
res_count_tuple
):
# bs_summary = {
# '卡号': {
# 'classify': 0,
...
...
@@ -578,6 +615,7 @@ class BSWorkbook(Workbook):
new_card
=
self
.
get_new_card
(
card
)
# 1.原表表头收集、按照月份分割
# 1.1 总结首行信息
role_name
=
summary
.
get
(
'role'
,
consts
.
UNKNOWN_ROLE
)
classify
=
summary
.
get
(
'classify'
,
0
)
sheet_header_info
=
{}
header_info
=
{}
...
...
@@ -614,12 +652,14 @@ class BSWorkbook(Workbook):
# 2.元信息提取表
confidence
=
self
.
get_confidence
(
max_find_count
)
ms
=
self
.
build_meta_sheet
(
new_card
,
ms
=
self
.
build_meta_sheet
(
role_name
,
new_card
,
confidence
,
summary
.
get
(
'code'
),
summary
.
get
(
'print_time'
),
start_date
,
end_date
)
end_date
,
res_count_tuple
)
# 3.创建月份表、提取/高亮关键行
# 倒序处理
...
...
@@ -627,7 +667,7 @@ class BSWorkbook(Workbook):
for
month_list
in
month_mapping
.
values
():
month_list
.
sort
(
key
=
lambda
x
:
x
[
-
1
],
reverse
=
is_reverse
)
self
.
build_month_sheet
(
ms
,
new_card
,
month_mapping
,
is_reverse
,
statistics_header_info
,
max_column
,
classify
)
self
.
build_month_sheet
(
ms
,
role_name
,
new_card
,
month_mapping
,
is_reverse
,
statistics_header_info
,
max_column
,
classify
)
# 4.删除原表
for
sheet
in
sheets_list
:
...
...
@@ -701,21 +741,35 @@ class BSWorkbook(Workbook):
res_list
.
sort
(
key
=
lambda
x
:
(
x
[
0
],
x
[
1
],
x
[
2
]))
ws
=
self
.
create_sheet
(
consts
.
RES_SHEET_NAME
)
ws
.
append
(
consts
.
RES_SHEET_HEADER
)
success_count
=
0
for
res_tuple
in
res_list
:
if
res_tuple
[
-
1
]
not
in
consts
.
RES_FAILED_SET
:
success_count
+=
1
ws
.
append
(
res_tuple
)
return
len
(
res_list
),
success_count
else
:
return
0
,
0
def
move_res_sheet
(
self
):
sheet
=
self
.
get_sheet_by_name
(
consts
.
RES_SHEET_NAME
)
idx
=
self
.
_sheets
.
index
(
sheet
)
del
self
.
_sheets
[
idx
]
self
.
_sheets
.
append
(
sheet
)
def
remove_base_sheet
(
self
):
if
len
(
self
.
sheetnames
)
>
1
:
self
.
remove
(
self
.
get_sheet_by_name
(
'Sheet'
))
def
rebuild
(
self
,
bs_summary
,
license_summary
,
res_list
,
document_scheme
):
res_count_tuple
=
self
.
res_sheet
(
res_list
)
count_list
=
[(
consts
.
MODEL_FIELD_BS
,
len
(
bs_summary
))]
if
document_scheme
==
consts
.
DOC_SCHEME_LIST
[
1
]:
self
.
license_rebuild
(
license_summary
,
document_scheme
,
count_list
)
self
.
bs_rebuild
(
bs_summary
)
self
.
bs_rebuild
(
bs_summary
,
res_count_tuple
)
else
:
self
.
bs_rebuild
(
bs_summary
)
self
.
bs_rebuild
(
bs_summary
,
res_count_tuple
)
self
.
license_rebuild
(
license_summary
,
document_scheme
,
count_list
)
self
.
res_sheet
(
res_list
)
self
.
move_res_sheet
(
)
self
.
remove_base_sheet
()
return
count_list
...
...
Write
Preview
Styling with
Markdown
is supported
Attach a file
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to post a comment