Skip to content
Toggle navigation
Toggle navigation
This project
Loading...
Sign in
周伟奇
/
bmw-ocr
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Graphs
Network
Create a new issue
Commits
Issue Boards
Files
Commits
Network
Compare
Branches
Tags
3591e645
authored
2021-02-08 10:42:47 +0800
by
周伟奇
Browse Files
Options
Browse Files
Tag
Download
Plain Diff
Merge branch 'feature/main' into feature/mssql
2 parents
e24808bc
94c1d320
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
66 additions
and
17 deletions
src/apps/doc/consts.py
src/apps/doc/management/commands/idcard_monthly.py
src/apps/doc/management/commands/ocr_process.py
src/apps/doc/ocr/wb.py
src/apps/doc/views.py
src/apps/doc/consts.py
View file @
3591e64
...
...
@@ -296,6 +296,7 @@ HEADERS_MAPPING.update(
HEADERS_MAPPING
.
update
(
{
'联机余额'
:
OVER_KEY
,
'联机金额'
:
OVER_KEY
,
}
)
# 竖版-无表格-邮储银行-账户对账单 含有对手方户名 对手方账户
...
...
@@ -519,6 +520,8 @@ OTHER_TUPLE = (None, None, None, None, None, None, None, None, None, None, None,
# "35":"针式打印-部分格线-竖版-邮储银行",
# "36":"针式打印-部分格线-竖版-邮储银行-绿卡",
# "38":"普通打印-无格线-农业银行-整数-特殊",
CLASSIFY_LIST
=
[
(
'其他'
,
OTHER_TUPLE
),
(
'其他'
,
OTHER_TUPLE
),
...
...
@@ -560,6 +563,8 @@ CLASSIFY_LIST = [
(
'针式打印-部分格线-竖版-邮储银行'
,
(
2
,
None
,
5
,
6
,
None
,
4
,
None
,
7
,
None
,
None
,
None
,
None
,
None
)),
(
'针式打印-部分格线-竖版-邮储银行-绿卡'
,
(
2
,
None
,
5
,
6
,
None
,
4
,
None
,
7
,
None
,
None
,
None
,
None
,
None
)),
(
'其他'
,
OTHER_TUPLE
),
(
'普通打印-无格线-农业银行-整数-特殊'
,
(
1
,
None
,
3
,
4
,
None
,
2
,
None
,
5
,
None
,
None
,
None
,
None
,
None
)),
]
CLASSIFY_HEADER_LIST
=
[
...
...
@@ -603,6 +608,8 @@ CLASSIFY_HEADER_LIST = [
(
'序号'
,
'交易日期'
,
'交易渠道'
,
'摘要'
,
'交易金额'
,
'账户余额'
,
'对方账号/卡号/汇票号'
,
'原子账号'
,
'交易机构名称'
),
(
'序号'
,
'交易日期'
,
'交易渠道'
,
'摘要'
,
'交易金额'
,
'账户余额'
,
'对方账号/卡号/汇票号'
,
'原子账号'
,
'交易机构名称'
),
OTHER_TUPLE
,
(
'交易日期'
,
'摘要/附言'
,
'交易金额'
,
'账户余额'
,
'对方账号和户名'
),
]
# ----------license相关------------------------------------------------------------------------------------------------
...
...
@@ -642,7 +649,7 @@ RP_FIELD_ORDER_0 = (('姓名', '姓名'),
(
'出生年月'
,
'出生年月'
),
(
'住址'
,
'住址'
),
(
'性别'
,
'性别'
),)
RP_FIELD_ORDER_1
=
IC_FIELD_ORDER_1
RP_FIELD_ORDER_1
=
((
'有效期限'
,
'有效期限'
),
(
'签发机关'
,
'签发机关'
),
(
'通行证号码'
,
'通行证号码'
))
# 增值税普票
VAT_CN_NAME
=
'VAT普票'
VAT_CLASSIFY
=
0
...
...
@@ -948,6 +955,8 @@ LICENSE_CLASSIFY_SET_1 = {IC_CLASSIFY, VAT_CLASSIFY, MVC_CLASSIFY, MVI_CLASSIFY,
LICENSE_CLASSIFY_SET_2
=
{
BL_CLASSIFY
,
EEP_CLASSIFY
,
DL_CLASSIFY
,
PP_CLASSIFY
,
BC_CLASSIFY
}
NYYH_CLASSIFY
=
{
17
,
18
}
NYZS_CLASSIFY
=
18
SPECIAL_NYZS_CLASSIFY
=
38
MS_CLASSIFY
=
21
MS_ERROR_COL
=
(
5
,
6
)
WECHART_CLASSIFY
=
12
...
...
@@ -960,12 +969,12 @@ WECHART_HEADERS_MAPPING.update(
}
)
PATTERN_LIST
=
[
'
交易名称'
,
'收入/支出金额'
,
'收入'
,
'存入'
,
'支出'
,
'支取'
,
'金额'
,
'余额'
,
'发生额'
,
'借贷'
,
'借贷状态'
,
'收支标志
'
,
'
收/支'
,
'收入金额'
,
'存入金额(贷)'
,
'存入金额(贷)'
,
'支出金额'
,
'支取金额(借)'
,
'支取金额(借)'
,
'记账日期
'
,
'
交易日期'
,
'摘要'
,
'业务摘要'
,
'工作日期'
,
'交易金额'
,
'账户余额'
,
'交易类型'
,
'金额(元)'
,
'金额(元)'
,
'时间
'
,
'
名称/备注'
,
'摘要/附言'
,
'交易发生额'
,
'交易摘要'
,
'借贷发生额(借:-贷:+)'
,
'借贷发生额(借:-贷:+)'
,
'联机余额
'
,
'
交易金额(元)'
,
'交易金额(元)'
,
'账户余额(元)'
,
'账户余额(元)'
,
'会计日期'
,
'摘要代码'
,
'摘要信息'
,
'
日期'
,
'短摘要'
,
'本次余额'
,
'交易后余额'
,
'交易说明'
,
'帐户余额'
,
'交易日期 记账日期'
]
PATTERN_LIST
=
[
'
联机金额'
,
'交易名称'
,
'收入/支出金额'
,
'收入'
,
'存入'
,
'支出'
,
'支取'
,
'金额'
,
'余额'
,
'发生额'
,
'借贷
'
,
'
借贷状态'
,
'收支标志'
,
'收/支'
,
'收入金额'
,
'存入金额(贷)'
,
'存入金额(贷)'
,
'支出金额'
,
'支取金额(借)
'
,
'
支取金额(借)'
,
'记账日期'
,
'交易日期'
,
'摘要'
,
'业务摘要'
,
'工作日期'
,
'交易金额'
,
'账户余额'
,
'交易类型
'
,
'
金额(元)'
,
'金额(元)'
,
'时间'
,
'名称/备注'
,
'摘要/附言'
,
'交易发生额'
,
'交易摘要'
,
'借贷发生额(借:-贷:+)
'
,
'
借贷发生额(借:-贷:+)'
,
'联机余额'
,
'交易金额(元)'
,
'交易金额(元)'
,
'账户余额(元)'
,
'账户余额(元)'
,
'会计
日期'
,
'
摘要代码'
,
'摘要信息'
,
'日期'
,
'
短摘要'
,
'本次余额'
,
'交易后余额'
,
'交易说明'
,
'帐户余额'
,
'交易日期 记账日期'
]
CN_RE
=
re
.
compile
(
u'[
\u4e00
-
\u9fa5
]'
)
...
...
src/apps/doc/management/commands/idcard_monthly.py
View file @
3591e64
...
...
@@ -28,6 +28,7 @@ class Command(BaseCommand, LoggerMixin):
return
monthly_wb
=
Workbook
()
monthly_ws
=
monthly_wb
.
get_sheet_by_name
(
'Sheet'
)
for
d
in
range
(
1
,
monthrange
(
pre_mouth
.
year
,
pre_mouth
.
month
)[
1
]
+
1
):
date_str
=
'{:04d}-{:02d}-{:02d}'
.
format
(
pre_mouth
.
year
,
pre_mouth
.
month
,
d
)
...
...
@@ -36,12 +37,13 @@ class Command(BaseCommand, LoggerMixin):
print
(
'daily excel path not exists: {0}'
.
format
(
daily_excel_path
))
continue
monthly_ws
=
monthly_wb
.
create_sheet
(
date_str
)
#
monthly_ws = monthly_wb.create_sheet(date_str)
daily_wb
=
load_workbook
(
daily_excel_path
)
daily_ws
=
daily_wb
.
get_sheet_by_name
(
'身份证'
)
for
row
in
daily_ws
.
iter_rows
(
min_row
=
1
,
values_only
=
True
):
monthly_ws
.
append
(
row
)
monthly_excel_path
=
os
.
path
.
join
(
excel_dir
,
'idcard_{0}.xlsx'
.
format
(
pre_mouth
.
strftime
(
'
%
Y-
%
m'
)))
monthly_wb
.
remove
(
monthly_wb
.
get_sheet_by_name
(
'Sheet'
))
# monthly_wb.remove(monthly_wb.get_sheet_by_name('Sheet'))
monthly_ws
.
title
=
'身份证'
monthly_wb
.
save
(
monthly_excel_path
)
...
...
src/apps/doc/management/commands/ocr_process.py
View file @
3591e64
...
...
@@ -291,6 +291,7 @@ class Command(BaseCommand, LoggerMixin):
return
date_res
def
merge_card
(
self
,
bs_summary
):
classify_info
=
{}
merged_bs_summary
=
{}
sorted_card
=
sorted
(
bs_summary
.
keys
(),
key
=
lambda
x
:
bs_summary
[
x
][
'count'
],
reverse
=
True
)
for
main_card
in
sorted_card
:
...
...
@@ -313,10 +314,13 @@ class Command(BaseCommand, LoggerMixin):
merge_cards
.
append
(
card
)
for
card
in
merge_cards
:
del
bs_summary
[
card
]
merged_bs_summary
[
main_card
][
'classify'
]
=
self
.
get_most
(
merged_bs_summary
[
main_card
][
'classify'
])
most_classify
=
self
.
get_most
(
merged_bs_summary
[
main_card
][
'classify'
])
classify_count
=
classify_info
.
get
(
most_classify
,
0
)
classify_info
[
most_classify
]
=
classify_count
+
1
merged_bs_summary
[
main_card
][
'classify'
]
=
most_classify
merged_bs_summary
[
main_card
][
'role'
]
=
self
.
get_most
(
merged_bs_summary
[
main_card
][
'role'
])
del
bs_summary
return
merged_bs_summary
return
merged_bs_summary
,
classify_info
def
prune_bs_summary
(
self
,
bs_summary
):
for
summary
in
bs_summary
.
values
():
...
...
@@ -354,6 +358,11 @@ class Command(BaseCommand, LoggerMixin):
# }
# }
# }
# 归为同一份流水的逻辑
# 所有图片均无卡号:同一分类同一户名归为同一份流水(如果同一分类下只有一个已知户名,则此分类下其他未知户名归为此户名)
# 所有图片只已知1卡号:其他未知卡号流水归为此卡号
# 所有图片已知多卡号: 1.根据相似度和图片数目合并相似已知卡号,并整理多数分类和户名集合
# 2.遍历所有未知卡号,进行过滤:当未知卡号分类与某已知卡号一致,且此未知卡号户名在此已知卡号户名集合中时,将未知卡号归为已知卡号。剩余未知卡号同一分类同一户名归为同一流水
# 无卡号
if
len
(
bs_summary
)
==
0
:
del
bs_summary
...
...
@@ -383,15 +392,16 @@ class Command(BaseCommand, LoggerMixin):
if
len
(
bs_summary
)
==
1
:
merged_bs_summary
=
self
.
prune_bs_summary
(
bs_summary
)
one_card
=
True
classify_info
=
{}
# 多卡号
else
:
merged_bs_summary
=
self
.
merge_card
(
bs_summary
)
merged_bs_summary
,
classify_info
=
self
.
merge_card
(
bs_summary
)
for
card_summary
in
merged_bs_summary
.
values
():
merge_role
=
[]
classify_summary
=
unknown_summary
.
get
(
card_summary
[
'classify'
],
{})
for
role
,
summary
in
classify_summary
.
items
():
if
one_card
or
role
in
card_summary
[
'role_set'
]:
if
one_card
or
classify_info
.
get
(
card_summary
[
'classify'
],
0
)
==
1
or
role
in
card_summary
[
'role_set'
]:
merge_role
.
append
(
role
)
# card_summary['confidence'].extend(summary['confidence'])
card_summary
[
'sheet'
]
.
extend
(
summary
[
'sheet'
])
...
...
src/apps/doc/ocr/wb.py
View file @
3591e64
...
...
@@ -2,6 +2,7 @@ import re
import
random
import
locale
import
numpy
as
np
from
datetime
import
datetime
from
pandas._libs
import
tslib
from
pandas._libs.tslibs.nattype
import
NaTType
from
pandas.core.indexes.datetimes
import
DatetimeIndex
...
...
@@ -126,7 +127,7 @@ class BSWorkbook(Workbook):
max_column_list
.
append
(
ws
.
max_column
)
@staticmethod
def
header_statistics
(
sheet_header_info
,
header_info
,
classify
):
def
header_statistics
(
sheet_header_info
,
header_info
,
classify
,
special_nhzs
):
# statistics_header_info = {
# SUMMARY_KEY: 2,
# DATE_KEY: 3,
...
...
@@ -143,6 +144,8 @@ class BSWorkbook(Workbook):
best_sheet_info
=
sheet_header_info
.
get
(
sheet_order_list
[
0
])
max_find_count
=
best_sheet_info
.
get
(
consts
.
FIND_COUNT_KEY
,
0
)
if
max_find_count
==
0
:
if
special_nhzs
:
classify
=
consts
.
SPECIAL_NYZS_CLASSIFY
for
key
,
value
in
consts
.
CLASSIFY_MAP
.
items
():
col
=
consts
.
CLASSIFY_LIST
[
classify
][
1
][
value
]
statistics_header_info
[
key
]
=
col
-
1
if
isinstance
(
col
,
int
)
else
None
...
...
@@ -255,7 +258,7 @@ class BSWorkbook(Workbook):
date_col
=
date_col
+
1
for
date_tuple_src
in
ws
.
iter_cols
(
min_col
=
date_col
,
max_col
=
date_col
,
min_row
=
min_row
,
values_only
=
True
):
date_tuple
=
[
date
[:
10
]
if
isinstance
(
date
,
str
)
else
date
for
date
in
date_tuple_src
]
dt_array
,
tz_parsed
=
tslib
.
array_to_datetime
(
dt_array
,
_
=
tslib
.
array_to_datetime
(
np
.
array
(
date_tuple
,
copy
=
False
,
dtype
=
np
.
object_
),
errors
=
"coerce"
,
utc
=
False
,
...
...
@@ -265,6 +268,22 @@ class BSWorkbook(Workbook):
)
dti
=
DatetimeIndex
(
dt_array
,
tz
=
None
,
name
=
None
)
rebuid
=
False
for
idx
,
d
in
enumerate
(
dti
):
try
:
if
isinstance
(
d
,
NaTType
)
and
isinstance
(
date_tuple
[
idx
],
str
):
match_obj
=
re
.
match
(
r'(\d{4})[7/](\d{2})[7/](\d{2})'
,
date_tuple
[
idx
])
if
match_obj
:
dt_array
[
idx
]
=
np
.
datetime64
(
datetime
(
int
(
match_obj
.
group
(
1
)),
int
(
match_obj
.
group
(
2
)),
int
(
match_obj
.
group
(
3
))))
rebuid
=
True
except
Exception
as
e
:
continue
if
rebuid
:
dti
=
DatetimeIndex
(
dt_array
,
tz
=
None
,
name
=
None
)
month_list
,
idx_list
=
self
.
month_split
(
dti
,
date_list
,
date_statistics
)
if
len
(
month_list
)
==
0
:
...
...
@@ -555,6 +574,7 @@ class BSWorkbook(Workbook):
# }
# }
for
card
,
summary
in
bs_summary
.
items
():
special_nhzs
=
False
new_card
=
self
.
get_new_card
(
card
)
# 1.原表表头收集、按照月份分割
# 1.1 总结首行信息
...
...
@@ -563,10 +583,17 @@ class BSWorkbook(Workbook):
header_info
=
{}
max_column_list
=
[]
sheets_list
=
summary
.
get
(
'sheet'
,
[])
special_nhzs_max_col
=
0
for
sheet
in
sheets_list
:
ws
=
self
.
get_sheet_by_name
(
sheet
)
if
classify
==
consts
.
NYZS_CLASSIFY
:
special_nhzs_max_col
+=
ws
.
max_column
self
.
header_collect
(
ws
,
sheet_header_info
,
header_info
,
max_column_list
,
classify
)
statistics_header_info
,
max_find_count
=
self
.
header_statistics
(
sheet_header_info
,
header_info
,
classify
)
# 农业银行整数表头特殊处理
if
classify
==
consts
.
NYZS_CLASSIFY
and
round
(
special_nhzs_max_col
/
len
(
sheets_list
))
==
5
:
special_nhzs
=
True
statistics_header_info
,
max_find_count
=
self
.
header_statistics
(
sheet_header_info
,
header_info
,
classify
,
special_nhzs
)
max_column
=
max
(
max_column_list
)
# 1.2.按月份分割 min_row 正文第一行 date_col 日期行
...
...
src/apps/doc/views.py
View file @
3591e64
...
...
@@ -370,4 +370,5 @@ class DocView(GenericView, DocHandler):
self
.
running_log
.
info
(
'[mock doc upload success] [args={0}] [business_type={1}] [doc_id={2}] '
'[is_priority={3}] [enqueue_res={4}]'
.
format
(
args
,
prefix
,
doc
.
id
,
is_priority
,
enqueue_res
))
return
response
.
ok
()
data
=
{
'excel_path'
:
os
.
path
.
join
(
save_dir_path
,
'{0}.xlsx'
.
format
(
doc
.
id
))}
return
response
.
ok
(
data
=
data
)
...
...
Write
Preview
Styling with
Markdown
is supported
Attach a file
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to post a comment