Skip to content
Toggle navigation
Toggle navigation
This project
Loading...
Sign in
周伟奇
/
bmw-ocr
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Graphs
Network
Create a new issue
Commits
Issue Boards
Files
Commits
Network
Compare
Branches
Tags
f3d6e429
authored
2020-08-31 15:36:36 +0800
by
周伟奇
Browse Files
Options
Browse Files
Tag
Download
Email Patches
Plain Diff
update wb build
1 parent
a220590e
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
65 additions
and
21 deletions
.gitignore
src/apps/doc/management/commands/doc_ocr_process.py
src/apps/doc/ocr/wb.py
src/common/tools/pdf_to_img.py
.gitignore
View file @
f3d6e42
...
...
@@ -34,3 +34,4 @@ data/*
src/*.sh
test.py
ocr_test.py
\ No newline at end of file
...
...
src/apps/doc/management/commands/doc_ocr_process.py
View file @
f3d6e42
...
...
@@ -67,7 +67,6 @@ class Command(BaseCommand, LoggerMixin):
def
pdf_download
(
self
,
doc
,
business_type
):
if
doc
is
None
:
return
None
,
None
,
None
,
None
# TODO EDMS下载pdf
doc_data_path
=
os
.
path
.
join
(
self
.
data_dir
,
business_type
,
str
(
doc
.
id
))
os
.
makedirs
(
doc_data_path
,
exist_ok
=
True
)
pdf_path
=
os
.
path
.
join
(
doc_data_path
,
'{0}.pdf'
.
format
(
doc
.
id
))
...
...
@@ -128,8 +127,10 @@ class Command(BaseCommand, LoggerMixin):
img_name
=
os
.
path
.
basename
(
img_path
)
self
.
append_sheet
(
wb
,
sheets_list
,
img_name
,
role_summary
)
# TODO 细化文件状态,不同异常状态采取不同的处理
# TODO 细化文件状态,不同异常状态
,归还队列,重试时
采取不同的处理
# TODO 调用接口重试
# TODO 异常邮件通知
# TODO 数据库断联问题
def
handle
(
self
,
*
args
,
**
kwargs
):
sleep_second
=
int
(
conf
.
SLEEP_SECOND
)
max_sleep_second
=
int
(
conf
.
MAX_SLEEP_SECOND
)
...
...
src/apps/doc/ocr/wb.py
View file @
f3d6e42
...
...
@@ -35,6 +35,7 @@ class BSWorkbook(Workbook):
header_value
=
ws
.
cell
(
1
,
col
)
.
value
header_idx
=
consts
.
HEADERS_MAPPING
.
get
(
header_value
)
# TODO 关键字段再次查找
# TODO 支付宝、微信流水第一行非表头,怎么处理
if
header_idx
is
None
:
continue
letter
=
get_column_letter
(
col
)
...
...
@@ -63,8 +64,31 @@ class BSWorkbook(Workbook):
break
return
month_list
,
idx_list
def
sheet_split
(
self
,
ws
,
month_mapping
,
date_list
):
for
date_tuple
in
ws
.
iter_cols
(
min_col
=
1
,
max_col
=
1
,
min_row
=
2
,
values_only
=
True
):
@staticmethod
def
get_reverse_trend
(
day_idx
,
idx_list
):
reverse_trend
=
0
pre_day
=
None
for
idx
,
day
in
enumerate
(
day_idx
):
if
np
.
isnan
(
day
):
continue
if
idx
in
idx_list
or
pre_day
is
None
:
pre_day
=
day
continue
if
day
<
pre_day
:
reverse_trend
+=
1
pre_day
=
day
elif
day
>
pre_day
:
reverse_trend
-=
1
pre_day
=
day
if
reverse_trend
>
0
:
reverse_trend
=
1
elif
reverse_trend
<
0
:
reverse_trend
=
-
1
return
reverse_trend
def
sheet_split
(
self
,
ws
,
month_mapping
,
date_list
,
reverse_trend_list
):
for
date_tuple_src
in
ws
.
iter_cols
(
min_col
=
1
,
max_col
=
1
,
min_row
=
2
,
values_only
=
True
):
date_tuple
=
[
date
[:
10
]
if
isinstance
(
date
,
str
)
else
date
for
date
in
date_tuple_src
]
dt_array
,
tz_parsed
=
tslib
.
array_to_datetime
(
np
.
array
(
date_tuple
,
copy
=
False
,
dtype
=
np
.
object_
),
errors
=
"coerce"
,
...
...
@@ -78,22 +102,30 @@ class BSWorkbook(Workbook):
month_list
,
idx_list
=
self
.
month_split
(
dti
,
date_list
)
if
len
(
month_list
)
==
0
:
# month_info process
month_info
=
month_mapping
.
setdefault
(
'xxxx-xx'
,
[])
month_info
.
append
((
ws
.
title
,
2
,
ws
.
max_row
,
0
))
elif
len
(
month_list
)
==
1
:
# reverse_trend_list process
reverse_trend
=
self
.
get_reverse_trend
(
dti
.
day
,
idx_list
)
reverse_trend_list
.
append
(
reverse_trend
)
# month_info process
month_info
=
month_mapping
.
setdefault
(
month_list
[
0
],
[])
day_mean
=
np
.
mean
(
dti
.
day
.
dropna
())
if
len
(
month_info
)
==
0
:
month_info
.
append
((
ws
.
title
,
2
,
ws
.
max_row
,
day_mean
))
else
:
for
i
,
item
in
enumerate
(
month_info
):
# TODO 倒序处理
if
day_mean
<=
item
[
-
1
]:
month_info
.
insert
(
i
,
(
ws
.
title
,
2
,
ws
.
max_row
,
day_mean
))
break
else
:
month_info
.
append
((
ws
.
title
,
2
,
ws
.
max_row
,
day_mean
))
else
:
# reverse_trend_list process
reverse_trend
=
self
.
get_reverse_trend
(
dti
.
day
,
idx_list
)
reverse_trend_list
.
append
(
reverse_trend
)
# month_info process
for
i
,
item
in
enumerate
(
month_list
[:
-
1
]):
month_mapping
.
setdefault
(
item
,
[])
.
append
(
(
ws
.
title
,
idx_list
[
i
]
+
2
,
idx_list
[
i
+
1
]
+
1
,
self
.
MAX_MEAN
))
...
...
@@ -128,7 +160,7 @@ class BSWorkbook(Workbook):
ms
.
append
(
row
)
return
ms
def
build_month_sheet
(
self
,
role
,
month_mapping
,
ms
):
def
build_month_sheet
(
self
,
role
,
month_mapping
,
ms
,
is_reverse
):
tmp_ws
=
self
.
create_sheet
(
'tmp_ws'
)
for
month
in
sorted
(
month_mapping
.
keys
()):
# 3.1.拷贝数据
...
...
@@ -143,7 +175,6 @@ class BSWorkbook(Workbook):
amount_mapping
=
{}
amount_fill_row
=
set
()
for
rows
in
new_ws
.
iter_rows
():
is_fill
=
False
summary_cell
=
rows
[
5
]
date_cell
=
rows
[
0
]
# 关键词1提取
...
...
@@ -154,11 +185,9 @@ class BSWorkbook(Workbook):
tmp_ws
.
append
((
summary_cell
.
value
,
date_cell
.
value
,
rows
[
2
]
.
value
))
# 贷款关键词高亮
elif
summary_cell
.
value
in
self
.
loan_keyword
:
is_fill
=
True
summary_cell
.
fill
=
self
.
loan_fill
for
i
,
cell
in
enumerate
(
rows
):
cell
.
border
=
self
.
border
if
is_fill
:
cell
.
fill
=
self
.
loan_fill
if
(
i
==
2
or
i
==
3
)
and
cell
.
row
>
1
:
try
:
# 3.3.金额、余额转数值
...
...
@@ -177,16 +206,18 @@ class BSWorkbook(Workbook):
cell
.
value
,
[])
.
append
(
cell
.
row
)
# 3.4.核对结果
# TODO 借贷、开支类型银行流水,需要手动添加+-号
# TODO 倒序流水需要改变公式
if
i
==
9
and
cell
.
row
>
2
:
cell
.
value
=
'=IF(D{0}=SUM(D{1},C{0}), "{2}", "{3}")'
.
format
(
cell
.
row
,
cell
.
row
-
1
,
*
self
.
proof_res
)
if
is_reverse
:
cell
.
value
=
'=IF(D{0}=SUM(D{1},C{0}), "{2}", "{3}")'
.
format
(
cell
.
row
-
1
,
cell
.
row
,
*
self
.
proof_res
)
else
:
cell
.
value
=
'=IF(D{0}=SUM(D{1},C{0}), "{2}", "{3}")'
.
format
(
cell
.
row
,
cell
.
row
-
1
,
*
self
.
proof_res
)
# 3.5.同一天相同进出账高亮
del
amount_mapping
for
row
in
amount_fill_row
:
for
cell
in
new_ws
[
row
]:
cell
.
fill
=
self
.
amount_fill
new_ws
[
row
][
2
]
.
fill
=
self
.
amount_fill
# 关键词2信息提取
ms
.
append
(
self
.
blank_row
)
...
...
@@ -196,9 +227,10 @@ class BSWorkbook(Workbook):
self
.
remove
(
tmp_ws
)
def
rebuild
(
self
,
role_summary
):
# (sheet_name, confidence, page, code, print_time, start_date, end_date)
# (sheet_name, confidence, page, code, print_time, start_date, end_date)
# TODO 表名简化,+卡号
for
role
,
summary_list
in
role_summary
.
items
():
# 1.原表修剪、排列、按照月份分割
reverse_trend_list
=
[]
confidence_max
=
0
code_list
=
[]
month_mapping
=
{}
...
...
@@ -210,7 +242,7 @@ class BSWorkbook(Workbook):
# 1.1.删除多余列、排列
self
.
sheet_prune
(
ws
)
# 1.2.按月份分割
self
.
sheet_split
(
ws
,
month_mapping
,
date_list
)
self
.
sheet_split
(
ws
,
month_mapping
,
date_list
,
reverse_trend_list
)
# 1.3.元数据处理 TODO 时间与日期处理
confidence_max
=
max
(
confidence
,
confidence_max
)
if
code
is
not
None
:
...
...
@@ -224,7 +256,12 @@ class BSWorkbook(Workbook):
ms
=
self
.
build_meta_sheet
(
role
,
confidence_max
,
code_list
,
print_time
,
start_date
,
end_date
,
date_interval
)
# 3.创建月份表、提取/高亮关键行
self
.
build_month_sheet
(
role
,
month_mapping
,
ms
)
is_reverse
=
False
if
sum
(
reverse_trend_list
)
>
0
:
# 倒序处理
is_reverse
=
True
for
month_list
in
month_mapping
.
values
():
month_list
.
sort
(
key
=
lambda
x
:
x
[
-
1
],
reverse
=
True
)
self
.
build_month_sheet
(
role
,
month_mapping
,
ms
,
is_reverse
)
# 删除原表
for
summary
in
summary_list
:
...
...
src/common/tools/pdf_to_img.py
View file @
f3d6e42
...
...
@@ -4,8 +4,10 @@ from PIL import Image
from
io
import
BytesIO
# 页面保存为png图片参数
ZOOM_X
=
ZOOM_Y
=
2.0
trans
=
fitz
.
Matrix
(
ZOOM_X
,
ZOOM_X
)
.
preRotate
(
0
)
# zoom factor 2 in each dimension
ZOOM_X_1
=
ZOOM_Y_1
=
1.0
ZOOM_X_2
=
ZOOM_Y_2
=
2.0
trans_1
=
fitz
.
Matrix
(
ZOOM_X_1
,
ZOOM_X_1
)
.
preRotate
(
0
)
# zoom factor 1 in each dimension
trans_2
=
fitz
.
Matrix
(
ZOOM_X_2
,
ZOOM_X_2
)
.
preRotate
(
0
)
# zoom factor 2 in each dimension
# 特殊filter处理
ADOBE_FILTER_SET
=
{
'FlateDecode'
,
'JPXDecode'
,
'JBIG2Decode'
}
...
...
@@ -30,7 +32,10 @@ class PDFHandler:
return
os
.
path
.
join
(
self
.
img_dir_path
,
'page_{0}_img_{1}.{2}'
.
format
(
pno
,
img_index
,
ext
))
def
page_to_png
(
self
,
page
):
pm
=
page
.
getPixmap
(
matrix
=
trans
,
alpha
=
False
)
if
page
.
MediaBoxSize
.
x
>
1500
or
page
.
MediaBoxSize
.
y
>
1500
:
pm
=
page
.
getPixmap
(
matrix
=
trans_1
,
alpha
=
False
)
else
:
pm
=
page
.
getPixmap
(
matrix
=
trans_2
,
alpha
=
False
)
img_save_path
=
self
.
get_img_save_path
(
page
.
number
)
pm
.
writePNG
(
img_save_path
)
self
.
img_path_list
.
append
(
img_save_path
)
...
...
Write
Preview
Styling with
Markdown
is supported
Attach a file
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to post a comment