Skip to content
Toggle navigation
Toggle navigation
This project
Loading...
Sign in
周伟奇
/
bmw-ocr
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Graphs
Network
Create a new issue
Commits
Issue Boards
Files
Commits
Network
Compare
Branches
Tags
13e30ac5
authored
2020-08-14 10:25:10 +0800
by
周伟奇
Browse Files
Options
Browse Files
Tag
Download
Email Patches
Plain Diff
add wb rebuild
1 parent
1526125c
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
86 additions
and
18 deletions
src/apps/doc/management/commands/doc_ocr_process.py
src/apps/doc/ocr/wb.py
src/apps/doc/management/commands/doc_ocr_process.py
View file @
13e30ac
...
...
@@ -5,14 +5,16 @@ import signal
import
base64
import
asyncio
import
aiohttp
from
openpyxl
import
Workbook
# from openpyxl import Workbook
from
apps.doc.ocr.wb
import
BSWorkbook
from
django.core.management
import
BaseCommand
from
settings
import
conf
from
common.mixins
import
LoggerMixin
from
common.tools.file_tools
import
write_zip_file
from
common.tools.pdf_to_img
import
PDFHandler
from
apps.doc.models
import
DocStatus
,
HILDoc
,
AFCDoc
from
apps.doc.models
import
DocStatus
,
HILDoc
,
AFCDoc
,
Keywords
from
apps.doc.named_enum
import
KeywordsType
from
apps.doc
import
consts
from
apps.doc.ocr.edms
import
EDMS
,
rh
...
...
@@ -86,9 +88,11 @@ class Command(BaseCommand, LoggerMixin):
return
doc_data_path
,
excel_path
,
pdf_path
@staticmethod
def
append_sheet
(
wb
,
sheets_list
,
img_name
):
def
append_sheet
(
wb
,
sheets_list
,
img_name
,
role_summary
):
for
i
,
sheet
in
enumerate
(
sheets_list
):
ws
=
wb
.
create_sheet
(
'{0}_{1}'
.
format
(
img_name
,
i
))
sheet_name
=
'{0}_{1}'
.
format
(
img_name
,
i
)
role_summary
[
'银行-户名'
]
.
append
((
sheet_name
,
1
,
None
,
None
,
None
,
None
,
None
))
ws
=
wb
.
create_sheet
(
sheet_name
)
cells
=
sheet
.
get
(
'cells'
)
for
cell
in
cells
:
c1
=
cell
.
get
(
'start_column'
)
...
...
@@ -112,12 +116,12 @@ class Command(BaseCommand, LoggerMixin):
async
with
session
.
post
(
self
.
ocr_url
,
json
=
json_data
)
as
response
:
return
await
response
.
json
()
async
def
img_ocr_excel
(
self
,
wb
,
img_path
):
async
def
img_ocr_excel
(
self
,
wb
,
img_path
,
role_summary
):
res
=
await
self
.
fetch_ocr_result
(
img_path
)
self
.
cronjob_log
.
info
(
'{0} [fetch ocr result success] [img={1}] [res={2}]'
.
format
(
self
.
log_base
,
img_path
,
res
))
sheets_list
=
res
.
get
(
'result'
)
.
get
(
'res'
)
img_name
=
os
.
path
.
basename
(
img_path
)
self
.
append_sheet
(
wb
,
sheets_list
,
img_name
)
self
.
append_sheet
(
wb
,
sheets_list
,
img_name
,
role_summary
)
# TODO 细化文件状态,不同异常状态采取不同的处理
# TODO 调用接口重试
...
...
@@ -148,13 +152,22 @@ class Command(BaseCommand, LoggerMixin):
write_zip_file
(
img_save_path
,
os
.
path
.
join
(
doc_data_path
,
'{0}_img.zip'
.
format
(
doc
.
id
)))
# 4.图片调用算法判断是否为银行流水, 图片调用算法OCR为excel文件
wb
=
Workbook
()
role_summary
=
{
'银行-户名'
:
[]
}
interest_keyword
=
Keywords
.
objects
.
filter
(
type
=
KeywordsType
.
INTEREST
.
value
)
.
values_list
(
'keyword'
,
flat
=
True
)
salary_keyword
=
Keywords
.
objects
.
filter
(
type
=
KeywordsType
.
SALARY
.
value
)
.
values_list
(
'keyword'
,
flat
=
True
)
loan_keyword
=
Keywords
.
objects
.
filter
(
type
=
KeywordsType
.
LOAN
.
value
)
.
values_list
(
'keyword'
,
flat
=
True
)
wb
=
BSWorkbook
(
interest_keyword
,
salary_keyword
,
loan_keyword
)
loop
=
asyncio
.
get_event_loop
()
tasks
=
[
self
.
img_ocr_excel
(
wb
,
img_path
)
for
img_path
in
pdf_handler
.
img_path_list
]
tasks
=
[
self
.
img_ocr_excel
(
wb
,
img_path
,
role_summary
)
for
img_path
in
pdf_handler
.
img_path_list
]
loop
.
run_until_complete
(
asyncio
.
wait
(
tasks
))
# loop.close()
# 整合excel文件
wb
.
rebuild
(
role_summary
)
wb
.
save
(
excel_path
)
except
Exception
as
e
:
doc
.
status
=
DocStatus
.
PROCESS_FAILED
.
value
...
...
@@ -164,7 +177,8 @@ class Command(BaseCommand, LoggerMixin):
else
:
try
:
# 5.上传至EDMS
self
.
edms
.
upload
(
excel_path
,
doc
,
business_type
)
# self.edms.upload(excel_path, doc, business_type)
print
(
'upload pass'
)
except
Exception
as
e
:
doc
.
status
=
DocStatus
.
UPLOAD_FAILED
.
value
doc
.
save
()
...
...
src/apps/doc/ocr/wb.py
View file @
13e30ac
import
numpy
as
np
import
locale
import
numpy
as
np
from
pandas._libs
import
tslib
from
pandas._libs.tslibs.nattype
import
NaTType
from
pandas.core.indexes.datetimes
import
DatetimeIndex
from
openpyxl
import
Workbook
from
openpyxl.styles
import
Border
,
Side
,
PatternFill
,
numbers
...
...
@@ -43,6 +44,7 @@ class BSWorkbook(Workbook):
self
.
amount_fill
=
PatternFill
(
"solid"
,
fgColor
=
"00FFFF00"
)
self
.
bd
=
Side
(
style
=
'thin'
,
color
=
"000000"
)
self
.
border
=
Border
(
left
=
self
.
bd
,
top
=
self
.
bd
,
right
=
self
.
bd
,
bottom
=
self
.
bd
)
self
.
MAX_MEAN
=
31
def
sheet_prune
(
self
,
ws
):
ws
.
insert_cols
(
1
,
amount
=
self
.
fixed_col_amount
)
...
...
@@ -56,7 +58,29 @@ class BSWorkbook(Workbook):
ws
.
move_range
(
"{0}1:{0}{1}"
.
format
(
letter
,
ws
.
max_row
),
cols
=
header_idx
-
col
)
ws
.
delete_cols
(
self
.
fixed_col_amount
+
1
,
amount
=
ws
.
max_column
)
def
sheet_split
(
self
,
ws
,
month_mapping
):
@staticmethod
def
month_split
(
dti
,
date_list
):
month_list
=
[]
idx_list
=
[]
month_pre
=
None
for
idx
,
month_str
in
enumerate
(
dti
.
strftime
(
'
%
Y-
%
m'
)):
if
isinstance
(
month_str
,
float
):
continue
if
month_str
!=
month_pre
:
month_list
.
append
(
month_str
)
if
month_pre
is
None
:
date_list
.
append
(
dti
[
idx
]
.
date
())
idx
=
0
idx_list
.
append
(
idx
)
month_pre
=
month_str
for
idx
in
range
(
len
(
dti
)
-
1
,
-
1
,
-
1
):
if
isinstance
(
dti
[
idx
],
NaTType
):
continue
date_list
.
append
(
dti
[
idx
]
.
date
())
break
return
month_list
,
idx_list
def
sheet_split
(
self
,
ws
,
month_mapping
,
date_list
):
for
date_tuple
in
ws
.
iter_cols
(
min_col
=
1
,
max_col
=
1
,
min_row
=
2
,
values_only
=
True
):
dt_array
,
tz_parsed
=
tslib
.
array_to_datetime
(
np
.
array
(
date_tuple
,
copy
=
False
,
dtype
=
np
.
object_
),
...
...
@@ -68,6 +92,31 @@ class BSWorkbook(Workbook):
)
dti
=
DatetimeIndex
(
dt_array
,
tz
=
None
,
name
=
None
)
month_list
,
idx_list
=
self
.
month_split
(
dti
,
date_list
)
if
len
(
month_list
)
==
0
:
month_info
=
month_mapping
.
setdefault
(
'xxxx-xx'
,
[])
month_info
.
append
((
ws
.
title
,
2
,
ws
.
max_row
,
0
))
elif
len
(
month_list
)
==
1
:
month_info
=
month_mapping
.
setdefault
(
month_list
[
0
],
[])
day_mean
=
np
.
mean
(
dti
.
day
.
dropna
())
if
len
(
month_info
)
==
0
:
month_info
.
append
((
ws
.
title
,
2
,
ws
.
max_row
,
day_mean
))
else
:
for
i
,
item
in
enumerate
(
month_info
):
# TODO 倒序处理
if
day_mean
<=
item
[
-
1
]:
month_info
.
insert
(
i
,
(
ws
.
title
,
2
,
ws
.
max_row
,
day_mean
))
break
else
:
month_info
.
append
((
ws
.
title
,
2
,
ws
.
max_row
,
day_mean
))
else
:
for
i
,
item
in
enumerate
(
month_list
[:
-
1
]):
month_mapping
.
setdefault
(
item
,
[])
.
append
(
(
ws
.
title
,
idx_list
[
i
]
+
2
,
idx_list
[
i
+
1
]
+
1
,
self
.
MAX_MEAN
))
month_mapping
.
setdefault
(
month_list
[
-
1
],
[])
.
insert
(
0
,
(
ws
.
title
,
idx_list
[
-
1
]
+
2
,
ws
.
max_row
,
0
))
def
build_metadata_rows
(
self
,
confidence_max
,
code_list
,
print_time
,
start_date
,
end_date
,
date_interval
):
metadata_rows
=
[(
'流水识别置信度'
,
confidence_max
),
self
.
blank_row
,
self
.
code_header
]
metadata_rows
.
extend
(
code_list
)
...
...
@@ -169,19 +218,24 @@ class BSWorkbook(Workbook):
confidence_max
=
0
code_list
=
[]
month_mapping
=
{}
print_time
=
start_date
=
end_date
=
date_interval
=
None
date_list
=
[]
start_date
=
end_date
=
date_interval
=
print_time
=
None
for
summary
in
summary_list
:
sheet_name
,
confidence
,
page
,
code
,
print_time
,
start_date
,
end_date
=
summary
sheet_name
,
confidence
,
page
,
code
,
print_time
_local
,
start_date_local
,
end_date_local
=
summary
ws
=
self
.
get_sheet_by_name
(
sheet_name
)
# 1.1.删除多余列、排列
self
.
sheet_prune
(
ws
)
# 1.2.
TODO
按月份分割
self
.
sheet_split
(
ws
,
month_mapping
)
# 1.2.按月份分割
self
.
sheet_split
(
ws
,
month_mapping
,
date_list
)
# 1.3.元数据处理 TODO 时间与日期处理
#
confidence_max = max(confidence, confidence_max)
#
if code is not None:
#
code_list.append((page, code))
confidence_max
=
max
(
confidence
,
confidence_max
)
if
code
is
not
None
:
code_list
.
append
((
page
,
code
))
if
len
(
date_list
)
>
1
:
start_date
=
min
(
date_list
)
end_date
=
max
(
date_list
)
date_interval
=
(
end_date
-
start_date
)
.
days
# 2.元信息提取表
ms
=
self
.
build_meta_sheet
(
role
,
confidence_max
,
code_list
,
print_time
,
start_date
,
end_date
,
date_interval
)
...
...
Write
Preview
Styling with
Markdown
is supported
Attach a file
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to post a comment