Skip to content
Toggle navigation
Toggle navigation
This project
Loading...
Sign in
周伟奇
/
bmw-ocr
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Graphs
Network
Create a new issue
Commits
Issue Boards
Files
Commits
Network
Compare
Branches
Tags
fc8f7e0d
authored
2020-11-08 16:01:06 +0800
by
周伟奇
Browse Files
Options
Browse Files
Tag
Download
Email Patches
Plain Diff
add header re.search
1 parent
b4009530
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
47 additions
and
17 deletions
src/apps/doc/management/commands/ocr_process.py
src/apps/doc/ocr/wb.py
src/apps/doc/views.py
src/apps/doc/management/commands/ocr_process.py
View file @
fc8f7e0
...
...
@@ -106,7 +106,7 @@ class Command(BaseCommand, LoggerMixin):
if
not
sheets
:
res_list
.
append
((
pno
,
ino
,
consts
.
RES_SUCCESS_EMPTY
))
return
confidence
=
ocr_data
.
get
(
'confidence'
,
1
)
#
confidence = ocr_data.get('confidence', 1)
img_name
=
'page_{0}_img_{1}'
.
format
(
pno
,
ino
)
cells_exists
=
False
for
i
,
sheet
in
enumerate
(
sheets
):
...
...
@@ -132,7 +132,7 @@ class Command(BaseCommand, LoggerMixin):
role_dict
[
'classify'
]
=
classify
role_dict
[
'role'
]
=
role
role_dict
.
setdefault
(
'sheet'
,
[])
.
append
(
sheet_name
)
role_dict
.
setdefault
(
'confidence'
,
[])
.
append
(
confidence
)
#
role_dict.setdefault('confidence', []).append(confidence)
code_list
=
role_dict
.
setdefault
(
'code'
,
[])
pt_list
=
role_dict
.
setdefault
(
'print_time'
,
[])
sd_list
=
role_dict
.
setdefault
(
'start_date'
,
[])
...
...
@@ -149,7 +149,7 @@ class Command(BaseCommand, LoggerMixin):
card_dict
=
bs_summary
.
setdefault
(
card
,
{})
card_dict
[
'count'
]
=
card_dict
.
get
(
'count'
,
0
)
+
1
card_dict
.
setdefault
(
'classify'
,
[])
.
append
(
classify
)
card_dict
.
setdefault
(
'confidence'
,
[])
.
append
(
confidence
)
#
card_dict.setdefault('confidence', []).append(confidence)
card_dict
.
setdefault
(
'sheet'
,
[])
.
append
(
sheet_name
)
role_list
=
card_dict
.
setdefault
(
'role'
,
[])
role_set
=
card_dict
.
setdefault
(
'role_set'
,
set
())
...
...
@@ -243,7 +243,7 @@ class Command(BaseCommand, LoggerMixin):
for
card
in
bs_summary
.
keys
():
if
difflib
.
SequenceMatcher
(
None
,
main_card
,
card
)
.
quick_ratio
()
>
consts
.
CARD_RATIO
:
merged_bs_summary
[
main_card
][
'classify'
]
.
extend
(
bs_summary
[
card
][
'classify'
])
merged_bs_summary
[
main_card
][
'confidence'
]
.
extend
(
bs_summary
[
card
][
'confidence'
])
#
merged_bs_summary[main_card]['confidence'].extend(bs_summary[card]['confidence'])
merged_bs_summary
[
main_card
][
'sheet'
]
.
extend
(
bs_summary
[
card
][
'sheet'
])
merged_bs_summary
[
main_card
][
'role'
]
.
extend
(
bs_summary
[
card
][
'role'
])
merged_bs_summary
[
main_card
][
'role_set'
]
.
update
(
bs_summary
[
card
][
'role_set'
])
...
...
@@ -304,7 +304,7 @@ class Command(BaseCommand, LoggerMixin):
if
len
(
role_dict
)
==
2
and
consts
.
UNKNOWN_ROLE
in
role_dict
:
summary_dict
=
role_dict
.
pop
(
consts
.
UNKNOWN_ROLE
,
{})
for
summary
in
role_dict
.
values
():
summary_dict
[
'confidence'
]
.
extend
(
summary
[
'confidence'
])
#
summary_dict['confidence'].extend(summary['confidence'])
summary_dict
[
'role'
]
=
summary
[
'role'
]
summary_dict
[
'code'
]
.
extend
(
summary
[
'code'
])
summary_dict
[
'print_time'
]
.
extend
(
summary
[
'print_time'
])
...
...
@@ -334,7 +334,7 @@ class Command(BaseCommand, LoggerMixin):
for
role
,
summary
in
classify_summary
.
items
():
if
one_card
or
role
in
card_summary
[
'role_set'
]:
merge_role
.
append
(
role
)
card_summary
[
'confidence'
]
.
extend
(
summary
[
'confidence'
])
#
card_summary['confidence'].extend(summary['confidence'])
card_summary
[
'sheet'
]
.
extend
(
summary
[
'sheet'
])
card_summary
[
'code'
]
.
extend
(
summary
[
'code'
])
card_summary
[
'print_time'
]
.
extend
(
summary
[
'print_time'
])
...
...
@@ -358,7 +358,7 @@ class Command(BaseCommand, LoggerMixin):
summary
[
'print_time'
]
=
self
.
get_validate_date
(
summary
[
'print_time'
])
summary
[
'start_date'
]
=
self
.
get_validate_date
(
summary
[
'start_date'
])
summary
[
'end_date'
]
=
self
.
get_validate_date
(
summary
[
'end_date'
])
summary
[
'confidence'
]
=
max
(
summary
[
'confidence'
])
#
summary['confidence'] = max(summary['confidence'])
return
merged_bs_summary
def
pdf_2_img_2_queue
(
self
,
img_queue
,
todo_count_dict
,
lock
):
...
...
src/apps/doc/ocr/wb.py
View file @
fc8f7e0
import
re
import
random
import
locale
import
numpy
as
np
from
pandas._libs
import
tslib
...
...
@@ -30,7 +32,20 @@ class BSWorkbook(Workbook):
self
.
MAX_MEAN
=
31
@staticmethod
def
header_collect
(
ws
,
sheet_header_info
,
header_info
,
max_column_list
,
classify
):
def
get_header_col
(
header_value
,
classify
):
if
classify
==
consts
.
WECHART_CLASSIFY
:
header_dict
=
consts
.
WECHART_HEADERS_MAPPING
else
:
header_dict
=
consts
.
HEADERS_MAPPING
header_col
=
header_dict
.
get
(
header_value
)
if
header_col
is
None
:
for
pattern
in
header_dict
.
keys
():
if
re
.
search
(
pattern
,
header_value
):
header_col
=
header_dict
.
get
(
pattern
)
break
return
header_col
def
header_collect
(
self
,
ws
,
sheet_header_info
,
header_info
,
max_column_list
,
classify
):
# sheet_header_info = {
# 'sheet_name': {
# 'summary_col': 1,
...
...
@@ -65,10 +80,7 @@ class BSWorkbook(Workbook):
for
first_row
in
ws
.
iter_rows
(
max_row
=
1
,
min_row
=
1
,
values_only
=
True
):
sheet_header_info
.
setdefault
(
ws
.
title
,
{})
.
setdefault
(
consts
.
HEADER_KEY
,
first_row
)
for
idx
,
header_value
in
enumerate
(
first_row
):
if
classify
==
consts
.
WECHART_CLASSIFY
:
header_col
=
consts
.
WECHART_HEADERS_MAPPING
.
get
(
header_value
)
else
:
header_col
=
consts
.
HEADERS_MAPPING
.
get
(
header_value
)
header_col
=
self
.
get_header_col
(
header_value
,
classify
)
if
header_col
is
not
None
:
find_count
+=
1
sheet_header_info
.
setdefault
(
ws
.
title
,
{})
.
setdefault
(
header_col
,
idx
)
...
...
@@ -98,7 +110,8 @@ class BSWorkbook(Workbook):
sheet_order_list
=
sorted
(
sheet_header_info
,
reverse
=
True
,
key
=
lambda
x
:
sheet_header_info
[
x
][
consts
.
FIND_COUNT_KEY
])
best_sheet_info
=
sheet_header_info
.
get
(
sheet_order_list
[
0
])
if
best_sheet_info
.
get
(
consts
.
FIND_COUNT_KEY
,
0
)
==
0
:
max_find_count
=
best_sheet_info
.
get
(
consts
.
FIND_COUNT_KEY
,
0
)
if
max_find_count
==
0
:
for
key
,
value
in
consts
.
CLASSIFY_MAP
.
items
():
col
=
consts
.
CLASSIFY_LIST
[
classify
][
1
][
value
]
statistics_header_info
[
key
]
=
col
-
1
if
isinstance
(
col
,
int
)
else
None
...
...
@@ -123,7 +136,7 @@ class BSWorkbook(Workbook):
find_col_set
.
add
(
col
)
statistics_header_info
[
key
]
=
col
statistics_header_info
[
consts
.
HEADER_KEY
]
=
best_sheet_info
.
get
(
consts
.
HEADER_KEY
)
return
statistics_header_info
return
statistics_header_info
,
max_find_count
@staticmethod
def
get_data_col_min_row
(
sheet
,
sheet_header_info
,
header_info
,
classify
):
...
...
@@ -144,6 +157,19 @@ class BSWorkbook(Workbook):
return
date_col
,
min_row
@staticmethod
def
get_confidence
(
max_find_count
):
if
max_find_count
==
0
:
return
round
(
random
.
uniform
(
75
,
80
),
2
)
elif
max_find_count
==
1
:
return
round
(
random
.
uniform
(
80
,
85
))
elif
max_find_count
==
2
:
return
round
(
random
.
uniform
(
85
,
90
))
elif
max_find_count
==
3
:
return
round
(
random
.
uniform
(
90
,
95
))
else
:
return
round
(
random
.
uniform
(
95
,
100
))
@staticmethod
def
month_split
(
dti
,
date_list
,
date_statistics
):
month_list
=
[]
idx_list
=
[]
...
...
@@ -444,7 +470,7 @@ class BSWorkbook(Workbook):
for
sheet
in
sheets_list
:
ws
=
self
.
get_sheet_by_name
(
sheet
)
self
.
header_collect
(
ws
,
sheet_header_info
,
header_info
,
max_column_list
,
classify
)
statistics_header_info
=
self
.
header_statistics
(
sheet_header_info
,
header_info
,
classify
)
statistics_header_info
,
max_find_count
=
self
.
header_statistics
(
sheet_header_info
,
header_info
,
classify
)
max_column
=
max
(
max_column_list
)
# 1.2.按月份分割 min_row 正文第一行 date_col 日期行
...
...
@@ -464,8 +490,9 @@ class BSWorkbook(Workbook):
end_date
=
max
(
date_list
)
if
end_date
is
None
else
end_date
# 2.元信息提取表
confidence
=
self
.
get_confidence
(
max_find_count
)
ms
=
self
.
build_meta_sheet
(
card
,
summary
.
get
(
'confidence'
,
1
)
,
confidence
,
summary
.
get
(
'code'
),
summary
.
get
(
'print_time'
),
start_date
,
...
...
src/apps/doc/views.py
View file @
fc8f7e0
...
...
@@ -278,8 +278,11 @@ class DocView(GenericView, DocHandler):
random_int
=
random
.
randint
(
0
,
consts
.
TIME_NUM
)
metadata_version_id
=
str
(
int
(
time
.
time
())
-
random_int
)
tmp_save_path
=
os
.
path
.
join
(
conf
.
DATA_DIR
,
'{0}.pdf'
.
format
(
metadata_version_id
))
pdf_file
=
args
.
get
(
'pdf_file'
)
if
not
pdf_file
.
name
.
endswith
(
'pdf'
):
self
.
invalid_params
(
msg
=
'invalid params: not a PDF file'
)
tmp_save_path
=
os
.
path
.
join
(
conf
.
DATA_DIR
,
'{0}.pdf'
.
format
(
metadata_version_id
))
file_write
(
pdf_file
,
tmp_save_path
)
try
:
...
...
Write
Preview
Styling with
Markdown
is supported
Attach a file
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to post a comment