Skip to content
Toggle navigation
Toggle navigation
This project
Loading...
Sign in
周伟奇
/
test_on_pytorch
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Pipelines
Graphs
Issues
0
Merge Requests
0
Wiki
Network
Create a new issue
Builds
Commits
Issue Boards
Files
Commits
Network
Compare
Branches
Tags
890ea78a
authored
2022-12-20 19:09:41 +0800
by
周伟奇
Browse Files
Options
Browse Files
Tag
Download
Email Patches
Plain Diff
add jwg word2vec
1 parent
c919b68e
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
42 additions
and
8 deletions
data/create_dataset2.py
data/word2vec.py
data/create_dataset2.py
View file @
890ea78
...
...
@@ -7,7 +7,7 @@ import uuid
import
cv2
import
pandas
as
pd
from
tools
import
get_file_paths
,
load_json
from
word2vec
import
simple_word2vec
from
word2vec
import
simple_word2vec
,
jwq_word2vec
def
clean_go_res
(
go_res_dir
):
...
...
@@ -187,28 +187,34 @@ def build_dataset(img_dir, go_res_dir, label_dir, top_text_list, skip_list, save
X
=
list
()
y_true
=
list
()
text_vec_max_lens
=
15
*
50
dim
=
1
+
5
+
8
+
text_vec_max_lens
num_classes
=
10
for
i
in
range
(
160
):
if
i
>=
valid_lens
:
X
.
append
([
0.
for
_
in
range
(
14
)])
y_true
.
append
([
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
])
X
.
append
([
0.
for
_
in
range
(
dim
)])
y_true
.
append
([
0
for
_
in
range
(
num_classes
)
])
elif
i
in
top_text_idx_set
:
(
x0
,
y0
,
x1
,
y1
,
x2
,
y2
,
x3
,
y3
),
text
=
go_res_list
[
i
]
feature_vec
=
[
1.
]
feature_vec
.
extend
(
simple_word2vec
(
text
))
feature_vec
.
extend
([
x0
/
w
,
y0
/
h
,
x1
/
w
,
y1
/
h
,
x2
/
w
,
y2
/
h
,
x3
/
w
,
y3
/
h
])
feature_vec
.
extend
(
jwq_word2vec
(
text
,
text_vec_max_lens
))
X
.
append
(
feature_vec
)
y_true
.
append
([
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
])
y_true
.
append
([
0
for
_
in
range
(
num_classes
)
])
elif
i
in
label_idx_dict
:
(
x0
,
y0
,
x1
,
y1
,
x2
,
y2
,
x3
,
y3
),
text
=
go_res_list
[
i
]
feature_vec
=
[
0.
]
feature_vec
.
extend
(
simple_word2vec
(
text
))
feature_vec
.
extend
([
x0
/
w
,
y0
/
h
,
x1
/
w
,
y1
/
h
,
x2
/
w
,
y2
/
h
,
x3
/
w
,
y3
/
h
])
feature_vec
.
extend
(
jwq_word2vec
(
text
,
text_vec_max_lens
))
X
.
append
(
feature_vec
)
base_label_list
=
[
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
]
base_label_list
=
[
0
for
_
in
range
(
num_classes
)
]
base_label_list
[
label_idx_dict
[
i
]]
=
1
y_true
.
append
(
base_label_list
)
else
:
...
...
@@ -216,9 +222,10 @@ def build_dataset(img_dir, go_res_dir, label_dir, top_text_list, skip_list, save
feature_vec
=
[
0.
]
feature_vec
.
extend
(
simple_word2vec
(
text
))
feature_vec
.
extend
([
x0
/
w
,
y0
/
h
,
x1
/
w
,
y1
/
h
,
x2
/
w
,
y2
/
h
,
x3
/
w
,
y3
/
h
])
feature_vec
.
extend
(
jwq_word2vec
(
text
,
text_vec_max_lens
))
X
.
append
(
feature_vec
)
y_true
.
append
([
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
])
y_true
.
append
([
0
for
_
in
range
(
num_classes
)
])
all_data
=
[
X
,
y_true
,
valid_lens
]
...
...
@@ -328,5 +335,6 @@ if __name__ == '__main__':
build_anno_file
(
valid_dataset_dir
,
valid_anno_file_path
)
# print(simple_word2vec(' fd2jk接口 额24;叁‘,。测ADF壹试!¥? '))
# print(jwq_word2vec('发', 15*50))
...
...
data/word2vec.py
View file @
890ea78
import
re
# from gensim.models import word2vec
import
numpy
as
np
from
gensim.models
import
word2vec
word2vec_model
=
word2vec
.
Word2Vec
.
load
(
'/Users/zhouweiqi/Downloads/xgboost/models/word2vec_train_single.model'
)
def
simple_word2vec
(
text
):
clean_text
=
text
.
strip
()
...
...
@@ -40,4 +43,27 @@ def simple_word2vec(text):
# print(other_num)
# print('-------------')
return
vec
\ No newline at end of file
return
vec
def
jwq_word2vec
(
text
,
text_vec_max_lens
=
1500
):
clean_text
=
text
.
strip
()
sentence_vec
=
list
()
for
char
in
clean_text
:
try
:
word_vec
=
word2vec_model
.
wv
[
char
]
sentence_vec
.
extend
(
word_vec
)
except
:
word_vec
=
word2vec_model
.
wv
[
'unk'
]
sentence_vec
.
extend
(
word_vec
)
if
len
(
sentence_vec
)
>
text_vec_max_lens
:
sentence_vec
=
sentence_vec
[:
text_vec_max_lens
]
else
:
padding_number
=
text_vec_max_lens
-
len
(
sentence_vec
)
for
_
in
range
(
padding_number
):
sentence_vec
.
append
(
0.
)
sentence_vec
=
np
.
float64
(
sentence_vec
)
# print(type(sentence_vec))
return
sentence_vec
...
...
Write
Preview
Styling with
Markdown
is supported
Attach a file
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to post a comment