Skip to content
Toggle navigation
Toggle navigation
This project
Loading...
Sign in
周伟奇
/
test_on_pytorch
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Pipelines
Graphs
Issues
0
Merge Requests
0
Wiki
Network
Create a new issue
Builds
Commits
Issue Boards
Files
Commits
Network
Compare
Branches
Tags
8631d57e
authored
2022-12-26 18:15:56 +0800
by
周伟奇
Browse Files
Options
Browse Files
Tag
Download
Email Patches
Plain Diff
modify dataset
1 parent
18e1e6ed
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
42 additions
and
11 deletions
data/create_dataset2.py
data/word2vec.py
data/create_dataset2.py
View file @
8631d57
...
...
@@ -245,6 +245,8 @@ def build_dataset(img_dir, go_res_dir, label_dir, top_text_list, skip_list, save
X
=
list
()
y_true
=
list
()
X_no_text
=
list
()
# dim = 1 + 5 + 8
# text_vec_max_lens = 15 * 50
...
...
@@ -260,22 +262,31 @@ def build_dataset(img_dir, go_res_dir, label_dir, top_text_list, skip_list, save
X
.
append
([
0.
for
_
in
range
(
dim
)])
y_true
.
append
([
0
for
_
in
range
(
num_classes
)])
X_no_text
.
append
([
0.
for
_
in
range
(
dim
)])
elif
i
in
top_text_idx_set
:
(
x0
,
y0
,
x1
,
y1
,
x2
,
y2
,
x3
,
y3
),
text
=
go_res_list
[
i
]
feature_vec
=
[
1.
]
feature_vec
.
extend
(
simple_word2vec
(
text
))
feature_vec
.
extend
([(
x0
/
w
)
*
2
-
1
,
(
y0
/
h
)
*
2
-
1
,
(
x1
/
w
)
*
2
-
1
,
(
y1
/
h
)
*
2
-
1
,
(
x2
/
w
)
*
2
-
1
,
(
y2
/
h
)
*
2
-
1
,
(
x3
/
w
)
*
2
-
1
,
(
y3
/
h
)
*
2
-
1
])
feature_vec
.
extend
([(
x0
/
w
)
,
(
y0
/
h
),
(
x1
/
w
),
(
y1
/
h
),
(
x2
/
w
),
(
y2
/
h
),
(
x3
/
w
),
(
y3
/
h
)
])
# feature_vec.extend(jwq_word2vec(text, text_vec_max_lens))
feature_vec
.
extend
(
jieba_and_tencent_word2vec
(
text
,
max_jieba_char
))
X
.
append
(
feature_vec
)
y_true
.
append
([
0
for
_
in
range
(
num_classes
)])
feature_vec_no_text
=
[
1.
]
feature_vec_no_text
.
extend
([
0.
for
_
in
range
(
5
)])
feature_vec_no_text
.
extend
([(
x0
/
w
),
(
y0
/
h
),
(
x1
/
w
),
(
y1
/
h
),
(
x2
/
w
),
(
y2
/
h
),
(
x3
/
w
),
(
y3
/
h
)])
# feature_vec.extend(jwq_word2vec(text, text_vec_max_lens))
feature_vec_no_text
.
extend
([
0.
for
_
in
range
(
text_vec_max_lens
)])
X_no_text
.
append
(
feature_vec_no_text
)
elif
i
in
label_idx_dict
:
(
x0
,
y0
,
x1
,
y1
,
x2
,
y2
,
x3
,
y3
),
text
=
go_res_list
[
i
]
feature_vec
=
[
-
1
.
]
feature_vec
=
[
0
.
]
feature_vec
.
extend
(
simple_word2vec
(
text
))
feature_vec
.
extend
([(
x0
/
w
)
*
2
-
1
,
(
y0
/
h
)
*
2
-
1
,
(
x1
/
w
)
*
2
-
1
,
(
y1
/
h
)
*
2
-
1
,
(
x2
/
w
)
*
2
-
1
,
(
y2
/
h
)
*
2
-
1
,
(
x3
/
w
)
*
2
-
1
,
(
y3
/
h
)
*
2
-
1
])
feature_vec
.
extend
([(
x0
/
w
)
,
(
y0
/
h
),
(
x1
/
w
),
(
y1
/
h
),
(
x2
/
w
),
(
y2
/
h
),
(
x3
/
w
),
(
y3
/
h
)
])
# feature_vec.extend(jwq_word2vec(text, text_vec_max_lens))
feature_vec
.
extend
(
jieba_and_tencent_word2vec
(
text
,
max_jieba_char
))
X
.
append
(
feature_vec
)
...
...
@@ -283,23 +294,43 @@ def build_dataset(img_dir, go_res_dir, label_dir, top_text_list, skip_list, save
base_label_list
=
[
0
for
_
in
range
(
num_classes
)]
base_label_list
[
label_idx_dict
[
i
]]
=
1
y_true
.
append
(
base_label_list
)
feature_vec_no_text
=
[
0.
]
feature_vec_no_text
.
extend
([
0.
for
_
in
range
(
5
)])
feature_vec_no_text
.
extend
([(
x0
/
w
),
(
y0
/
h
),
(
x1
/
w
),
(
y1
/
h
),
(
x2
/
w
),
(
y2
/
h
),
(
x3
/
w
),
(
y3
/
h
)])
# feature_vec.extend(jwq_word2vec(text, text_vec_max_lens))
feature_vec_no_text
.
extend
([
0.
for
_
in
range
(
text_vec_max_lens
)])
X_no_text
.
append
(
feature_vec_no_text
)
else
:
(
x0
,
y0
,
x1
,
y1
,
x2
,
y2
,
x3
,
y3
),
text
=
go_res_list
[
i
]
feature_vec
=
[
-
1
.
]
feature_vec
=
[
0
.
]
feature_vec
.
extend
(
simple_word2vec
(
text
))
feature_vec
.
extend
([(
x0
/
w
)
*
2
-
1
,
(
y0
/
h
)
*
2
-
1
,
(
x1
/
w
)
*
2
-
1
,
(
y1
/
h
)
*
2
-
1
,
(
x2
/
w
)
*
2
-
1
,
(
y2
/
h
)
*
2
-
1
,
(
x3
/
w
)
*
2
-
1
,
(
y3
/
h
)
*
2
-
1
])
feature_vec
.
extend
([(
x0
/
w
)
,
(
y0
/
h
),
(
x1
/
w
),
(
y1
/
h
),
(
x2
/
w
),
(
y2
/
h
),
(
x3
/
w
),
(
y3
/
h
)
])
# feature_vec.extend(jwq_word2vec(text, text_vec_max_lens))
feature_vec
.
extend
(
jieba_and_tencent_word2vec
(
text
,
max_jieba_char
))
X
.
append
(
feature_vec
)
y_true
.
append
([
0
for
_
in
range
(
num_classes
)])
feature_vec_no_text
=
[
0.
]
feature_vec_no_text
.
extend
([
0.
for
_
in
range
(
5
)])
feature_vec_no_text
.
extend
([(
x0
/
w
),
(
y0
/
h
),
(
x1
/
w
),
(
y1
/
h
),
(
x2
/
w
),
(
y2
/
h
),
(
x3
/
w
),
(
y3
/
h
)])
# feature_vec.extend(jwq_word2vec(text, text_vec_max_lens))
feature_vec_no_text
.
extend
([
0.
for
_
in
range
(
text_vec_max_lens
)])
X_no_text
.
append
(
feature_vec_no_text
)
all_data
=
[
X
,
y_true
,
valid_lens
]
all_data_no_text
=
[
X_no_text
,
y_true
,
valid_lens
]
save_json_name
=
'{0}.json'
.
format
(
uuid
.
uuid3
(
uuid
.
NAMESPACE_DNS
,
img_name
))
with
open
(
os
.
path
.
join
(
save_dir
,
save_json_name
),
'w'
)
as
fp
:
json
.
dump
(
all_data
,
fp
)
save_json_name_2
=
'{0}.json'
.
format
(
uuid
.
uuid3
(
uuid
.
NAMESPACE_DNS
,
'{0}_no_text'
.
format
(
img_name
)))
with
open
(
os
.
path
.
join
(
save_dir
,
save_json_name_2
),
'w'
)
as
fp
:
json
.
dump
(
all_data_no_text
,
fp
)
if
is_create_map
:
create_map
[
img_name
]
=
{
'x_y_valid_lens'
:
save_json_name
,
...
...
@@ -333,7 +364,7 @@ def build_dataset(img_dir, go_res_dir, label_dir, top_text_list, skip_list, save
if
__name__
==
'__main__'
:
base_dir
=
'/Users/zhouweiqi/Downloads/gcfp/data'
go_dir
=
os
.
path
.
join
(
base_dir
,
'go_res'
)
dataset_save_dir
=
os
.
path
.
join
(
base_dir
,
'dataset160x414'
)
dataset_save_dir
=
os
.
path
.
join
(
base_dir
,
'dataset160x414
x10-no-text
'
)
label_dir
=
os
.
path
.
join
(
base_dir
,
'labeled'
)
train_go_path
=
os
.
path
.
join
(
go_dir
,
'train'
)
...
...
data/word2vec.py
View file @
8631d57
...
...
@@ -27,12 +27,12 @@ def simple_word2vec(text):
else
:
other_num
+=
1
vec
=
[
(
text_len
/
100
)
*
2
-
1
,
(
cn_num
/
text_len
)
*
2
-
1
,
(
en_num
/
text_len
)
*
2
-
1
,
(
digit_num
/
text_len
)
*
2
-
1
,
vec
=
[
text_len
/
100
,
cn_num
/
text_len
,
en_num
/
text_len
,
digit_num
/
text_len
,
# space_num/text_len,
(
other_num
/
text_len
)
*
2
-
1
,
other_num
/
text_len
,
]
# print(text)
...
...
Write
Preview
Styling with
Markdown
is supported
Attach a file
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to post a comment