8631d57e by 周伟奇

modify dataset

1 parent 18e1e6ed
......@@ -245,6 +245,8 @@ def build_dataset(img_dir, go_res_dir, label_dir, top_text_list, skip_list, save
X = list()
y_true = list()
X_no_text = list()
# dim = 1 + 5 + 8
# text_vec_max_lens = 15 * 50
......@@ -260,22 +262,31 @@ def build_dataset(img_dir, go_res_dir, label_dir, top_text_list, skip_list, save
X.append([0. for _ in range(dim)])
y_true.append([0 for _ in range(num_classes)])
X_no_text.append([0. for _ in range(dim)])
elif i in top_text_idx_set:
(x0, y0, x1, y1, x2, y2, x3, y3), text = go_res_list[i]
feature_vec = [1.]
feature_vec.extend(simple_word2vec(text))
feature_vec.extend([(x0/w)*2-1, (y0/h)*2-1, (x1/w)*2-1, (y1/h)*2-1, (x2/w)*2-1, (y2/h)*2-1, (x3/w)*2-1, (y3/h)*2-1])
feature_vec.extend([(x0/w), (y0/h), (x1/w), (y1/h), (x2/w), (y2/h), (x3/w), (y3/h)])
# feature_vec.extend(jwq_word2vec(text, text_vec_max_lens))
feature_vec.extend(jieba_and_tencent_word2vec(text, max_jieba_char))
X.append(feature_vec)
y_true.append([0 for _ in range(num_classes)])
feature_vec_no_text = [1.]
feature_vec_no_text.extend([0. for _ in range(5)])
feature_vec_no_text.extend([(x0/w), (y0/h), (x1/w), (y1/h), (x2/w), (y2/h), (x3/w), (y3/h)])
# feature_vec.extend(jwq_word2vec(text, text_vec_max_lens))
feature_vec_no_text.extend([0. for _ in range(text_vec_max_lens)])
X_no_text.append(feature_vec_no_text)
elif i in label_idx_dict:
(x0, y0, x1, y1, x2, y2, x3, y3), text = go_res_list[i]
feature_vec = [-1.]
feature_vec = [0.]
feature_vec.extend(simple_word2vec(text))
feature_vec.extend([(x0/w)*2-1, (y0/h)*2-1, (x1/w)*2-1, (y1/h)*2-1, (x2/w)*2-1, (y2/h)*2-1, (x3/w)*2-1, (y3/h)*2-1])
feature_vec.extend([(x0/w), (y0/h), (x1/w), (y1/h), (x2/w), (y2/h), (x3/w), (y3/h)])
# feature_vec.extend(jwq_word2vec(text, text_vec_max_lens))
feature_vec.extend(jieba_and_tencent_word2vec(text, max_jieba_char))
X.append(feature_vec)
......@@ -283,23 +294,43 @@ def build_dataset(img_dir, go_res_dir, label_dir, top_text_list, skip_list, save
base_label_list = [0 for _ in range(num_classes)]
base_label_list[label_idx_dict[i]] = 1
y_true.append(base_label_list)
feature_vec_no_text = [0.]
feature_vec_no_text.extend([0. for _ in range(5)])
feature_vec_no_text.extend([(x0/w), (y0/h), (x1/w), (y1/h), (x2/w), (y2/h), (x3/w), (y3/h)])
# feature_vec.extend(jwq_word2vec(text, text_vec_max_lens))
feature_vec_no_text.extend([0. for _ in range(text_vec_max_lens)])
X_no_text.append(feature_vec_no_text)
else:
(x0, y0, x1, y1, x2, y2, x3, y3), text = go_res_list[i]
feature_vec = [-1.]
feature_vec = [0.]
feature_vec.extend(simple_word2vec(text))
feature_vec.extend([(x0/w)*2-1, (y0/h)*2-1, (x1/w)*2-1, (y1/h)*2-1, (x2/w)*2-1, (y2/h)*2-1, (x3/w)*2-1, (y3/h)*2-1])
feature_vec.extend([(x0/w), (y0/h), (x1/w), (y1/h), (x2/w), (y2/h), (x3/w), (y3/h)])
# feature_vec.extend(jwq_word2vec(text, text_vec_max_lens))
feature_vec.extend(jieba_and_tencent_word2vec(text, max_jieba_char))
X.append(feature_vec)
y_true.append([0 for _ in range(num_classes)])
feature_vec_no_text = [0.]
feature_vec_no_text.extend([0. for _ in range(5)])
feature_vec_no_text.extend([(x0/w), (y0/h), (x1/w), (y1/h), (x2/w), (y2/h), (x3/w), (y3/h)])
# feature_vec.extend(jwq_word2vec(text, text_vec_max_lens))
feature_vec_no_text.extend([0. for _ in range(text_vec_max_lens)])
X_no_text.append(feature_vec_no_text)
all_data = [X, y_true, valid_lens]
all_data_no_text = [X_no_text, y_true, valid_lens]
save_json_name = '{0}.json'.format(uuid.uuid3(uuid.NAMESPACE_DNS, img_name))
with open(os.path.join(save_dir, save_json_name), 'w') as fp:
json.dump(all_data, fp)
save_json_name_2 = '{0}.json'.format(uuid.uuid3(uuid.NAMESPACE_DNS, '{0}_no_text'.format(img_name)))
with open(os.path.join(save_dir, save_json_name_2), 'w') as fp:
json.dump(all_data_no_text, fp)
if is_create_map:
create_map[img_name] = {
'x_y_valid_lens': save_json_name,
......@@ -333,7 +364,7 @@ def build_dataset(img_dir, go_res_dir, label_dir, top_text_list, skip_list, save
if __name__ == '__main__':
base_dir = '/Users/zhouweiqi/Downloads/gcfp/data'
go_dir = os.path.join(base_dir, 'go_res')
dataset_save_dir = os.path.join(base_dir, 'dataset160x414')
dataset_save_dir = os.path.join(base_dir, 'dataset160x414x10-no-text')
label_dir = os.path.join(base_dir, 'labeled')
train_go_path = os.path.join(go_dir, 'train')
......
......@@ -27,12 +27,12 @@ def simple_word2vec(text):
else:
other_num += 1
vec = [(text_len/100)*2 - 1,
(cn_num/text_len)*2 - 1,
(en_num/text_len)*2 - 1,
(digit_num/text_len)*2 - 1,
vec = [text_len/100,
cn_num/text_len,
en_num/text_len,
digit_num/text_len,
# space_num/text_len,
(other_num/text_len)*2 - 1,
other_num/text_len,
]
# print(text)
......
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!