modify dataset
Showing
2 changed files
with
42 additions
and
11 deletions
| ... | @@ -245,6 +245,8 @@ def build_dataset(img_dir, go_res_dir, label_dir, top_text_list, skip_list, save | ... | @@ -245,6 +245,8 @@ def build_dataset(img_dir, go_res_dir, label_dir, top_text_list, skip_list, save |
| 245 | X = list() | 245 | X = list() |
| 246 | y_true = list() | 246 | y_true = list() |
| 247 | 247 | ||
| 248 | X_no_text = list() | ||
| 249 | |||
| 248 | # dim = 1 + 5 + 8 | 250 | # dim = 1 + 5 + 8 |
| 249 | 251 | ||
| 250 | # text_vec_max_lens = 15 * 50 | 252 | # text_vec_max_lens = 15 * 50 |
| ... | @@ -260,22 +262,31 @@ def build_dataset(img_dir, go_res_dir, label_dir, top_text_list, skip_list, save | ... | @@ -260,22 +262,31 @@ def build_dataset(img_dir, go_res_dir, label_dir, top_text_list, skip_list, save |
| 260 | X.append([0. for _ in range(dim)]) | 262 | X.append([0. for _ in range(dim)]) |
| 261 | y_true.append([0 for _ in range(num_classes)]) | 263 | y_true.append([0 for _ in range(num_classes)]) |
| 262 | 264 | ||
| 265 | X_no_text.append([0. for _ in range(dim)]) | ||
| 266 | |||
| 263 | elif i in top_text_idx_set: | 267 | elif i in top_text_idx_set: |
| 264 | (x0, y0, x1, y1, x2, y2, x3, y3), text = go_res_list[i] | 268 | (x0, y0, x1, y1, x2, y2, x3, y3), text = go_res_list[i] |
| 265 | feature_vec = [1.] | 269 | feature_vec = [1.] |
| 266 | feature_vec.extend(simple_word2vec(text)) | 270 | feature_vec.extend(simple_word2vec(text)) |
| 267 | feature_vec.extend([(x0/w)*2-1, (y0/h)*2-1, (x1/w)*2-1, (y1/h)*2-1, (x2/w)*2-1, (y2/h)*2-1, (x3/w)*2-1, (y3/h)*2-1]) | 271 | feature_vec.extend([(x0/w), (y0/h), (x1/w), (y1/h), (x2/w), (y2/h), (x3/w), (y3/h)]) |
| 268 | # feature_vec.extend(jwq_word2vec(text, text_vec_max_lens)) | 272 | # feature_vec.extend(jwq_word2vec(text, text_vec_max_lens)) |
| 269 | feature_vec.extend(jieba_and_tencent_word2vec(text, max_jieba_char)) | 273 | feature_vec.extend(jieba_and_tencent_word2vec(text, max_jieba_char)) |
| 270 | X.append(feature_vec) | 274 | X.append(feature_vec) |
| 271 | 275 | ||
| 272 | y_true.append([0 for _ in range(num_classes)]) | 276 | y_true.append([0 for _ in range(num_classes)]) |
| 273 | 277 | ||
| 278 | feature_vec_no_text = [1.] | ||
| 279 | feature_vec_no_text.extend([0. for _ in range(5)]) | ||
| 280 | feature_vec_no_text.extend([(x0/w), (y0/h), (x1/w), (y1/h), (x2/w), (y2/h), (x3/w), (y3/h)]) | ||
| 281 | # feature_vec.extend(jwq_word2vec(text, text_vec_max_lens)) | ||
| 282 | feature_vec_no_text.extend([0. for _ in range(text_vec_max_lens)]) | ||
| 283 | X_no_text.append(feature_vec_no_text) | ||
| 284 | |||
| 274 | elif i in label_idx_dict: | 285 | elif i in label_idx_dict: |
| 275 | (x0, y0, x1, y1, x2, y2, x3, y3), text = go_res_list[i] | 286 | (x0, y0, x1, y1, x2, y2, x3, y3), text = go_res_list[i] |
| 276 | feature_vec = [-1.] | 287 | feature_vec = [0.] |
| 277 | feature_vec.extend(simple_word2vec(text)) | 288 | feature_vec.extend(simple_word2vec(text)) |
| 278 | feature_vec.extend([(x0/w)*2-1, (y0/h)*2-1, (x1/w)*2-1, (y1/h)*2-1, (x2/w)*2-1, (y2/h)*2-1, (x3/w)*2-1, (y3/h)*2-1]) | 289 | feature_vec.extend([(x0/w), (y0/h), (x1/w), (y1/h), (x2/w), (y2/h), (x3/w), (y3/h)]) |
| 279 | # feature_vec.extend(jwq_word2vec(text, text_vec_max_lens)) | 290 | # feature_vec.extend(jwq_word2vec(text, text_vec_max_lens)) |
| 280 | feature_vec.extend(jieba_and_tencent_word2vec(text, max_jieba_char)) | 291 | feature_vec.extend(jieba_and_tencent_word2vec(text, max_jieba_char)) |
| 281 | X.append(feature_vec) | 292 | X.append(feature_vec) |
| ... | @@ -283,23 +294,43 @@ def build_dataset(img_dir, go_res_dir, label_dir, top_text_list, skip_list, save | ... | @@ -283,23 +294,43 @@ def build_dataset(img_dir, go_res_dir, label_dir, top_text_list, skip_list, save |
| 283 | base_label_list = [0 for _ in range(num_classes)] | 294 | base_label_list = [0 for _ in range(num_classes)] |
| 284 | base_label_list[label_idx_dict[i]] = 1 | 295 | base_label_list[label_idx_dict[i]] = 1 |
| 285 | y_true.append(base_label_list) | 296 | y_true.append(base_label_list) |
| 297 | |||
| 298 | feature_vec_no_text = [0.] | ||
| 299 | feature_vec_no_text.extend([0. for _ in range(5)]) | ||
| 300 | feature_vec_no_text.extend([(x0/w), (y0/h), (x1/w), (y1/h), (x2/w), (y2/h), (x3/w), (y3/h)]) | ||
| 301 | # feature_vec.extend(jwq_word2vec(text, text_vec_max_lens)) | ||
| 302 | feature_vec_no_text.extend([0. for _ in range(text_vec_max_lens)]) | ||
| 303 | X_no_text.append(feature_vec_no_text) | ||
| 304 | |||
| 286 | else: | 305 | else: |
| 287 | (x0, y0, x1, y1, x2, y2, x3, y3), text = go_res_list[i] | 306 | (x0, y0, x1, y1, x2, y2, x3, y3), text = go_res_list[i] |
| 288 | feature_vec = [-1.] | 307 | feature_vec = [0.] |
| 289 | feature_vec.extend(simple_word2vec(text)) | 308 | feature_vec.extend(simple_word2vec(text)) |
| 290 | feature_vec.extend([(x0/w)*2-1, (y0/h)*2-1, (x1/w)*2-1, (y1/h)*2-1, (x2/w)*2-1, (y2/h)*2-1, (x3/w)*2-1, (y3/h)*2-1]) | 309 | feature_vec.extend([(x0/w), (y0/h), (x1/w), (y1/h), (x2/w), (y2/h), (x3/w), (y3/h)]) |
| 291 | # feature_vec.extend(jwq_word2vec(text, text_vec_max_lens)) | 310 | # feature_vec.extend(jwq_word2vec(text, text_vec_max_lens)) |
| 292 | feature_vec.extend(jieba_and_tencent_word2vec(text, max_jieba_char)) | 311 | feature_vec.extend(jieba_and_tencent_word2vec(text, max_jieba_char)) |
| 293 | X.append(feature_vec) | 312 | X.append(feature_vec) |
| 294 | 313 | ||
| 295 | y_true.append([0 for _ in range(num_classes)]) | 314 | y_true.append([0 for _ in range(num_classes)]) |
| 296 | 315 | ||
| 316 | feature_vec_no_text = [0.] | ||
| 317 | feature_vec_no_text.extend([0. for _ in range(5)]) | ||
| 318 | feature_vec_no_text.extend([(x0/w), (y0/h), (x1/w), (y1/h), (x2/w), (y2/h), (x3/w), (y3/h)]) | ||
| 319 | # feature_vec.extend(jwq_word2vec(text, text_vec_max_lens)) | ||
| 320 | feature_vec_no_text.extend([0. for _ in range(text_vec_max_lens)]) | ||
| 321 | X_no_text.append(feature_vec_no_text) | ||
| 322 | |||
| 297 | all_data = [X, y_true, valid_lens] | 323 | all_data = [X, y_true, valid_lens] |
| 324 | all_data_no_text = [X_no_text, y_true, valid_lens] | ||
| 298 | 325 | ||
| 299 | save_json_name = '{0}.json'.format(uuid.uuid3(uuid.NAMESPACE_DNS, img_name)) | 326 | save_json_name = '{0}.json'.format(uuid.uuid3(uuid.NAMESPACE_DNS, img_name)) |
| 300 | with open(os.path.join(save_dir, save_json_name), 'w') as fp: | 327 | with open(os.path.join(save_dir, save_json_name), 'w') as fp: |
| 301 | json.dump(all_data, fp) | 328 | json.dump(all_data, fp) |
| 302 | 329 | ||
| 330 | save_json_name_2 = '{0}.json'.format(uuid.uuid3(uuid.NAMESPACE_DNS, '{0}_no_text'.format(img_name))) | ||
| 331 | with open(os.path.join(save_dir, save_json_name_2), 'w') as fp: | ||
| 332 | json.dump(all_data_no_text, fp) | ||
| 333 | |||
| 303 | if is_create_map: | 334 | if is_create_map: |
| 304 | create_map[img_name] = { | 335 | create_map[img_name] = { |
| 305 | 'x_y_valid_lens': save_json_name, | 336 | 'x_y_valid_lens': save_json_name, |
| ... | @@ -333,7 +364,7 @@ def build_dataset(img_dir, go_res_dir, label_dir, top_text_list, skip_list, save | ... | @@ -333,7 +364,7 @@ def build_dataset(img_dir, go_res_dir, label_dir, top_text_list, skip_list, save |
| 333 | if __name__ == '__main__': | 364 | if __name__ == '__main__': |
| 334 | base_dir = '/Users/zhouweiqi/Downloads/gcfp/data' | 365 | base_dir = '/Users/zhouweiqi/Downloads/gcfp/data' |
| 335 | go_dir = os.path.join(base_dir, 'go_res') | 366 | go_dir = os.path.join(base_dir, 'go_res') |
| 336 | dataset_save_dir = os.path.join(base_dir, 'dataset160x414') | 367 | dataset_save_dir = os.path.join(base_dir, 'dataset160x414x10-no-text') |
| 337 | label_dir = os.path.join(base_dir, 'labeled') | 368 | label_dir = os.path.join(base_dir, 'labeled') |
| 338 | 369 | ||
| 339 | train_go_path = os.path.join(go_dir, 'train') | 370 | train_go_path = os.path.join(go_dir, 'train') | ... | ... |
| ... | @@ -27,12 +27,12 @@ def simple_word2vec(text): | ... | @@ -27,12 +27,12 @@ def simple_word2vec(text): |
| 27 | else: | 27 | else: |
| 28 | other_num += 1 | 28 | other_num += 1 |
| 29 | 29 | ||
| 30 | vec = [(text_len/100)*2 - 1, | 30 | vec = [text_len/100, |
| 31 | (cn_num/text_len)*2 - 1, | 31 | cn_num/text_len, |
| 32 | (en_num/text_len)*2 - 1, | 32 | en_num/text_len, |
| 33 | (digit_num/text_len)*2 - 1, | 33 | digit_num/text_len, |
| 34 | # space_num/text_len, | 34 | # space_num/text_len, |
| 35 | (other_num/text_len)*2 - 1, | 35 | other_num/text_len, |
| 36 | ] | 36 | ] |
| 37 | 37 | ||
| 38 | # print(text) | 38 | # print(text) | ... | ... |
-
Please register or sign in to post a comment