modify dataset
Showing
2 changed files
with
42 additions
and
11 deletions
... | @@ -245,6 +245,8 @@ def build_dataset(img_dir, go_res_dir, label_dir, top_text_list, skip_list, save | ... | @@ -245,6 +245,8 @@ def build_dataset(img_dir, go_res_dir, label_dir, top_text_list, skip_list, save |
245 | X = list() | 245 | X = list() |
246 | y_true = list() | 246 | y_true = list() |
247 | 247 | ||
248 | X_no_text = list() | ||
249 | |||
248 | # dim = 1 + 5 + 8 | 250 | # dim = 1 + 5 + 8 |
249 | 251 | ||
250 | # text_vec_max_lens = 15 * 50 | 252 | # text_vec_max_lens = 15 * 50 |
... | @@ -260,22 +262,31 @@ def build_dataset(img_dir, go_res_dir, label_dir, top_text_list, skip_list, save | ... | @@ -260,22 +262,31 @@ def build_dataset(img_dir, go_res_dir, label_dir, top_text_list, skip_list, save |
260 | X.append([0. for _ in range(dim)]) | 262 | X.append([0. for _ in range(dim)]) |
261 | y_true.append([0 for _ in range(num_classes)]) | 263 | y_true.append([0 for _ in range(num_classes)]) |
262 | 264 | ||
265 | X_no_text.append([0. for _ in range(dim)]) | ||
266 | |||
263 | elif i in top_text_idx_set: | 267 | elif i in top_text_idx_set: |
264 | (x0, y0, x1, y1, x2, y2, x3, y3), text = go_res_list[i] | 268 | (x0, y0, x1, y1, x2, y2, x3, y3), text = go_res_list[i] |
265 | feature_vec = [1.] | 269 | feature_vec = [1.] |
266 | feature_vec.extend(simple_word2vec(text)) | 270 | feature_vec.extend(simple_word2vec(text)) |
267 | feature_vec.extend([(x0/w)*2-1, (y0/h)*2-1, (x1/w)*2-1, (y1/h)*2-1, (x2/w)*2-1, (y2/h)*2-1, (x3/w)*2-1, (y3/h)*2-1]) | 271 | feature_vec.extend([(x0/w), (y0/h), (x1/w), (y1/h), (x2/w), (y2/h), (x3/w), (y3/h)]) |
268 | # feature_vec.extend(jwq_word2vec(text, text_vec_max_lens)) | 272 | # feature_vec.extend(jwq_word2vec(text, text_vec_max_lens)) |
269 | feature_vec.extend(jieba_and_tencent_word2vec(text, max_jieba_char)) | 273 | feature_vec.extend(jieba_and_tencent_word2vec(text, max_jieba_char)) |
270 | X.append(feature_vec) | 274 | X.append(feature_vec) |
271 | 275 | ||
272 | y_true.append([0 for _ in range(num_classes)]) | 276 | y_true.append([0 for _ in range(num_classes)]) |
273 | 277 | ||
278 | feature_vec_no_text = [1.] | ||
279 | feature_vec_no_text.extend([0. for _ in range(5)]) | ||
280 | feature_vec_no_text.extend([(x0/w), (y0/h), (x1/w), (y1/h), (x2/w), (y2/h), (x3/w), (y3/h)]) | ||
281 | # feature_vec.extend(jwq_word2vec(text, text_vec_max_lens)) | ||
282 | feature_vec_no_text.extend([0. for _ in range(text_vec_max_lens)]) | ||
283 | X_no_text.append(feature_vec_no_text) | ||
284 | |||
274 | elif i in label_idx_dict: | 285 | elif i in label_idx_dict: |
275 | (x0, y0, x1, y1, x2, y2, x3, y3), text = go_res_list[i] | 286 | (x0, y0, x1, y1, x2, y2, x3, y3), text = go_res_list[i] |
276 | feature_vec = [-1.] | 287 | feature_vec = [0.] |
277 | feature_vec.extend(simple_word2vec(text)) | 288 | feature_vec.extend(simple_word2vec(text)) |
278 | feature_vec.extend([(x0/w)*2-1, (y0/h)*2-1, (x1/w)*2-1, (y1/h)*2-1, (x2/w)*2-1, (y2/h)*2-1, (x3/w)*2-1, (y3/h)*2-1]) | 289 | feature_vec.extend([(x0/w), (y0/h), (x1/w), (y1/h), (x2/w), (y2/h), (x3/w), (y3/h)]) |
279 | # feature_vec.extend(jwq_word2vec(text, text_vec_max_lens)) | 290 | # feature_vec.extend(jwq_word2vec(text, text_vec_max_lens)) |
280 | feature_vec.extend(jieba_and_tencent_word2vec(text, max_jieba_char)) | 291 | feature_vec.extend(jieba_and_tencent_word2vec(text, max_jieba_char)) |
281 | X.append(feature_vec) | 292 | X.append(feature_vec) |
... | @@ -283,23 +294,43 @@ def build_dataset(img_dir, go_res_dir, label_dir, top_text_list, skip_list, save | ... | @@ -283,23 +294,43 @@ def build_dataset(img_dir, go_res_dir, label_dir, top_text_list, skip_list, save |
283 | base_label_list = [0 for _ in range(num_classes)] | 294 | base_label_list = [0 for _ in range(num_classes)] |
284 | base_label_list[label_idx_dict[i]] = 1 | 295 | base_label_list[label_idx_dict[i]] = 1 |
285 | y_true.append(base_label_list) | 296 | y_true.append(base_label_list) |
297 | |||
298 | feature_vec_no_text = [0.] | ||
299 | feature_vec_no_text.extend([0. for _ in range(5)]) | ||
300 | feature_vec_no_text.extend([(x0/w), (y0/h), (x1/w), (y1/h), (x2/w), (y2/h), (x3/w), (y3/h)]) | ||
301 | # feature_vec.extend(jwq_word2vec(text, text_vec_max_lens)) | ||
302 | feature_vec_no_text.extend([0. for _ in range(text_vec_max_lens)]) | ||
303 | X_no_text.append(feature_vec_no_text) | ||
304 | |||
286 | else: | 305 | else: |
287 | (x0, y0, x1, y1, x2, y2, x3, y3), text = go_res_list[i] | 306 | (x0, y0, x1, y1, x2, y2, x3, y3), text = go_res_list[i] |
288 | feature_vec = [-1.] | 307 | feature_vec = [0.] |
289 | feature_vec.extend(simple_word2vec(text)) | 308 | feature_vec.extend(simple_word2vec(text)) |
290 | feature_vec.extend([(x0/w)*2-1, (y0/h)*2-1, (x1/w)*2-1, (y1/h)*2-1, (x2/w)*2-1, (y2/h)*2-1, (x3/w)*2-1, (y3/h)*2-1]) | 309 | feature_vec.extend([(x0/w), (y0/h), (x1/w), (y1/h), (x2/w), (y2/h), (x3/w), (y3/h)]) |
291 | # feature_vec.extend(jwq_word2vec(text, text_vec_max_lens)) | 310 | # feature_vec.extend(jwq_word2vec(text, text_vec_max_lens)) |
292 | feature_vec.extend(jieba_and_tencent_word2vec(text, max_jieba_char)) | 311 | feature_vec.extend(jieba_and_tencent_word2vec(text, max_jieba_char)) |
293 | X.append(feature_vec) | 312 | X.append(feature_vec) |
294 | 313 | ||
295 | y_true.append([0 for _ in range(num_classes)]) | 314 | y_true.append([0 for _ in range(num_classes)]) |
296 | 315 | ||
316 | feature_vec_no_text = [0.] | ||
317 | feature_vec_no_text.extend([0. for _ in range(5)]) | ||
318 | feature_vec_no_text.extend([(x0/w), (y0/h), (x1/w), (y1/h), (x2/w), (y2/h), (x3/w), (y3/h)]) | ||
319 | # feature_vec.extend(jwq_word2vec(text, text_vec_max_lens)) | ||
320 | feature_vec_no_text.extend([0. for _ in range(text_vec_max_lens)]) | ||
321 | X_no_text.append(feature_vec_no_text) | ||
322 | |||
297 | all_data = [X, y_true, valid_lens] | 323 | all_data = [X, y_true, valid_lens] |
324 | all_data_no_text = [X_no_text, y_true, valid_lens] | ||
298 | 325 | ||
299 | save_json_name = '{0}.json'.format(uuid.uuid3(uuid.NAMESPACE_DNS, img_name)) | 326 | save_json_name = '{0}.json'.format(uuid.uuid3(uuid.NAMESPACE_DNS, img_name)) |
300 | with open(os.path.join(save_dir, save_json_name), 'w') as fp: | 327 | with open(os.path.join(save_dir, save_json_name), 'w') as fp: |
301 | json.dump(all_data, fp) | 328 | json.dump(all_data, fp) |
302 | 329 | ||
330 | save_json_name_2 = '{0}.json'.format(uuid.uuid3(uuid.NAMESPACE_DNS, '{0}_no_text'.format(img_name))) | ||
331 | with open(os.path.join(save_dir, save_json_name_2), 'w') as fp: | ||
332 | json.dump(all_data_no_text, fp) | ||
333 | |||
303 | if is_create_map: | 334 | if is_create_map: |
304 | create_map[img_name] = { | 335 | create_map[img_name] = { |
305 | 'x_y_valid_lens': save_json_name, | 336 | 'x_y_valid_lens': save_json_name, |
... | @@ -333,7 +364,7 @@ def build_dataset(img_dir, go_res_dir, label_dir, top_text_list, skip_list, save | ... | @@ -333,7 +364,7 @@ def build_dataset(img_dir, go_res_dir, label_dir, top_text_list, skip_list, save |
333 | if __name__ == '__main__': | 364 | if __name__ == '__main__': |
334 | base_dir = '/Users/zhouweiqi/Downloads/gcfp/data' | 365 | base_dir = '/Users/zhouweiqi/Downloads/gcfp/data' |
335 | go_dir = os.path.join(base_dir, 'go_res') | 366 | go_dir = os.path.join(base_dir, 'go_res') |
336 | dataset_save_dir = os.path.join(base_dir, 'dataset160x414') | 367 | dataset_save_dir = os.path.join(base_dir, 'dataset160x414x10-no-text') |
337 | label_dir = os.path.join(base_dir, 'labeled') | 368 | label_dir = os.path.join(base_dir, 'labeled') |
338 | 369 | ||
339 | train_go_path = os.path.join(go_dir, 'train') | 370 | train_go_path = os.path.join(go_dir, 'train') | ... | ... |
... | @@ -27,12 +27,12 @@ def simple_word2vec(text): | ... | @@ -27,12 +27,12 @@ def simple_word2vec(text): |
27 | else: | 27 | else: |
28 | other_num += 1 | 28 | other_num += 1 |
29 | 29 | ||
30 | vec = [(text_len/100)*2 - 1, | 30 | vec = [text_len/100, |
31 | (cn_num/text_len)*2 - 1, | 31 | cn_num/text_len, |
32 | (en_num/text_len)*2 - 1, | 32 | en_num/text_len, |
33 | (digit_num/text_len)*2 - 1, | 33 | digit_num/text_len, |
34 | # space_num/text_len, | 34 | # space_num/text_len, |
35 | (other_num/text_len)*2 - 1, | 35 | other_num/text_len, |
36 | ] | 36 | ] |
37 | 37 | ||
38 | # print(text) | 38 | # print(text) | ... | ... |
-
Please register or sign in to post a comment