8631d57e by 周伟奇

modify dataset

1 parent 18e1e6ed
...@@ -245,6 +245,8 @@ def build_dataset(img_dir, go_res_dir, label_dir, top_text_list, skip_list, save ...@@ -245,6 +245,8 @@ def build_dataset(img_dir, go_res_dir, label_dir, top_text_list, skip_list, save
245 X = list() 245 X = list()
246 y_true = list() 246 y_true = list()
247 247
248 X_no_text = list()
249
248 # dim = 1 + 5 + 8 250 # dim = 1 + 5 + 8
249 251
250 # text_vec_max_lens = 15 * 50 252 # text_vec_max_lens = 15 * 50
...@@ -260,22 +262,31 @@ def build_dataset(img_dir, go_res_dir, label_dir, top_text_list, skip_list, save ...@@ -260,22 +262,31 @@ def build_dataset(img_dir, go_res_dir, label_dir, top_text_list, skip_list, save
260 X.append([0. for _ in range(dim)]) 262 X.append([0. for _ in range(dim)])
261 y_true.append([0 for _ in range(num_classes)]) 263 y_true.append([0 for _ in range(num_classes)])
262 264
265 X_no_text.append([0. for _ in range(dim)])
266
263 elif i in top_text_idx_set: 267 elif i in top_text_idx_set:
264 (x0, y0, x1, y1, x2, y2, x3, y3), text = go_res_list[i] 268 (x0, y0, x1, y1, x2, y2, x3, y3), text = go_res_list[i]
265 feature_vec = [1.] 269 feature_vec = [1.]
266 feature_vec.extend(simple_word2vec(text)) 270 feature_vec.extend(simple_word2vec(text))
267 feature_vec.extend([(x0/w)*2-1, (y0/h)*2-1, (x1/w)*2-1, (y1/h)*2-1, (x2/w)*2-1, (y2/h)*2-1, (x3/w)*2-1, (y3/h)*2-1]) 271 feature_vec.extend([(x0/w), (y0/h), (x1/w), (y1/h), (x2/w), (y2/h), (x3/w), (y3/h)])
268 # feature_vec.extend(jwq_word2vec(text, text_vec_max_lens)) 272 # feature_vec.extend(jwq_word2vec(text, text_vec_max_lens))
269 feature_vec.extend(jieba_and_tencent_word2vec(text, max_jieba_char)) 273 feature_vec.extend(jieba_and_tencent_word2vec(text, max_jieba_char))
270 X.append(feature_vec) 274 X.append(feature_vec)
271 275
272 y_true.append([0 for _ in range(num_classes)]) 276 y_true.append([0 for _ in range(num_classes)])
273 277
278 feature_vec_no_text = [1.]
279 feature_vec_no_text.extend([0. for _ in range(5)])
280 feature_vec_no_text.extend([(x0/w), (y0/h), (x1/w), (y1/h), (x2/w), (y2/h), (x3/w), (y3/h)])
281 # feature_vec.extend(jwq_word2vec(text, text_vec_max_lens))
282 feature_vec_no_text.extend([0. for _ in range(text_vec_max_lens)])
283 X_no_text.append(feature_vec_no_text)
284
274 elif i in label_idx_dict: 285 elif i in label_idx_dict:
275 (x0, y0, x1, y1, x2, y2, x3, y3), text = go_res_list[i] 286 (x0, y0, x1, y1, x2, y2, x3, y3), text = go_res_list[i]
276 feature_vec = [-1.] 287 feature_vec = [0.]
277 feature_vec.extend(simple_word2vec(text)) 288 feature_vec.extend(simple_word2vec(text))
278 feature_vec.extend([(x0/w)*2-1, (y0/h)*2-1, (x1/w)*2-1, (y1/h)*2-1, (x2/w)*2-1, (y2/h)*2-1, (x3/w)*2-1, (y3/h)*2-1]) 289 feature_vec.extend([(x0/w), (y0/h), (x1/w), (y1/h), (x2/w), (y2/h), (x3/w), (y3/h)])
279 # feature_vec.extend(jwq_word2vec(text, text_vec_max_lens)) 290 # feature_vec.extend(jwq_word2vec(text, text_vec_max_lens))
280 feature_vec.extend(jieba_and_tencent_word2vec(text, max_jieba_char)) 291 feature_vec.extend(jieba_and_tencent_word2vec(text, max_jieba_char))
281 X.append(feature_vec) 292 X.append(feature_vec)
...@@ -283,23 +294,43 @@ def build_dataset(img_dir, go_res_dir, label_dir, top_text_list, skip_list, save ...@@ -283,23 +294,43 @@ def build_dataset(img_dir, go_res_dir, label_dir, top_text_list, skip_list, save
283 base_label_list = [0 for _ in range(num_classes)] 294 base_label_list = [0 for _ in range(num_classes)]
284 base_label_list[label_idx_dict[i]] = 1 295 base_label_list[label_idx_dict[i]] = 1
285 y_true.append(base_label_list) 296 y_true.append(base_label_list)
297
298 feature_vec_no_text = [0.]
299 feature_vec_no_text.extend([0. for _ in range(5)])
300 feature_vec_no_text.extend([(x0/w), (y0/h), (x1/w), (y1/h), (x2/w), (y2/h), (x3/w), (y3/h)])
301 # feature_vec.extend(jwq_word2vec(text, text_vec_max_lens))
302 feature_vec_no_text.extend([0. for _ in range(text_vec_max_lens)])
303 X_no_text.append(feature_vec_no_text)
304
286 else: 305 else:
287 (x0, y0, x1, y1, x2, y2, x3, y3), text = go_res_list[i] 306 (x0, y0, x1, y1, x2, y2, x3, y3), text = go_res_list[i]
288 feature_vec = [-1.] 307 feature_vec = [0.]
289 feature_vec.extend(simple_word2vec(text)) 308 feature_vec.extend(simple_word2vec(text))
290 feature_vec.extend([(x0/w)*2-1, (y0/h)*2-1, (x1/w)*2-1, (y1/h)*2-1, (x2/w)*2-1, (y2/h)*2-1, (x3/w)*2-1, (y3/h)*2-1]) 309 feature_vec.extend([(x0/w), (y0/h), (x1/w), (y1/h), (x2/w), (y2/h), (x3/w), (y3/h)])
291 # feature_vec.extend(jwq_word2vec(text, text_vec_max_lens)) 310 # feature_vec.extend(jwq_word2vec(text, text_vec_max_lens))
292 feature_vec.extend(jieba_and_tencent_word2vec(text, max_jieba_char)) 311 feature_vec.extend(jieba_and_tencent_word2vec(text, max_jieba_char))
293 X.append(feature_vec) 312 X.append(feature_vec)
294 313
295 y_true.append([0 for _ in range(num_classes)]) 314 y_true.append([0 for _ in range(num_classes)])
296 315
316 feature_vec_no_text = [0.]
317 feature_vec_no_text.extend([0. for _ in range(5)])
318 feature_vec_no_text.extend([(x0/w), (y0/h), (x1/w), (y1/h), (x2/w), (y2/h), (x3/w), (y3/h)])
319 # feature_vec.extend(jwq_word2vec(text, text_vec_max_lens))
320 feature_vec_no_text.extend([0. for _ in range(text_vec_max_lens)])
321 X_no_text.append(feature_vec_no_text)
322
297 all_data = [X, y_true, valid_lens] 323 all_data = [X, y_true, valid_lens]
324 all_data_no_text = [X_no_text, y_true, valid_lens]
298 325
299 save_json_name = '{0}.json'.format(uuid.uuid3(uuid.NAMESPACE_DNS, img_name)) 326 save_json_name = '{0}.json'.format(uuid.uuid3(uuid.NAMESPACE_DNS, img_name))
300 with open(os.path.join(save_dir, save_json_name), 'w') as fp: 327 with open(os.path.join(save_dir, save_json_name), 'w') as fp:
301 json.dump(all_data, fp) 328 json.dump(all_data, fp)
302 329
330 save_json_name_2 = '{0}.json'.format(uuid.uuid3(uuid.NAMESPACE_DNS, '{0}_no_text'.format(img_name)))
331 with open(os.path.join(save_dir, save_json_name_2), 'w') as fp:
332 json.dump(all_data_no_text, fp)
333
303 if is_create_map: 334 if is_create_map:
304 create_map[img_name] = { 335 create_map[img_name] = {
305 'x_y_valid_lens': save_json_name, 336 'x_y_valid_lens': save_json_name,
...@@ -333,7 +364,7 @@ def build_dataset(img_dir, go_res_dir, label_dir, top_text_list, skip_list, save ...@@ -333,7 +364,7 @@ def build_dataset(img_dir, go_res_dir, label_dir, top_text_list, skip_list, save
333 if __name__ == '__main__': 364 if __name__ == '__main__':
334 base_dir = '/Users/zhouweiqi/Downloads/gcfp/data' 365 base_dir = '/Users/zhouweiqi/Downloads/gcfp/data'
335 go_dir = os.path.join(base_dir, 'go_res') 366 go_dir = os.path.join(base_dir, 'go_res')
336 dataset_save_dir = os.path.join(base_dir, 'dataset160x414') 367 dataset_save_dir = os.path.join(base_dir, 'dataset160x414x10-no-text')
337 label_dir = os.path.join(base_dir, 'labeled') 368 label_dir = os.path.join(base_dir, 'labeled')
338 369
339 train_go_path = os.path.join(go_dir, 'train') 370 train_go_path = os.path.join(go_dir, 'train')
......
...@@ -27,12 +27,12 @@ def simple_word2vec(text): ...@@ -27,12 +27,12 @@ def simple_word2vec(text):
27 else: 27 else:
28 other_num += 1 28 other_num += 1
29 29
30 vec = [(text_len/100)*2 - 1, 30 vec = [text_len/100,
31 (cn_num/text_len)*2 - 1, 31 cn_num/text_len,
32 (en_num/text_len)*2 - 1, 32 en_num/text_len,
33 (digit_num/text_len)*2 - 1, 33 digit_num/text_len,
34 # space_num/text_len, 34 # space_num/text_len,
35 (other_num/text_len)*2 - 1, 35 other_num/text_len,
36 ] 36 ]
37 37
38 # print(text) 38 # print(text)
......
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!