e573fab0 by 周伟奇

add tencent word2vec

1 parent f5b96f19
...@@ -5,11 +5,13 @@ import random ...@@ -5,11 +5,13 @@ import random
5 import uuid 5 import uuid
6 6
7 import cv2 7 import cv2
8 import re
8 import pandas as pd 9 import pandas as pd
9 import numpy as np 10 import numpy as np
11 import jieba
10 from shapely.geometry import Polygon, MultiPoint 12 from shapely.geometry import Polygon, MultiPoint
11 from tools import get_file_paths, load_json 13 from tools import get_file_paths, load_json
12 from word2vec import jwq_word2vec, simple_word2vec 14 from word2vec import jwq_word2vec, simple_word2vec, jieba_and_tencent_word2vec
13 15
14 def bbox_iou(go_bbox, label_bbox, mode='iou'): 16 def bbox_iou(go_bbox, label_bbox, mode='iou'):
15 # 所有点的最小凸的表示形式,四边形对象,会自动计算四个点,最后顺序为:左上 左下 右下 右上 左上 17 # 所有点的最小凸的表示形式,四边形对象,会自动计算四个点,最后顺序为:左上 左下 右下 右上 左上
...@@ -48,8 +50,6 @@ def bbox_iou(go_bbox, label_bbox, mode='iou'): ...@@ -48,8 +50,6 @@ def bbox_iou(go_bbox, label_bbox, mode='iou'):
48 # else: 50 # else:
49 # return inter / union 51 # return inter / union
50 52
51
52
53 def clean_go_res(go_res_dir): 53 def clean_go_res(go_res_dir):
54 go_res_json_paths = get_file_paths(go_res_dir, ['.json', ]) 54 go_res_json_paths = get_file_paths(go_res_dir, ['.json', ])
55 for go_res_json_path in go_res_json_paths: 55 for go_res_json_path in go_res_json_paths:
...@@ -85,6 +85,28 @@ def char_length_statistics(go_res_dir): ...@@ -85,6 +85,28 @@ def char_length_statistics(go_res_dir):
85 target_file_name = go_res_json_path 85 target_file_name = go_res_json_path
86 return max_char_length, target_file_name 86 return max_char_length, target_file_name
87 87
88 def char_length_statistics_jieba(go_res_dir):
89 max_char_length = None
90 target_file_name = None
91 max_char_length = None
92 statistics_dict = {}
93 go_res_json_paths = get_file_paths(go_res_dir, ['.json', ])
94 for go_res_json_path in go_res_json_paths:
95 print('Info: start {0}'.format(go_res_json_path))
96 src_go_res_list = load_json(go_res_json_path)
97 for _, text in src_go_res_list:
98 jieba_char_list = list(filter(lambda x:re.match(r'[\u4e00-\u9fa5]', x), jieba.lcut(text.strip())))
99 length = len(jieba_char_list)
100 if length in statistics_dict:
101 statistics_dict[length] += 1
102 else:
103 statistics_dict[length] = 1
104 if max_char_length is None or length > max_char_length:
105 target_file_name = go_res_json_path
106 target_jieba_char_list = jieba_char_list
107 max_char_length = length
108 return max_char_length, target_file_name, target_jieba_char_list, statistics_dict
109
88 def bbox_statistics(go_res_dir): 110 def bbox_statistics(go_res_dir):
89 max_seq_count = None 111 max_seq_count = None
90 seq_sum = 0 112 seq_sum = 0
...@@ -223,9 +245,15 @@ def build_dataset(img_dir, go_res_dir, label_dir, top_text_list, skip_list, save ...@@ -223,9 +245,15 @@ def build_dataset(img_dir, go_res_dir, label_dir, top_text_list, skip_list, save
223 X = list() 245 X = list()
224 y_true = list() 246 y_true = list()
225 247
248 # dim = 1 + 5 + 8
249
226 # text_vec_max_lens = 15 * 50 250 # text_vec_max_lens = 15 * 50
227 # dim = 1 + 5 + 8 + text_vec_max_lens 251 # dim = 1 + 5 + 8 + text_vec_max_lens
228 dim = 1 + 5 + 8 252
253 max_jieba_char = 8
254 text_vec_max_lens = max_jieba_char * 100
255 dim = 1 + 5 + 8 + text_vec_max_lens
256
229 num_classes = 10 257 num_classes = 10
230 for i in range(160): 258 for i in range(160):
231 if i >= valid_lens: 259 if i >= valid_lens:
...@@ -238,6 +266,7 @@ def build_dataset(img_dir, go_res_dir, label_dir, top_text_list, skip_list, save ...@@ -238,6 +266,7 @@ def build_dataset(img_dir, go_res_dir, label_dir, top_text_list, skip_list, save
238 feature_vec.extend(simple_word2vec(text)) 266 feature_vec.extend(simple_word2vec(text))
239 feature_vec.extend([x0/w, y0/h, x1/w, y1/h, x2/w, y2/h, x3/w, y3/h]) 267 feature_vec.extend([x0/w, y0/h, x1/w, y1/h, x2/w, y2/h, x3/w, y3/h])
240 # feature_vec.extend(jwq_word2vec(text, text_vec_max_lens)) 268 # feature_vec.extend(jwq_word2vec(text, text_vec_max_lens))
269 feature_vec.extend(jieba_and_tencent_word2vec(text, max_jieba_char))
241 X.append(feature_vec) 270 X.append(feature_vec)
242 271
243 y_true.append([0 for _ in range(num_classes)]) 272 y_true.append([0 for _ in range(num_classes)])
...@@ -248,6 +277,7 @@ def build_dataset(img_dir, go_res_dir, label_dir, top_text_list, skip_list, save ...@@ -248,6 +277,7 @@ def build_dataset(img_dir, go_res_dir, label_dir, top_text_list, skip_list, save
248 feature_vec.extend(simple_word2vec(text)) 277 feature_vec.extend(simple_word2vec(text))
249 feature_vec.extend([x0/w, y0/h, x1/w, y1/h, x2/w, y2/h, x3/w, y3/h]) 278 feature_vec.extend([x0/w, y0/h, x1/w, y1/h, x2/w, y2/h, x3/w, y3/h])
250 # feature_vec.extend(jwq_word2vec(text, text_vec_max_lens)) 279 # feature_vec.extend(jwq_word2vec(text, text_vec_max_lens))
280 feature_vec.extend(jieba_and_tencent_word2vec(text, max_jieba_char))
251 X.append(feature_vec) 281 X.append(feature_vec)
252 282
253 base_label_list = [0 for _ in range(num_classes)] 283 base_label_list = [0 for _ in range(num_classes)]
...@@ -259,6 +289,7 @@ def build_dataset(img_dir, go_res_dir, label_dir, top_text_list, skip_list, save ...@@ -259,6 +289,7 @@ def build_dataset(img_dir, go_res_dir, label_dir, top_text_list, skip_list, save
259 feature_vec.extend(simple_word2vec(text)) 289 feature_vec.extend(simple_word2vec(text))
260 feature_vec.extend([x0/w, y0/h, x1/w, y1/h, x2/w, y2/h, x3/w, y3/h]) 290 feature_vec.extend([x0/w, y0/h, x1/w, y1/h, x2/w, y2/h, x3/w, y3/h])
261 # feature_vec.extend(jwq_word2vec(text, text_vec_max_lens)) 291 # feature_vec.extend(jwq_word2vec(text, text_vec_max_lens))
292 feature_vec.extend(jieba_and_tencent_word2vec(text, max_jieba_char))
262 X.append(feature_vec) 293 X.append(feature_vec)
263 294
264 y_true.append([0 for _ in range(num_classes)]) 295 y_true.append([0 for _ in range(num_classes)])
...@@ -276,6 +307,8 @@ def build_dataset(img_dir, go_res_dir, label_dir, top_text_list, skip_list, save ...@@ -276,6 +307,8 @@ def build_dataset(img_dir, go_res_dir, label_dir, top_text_list, skip_list, save
276 'find_value': {go_res_list[k][-1]: group_cn_list[v] for k, v in label_idx_dict.items()} 307 'find_value': {go_res_list[k][-1]: group_cn_list[v] for k, v in label_idx_dict.items()}
277 } 308 }
278 309
310 # break
311
279 # print(create_map) 312 # print(create_map)
280 # print(is_create_map) 313 # print(is_create_map)
281 if create_map: 314 if create_map:
...@@ -300,7 +333,7 @@ def build_dataset(img_dir, go_res_dir, label_dir, top_text_list, skip_list, save ...@@ -300,7 +333,7 @@ def build_dataset(img_dir, go_res_dir, label_dir, top_text_list, skip_list, save
300 if __name__ == '__main__': 333 if __name__ == '__main__':
301 base_dir = '/Users/zhouweiqi/Downloads/gcfp/data' 334 base_dir = '/Users/zhouweiqi/Downloads/gcfp/data'
302 go_dir = os.path.join(base_dir, 'go_res') 335 go_dir = os.path.join(base_dir, 'go_res')
303 dataset_save_dir = os.path.join(base_dir, 'dataset160x14-pro-all-valid') 336 dataset_save_dir = os.path.join(base_dir, 'dataset160x814')
304 label_dir = os.path.join(base_dir, 'labeled') 337 label_dir = os.path.join(base_dir, 'labeled')
305 338
306 train_go_path = os.path.join(go_dir, 'train') 339 train_go_path = os.path.join(go_dir, 'train')
...@@ -324,6 +357,12 @@ if __name__ == '__main__': ...@@ -324,6 +357,12 @@ if __name__ == '__main__':
324 # print(max_char_lens) # 72 357 # print(max_char_lens) # 72
325 # print(target_file_name) # train/CH-B103053828-4.json 358 # print(target_file_name) # train/CH-B103053828-4.json
326 359
360 # max_char_length, target_file_name, target_jieba_char_list, statistics_dict = char_length_statistics_jieba(go_dir)
361 # print(max_char_length) # 24
362 # print(target_file_name) # train/CH-B102551568-6.json
363 # print(target_jieba_char_list)
364 # print(statistics_dict) # {2: 12077, 1: 12751, 0: 13073, 3: 4423, 4: 1212, 5: 969, 6: 744, 7: 524, 8: 199, 10: 45, 12: 9, 18: 44, 9: 109, 11: 19, 13: 4, 16: 4, 21: 2, 19: 2, 15: 8, 17: 7, 14: 3, 20: 1, 24: 1}
365
327 # top_text_list = text_statistics(go_dir) 366 # top_text_list = text_statistics(go_dir)
328 # for t in top_text_list: 367 # for t in top_text_list:
329 # print(t) 368 # print(t)
......
1 import re 1 import re
2 import numpy as np 2 import numpy as np
3 from gensim.models import word2vec 3 import jieba
4 from gensim.models import word2vec, KeyedVectors
4 word2vec_model = word2vec.Word2Vec.load('/Users/zhouweiqi/Downloads/xgboost/models/word2vec_train_single.model') 5 word2vec_model = word2vec.Word2Vec.load('/Users/zhouweiqi/Downloads/xgboost/models/word2vec_train_single.model')
6 wv_from_text = KeyedVectors.load_word2vec_format('/Users/zhouweiqi/Downloads/tencent-ailab-embedding-zh-d100-v0.2.0-s/tencent-ailab-embedding-zh-d100-v0.2.0-s.txt', binary=False)
5 7
6 8
7 def simple_word2vec(text): 9 def simple_word2vec(text):
...@@ -67,3 +69,24 @@ def jwq_word2vec(text, text_vec_max_lens=1500): ...@@ -67,3 +69,24 @@ def jwq_word2vec(text, text_vec_max_lens=1500):
67 sentence_vec = np.float64(sentence_vec) 69 sentence_vec = np.float64(sentence_vec)
68 # print(type(sentence_vec)) 70 # print(type(sentence_vec))
69 return sentence_vec 71 return sentence_vec
72
73 def jieba_and_tencent_word2vec(text, max_jieba_char):
74 done_char_count = 0
75 sentence_vec = []
76 for cn_char in filter(lambda x:re.match(r'[\u4e00-\u9fa5]', x.strip()), jieba.lcut(text.strip())):
77 if done_char_count >= max_jieba_char:
78 break
79
80 try:
81 vec = wv_from_text.word_vec(cn_char.strip())
82 except Exception as e:
83 pass
84 else:
85 sentence_vec = np.append(sentence_vec, vec)
86 done_char_count += 1
87
88 if done_char_count < max_jieba_char:
89 sentence_vec = np.append(sentence_vec, np.zeros(((max_jieba_char-done_char_count)*100, ), dtype=np.float32))
90
91 return sentence_vec
92
...\ No newline at end of file ...\ No newline at end of file
......
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!