add tencent word2vec
Showing
2 changed files
with
68 additions
and
6 deletions
... | @@ -5,11 +5,13 @@ import random | ... | @@ -5,11 +5,13 @@ import random |
5 | import uuid | 5 | import uuid |
6 | 6 | ||
7 | import cv2 | 7 | import cv2 |
8 | import re | ||
8 | import pandas as pd | 9 | import pandas as pd |
9 | import numpy as np | 10 | import numpy as np |
11 | import jieba | ||
10 | from shapely.geometry import Polygon, MultiPoint | 12 | from shapely.geometry import Polygon, MultiPoint |
11 | from tools import get_file_paths, load_json | 13 | from tools import get_file_paths, load_json |
12 | from word2vec import jwq_word2vec, simple_word2vec | 14 | from word2vec import jwq_word2vec, simple_word2vec, jieba_and_tencent_word2vec |
13 | 15 | ||
14 | def bbox_iou(go_bbox, label_bbox, mode='iou'): | 16 | def bbox_iou(go_bbox, label_bbox, mode='iou'): |
15 | # 所有点的最小凸的表示形式,四边形对象,会自动计算四个点,最后顺序为:左上 左下 右下 右上 左上 | 17 | # 所有点的最小凸的表示形式,四边形对象,会自动计算四个点,最后顺序为:左上 左下 右下 右上 左上 |
... | @@ -48,8 +50,6 @@ def bbox_iou(go_bbox, label_bbox, mode='iou'): | ... | @@ -48,8 +50,6 @@ def bbox_iou(go_bbox, label_bbox, mode='iou'): |
48 | # else: | 50 | # else: |
49 | # return inter / union | 51 | # return inter / union |
50 | 52 | ||
51 | |||
52 | |||
53 | def clean_go_res(go_res_dir): | 53 | def clean_go_res(go_res_dir): |
54 | go_res_json_paths = get_file_paths(go_res_dir, ['.json', ]) | 54 | go_res_json_paths = get_file_paths(go_res_dir, ['.json', ]) |
55 | for go_res_json_path in go_res_json_paths: | 55 | for go_res_json_path in go_res_json_paths: |
... | @@ -85,6 +85,28 @@ def char_length_statistics(go_res_dir): | ... | @@ -85,6 +85,28 @@ def char_length_statistics(go_res_dir): |
85 | target_file_name = go_res_json_path | 85 | target_file_name = go_res_json_path |
86 | return max_char_length, target_file_name | 86 | return max_char_length, target_file_name |
87 | 87 | ||
88 | def char_length_statistics_jieba(go_res_dir): | ||
89 | max_char_length = None | ||
90 | target_file_name = None | ||
91 | max_char_length = None | ||
92 | statistics_dict = {} | ||
93 | go_res_json_paths = get_file_paths(go_res_dir, ['.json', ]) | ||
94 | for go_res_json_path in go_res_json_paths: | ||
95 | print('Info: start {0}'.format(go_res_json_path)) | ||
96 | src_go_res_list = load_json(go_res_json_path) | ||
97 | for _, text in src_go_res_list: | ||
98 | jieba_char_list = list(filter(lambda x:re.match(r'[\u4e00-\u9fa5]', x), jieba.lcut(text.strip()))) | ||
99 | length = len(jieba_char_list) | ||
100 | if length in statistics_dict: | ||
101 | statistics_dict[length] += 1 | ||
102 | else: | ||
103 | statistics_dict[length] = 1 | ||
104 | if max_char_length is None or length > max_char_length: | ||
105 | target_file_name = go_res_json_path | ||
106 | target_jieba_char_list = jieba_char_list | ||
107 | max_char_length = length | ||
108 | return max_char_length, target_file_name, target_jieba_char_list, statistics_dict | ||
109 | |||
88 | def bbox_statistics(go_res_dir): | 110 | def bbox_statistics(go_res_dir): |
89 | max_seq_count = None | 111 | max_seq_count = None |
90 | seq_sum = 0 | 112 | seq_sum = 0 |
... | @@ -223,9 +245,15 @@ def build_dataset(img_dir, go_res_dir, label_dir, top_text_list, skip_list, save | ... | @@ -223,9 +245,15 @@ def build_dataset(img_dir, go_res_dir, label_dir, top_text_list, skip_list, save |
223 | X = list() | 245 | X = list() |
224 | y_true = list() | 246 | y_true = list() |
225 | 247 | ||
248 | # dim = 1 + 5 + 8 | ||
249 | |||
226 | # text_vec_max_lens = 15 * 50 | 250 | # text_vec_max_lens = 15 * 50 |
227 | # dim = 1 + 5 + 8 + text_vec_max_lens | 251 | # dim = 1 + 5 + 8 + text_vec_max_lens |
228 | dim = 1 + 5 + 8 | 252 | |
253 | max_jieba_char = 8 | ||
254 | text_vec_max_lens = max_jieba_char * 100 | ||
255 | dim = 1 + 5 + 8 + text_vec_max_lens | ||
256 | |||
229 | num_classes = 10 | 257 | num_classes = 10 |
230 | for i in range(160): | 258 | for i in range(160): |
231 | if i >= valid_lens: | 259 | if i >= valid_lens: |
... | @@ -238,6 +266,7 @@ def build_dataset(img_dir, go_res_dir, label_dir, top_text_list, skip_list, save | ... | @@ -238,6 +266,7 @@ def build_dataset(img_dir, go_res_dir, label_dir, top_text_list, skip_list, save |
238 | feature_vec.extend(simple_word2vec(text)) | 266 | feature_vec.extend(simple_word2vec(text)) |
239 | feature_vec.extend([x0/w, y0/h, x1/w, y1/h, x2/w, y2/h, x3/w, y3/h]) | 267 | feature_vec.extend([x0/w, y0/h, x1/w, y1/h, x2/w, y2/h, x3/w, y3/h]) |
240 | # feature_vec.extend(jwq_word2vec(text, text_vec_max_lens)) | 268 | # feature_vec.extend(jwq_word2vec(text, text_vec_max_lens)) |
269 | feature_vec.extend(jieba_and_tencent_word2vec(text, max_jieba_char)) | ||
241 | X.append(feature_vec) | 270 | X.append(feature_vec) |
242 | 271 | ||
243 | y_true.append([0 for _ in range(num_classes)]) | 272 | y_true.append([0 for _ in range(num_classes)]) |
... | @@ -248,6 +277,7 @@ def build_dataset(img_dir, go_res_dir, label_dir, top_text_list, skip_list, save | ... | @@ -248,6 +277,7 @@ def build_dataset(img_dir, go_res_dir, label_dir, top_text_list, skip_list, save |
248 | feature_vec.extend(simple_word2vec(text)) | 277 | feature_vec.extend(simple_word2vec(text)) |
249 | feature_vec.extend([x0/w, y0/h, x1/w, y1/h, x2/w, y2/h, x3/w, y3/h]) | 278 | feature_vec.extend([x0/w, y0/h, x1/w, y1/h, x2/w, y2/h, x3/w, y3/h]) |
250 | # feature_vec.extend(jwq_word2vec(text, text_vec_max_lens)) | 279 | # feature_vec.extend(jwq_word2vec(text, text_vec_max_lens)) |
280 | feature_vec.extend(jieba_and_tencent_word2vec(text, max_jieba_char)) | ||
251 | X.append(feature_vec) | 281 | X.append(feature_vec) |
252 | 282 | ||
253 | base_label_list = [0 for _ in range(num_classes)] | 283 | base_label_list = [0 for _ in range(num_classes)] |
... | @@ -259,6 +289,7 @@ def build_dataset(img_dir, go_res_dir, label_dir, top_text_list, skip_list, save | ... | @@ -259,6 +289,7 @@ def build_dataset(img_dir, go_res_dir, label_dir, top_text_list, skip_list, save |
259 | feature_vec.extend(simple_word2vec(text)) | 289 | feature_vec.extend(simple_word2vec(text)) |
260 | feature_vec.extend([x0/w, y0/h, x1/w, y1/h, x2/w, y2/h, x3/w, y3/h]) | 290 | feature_vec.extend([x0/w, y0/h, x1/w, y1/h, x2/w, y2/h, x3/w, y3/h]) |
261 | # feature_vec.extend(jwq_word2vec(text, text_vec_max_lens)) | 291 | # feature_vec.extend(jwq_word2vec(text, text_vec_max_lens)) |
292 | feature_vec.extend(jieba_and_tencent_word2vec(text, max_jieba_char)) | ||
262 | X.append(feature_vec) | 293 | X.append(feature_vec) |
263 | 294 | ||
264 | y_true.append([0 for _ in range(num_classes)]) | 295 | y_true.append([0 for _ in range(num_classes)]) |
... | @@ -276,6 +307,8 @@ def build_dataset(img_dir, go_res_dir, label_dir, top_text_list, skip_list, save | ... | @@ -276,6 +307,8 @@ def build_dataset(img_dir, go_res_dir, label_dir, top_text_list, skip_list, save |
276 | 'find_value': {go_res_list[k][-1]: group_cn_list[v] for k, v in label_idx_dict.items()} | 307 | 'find_value': {go_res_list[k][-1]: group_cn_list[v] for k, v in label_idx_dict.items()} |
277 | } | 308 | } |
278 | 309 | ||
310 | # break | ||
311 | |||
279 | # print(create_map) | 312 | # print(create_map) |
280 | # print(is_create_map) | 313 | # print(is_create_map) |
281 | if create_map: | 314 | if create_map: |
... | @@ -300,7 +333,7 @@ def build_dataset(img_dir, go_res_dir, label_dir, top_text_list, skip_list, save | ... | @@ -300,7 +333,7 @@ def build_dataset(img_dir, go_res_dir, label_dir, top_text_list, skip_list, save |
300 | if __name__ == '__main__': | 333 | if __name__ == '__main__': |
301 | base_dir = '/Users/zhouweiqi/Downloads/gcfp/data' | 334 | base_dir = '/Users/zhouweiqi/Downloads/gcfp/data' |
302 | go_dir = os.path.join(base_dir, 'go_res') | 335 | go_dir = os.path.join(base_dir, 'go_res') |
303 | dataset_save_dir = os.path.join(base_dir, 'dataset160x14-pro-all-valid') | 336 | dataset_save_dir = os.path.join(base_dir, 'dataset160x814') |
304 | label_dir = os.path.join(base_dir, 'labeled') | 337 | label_dir = os.path.join(base_dir, 'labeled') |
305 | 338 | ||
306 | train_go_path = os.path.join(go_dir, 'train') | 339 | train_go_path = os.path.join(go_dir, 'train') |
... | @@ -324,6 +357,12 @@ if __name__ == '__main__': | ... | @@ -324,6 +357,12 @@ if __name__ == '__main__': |
324 | # print(max_char_lens) # 72 | 357 | # print(max_char_lens) # 72 |
325 | # print(target_file_name) # train/CH-B103053828-4.json | 358 | # print(target_file_name) # train/CH-B103053828-4.json |
326 | 359 | ||
360 | # max_char_length, target_file_name, target_jieba_char_list, statistics_dict = char_length_statistics_jieba(go_dir) | ||
361 | # print(max_char_length) # 24 | ||
362 | # print(target_file_name) # train/CH-B102551568-6.json | ||
363 | # print(target_jieba_char_list) | ||
364 | # print(statistics_dict) # {2: 12077, 1: 12751, 0: 13073, 3: 4423, 4: 1212, 5: 969, 6: 744, 7: 524, 8: 199, 10: 45, 12: 9, 18: 44, 9: 109, 11: 19, 13: 4, 16: 4, 21: 2, 19: 2, 15: 8, 17: 7, 14: 3, 20: 1, 24: 1} | ||
365 | |||
327 | # top_text_list = text_statistics(go_dir) | 366 | # top_text_list = text_statistics(go_dir) |
328 | # for t in top_text_list: | 367 | # for t in top_text_list: |
329 | # print(t) | 368 | # print(t) | ... | ... |
1 | import re | 1 | import re |
2 | import numpy as np | 2 | import numpy as np |
3 | from gensim.models import word2vec | 3 | import jieba |
4 | from gensim.models import word2vec, KeyedVectors | ||
4 | word2vec_model = word2vec.Word2Vec.load('/Users/zhouweiqi/Downloads/xgboost/models/word2vec_train_single.model') | 5 | word2vec_model = word2vec.Word2Vec.load('/Users/zhouweiqi/Downloads/xgboost/models/word2vec_train_single.model') |
6 | wv_from_text = KeyedVectors.load_word2vec_format('/Users/zhouweiqi/Downloads/tencent-ailab-embedding-zh-d100-v0.2.0-s/tencent-ailab-embedding-zh-d100-v0.2.0-s.txt', binary=False) | ||
5 | 7 | ||
6 | 8 | ||
7 | def simple_word2vec(text): | 9 | def simple_word2vec(text): |
... | @@ -67,3 +69,24 @@ def jwq_word2vec(text, text_vec_max_lens=1500): | ... | @@ -67,3 +69,24 @@ def jwq_word2vec(text, text_vec_max_lens=1500): |
67 | sentence_vec = np.float64(sentence_vec) | 69 | sentence_vec = np.float64(sentence_vec) |
68 | # print(type(sentence_vec)) | 70 | # print(type(sentence_vec)) |
69 | return sentence_vec | 71 | return sentence_vec |
72 | |||
73 | def jieba_and_tencent_word2vec(text, max_jieba_char): | ||
74 | done_char_count = 0 | ||
75 | sentence_vec = [] | ||
76 | for cn_char in filter(lambda x:re.match(r'[\u4e00-\u9fa5]', x.strip()), jieba.lcut(text.strip())): | ||
77 | if done_char_count >= max_jieba_char: | ||
78 | break | ||
79 | |||
80 | try: | ||
81 | vec = wv_from_text.word_vec(cn_char.strip()) | ||
82 | except Exception as e: | ||
83 | pass | ||
84 | else: | ||
85 | sentence_vec = np.append(sentence_vec, vec) | ||
86 | done_char_count += 1 | ||
87 | |||
88 | if done_char_count < max_jieba_char: | ||
89 | sentence_vec = np.append(sentence_vec, np.zeros(((max_jieba_char-done_char_count)*100, ), dtype=np.float32)) | ||
90 | |||
91 | return sentence_vec | ||
92 | |||
... | \ No newline at end of file | ... | \ No newline at end of file | ... | ... |
-
Please register or sign in to post a comment