word2vec.py
2.67 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import re
import numpy as np
import jieba
from gensim.models import word2vec, KeyedVectors
word2vec_model = word2vec.Word2Vec.load('/Users/zhouweiqi/Downloads/xgboost/models/word2vec_train_single.model')
wv_from_text = KeyedVectors.load_word2vec_format('/Users/zhouweiqi/Downloads/tencent-ailab-embedding-zh-d100-v0.2.0-s/tencent-ailab-embedding-zh-d100-v0.2.0-s.txt', binary=False)
def simple_word2vec(text):
clean_text = text.strip()
text_len = len(clean_text)
digit_num = 0
en_num = 0
cn_num = 0
space_num = 0
other_num = 0
for char in clean_text:
if char.isdigit():
digit_num += 1
elif re.match(r'[A-Za-z]', char):
en_num += 1
elif char.isspace():
space_num += 1
elif re.match(r'[\u4e00-\u9fa5]', char):
cn_num += 1
else:
other_num += 1
vec = [(text_len/100)*2 - 1,
(cn_num/text_len)*2 - 1,
(en_num/text_len)*2 - 1,
(digit_num/text_len)*2 - 1,
# space_num/text_len,
(other_num/text_len)*2 - 1,
]
# print(text)
# print(clean_text)
# print('-------------')
# print(en_num)
# print(cn_num)
# print(digit_num)
# print(space_num)
# print(other_num)
# print('-------------')
return vec
def jwq_word2vec(text, text_vec_max_lens=1500):
clean_text = text.strip()
sentence_vec = list()
for char in clean_text:
try:
word_vec = word2vec_model.wv[char]
sentence_vec.extend(word_vec)
except:
word_vec = word2vec_model.wv['unk']
sentence_vec.extend(word_vec)
if len(sentence_vec) > text_vec_max_lens:
sentence_vec = sentence_vec[:text_vec_max_lens]
else:
padding_number = text_vec_max_lens - len(sentence_vec)
for _ in range(padding_number):
sentence_vec.append(0.)
sentence_vec = np.float64(sentence_vec)
# print(type(sentence_vec))
return sentence_vec
def jieba_and_tencent_word2vec(text, max_jieba_char):
done_char_count = 0
sentence_vec = []
for cn_char in filter(lambda x:re.match(r'[\u4e00-\u9fa5]', x.strip()), jieba.lcut(text.strip())):
if done_char_count >= max_jieba_char:
break
try:
vec = wv_from_text.word_vec(cn_char.strip())
except Exception as e:
pass
else:
sentence_vec = np.append(sentence_vec, vec)
done_char_count += 1
if done_char_count < max_jieba_char:
sentence_vec = np.append(sentence_vec, np.zeros(((max_jieba_char-done_char_count)*100, ), dtype=np.float32))
return sentence_vec