word2vec.py
958 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
import re
# from gensim.models import word2vec
def simple_word2vec(text):
clean_text = text.strip()
text_len = len(clean_text)
digit_num = 0
en_num = 0
cn_num = 0
space_num = 0
other_num = 0
for char in clean_text:
if char.isdigit():
digit_num += 1
elif re.match(r'[A-Za-z]', char):
en_num += 1
elif char.isspace():
space_num += 1
elif re.match(r'[\u4e00-\u9fa5]', char):
cn_num += 1
else:
other_num += 1
vec = [text_len/100,
cn_num/text_len,
en_num/text_len,
digit_num/text_len,
# space_num/text_len,
other_num/text_len,
]
# print(text)
# print(clean_text)
# print('-------------')
# print(en_num)
# print(cn_num)
# print(digit_num)
# print(space_num)
# print(other_num)
# print('-------------')
return vec