word2vec.py 2.67 KB
import re
import numpy as np
import jieba
from gensim.models import word2vec, KeyedVectors
word2vec_model = word2vec.Word2Vec.load('/Users/zhouweiqi/Downloads/xgboost/models/word2vec_train_single.model')
wv_from_text = KeyedVectors.load_word2vec_format('/Users/zhouweiqi/Downloads/tencent-ailab-embedding-zh-d100-v0.2.0-s/tencent-ailab-embedding-zh-d100-v0.2.0-s.txt', binary=False)


def simple_word2vec(text):
    clean_text = text.strip()
    text_len = len(clean_text)

    digit_num = 0
    en_num = 0
    cn_num = 0
    space_num = 0
    other_num = 0
    for char in clean_text:
        if char.isdigit():
            digit_num += 1
        elif re.match(r'[A-Za-z]', char):
            en_num += 1
        elif char.isspace():
            space_num += 1
        elif re.match(r'[\u4e00-\u9fa5]', char):
            cn_num += 1
        else:
            other_num += 1
    
    vec = [(text_len/100)*2 - 1,
           (cn_num/text_len)*2 - 1,
           (en_num/text_len)*2 - 1,
           (digit_num/text_len)*2 - 1,
           # space_num/text_len,
           (other_num/text_len)*2 - 1,
           ]

    # print(text)
    # print(clean_text)
    # print('-------------')
    # print(en_num)
    # print(cn_num)
    # print(digit_num)
    # print(space_num)
    # print(other_num)
    # print('-------------')

    return vec

def jwq_word2vec(text, text_vec_max_lens=1500):
    clean_text = text.strip()

    sentence_vec = list()
    for char in clean_text:
        try:
            word_vec = word2vec_model.wv[char]
            sentence_vec.extend(word_vec)
        except:
            word_vec = word2vec_model.wv['unk']
            sentence_vec.extend(word_vec)
    
    if len(sentence_vec) > text_vec_max_lens:
        sentence_vec = sentence_vec[:text_vec_max_lens]
    else:
        padding_number = text_vec_max_lens - len(sentence_vec)
        for _ in range(padding_number):
            sentence_vec.append(0.)

    sentence_vec = np.float64(sentence_vec)
    # print(type(sentence_vec))
    return sentence_vec

def jieba_and_tencent_word2vec(text, max_jieba_char):
    done_char_count = 0
    sentence_vec = []
    for cn_char in filter(lambda x:re.match(r'[\u4e00-\u9fa5]', x.strip()), jieba.lcut(text.strip())):
        if done_char_count >= max_jieba_char:
            break

        try:
            vec = wv_from_text.word_vec(cn_char.strip())
        except Exception as e:
            pass
        else:
            sentence_vec = np.append(sentence_vec, vec)
            done_char_count += 1
    
    if done_char_count < max_jieba_char:
        sentence_vec = np.append(sentence_vec, np.zeros(((max_jieba_char-done_char_count)*100, ), dtype=np.float32))
        
    return sentence_vec