zl程序教程

您现在的位置是:首页 >  工具

当前栏目

NLP学习笔记

2023-09-11 14:17:47 时间

NLPL学习笔记


gensim-word2vec

训练

from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
import multiprocessing
def create_wordVectors(sentences, embedding_size = 128, window = 5, min_count = 5, word2vec_path = None):
    w2vModel = Word2Vec(sentences, size=embedding_size, window=window, min_count=min_count,workers=multiprocessing.cpu_count())
    w2vModel.save(word2vec_path)

载入

def load_wordVectors(word2vec_path):
    w2vModel = Word2Vec.load(word2vec_path)
    return w2vModel

映射

def embedding_lookup(w2vModel, sentences):
    all_vectors = []
    embeddingDim = w2vModel.vector_size
    embeddingUnknown = [0 for i in range(embeddingDim)]
    for sentence in sentences:
        this_vector = []
        for word in sentence:
            if word in w2vModel.wv.vocab:
                v=w2vModel[word]
                this_vector.append(v)
            else:
                this_vector.append(embeddingUnknown)
        all_vectors.append(this_vector)
    return all_vectors

获得单词下标和词向量

    w2vModel = Word2Vec.load(word2vec_path)
    word = '你'
    index = w2vModel.wv.vocab[word].index  # 获得单词word的下标
    word2= w2vModel.wv.index2word[index]   # 根据index,获得对应的word
    
    vector1 = w2vModel.wv.vectors[index]   # 由下标获得词向量
    vector2 = w2vModel[word]               # 由word直接获得词向量