import cPickle as pickle import msgpack import numpy as np # Load vocabulary w/ word frequencies with open('wmt11.head.vocab', 'rb') as f: vocab = msgpack.load(f) # Load requisite vector data with open('wmt11.head.vectors', 'rb') as f: W = pickle.load(f) id2word = dict((id, word) for word, (id, _) in vocab.iteritems()) # Normalize word vectors for i, row in enumerate(W): W[i, :] /= np.linalg.norm(row) # Remove context word vectors W = W[:len(vocab), :] def most_similar(positive, negative, topn=10, freq_threshold=5): # Build a "mean" vector for the given positive and negative terms mean_vecs = [] for word in positive: mean_vecs.append(W[vocab[word][0]]) for word in negative: mean_vecs.append(-1 * W[vocab[word][0]]) mean = np.array(mean_vecs).mean(axis=0) mean /= np.linalg.norm(mean) # Now calculate cosine distances between this mean vector and all others dists = np.dot(W, mean) best = np.argsort(dists)[::-1][:topn + len(positive) + len(negative) + 100] result = [(id2word[i], dists[i]) for i in best if (vocab[id2word[i]] >= freq_threshold and id2word[i] not in positive and id2word[i] not in negative)] return result[:topn] most_similar(['king', 'woman'], ['man'], topn=50) most_similar(['brought', 'seek'], ['bring'], topn=50)