import cPickle as pickle
import msgpack

import numpy as np

# Load vocabulary w/ word frequencies
with open('wmt11.head.vocab', 'rb') as f:
    vocab = msgpack.load(f)

# Load requisite vector data
with open('wmt11.head.vectors', 'rb') as f:
    W = pickle.load(f)

id2word = dict((id, word) for word, (id, _) in vocab.iteritems())

# Normalize word vectors
for i, row in enumerate(W):
    W[i, :] /= np.linalg.norm(row)
    
# Remove context word vectors
W = W[:len(vocab), :]

def most_similar(positive, negative, topn=10, freq_threshold=5):
    # Build a "mean" vector for the given positive and negative terms
    mean_vecs = []
    for word in positive: mean_vecs.append(W[vocab[word][0]])
    for word in negative: mean_vecs.append(-1 * W[vocab[word][0]])
    
    mean = np.array(mean_vecs).mean(axis=0)
    mean /= np.linalg.norm(mean)
    
    # Now calculate cosine distances between this mean vector and all others
    dists = np.dot(W, mean)
    
    best = np.argsort(dists)[::-1][:topn + len(positive) + len(negative) + 100]
    result = [(id2word[i], dists[i]) for i in best if (vocab[id2word[i]] >= freq_threshold
                                                       and id2word[i] not in positive
                                                       and id2word[i] not in negative)]
    return result[:topn]

most_similar(['king', 'woman'], ['man'], topn=50)

most_similar(['brought', 'seek'], ['bring'], topn=50)