The models are pickled as a python tuple.
The first element of the pair is a python list. The list represents the vocabulary words sorted by their frequency in the corpus.
The second element is a python numpy array where each row represents a word vector. The embeddings are stored as 32 bit float numpy array to save space.
import pickle
import numpy
words, embeddings = pickle.load(open('/home/polyglot/en/words_embeddings_32.pkl', 'rb'))
print("Emebddings shape is {}".format(embeddings.shape))
Emebddings shape is (100004, 64)
The size of the emebddings tells about the number of words that make up the vocabulary (100004) and the size of the vector that represent each word (64).
The vocabulary consist of 100004 words. The first four words are special symbols:
Token_ID = {"<UNK>": 0, "<S>": 1, "</S>":2, "<PAD>": 3}
#{<UNK>: Out of vocabulary word,
# <S>: Start of sentence,
# </S>: End of sentence,
# <PAD>: Padding character}.
If your sentence is "I visited New York .", then the model was trained on the following five 5-grams:
(("<PAD>", "<S>", "I", "visited", "New"),
("<S>", "I", "visited", "New", "York"),
("I", "visited", "New", "York", "."),
("visited", "New", "York", "." "</S>"),
("New", "York", ".", "</S>", "<PAD>"))
# First ten words in the vocabulary. Notice the special symbols <UNK> ... <PAD>
words[:10]
(u'<UNK>', u'<S>', u'</S>', u'<PAD>', u',', u'the', u'.', u'of', u'and', u'in')
# A tuple of the word "outside" and its representation as a python array
words[777], embeddings[777]
(u'outside', array([-0.30594289, -0.10619531, 0.22014943, -0.0418048 , -0.25563559, -0.22289647, 0.24415126, -0.42394561, -0.29082915, 0.2757107 , -0.01486778, -0.82750046, -0.46192446, 0.02731112, 0.36313367, 0.02308739, 0.44220203, 0.63888663, -0.75270784, 0.34825927, 0.33574232, 0.13333255, -1.27148712, -0.17001058, -0.94983661, -0.02366572, 0.58226883, -0.73669076, 0.20364907, 0.53477538, -0.11396599, -0.22912201, -0.18428923, 0.57168871, 0.70096195, -0.01094483, -0.10256457, 0.23729944, 0.16012612, -0.08789989, 0.32947737, -0.19176106, -0.40289786, 0.2634418 , 0.22998494, 0.14719962, -0.03886349, -0.1285357 , -0.05806407, 0.19683087, 0.59862757, -0.15636708, -0.53672892, 0.23510239, -0.34235647, -0.4950844 , -0.29466859, 1.062222 , -0.15154035, 0.22687389, 0.34555328, -0.44103339, 0.43293494, -0.10873429], dtype=float32))
"""KNN Example."""
from operator import itemgetter
from itertools import izip, islice
import re
import numpy
# Special tokens
Token_ID = {"<UNK>": 0, "<S>": 1, "</S>":2, "<PAD>": 3}
ID_Token = {v:k for k,v in Token_ID.iteritems()}
# Map words to indices and vice versa
word_id = {w:i for (i, w) in enumerate(words)}
id_word = dict(enumerate(words))
# Noramlize digits by replacing them with #
DIGITS = re.compile("[0-9]", re.UNICODE)
# Number of neighbors to return.
k = 5
def case_normalizer(word, dictionary):
""" In case the word is not available in the vocabulary,
we can try multiple case normalizing procedure.
We consider the best substitute to be the one with the lowest index,
which is equivalent to the most frequent alternative."""
w = word
lower = (dictionary.get(w.lower(), 1e12), w.lower())
upper = (dictionary.get(w.upper(), 1e12), w.upper())
title = (dictionary.get(w.title(), 1e12), w.title())
results = [lower, upper, title]
results.sort()
index, w = results[0]
if index != 1e12:
return w
return word
def normalize(word, word_id):
""" Find the closest alternative in case the word is OOV."""
if not word in word_id:
word = DIGITS.sub("#", word)
if not word in word_id:
word = case_normalizer(word, word_id)
if not word in word_id:
return None
return word
def l2_nearest(embeddings, word_index, k):
"""Sorts words according to their Euclidean distance.
To use cosine distance, embeddings has to be normalized so that their l2 norm is 1."""
e = embeddings[word_index]
distances = (((embeddings - e) ** 2).sum(axis=1) ** 0.5)
sorted_distances = sorted(enumerate(distances), key=itemgetter(1))
return zip(*sorted_distances[:k])
def knn(word, embeddings, word_id, id_word):
word = normalize(word, word_id)
if not word:
print("OOV word")
return
word_index = word_id[word]
indices, distances = l2_nearest(embeddings, word_index, k)
neighbors = [id_word[idx] for idx in indices]
for i, (word, distance) in enumerate(izip(neighbors, distances)):
print i, '\t', word, '\t\t', distance
knn("Jordan", embeddings, word_id, id_word)
print
knn("1986", embeddings, word_id, id_word)
print
knn("JAPAN", embeddings, word_id, id_word)
0 Jordan 0.0 1 Holland 1.47199 2 Lucas 1.55305 3 Marshall 1.58405 4 Nelson 1.58547 0 #### 0.0 1 ####EN#### 2.66805 2 ## 2.8479 3 ####EN## 2.88584 4 # 3.05274 0 Japan 0.0 1 China 1.38575 2 Mexico 1.45689 3 Europe 1.50911 4 Brazil 1.52698