# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
from __future__ import print_function
import collections
import math
import numpy as np
import os
import random
import tensorflow as tf
import zipfile
from matplotlib import pylab
from six.moves import range
from six.moves.urllib.request import urlretrieve
from sklearn.manifold import TSNE
Download the data from the source website if necessary.
url = 'http://mattmahoney.net/dc/'
def maybe_download(filename, expected_bytes):
"""Download a file if not present, and make sure it's the right size."""
if not os.path.exists(filename):
filename, _ = urlretrieve(url + filename, filename)
statinfo = os.stat(filename)
if statinfo.st_size == expected_bytes:
print('Found and verified %s' % filename)
else:
print(statinfo.st_size)
raise Exception(
'Failed to verify ' + filename + '. Can you get to it with a browser?')
return filename
filename = maybe_download('text8.zip', 31344016)
Found and verified text8.zip
Read the data into a string.
def read_data(filename):
f = zipfile.ZipFile(filename)
for name in f.namelist():
return tf.compat.as_str(f.read(name)).split()
f.close()
words = read_data(filename)
print('Data size %d' % len(words))
Data size 17005207
Build the dictionary and replace rare words with UNK token.
vocabulary_size = 50000
def build_dataset(words):
count = [['UNK', -1]]
count.extend(collections.Counter(words).most_common(vocabulary_size - 1))
dictionary = dict()
for word, _ in count:
dictionary[word] = len(dictionary)
data = list()
unk_count = 0
for word in words:
if word in dictionary:
index = dictionary[word]
else:
index = 0 # dictionary['UNK']
unk_count = unk_count + 1
data.append(index)
count[0][1] = unk_count
reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
return data, count, dictionary, reverse_dictionary
data, count, dictionary, reverse_dictionary = build_dataset(words)
print('Most common words (+UNK)', count[:5])
print('Sample data', data[:10])
del words # Hint to reduce memory.
Most common words (+UNK) [['UNK', 418391], ('the', 1061396), ('of', 593677), ('and', 416629), ('one', 411764)] Sample data [5239, 3084, 12, 6, 195, 2, 3137, 46, 59, 156]
Function to generate a training batch for the skip-gram model.
data_index = 0
def generate_batch(batch_size, num_skips, skip_window):
global data_index
assert batch_size % num_skips == 0
assert num_skips <= 2 * skip_window
batch = np.ndarray(shape=(batch_size), dtype=np.int32)
labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
span = 2 * skip_window + 1 # [ skip_window target skip_window ]
buffer = collections.deque(maxlen=span)
for _ in range(span):
buffer.append(data[data_index])
data_index = (data_index + 1) % len(data)
for i in range(batch_size // num_skips):
target = skip_window # target label at the center of the buffer
targets_to_avoid = [ skip_window ]
for j in range(num_skips):
while target in targets_to_avoid:
target = random.randint(0, span - 1)
targets_to_avoid.append(target)
batch[i * num_skips + j] = buffer[skip_window]
labels[i * num_skips + j, 0] = buffer[target]
buffer.append(data[data_index])
data_index = (data_index + 1) % len(data)
return batch, labels
print('data:', [reverse_dictionary[di] for di in data[:8]])
for num_skips, skip_window in [(2, 1), (4, 2)]:
data_index = 0
batch, labels = generate_batch(batch_size=8, num_skips=num_skips, skip_window=skip_window)
print('\nwith num_skips = %d and skip_window = %d:' % (num_skips, skip_window))
print(' batch:', [reverse_dictionary[bi] for bi in batch])
print(' labels:', [reverse_dictionary[li] for li in labels.reshape(8)])
data: ['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first'] with num_skips = 2 and skip_window = 1: batch: ['originated', 'originated', 'as', 'as', 'a', 'a', 'term', 'term'] labels: ['as', 'anarchism', 'originated', 'a', 'as', 'term', 'of', 'a'] with num_skips = 4 and skip_window = 2: batch: ['as', 'as', 'as', 'as', 'a', 'a', 'a', 'a'] labels: ['a', 'anarchism', 'originated', 'term', 'term', 'as', 'originated', 'of']
Train a skip-gram model.
batch_size = 128
embedding_size = 128 # Dimension of the embedding vector.
skip_window = 1 # How many words to consider left and right.
num_skips = 2 # How many times to reuse an input to generate a label.
# We pick a random validation set to sample nearest neighbors. here we limit the
# validation samples to the words that have a low numeric ID, which by
# construction are also the most frequent.
valid_size = 16 # Random set of words to evaluate similarity on.
valid_window = 100 # Only pick dev samples in the head of the distribution.
valid_examples = np.array(random.sample(range(valid_window), valid_size))
num_sampled = 64 # Number of negative examples to sample.
graph = tf.Graph()
with graph.as_default(), tf.device('/cpu:0'):
# Input data.
train_dataset = tf.placeholder(tf.int32, shape=[batch_size])
train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
# Variables.
embeddings = tf.Variable(
tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
softmax_weights = tf.Variable(
tf.truncated_normal([vocabulary_size, embedding_size],
stddev=1.0 / math.sqrt(embedding_size)))
softmax_biases = tf.Variable(tf.zeros([vocabulary_size]))
# Model.
# Look up embeddings for inputs.
embed = tf.nn.embedding_lookup(embeddings, train_dataset)
# Compute the softmax loss, using a sample of the negative labels each time.
loss = tf.reduce_mean(
tf.nn.sampled_softmax_loss(softmax_weights, softmax_biases, embed,
train_labels, num_sampled, vocabulary_size))
# Optimizer.
optimizer = tf.train.AdagradOptimizer(1.0).minimize(loss)
# Compute the similarity between minibatch examples and all embeddings.
# We use the cosine distance:
norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
normalized_embeddings = embeddings / norm
valid_embeddings = tf.nn.embedding_lookup(
normalized_embeddings, valid_dataset)
similarity = tf.matmul(valid_embeddings, tf.transpose(normalized_embeddings))
num_steps = 100001
with tf.Session(graph=graph) as session:
tf.initialize_all_variables().run()
print('Initialized')
average_loss = 0
for step in range(num_steps):
batch_data, batch_labels = generate_batch(
batch_size, num_skips, skip_window)
feed_dict = {train_dataset : batch_data, train_labels : batch_labels}
_, l = session.run([optimizer, loss], feed_dict=feed_dict)
average_loss += l
if step % 2000 == 0:
if step > 0:
average_loss = average_loss / 2000
# The average loss is an estimate of the loss over the last 2000 batches.
print('Average loss at step %d: %f' % (step, average_loss))
average_loss = 0
# note that this is expensive (~20% slowdown if computed every 500 steps)
if step % 10000 == 0:
sim = similarity.eval()
for i in xrange(valid_size):
valid_word = reverse_dictionary[valid_examples[i]]
top_k = 8 # number of nearest neighbors
nearest = (-sim[i, :]).argsort()[1:top_k+1]
log = 'Nearest to %s:' % valid_word
for k in xrange(top_k):
close_word = reverse_dictionary[nearest[k]]
log = '%s %s,' % (log, close_word)
print(log)
final_embeddings = normalized_embeddings.eval()
Initialized Average loss at step 0: 8.264635 Nearest to not: ouadda, geldof, crystallizes, bioinformatics, flaminius, plight, conforms, cohn, Nearest to of: hyperion, ecclesiae, avery, preliminary, anders, unilateral, ottawa, promotional, Nearest to but: trusting, forgiving, mummification, gerry, estudios, coffins, fbi, sotho, Nearest to war: authenticated, shirt, dysplasia, disengage, rourke, federline, operands, abitibi, Nearest to its: prohibits, euskara, derleth, espionage, collation, deluge, bembo, wh, Nearest to has: hog, architect, hydroelectric, reconciling, member, muhammed, fulda, audiences, Nearest to known: ibiza, conversational, enjoys, ava, rare, boyer, thugs, phimosis, Nearest to time: unsound, perceived, gilded, ndebele, tuxedo, progeny, lina, qassam, Nearest to are: lunches, subjectivism, rigorous, zodiac, godfather, kappa, yoda, bonding, Nearest to american: jeux, sourcewatch, calwell, brahms, twh, juveniles, dreamed, combining, Nearest to over: carta, nikos, my, reprimanded, letterman, biconditional, demons, fanpage, Nearest to no: wonder, ignaz, puerto, debrecen, horseback, greco, langston, transponder, Nearest to had: embroidery, smallest, probes, rewrite, cimmerian, congresses, humpback, framework, Nearest to as: flagella, bastion, cowpox, hilt, spices, subscribed, otherworldly, supposedly, Nearest to nine: noah, regiments, dissipation, johansen, freising, suriname, weasel, associating, Nearest to is: gtk, baptise, prefixes, meribbaal, xia, armando, gzip, brzezinski, Average loss at step 2000: 4.369195 Average loss at step 4000: 3.863247 Average loss at step 6000: 3.785764 Average loss at step 8000: 3.691525 Average loss at step 10000: 3.617537 Nearest to not: it, they, also, sometimes, he, never, you, who, Nearest to of: in, for, intermarriages, from, rejuvenated, and, cottages, preliminary, Nearest to but: however, discusses, commute, superscription, did, until, estudios, qh, Nearest to war: disengage, federline, abitibi, leyden, dysplasia, superlative, czech, tokelau, Nearest to its: his, the, their, prohibits, collation, stirling, yah, a, Nearest to has: had, is, was, have, crusades, calming, sweep, estuaries, Nearest to known: conversational, ibiza, used, apostol, manually, enjoys, ava, supra, Nearest to time: perceived, buckingham, qassam, wellesley, motivational, discernible, replicators, blaming, Nearest to are: were, is, have, beauvoir, functional, was, complication, alvaro, Nearest to american: jeux, calwell, tramlink, montenegrin, entire, fayyum, click, kgf, Nearest to over: my, nikos, until, standalone, biconditional, tired, conrad, dignity, Nearest to no: wonder, henley, debrecen, transponder, langston, rik, segmental, sabo, Nearest to had: have, has, was, were, rosalynn, veered, embroidery, refugee, Nearest to as: by, chosroes, referendum, played, flagella, myrdal, spectrometers, bede, Nearest to nine: eight, seven, six, five, zero, three, four, two, Nearest to is: was, are, has, be, does, cthulhu, slice, mountainous, Average loss at step 12000: 3.604574 Average loss at step 14000: 3.574436 Average loss at step 16000: 3.406878 Average loss at step 18000: 3.457766 Average loss at step 20000: 3.542967 Nearest to not: it, also, never, usually, now, they, you, who, Nearest to of: filters, discos, intermarriages, freitas, homogenized, gigabyte, preliminary, flybys, Nearest to but: however, which, is, and, though, that, where, if, Nearest to war: disengage, federline, leyden, cecil, superlative, bangkok, athenian, shrunk, Nearest to its: their, his, the, collation, systems, infanticide, some, mentions, Nearest to has: had, is, have, was, crusades, imminent, sweep, bus, Nearest to known: used, ibiza, conversational, supra, transwomen, manually, forever, ava, Nearest to time: buckingham, qassam, discernible, nazism, year, perceived, way, seems, Nearest to are: were, is, have, was, functional, other, but, complication, Nearest to american: british, calwell, jeux, french, canadian, sacrilegious, and, dreamed, Nearest to over: my, nikos, curfew, after, biconditional, tired, dignity, until, Nearest to no: wonder, langston, there, compatibles, nick, inferior, cheat, regionally, Nearest to had: has, have, was, were, rosalynn, having, culver, beans, Nearest to as: pensacola, bix, myrdal, yitzchak, dart, kingfishers, intracellular, bluescreen, Nearest to nine: eight, seven, six, four, zero, three, five, two, Nearest to is: was, has, are, but, does, were, gizzard, be, Average loss at step 22000: 3.502944 Average loss at step 24000: 3.490890 Average loss at step 26000: 3.481496 Average loss at step 28000: 3.479962 Average loss at step 30000: 3.503358 Nearest to not: they, now, still, usually, it, never, you, only, Nearest to of: in, preliminary, discos, from, for, and, own, ieung, Nearest to but: however, which, or, and, although, where, when, though, Nearest to war: disengage, bangkok, federline, leyden, superlative, cecil, shrunk, athenian, Nearest to its: their, his, the, incorporating, registering, stirling, balderus, heater, Nearest to has: had, have, is, was, since, calming, imminent, sweep, Nearest to known: used, ibiza, transwomen, supra, such, jewell, manually, conversational, Nearest to time: year, nazism, discernible, qassam, buckingham, way, explodes, course, Nearest to are: were, have, is, include, while, maas, other, noether, Nearest to american: british, english, canadian, autograph, french, jeux, australian, tramlink, Nearest to over: until, nikos, off, my, standalone, biconditional, curfew, suspend, Nearest to no: a, wonder, there, nick, langston, secaucus, compatibles, inferior, Nearest to had: has, have, was, were, having, is, aryans, aan, Nearest to as: by, bix, under, intracellular, yeomen, be, synoptic, unanimity, Nearest to nine: eight, seven, six, four, five, three, zero, two, Nearest to is: was, has, are, became, be, were, had, does, Average loss at step 32000: 3.506307 Average loss at step 34000: 3.494333 Average loss at step 36000: 3.450909 Average loss at step 38000: 3.300008 Average loss at step 40000: 3.429721 Nearest to not: never, now, it, still, almost, usually, they, also, Nearest to of: in, intermarriages, for, biafra, hysteria, from, decrement, garage, Nearest to but: however, while, although, and, though, which, it, before, Nearest to war: disengage, federline, leyden, bangkok, valuables, shrunk, autographs, afghan, Nearest to its: their, his, the, her, balderus, registering, infanticide, cameras, Nearest to has: had, have, was, is, sweep, phenomenally, imminent, px, Nearest to known: used, such, ibiza, supra, transwomen, possible, conversational, described, Nearest to time: year, way, discernible, explodes, buckingham, terrier, maury, day, Nearest to are: were, have, is, maas, searle, while, be, can, Nearest to american: british, australian, french, english, canadian, italian, autograph, winning, Nearest to over: nikos, off, curfew, standalone, until, annan, my, sonar, Nearest to no: any, wonder, langston, secaucus, nick, hingis, liquified, regionally, Nearest to had: has, have, was, having, were, beans, since, aan, Nearest to as: intracellular, bluescreen, duchamp, chosroes, leclerc, dart, fortify, bix, Nearest to nine: eight, seven, six, five, four, zero, three, one, Nearest to is: was, are, has, be, does, mitigated, became, hustler, Average loss at step 42000: 3.434099 Average loss at step 44000: 3.452721 Average loss at step 46000: 3.451069 Average loss at step 48000: 3.357493 Average loss at step 50000: 3.383801 Nearest to not: never, now, they, usually, still, almost, who, preferential, Nearest to of: in, and, preliminary, including, abusing, wilmot, during, stocking, Nearest to but: however, although, and, while, though, when, during, where, Nearest to war: disengage, bangkok, leyden, shrunk, federline, autographs, superlative, valuables, Nearest to its: their, his, the, her, balderus, infanticide, vuoksi, your, Nearest to has: had, have, is, was, ante, since, deems, watson, Nearest to known: used, such, possible, described, transwomen, regarded, supra, ibiza, Nearest to time: year, way, period, day, tarantino, lata, node, discernible, Nearest to are: were, is, have, these, bloomsbury, complication, be, include, Nearest to american: italian, australian, french, british, international, english, canadian, rsted, Nearest to over: off, nikos, through, rudder, curfew, annan, dtmf, on, Nearest to no: any, a, langston, rik, alison, regionally, nick, hingis, Nearest to had: has, have, were, was, having, since, been, prohibit, Nearest to as: authentication, stool, yeomen, shavuot, duchamp, eno, referendum, including, Nearest to nine: eight, seven, six, four, three, five, zero, two, Nearest to is: was, has, are, does, bloodstream, be, erlang, but, Average loss at step 52000: 3.435205 Average loss at step 54000: 3.425108 Average loss at step 56000: 3.441868 Average loss at step 58000: 3.393682 Average loss at step 60000: 3.394030 Nearest to not: never, still, usually, now, you, they, almost, to, Nearest to of: for, in, rejuvenated, beggars, impeachment, intermarriages, original, discos, Nearest to but: however, although, and, though, see, determines, which, than, Nearest to war: bangkok, disengage, leyden, shrunk, latinized, autographs, agnesi, valuables, Nearest to its: their, his, the, her, your, vuoksi, gallico, rudy, Nearest to has: had, have, was, is, having, sweep, ante, gnaeus, Nearest to known: used, such, possible, transwomen, regarded, described, ibiza, supra, Nearest to time: year, way, period, tarantino, process, explodes, motivation, place, Nearest to are: were, is, have, searle, these, include, bloomsbury, those, Nearest to american: british, italian, australian, english, international, french, canadian, european, Nearest to over: through, until, off, dtmf, nikos, continuity, caudal, around, Nearest to no: any, regionally, langston, uniformitarianism, little, alison, horch, parallels, Nearest to had: has, have, was, having, were, thresholds, phenomenally, won, Nearest to as: bix, authentication, gaylord, contra, angel, flagella, meddling, fortify, Nearest to nine: eight, six, seven, four, five, three, zero, one, Nearest to is: was, are, has, does, became, be, contains, expect, Average loss at step 62000: 3.239307 Average loss at step 64000: 3.256177 Average loss at step 66000: 3.403254 Average loss at step 68000: 3.388942 Average loss at step 70000: 3.358393 Nearest to not: never, now, still, usually, generally, almost, partially, sheedy, Nearest to of: including, preliminary, in, intermarriages, and, for, include, like, Nearest to but: however, although, see, which, and, that, than, though, Nearest to war: disengage, bangkok, leyden, wars, superlative, chalcedonian, autographs, agnesi, Nearest to its: their, his, the, her, our, whose, hypnotized, superoxide, Nearest to has: had, have, was, is, since, sweep, imminent, uppercase, Nearest to known: used, such, regarded, possible, described, transwomen, viewed, considered, Nearest to time: process, tarantino, period, year, place, way, course, encrypting, Nearest to are: were, have, include, is, these, including, maas, those, Nearest to american: british, italian, australian, canadian, english, international, dubs, rsted, Nearest to over: through, dtmf, off, around, about, until, curfew, continuity, Nearest to no: any, there, little, langston, regionally, parallels, liquified, wonder, Nearest to had: has, have, having, was, were, thresholds, phenomenally, won, Nearest to as: angel, like, bix, lyman, fortify, echelon, luminosity, before, Nearest to nine: eight, seven, six, five, four, three, zero, one, Nearest to is: was, has, are, became, be, does, being, becomes, Average loss at step 72000: 3.373208 Average loss at step 74000: 3.345302 Average loss at step 76000: 3.310933 Average loss at step 78000: 3.349958 Average loss at step 80000: 3.379658 Nearest to not: still, never, usually, now, actually, they, almost, it, Nearest to of: in, commutator, including, original, collegium, sammy, avery, simplest, Nearest to but: however, although, see, while, and, which, than, though, Nearest to war: disengage, bangkok, discursive, wars, leyden, chalcedonian, nebuchadrezzar, privatisation, Nearest to its: their, his, her, the, our, heater, whose, your, Nearest to has: had, have, is, was, px, since, having, fineness, Nearest to known: used, possible, regarded, described, such, transwomen, considered, treated, Nearest to time: year, course, tarantino, cover, process, explodes, coffin, day, Nearest to are: were, include, is, have, maas, these, although, including, Nearest to american: british, italian, canadian, australian, calwell, international, ling, french, Nearest to over: off, around, caudal, through, until, dtmf, curfew, across, Nearest to no: little, reliefs, langston, any, regionally, liquified, nick, there, Nearest to had: has, have, were, was, having, phenomenally, prohibit, refused, Nearest to as: brainwashing, hereford, mentors, before, fortify, uranus, stool, yeomen, Nearest to nine: eight, seven, six, four, five, three, zero, one, Nearest to is: was, has, are, remains, became, includes, planing, although, Average loss at step 82000: 3.405994 Average loss at step 84000: 3.407185 Average loss at step 86000: 3.386949 Average loss at step 88000: 3.352540 Average loss at step 90000: 3.364312 Nearest to not: still, actually, never, they, usually, almost, nor, now, Nearest to of: including, in, purdue, for, actual, preliminary, discos, simplest, Nearest to but: however, and, although, though, while, which, he, they, Nearest to war: disengage, bangkok, discursive, carefree, wars, chalcedonian, leyden, agnesi, Nearest to its: their, his, her, the, our, infanticide, heater, gallico, Nearest to has: had, have, is, was, since, additionally, having, maintains, Nearest to known: used, such, described, possible, regarded, transwomen, seen, considered, Nearest to time: tarantino, course, process, took, year, period, day, explodes, Nearest to are: were, is, include, have, these, maas, contain, although, Nearest to american: british, italian, australian, canadian, indian, french, spanish, regents, Nearest to over: off, through, around, against, dtmf, caudal, about, across, Nearest to no: little, any, reliefs, langston, only, liquified, there, inferior, Nearest to had: has, have, were, was, having, refused, already, continued, Nearest to as: brainwashing, stool, lombok, hereford, mentors, yeomen, chosroes, renormalization, Nearest to nine: eight, seven, six, five, four, three, zero, two, Nearest to is: was, has, are, does, became, be, although, becomes, Average loss at step 92000: 3.396994 Average loss at step 94000: 3.249787 Average loss at step 96000: 3.357772 Average loss at step 98000: 3.245364 Average loss at step 100000: 3.357103 Nearest to not: never, still, almost, usually, nor, generally, sheedy, now, Nearest to of: including, in, original, and, ritz, blitter, from, grasso, Nearest to but: however, although, and, though, while, which, that, where, Nearest to war: disengage, bangkok, swamps, leyden, wars, chalcedonian, discursive, agnesi, Nearest to its: their, his, the, her, our, firms, infanticide, twiggy, Nearest to has: had, have, is, was, since, additionally, having, px, Nearest to known: possible, used, regarded, such, seen, described, considered, defined, Nearest to time: year, process, course, reason, tarantino, day, constantinople, replicates, Nearest to are: were, have, although, include, while, contain, these, is, Nearest to american: british, canadian, italian, australian, belgian, indian, doped, french, Nearest to over: caudal, across, around, nearly, through, off, costs, dtmf, Nearest to no: little, any, langston, reliefs, there, uniformitarianism, pieter, inferior, Nearest to had: has, have, having, was, were, won, thresholds, since, Nearest to as: like, brainwashing, chosroes, stool, hennessy, vl, within, hereford, Nearest to nine: eight, seven, six, four, five, three, zero, two, Nearest to is: was, has, became, becomes, appears, are, makes, be,
num_points = 400
tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000)
two_d_embeddings = tsne.fit_transform(final_embeddings[1:num_points+1, :])
%matplotlib inline
def plot(embeddings, labels):
assert embeddings.shape[0] >= len(labels), 'More labels than embeddings'
pylab.figure(figsize=(15,15)) # in inches
for i, label in enumerate(labels):
x, y = embeddings[i,:]
pylab.scatter(x, y)
pylab.annotate(label, xy=(x, y), xytext=(5, 2), textcoords='offset points',
ha='right', va='bottom')
pylab.show()
words = [reverse_dictionary[i] for i in range(1, num_points+1)]
plot(two_d_embeddings, words)
The main difference between skip-gram and CBOW is the inputs and outputs are reversed - skip-gram predicts the context given the word, while CBOW predicts the word given the context. The second difference is that unlike skip-gram where each context word is a separate, label, in CBOW the context words are summed together in the input vector (so instead of having a single bit "activated" we'll have 2 or more bits turned on). This blog post explains it pretty well, although the exact implementation used here isn't as complicated as the one suggested in the post.
The code is very similar to skip-gram, with the exception of the batch generation function.
Full disclousure: these code updates came from other course participants through discussions about the assignment. There are multiple ways to implement this so there is no single right answer.
data_index = 0
def generate_batch_cbow(batch_size, skip_window):
global data_index
context_window = 2 * skip_window
assert batch_size % context_window == 0
num_labels = batch_size / context_window
batch = np.ndarray(shape=(batch_size), dtype=np.int32)
labels = np.ndarray(shape=(num_labels, 1), dtype=np.int32)
span = 2 * skip_window + 1 # [ skip_window target skip_window ]
buffer = collections.deque(maxlen=span)
for _ in range(span):
buffer.append(data[data_index])
data_index = (data_index + 1) % len(data)
for i in range(num_labels):
target = skip_window # target label at the center of the buffer
labels[i, 0] = buffer[target]
targets_to_avoid = [ skip_window ]
for j in range(context_window):
while target in targets_to_avoid:
target = random.randint(0, span - 1)
targets_to_avoid.append(target)
batch[i * context_window + j] = buffer[target]
buffer.append(data[data_index])
data_index = (data_index + 1) % len(data)
return batch, labels
batch_size = 128
embedding_size = 128 # Dimension of the embedding vector.
skip_window = 1 # How many words to consider left and right.
context_window = 2 * skip_window
num_labels = batch_size / context_window
# num_skips = 2 # How many times to reuse an input to generate a label.
# We pick a random validation set to sample nearest neighbors. here we limit the
# validation samples to the words that have a low numeric ID, which by
# construction are also the most frequent.
valid_size = 16 # Random set of words to evaluate similarity on.
valid_window = 100 # Only pick dev samples in the head of the distribution.
valid_examples = np.array(random.sample(range(valid_window), valid_size))
num_sampled = 32 # Number of negative examples to sample.
graph = tf.Graph()
with graph.as_default(), tf.device('/cpu:0'):
# Input data.
train_dataset = tf.placeholder(tf.int32, shape=[batch_size])
train_labels = tf.placeholder(tf.int32, shape=[num_labels, 1])
valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
# Variables.
embeddings = tf.Variable(
tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
softmax_weights = tf.Variable(
tf.truncated_normal([vocabulary_size, embedding_size],
stddev=1.0 / math.sqrt(embedding_size)))
softmax_biases = tf.Variable(tf.zeros([vocabulary_size]))
# Model.
# Look up embeddings for inputs.
embed = tf.nn.embedding_lookup(embeddings, train_dataset)
# Create a mask to apply to the embed tensor to essentially "sum" the context words
seq_ids = np.zeros(batch_size, dtype=np.int32)
cur_id = -1
for i in range(batch_size):
if i % context_window == 0:
cur_id = cur_id + 1
seq_ids[i] = cur_id
# Use segment_sum to add together the related words and reduce the output to be num_labels in size.
embed_sum = tf.segment_sum(embed, seq_ids)
# Compute the softmax loss, using a sample of the negative labels each time.
loss = tf.reduce_mean(
tf.nn.sampled_softmax_loss(softmax_weights, softmax_biases, embed_sum,
train_labels, num_sampled, vocabulary_size))
# Optimizer.
optimizer = tf.train.AdagradOptimizer(1.0).minimize(loss)
# Compute the similarity between minibatch examples and all embeddings.
# We use the cosine distance:
norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
normalized_embeddings = embeddings / norm
valid_embeddings = tf.nn.embedding_lookup(
normalized_embeddings, valid_dataset)
similarity = tf.matmul(valid_embeddings, tf.transpose(normalized_embeddings))
num_steps = 100001
with tf.Session(graph=graph) as session:
tf.initialize_all_variables().run()
print('Initialized')
average_loss = 0
for step in range(num_steps):
batch_data, batch_labels = generate_batch_cbow(batch_size, skip_window)
feed_dict = {train_dataset : batch_data, train_labels : batch_labels}
_, l = session.run([optimizer, loss], feed_dict=feed_dict)
average_loss += l
if step % 2000 == 0:
if step > 0:
average_loss = average_loss / 2000
# The average loss is an estimate of the loss over the last 2000 batches.
print('Average loss at step %d: %f' % (step, average_loss))
average_loss = 0
# note that this is expensive (~20% slowdown if computed every 500 steps)
if step % 10000 == 0:
sim = similarity.eval()
for i in xrange(valid_size):
valid_word = reverse_dictionary[valid_examples[i]]
top_k = 8 # number of nearest neighbors
nearest = (-sim[i, :]).argsort()[1:top_k+1]
log = 'Nearest to %s:' % valid_word
for k in xrange(top_k):
close_word = reverse_dictionary[nearest[k]]
log = '%s %s,' % (log, close_word)
print(log)
final_embeddings = normalized_embeddings.eval()
Initialized Average loss at step 0: 7.065996 Nearest to UNK: browsers, rotary, graded, poisonous, police, funchal, corn, freeciv, Nearest to war: reconciled, glasnost, satan, affections, barbaric, separatist, yardbirds, enact, Nearest to seven: bonnie, marriage, originated, brahma, consuls, justifies, usher, reshaped, Nearest to united: alarms, excommunication, pic, laramie, mcgregor, grampus, responses, fgth, Nearest to when: benito, illustrated, hydroponics, metamorphic, intriguing, melanesian, drugs, prohibitively, Nearest to were: restoring, feel, relate, busch, king, cree, hedgehog, discriminated, Nearest to had: handlebar, ech, chop, reporter, ridley, vast, improper, officiated, Nearest to he: wardrobe, ing, functionalism, authoritarian, bio, humours, experimenting, olaf, Nearest to during: trauma, ipa, hangul, greenhouses, schumpeter, specials, chat, infallibility, Nearest to known: algebras, vladislav, sayyaf, medes, untrained, mimi, act, ed, Nearest to into: forgo, pus, discord, abstractly, proliferate, infrastructural, plenty, crucible, Nearest to all: nio, boac, unchecked, contrasts, jersey, overs, isao, stipulations, Nearest to there: counterattack, reactance, shortening, marginal, polytheism, scientologists, saito, lamo, Nearest to than: stellar, utah, precaution, jujutsu, chogm, boolean, earnestly, excision, Nearest to five: stagnation, duke, haran, folksong, llu, stabilizes, behaviorism, stamps, Nearest to be: departments, resorted, cdu, slovakian, scotty, reconsider, conscript, biconditional, Average loss at step 2000: 3.689275 Average loss at step 4000: 3.127331 Average loss at step 6000: 3.030139 Average loss at step 8000: 2.877302 Average loss at step 10000: 2.793477 Nearest to UNK: frosts, foul, disregarding, coupled, gaborone, straightforwardly, fluctuations, insertions, Nearest to war: paucity, verisign, reconciled, separatist, ear, decomposing, lufthansa, cm, Nearest to seven: eight, six, nine, three, five, four, zero, two, Nearest to united: ziegler, pic, grampus, responses, powerbook, purdue, kyushu, diner, Nearest to when: after, viability, hydroponics, where, airlines, samara, took, rigs, Nearest to were: are, have, but, had, was, kingdoms, ard, matrimonial, Nearest to had: has, have, were, sqrat, was, aurelianus, stalker, ballads, Nearest to he: she, it, they, there, who, dimitri, tama, troika, Nearest to during: greenhouses, voter, under, arius, ipa, trauma, fen, bz, Nearest to known: well, sayyaf, yardage, absolutive, used, vladislav, such, snoopy, Nearest to into: with, flirt, out, gave, giulio, rowdy, almeida, from, Nearest to all: many, howlin, institute, impending, meteorites, sketch, postumus, reagent, Nearest to there: it, they, he, despised, invisibly, covertly, bebox, oregano, Nearest to than: chogm, gliders, fptp, spontaneously, detonates, utah, jahan, procure, Nearest to five: eight, six, nine, three, seven, four, zero, two, Nearest to be: have, is, magnesians, uncut, protestors, become, arnauld, been, Average loss at step 12000: 2.766466 Average loss at step 14000: 2.730358 Average loss at step 16000: 2.534443 Average loss at step 18000: 2.566399 Average loss at step 20000: 2.654789 Nearest to UNK: aldosterone, parole, vista, brilliance, bissette, divine, monopoles, facet, Nearest to war: decomposing, reconciled, paucity, airplay, lufthansa, gunnery, hello, sunday, Nearest to seven: eight, four, six, zero, nine, five, three, two, Nearest to united: ziegler, grampus, city, powerbook, pic, responses, please, dactylic, Nearest to when: if, hydroponics, because, where, denies, archtop, diamagnets, during, Nearest to were: are, was, have, had, tripod, canetti, dagesh, hobby, Nearest to had: has, have, was, having, waking, were, impatient, unsuitable, Nearest to he: she, it, they, there, who, then, rue, never, Nearest to during: under, on, greenhouses, in, with, at, breeze, sothoth, Nearest to known: used, such, well, considered, defined, understood, regarded, described, Nearest to into: through, from, rowdy, picks, under, in, disordered, within, Nearest to all: these, many, several, omnivores, ethically, panspermia, seizing, suggest, Nearest to there: they, it, he, usually, which, still, misguided, melatonin, Nearest to than: or, chogm, utah, detonates, mond, hearts, acknowledgment, mao, Nearest to five: zero, six, seven, eight, four, three, two, nine, Nearest to be: been, have, produce, refer, become, magnesians, uncut, being, Average loss at step 22000: 2.619580 Average loss at step 24000: 2.585866 Average loss at step 26000: 2.560046 Average loss at step 28000: 2.563704 Average loss at step 30000: 2.587996 Nearest to UNK: adventure, colonists, h, paralleling, vit, caricatured, kerosene, kurdistan, Nearest to war: decomposing, sirach, jomo, edu, synchronize, martyrdom, sunday, spray, Nearest to seven: nine, eight, five, six, four, three, zero, two, Nearest to united: grampus, city, ziegler, georgian, rimet, gettier, powerbook, responses, Nearest to when: if, where, after, repetitions, before, from, since, however, Nearest to were: are, was, have, had, being, been, is, hostel, Nearest to had: have, has, was, having, since, were, demography, halts, Nearest to he: she, they, it, who, there, inkjet, lifespans, never, Nearest to during: after, in, throughout, before, until, through, from, at, Nearest to known: used, described, understood, well, defined, considered, regarded, available, Nearest to into: from, within, under, through, with, on, rowdy, around, Nearest to all: several, meteorites, any, various, many, taxing, civilized, almohades, Nearest to there: they, usually, he, it, still, often, tambo, she, Nearest to than: chogm, much, becoming, far, utah, algebraically, dada, reconsidered, Nearest to five: eight, seven, six, nine, four, zero, three, two, Nearest to be: refer, produce, have, xy, forster, become, say, were, Average loss at step 32000: 2.578251 Average loss at step 34000: 2.536099 Average loss at step 36000: 2.510159 Average loss at step 38000: 2.320349 Average loss at step 40000: 2.483141 Nearest to UNK: b, dutton, brian, hinged, d, mazar, abhorrent, cfaf, Nearest to war: decomposing, paternity, leblanc, synchronize, spray, decree, egypt, muhammad, Nearest to seven: nine, eight, five, six, three, four, two, zero, Nearest to united: grampus, pic, georgian, city, toluene, avtovaz, powerbook, trigonometric, Nearest to when: while, after, if, before, by, denies, cruz, until, Nearest to were: are, was, have, been, being, brimstone, those, exe, Nearest to had: has, have, having, was, demography, impatient, shrugged, became, Nearest to he: she, it, they, soon, subsequently, eventually, there, who, Nearest to during: at, in, within, throughout, through, before, between, on, Nearest to known: used, understood, considered, described, defined, such, regarded, available, Nearest to into: through, from, around, back, picks, away, emission, painless, Nearest to all: both, any, every, two, drayton, dol, argentinian, each, Nearest to there: it, they, still, usually, she, which, he, now, Nearest to than: chogm, or, tenses, hairless, shareware, volunteering, consecrate, becoming, Nearest to five: seven, six, four, eight, nine, two, three, zero, Nearest to be: produce, have, refer, become, been, are, is, steal, Average loss at step 42000: 2.491943 Average loss at step 44000: 2.495752 Average loss at step 46000: 2.491358 Average loss at step 48000: 2.388357 Average loss at step 50000: 2.403683 Nearest to UNK: braille, ma, kryptonians, berry, mauritius, mennonites, iy, h, Nearest to war: conflict, wars, computations, decomposing, synchronize, muhammad, leblanc, commodity, Nearest to seven: eight, six, nine, three, four, five, two, zero, Nearest to united: pic, past, georgian, rimet, grampus, ny, gettier, ziegler, Nearest to when: before, if, after, while, though, where, however, foreman, Nearest to were: are, was, have, is, had, hostel, modernists, ocular, Nearest to had: has, have, having, was, wanted, were, impatient, explains, Nearest to he: she, it, they, who, there, then, soon, eventually, Nearest to during: in, throughout, within, after, from, through, nilpotent, at, Nearest to known: used, understood, defined, described, available, regarded, considered, famous, Nearest to into: from, through, around, across, under, within, toward, away, Nearest to all: every, meteorites, some, taxing, howlin, civilized, ovid, panspermia, Nearest to there: they, it, he, she, still, incomes, now, jabal, Nearest to than: reconsidered, chogm, or, but, even, spontaneously, tenses, far, Nearest to five: four, six, eight, seven, three, nine, zero, two, Nearest to be: have, refer, become, produce, being, easily, been, officiate, Average loss at step 52000: 2.451219 Average loss at step 54000: 2.423089 Average loss at step 56000: 2.458059 Average loss at step 58000: 2.389679 Average loss at step 60000: 2.408637 Nearest to UNK: haas, slammed, domingue, singularly, pfa, fatimid, seabed, relegated, Nearest to war: conflict, muhammad, wars, elizabeth, leblanc, impacts, decomposing, am, Nearest to seven: five, six, eight, four, nine, three, zero, two, Nearest to united: following, past, rimet, pic, baltic, grampus, mutation, hawaii, Nearest to when: if, before, although, while, after, during, by, though, Nearest to were: are, was, had, have, be, been, including, romanticism, Nearest to had: has, have, was, yet, already, were, gave, never, Nearest to he: she, it, they, who, eventually, there, soon, never, Nearest to during: in, before, after, within, when, under, although, following, Nearest to known: used, defined, regarded, understood, considered, described, referred, famous, Nearest to into: around, through, from, under, with, disordered, on, within, Nearest to all: some, those, manpower, any, reset, every, many, both, Nearest to there: they, it, he, this, still, sometimes, verbose, we, Nearest to than: chogm, agena, far, tenses, utah, slightly, algebraically, acknowledgment, Nearest to five: four, seven, three, six, eight, nine, zero, two, Nearest to be: been, produce, remain, refer, were, become, easily, have, Average loss at step 62000: 2.205583 Average loss at step 64000: 2.229476 Average loss at step 66000: 2.384556 Average loss at step 68000: 2.391171 Average loss at step 70000: 2.338602 Nearest to UNK: aussie, superconductor, lemnos, servitude, duo, ller, hemiparesis, zapata, Nearest to war: wars, conflict, elizabeth, leblanc, series, coup, hacienda, commodity, Nearest to seven: eight, six, nine, five, four, zero, three, two, Nearest to united: nation, pic, past, baltic, purdue, goguryeo, west, toluene, Nearest to when: if, where, before, however, after, rgb, californians, because, Nearest to were: are, have, was, had, is, be, been, cusco, Nearest to had: has, have, was, having, were, already, subsequently, saw, Nearest to he: she, they, it, soon, there, we, hitler, who, Nearest to during: before, until, throughout, after, in, at, under, despite, Nearest to known: used, defined, understood, regarded, described, such, considered, famous, Nearest to into: from, through, around, across, upside, back, within, away, Nearest to all: some, various, every, both, many, panspermia, any, pharmacist, Nearest to there: it, they, still, she, we, he, sometimes, often, Nearest to than: becoming, dualities, but, detail, or, volunteering, chogm, fiorentina, Nearest to five: four, seven, eight, six, nine, three, zero, two, Nearest to be: been, become, produce, is, have, easily, provide, were, Average loss at step 72000: 2.365039 Average loss at step 74000: 2.334780 Average loss at step 76000: 2.327395 Average loss at step 78000: 2.343068 Average loss at step 80000: 2.355344 Nearest to UNK: willfully, vulpes, parole, astronomer, manuel, riley, ross, zacharias, Nearest to war: wars, conflict, leblanc, coup, pandit, elizabeth, commodity, rias, Nearest to seven: five, six, eight, zero, four, nine, three, two, Nearest to united: nation, pic, purdue, expressionism, median, baltic, toluene, newsreel, Nearest to when: after, before, where, if, until, for, without, then, Nearest to were: are, have, had, cusco, was, including, modernists, include, Nearest to had: has, have, was, already, were, having, never, are, Nearest to he: she, it, they, there, we, originally, never, hitler, Nearest to during: after, throughout, before, until, from, in, through, despite, Nearest to known: defined, used, understood, described, regarded, such, cited, referred, Nearest to into: through, around, within, from, along, across, away, with, Nearest to all: every, banishment, taxing, various, both, many, out, civilized, Nearest to there: it, he, they, she, sometimes, we, said, strikers, Nearest to than: but, chogm, much, dualities, fiorentina, becoming, agena, while, Nearest to five: four, seven, eight, six, three, nine, zero, two, Nearest to be: been, become, easily, being, remain, refer, seem, produce, Average loss at step 82000: 2.377959 Average loss at step 84000: 2.393377 Average loss at step 86000: 2.356136 Average loss at step 88000: 2.312366 Average loss at step 90000: 2.316218 Nearest to UNK: ting, plummeted, johan, battista, parole, haas, clancy, konrad, Nearest to war: wars, conflict, coup, elizabeth, muhammad, apparatus, commodity, sliders, Nearest to seven: nine, eight, five, four, six, zero, three, two, Nearest to united: baltic, confederate, nation, pic, papal, usa, purdue, southern, Nearest to when: after, before, while, until, if, without, by, although, Nearest to were: are, was, had, have, being, while, been, canetti, Nearest to had: has, have, was, were, began, continued, already, having, Nearest to he: she, they, it, there, subsequently, originally, initially, who, Nearest to during: throughout, before, in, until, after, despite, while, from, Nearest to known: described, defined, used, understood, regarded, such, seen, opposed, Nearest to into: through, around, within, across, from, beyond, under, upside, Nearest to all: many, several, some, various, every, aware, any, regardless, Nearest to there: it, they, he, still, we, she, not, keeshond, Nearest to than: chogm, much, algebraically, detail, fiorentina, becoming, far, spacetime, Nearest to five: eight, four, seven, six, nine, three, two, zero, Nearest to be: easily, have, is, provide, was, remain, produce, been, Average loss at step 92000: 2.363738 Average loss at step 94000: 2.205631 Average loss at step 96000: 2.325339 Average loss at step 98000: 2.205846 Average loss at step 100000: 2.315003 Nearest to UNK: extraneous, ller, tsu, nocs, hallstatt, reggie, formalize, christiansen, Nearest to war: wars, conflict, leblanc, coup, decomposing, muhammad, season, trade, Nearest to seven: nine, eight, four, five, six, zero, three, two, Nearest to united: baltic, confederate, pic, nation, newsreel, city, southern, georgian, Nearest to when: if, while, after, although, before, though, where, because, Nearest to were: are, have, brimstone, exist, those, had, modernists, cusco, Nearest to had: has, have, having, already, refused, was, demography, enjoys, Nearest to he: she, they, it, there, who, we, initially, eventually, Nearest to during: throughout, in, despite, through, within, at, until, among, Nearest to known: defined, described, understood, such, used, possible, opposed, referred, Nearest to into: through, within, from, around, across, beyond, back, upside, Nearest to all: various, these, every, both, some, several, many, any, Nearest to there: they, he, she, still, it, believed, however, we, Nearest to than: chogm, or, much, acknowledgment, fanfic, mauss, bskyb, bakersfield, Nearest to five: six, seven, eight, four, nine, zero, two, three, Nearest to be: have, been, produce, lead, remain, easily, refer, become,
num_points = 400
tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000)
two_d_embeddings = tsne.fit_transform(final_embeddings[1:num_points+1, :])
%matplotlib inline
def plot(embeddings, labels):
assert embeddings.shape[0] >= len(labels), 'More labels than embeddings'
pylab.figure(figsize=(15,15)) # in inches
for i, label in enumerate(labels):
x, y = embeddings[i,:]
pylab.scatter(x, y)
pylab.annotate(label, xy=(x, y), xytext=(5, 2), textcoords='offset points',
ha='right', va='bottom')
pylab.show()
words = [reverse_dictionary[i] for i in range(1, num_points+1)]
plot(two_d_embeddings, words)