#!/usr/bin/env python
# coding: utf-8

# In[11]:


import numpy as np
get_ipython().run_line_magic('matplotlib', 'notebook')
import matplotlib.pyplot as plt
import nltk
from bs4 import BeautifulSoup
import re
import gzip
import urllib.request


# In[12]:


response = urllib.request.urlopen('https://en.wikipedia.org/wiki/Julia_(programming_language)')


# In[13]:


text = response.read()


# In[14]:


text = BeautifulSoup(text, 'lxml')


# In[15]:


text = text.get_text()
text


# In[16]:


text = text.translate(str.maketrans('\n\r\t', '   '))
text


# In[17]:


text.index('http')


# In[18]:


text = re.sub(r'http\S+', '', text)


# In[19]:


text.index('http')


# In[20]:


text = text.lower()


# In[21]:


text


# In[22]:


text = text.translate(str.maketrans("\'", "'"))
text                             
                                    

# In[23]:


small = "I wouldn't recommend this movie"

import contractions


# In[24]:


contractions.fix(small)


# In[25]:


stopwords = nltk.corpus.stopwords.words('english')
[word for word in small.split() if word not in stopwords]


# In[26]:


from nltk.stem import WordNetLemmatizer


# In[27]:


lemmatizer = WordNetLemmatizer()


# In[28]:


small = 'I am thinking of not recommending this movie'


# In[29]:


[lemmatizer.lemmatize(w) for w in small.split()]


# In[30]:


lemmatizer.lemmatize('running')


# In[31]:


print(lemmatizer.lemmatize("cats"))
print(lemmatizer.lemmatize("cacti"))
print(lemmatizer.lemmatize("geese"))
print(lemmatizer.lemmatize("rocks"))


# In[32]:


import gensim


# In[33]:


import gensim
import gensim.test.utils as gtu


# In[34]:


reviews = gzip.open('chicago.txt.gz', 'rb').readlines()


# In[35]:


len(reviews)


# In[36]:


reviews[0]


# In[37]:


reviews = [gensim.utils.simple_preprocess(review) for review in reviews]


# In[38]:


' '.join(reviews[0])


# In[39]:


model = gensim.models.Word2Vec(reviews, size=200, window=10, min_count=2)


# In[40]:


model.train(reviews, total_examples=len(reviews), epochs=50)


# In[41]:


model.wv.most_similar(positive=['love'])


# In[42]:


import sklearn.manifold as sm


# In[43]:


find_ones_like = ['love', 'like', 'hate', 'awful']
similars = [[word for (word, sim) in model.wv.most_similar(w, topn=20)] for w in find_ones_like]


# In[44]:


similars


# In[45]:


similars_vecs = [[model.wv[w] for w in group] for group in similars]


# In[46]:


len(similars_vecs)


# In[47]:


vecs = np.array(similars_vecs)
vecs.shape


# In[48]:


n_clusters, n_per_cluster, n_dim = vecs.shape


# In[49]:


tsne_model = sm.TSNE(perplexity=10, n_components=2, init='pca', n_iter=8000)
points = tsne_model.fit_transform(vecs.reshape(n_clusters * n_per_cluster, n_dim))
points = points.reshape(n_clusters, n_per_cluster, 2)


# In[50]:


import matplotlib.cm as cm
plt.figure(figsize=(10, 10))
colors = cm.rainbow(np.linspace(0, 1, n_clusters))
for label, vectors, words, color in zip(find_ones_like, points, similars, colors):
    x = vectors[:, 0]
    y = vectors[:, 1]
    plt.scatter(x, y, color=color, label=label)
    for i, word in enumerate(words):
        plt.annotate(word, xy=(x[i], y[i]), xytext=(5, 2),
                    textcoords='offset points', ha='right', va='bottom', size=8)
plt.legend()
plt.show()


# In[ ]: