#!/usr/bin/env python # coding: utf-8 # In[11]: import numpy as np get_ipython().run_line_magic('matplotlib', 'notebook') import matplotlib.pyplot as plt import nltk from bs4 import BeautifulSoup import re import gzip import urllib.request # In[12]: response = urllib.request.urlopen('https://en.wikipedia.org/wiki/Julia_(programming_language)') # In[13]: text = response.read() # In[14]: text = BeautifulSoup(text, 'lxml') # In[15]: text = text.get_text() text # In[16]: text = text.translate(str.maketrans('\n\r\t', ' ')) text # In[17]: text.index('http') # In[18]: text = re.sub(r'http\S+', '', text) # In[19]: text.index('http') # In[20]: text = text.lower() # In[21]: text # In[22]: text = text.translate(str.maketrans("\'", "'")) text # In[23]: small = "I wouldn't recommend this movie" import contractions # In[24]: contractions.fix(small) # In[25]: stopwords = nltk.corpus.stopwords.words('english') [word for word in small.split() if word not in stopwords] # In[26]: from nltk.stem import WordNetLemmatizer # In[27]: lemmatizer = WordNetLemmatizer() # In[28]: small = 'I am thinking of not recommending this movie' # In[29]: [lemmatizer.lemmatize(w) for w in small.split()] # In[30]: lemmatizer.lemmatize('running') # In[31]: print(lemmatizer.lemmatize("cats")) print(lemmatizer.lemmatize("cacti")) print(lemmatizer.lemmatize("geese")) print(lemmatizer.lemmatize("rocks")) # In[32]: import gensim # In[33]: import gensim import gensim.test.utils as gtu # In[34]: reviews = gzip.open('chicago.txt.gz', 'rb').readlines() # In[35]: len(reviews) # In[36]: reviews[0] # In[37]: reviews = [gensim.utils.simple_preprocess(review) for review in reviews] # In[38]: ' '.join(reviews[0]) # In[39]: model = gensim.models.Word2Vec(reviews, size=200, window=10, min_count=2) # In[40]: model.train(reviews, total_examples=len(reviews), epochs=50) # In[41]: model.wv.most_similar(positive=['love']) # In[42]: import sklearn.manifold as sm # In[43]: find_ones_like = ['love', 'like', 'hate', 'awful'] similars = [[word for (word, sim) in model.wv.most_similar(w, topn=20)] for w in find_ones_like] # In[44]: similars # In[45]: similars_vecs = [[model.wv[w] for w in group] for group in similars] # In[46]: len(similars_vecs) # In[47]: vecs = np.array(similars_vecs) vecs.shape # In[48]: n_clusters, n_per_cluster, n_dim = vecs.shape # In[49]: tsne_model = sm.TSNE(perplexity=10, n_components=2, init='pca', n_iter=8000) points = tsne_model.fit_transform(vecs.reshape(n_clusters * n_per_cluster, n_dim)) points = points.reshape(n_clusters, n_per_cluster, 2) # In[50]: import matplotlib.cm as cm plt.figure(figsize=(10, 10)) colors = cm.rainbow(np.linspace(0, 1, n_clusters)) for label, vectors, words, color in zip(find_ones_like, points, similars, colors): x = vectors[:, 0] y = vectors[:, 1] plt.scatter(x, y, color=color, label=label) for i, word in enumerate(words): plt.annotate(word, xy=(x[i], y[i]), xytext=(5, 2), textcoords='offset points', ha='right', va='bottom', size=8) plt.legend() plt.show() # In[ ]: