from __future__ import division # python 2, so old school
import numpy as np
import pandas as pd
import seaborn as sns
import nltk
import matplotlib.pyplot as plt
import re
from bs4 import BeautifulSoup
import codecs
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import Normalizer
from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist, pdist
from sklearn.cluster import KMeans
from mpl_toolkits.mplot3d import Axes3D
from sklearn.decomposition import TruncatedSVD
from sklearn.manifold import TSNE
from sklearn.decomposition import FastICA as ICA
from sklearn.decomposition import PCA
pd.set_option('display.max_rows', 10)
%matplotlib inline
# From OPUS: http://opus.lingfil.uu.se/TED2013.php
infile=open('./ted.xml', 'r')
raw=infile.read()
infile.close()
raw = raw.decode('ascii', 'ignore')
raw = raw.encode('utf8', 'ignore')
soup = BeautifulSoup(raw)
text = soup.get_text()
number_of_talks = len(text.split('http://www.ted.com/talks/')); number_of_talks
1170
url_split = text.split('http://www.ted.com/talks/')
url_split=url_split[1:]
url_split[0].split('\n')[:10]
[u'stephen_palumbi_following_the_mercury_trail.html', u"There's a tight and surprising link between the ocean'shealth and ours, says marine biologist Stephen Palumbi. He showshow toxins at the bottom of the ocean food chain find their wayinto our bodies, with a shocking story of toxic contamination froma Japanese fish market. His work points a way forward for savingthe oceans' health -- and humanity's.", u'fish,health,mission blue,oceans,science', u'899', u'Stephen Palumbi: Following the mercury trail', u'', u'It can be a very complicated thing, the ocean.', u'And it can be a very complicated thing, what human healthis.', u'And bringing those two together might seem a very dauntingtask,', u"but what I'm going to try to say is that even in thatcomplexity, there's some simple themes that I think, if weunderstand, we can really move forward."]
# Word list from : http://www.keithv.com/software/wlist/
wordfile = open('./wlist_match7.txt', 'r')
wordlist = wordfile.readlines()
wordfile.close()
dictionary = {word.strip(): '' for word in wordlist}
# Stop list file from http://www.ranks.nl/stopwords + nltk
stopword_file = open('./stopword.txt', 'r')
stopwords_raw = stopword_file.read()
stopword_file.close()
stopwords_list = [w for w in stopwords_raw.split()]
stopwords_list = stopwords_list + nltk.corpus.stopwords.words('english')
stopwords = list(set(stopwords_list))
stopwords.append('ha')
stopwords.append('wa')
stopwords[-10:]
['serious', 'e', 'together', 'hello', "we're", "ain't", 'having', 'once', 'ha', 'wa']
def getRealWords(word, dictionary):
if word in dictionary:
return str(word)
else:
wordlength = len(word)
for i in range(wordlength):
part = word[:i]
if part in dictionary:
if word[i:] in dictionary:
return str(part) + ' ' + str(word[i:])
return str(word)
def processText(text, dictionary):
string = u''
words = text.split()
for word in words:
if word in stopwords:
pass
else:
string += ' ' + getRealWords(word, dictionary)
return string
lemmatizer = nltk.stem.WordNetLemmatizer()
def scrub(text):
lines = text.splitlines()
url = lines[0]
topics = lines[2]
author = lines[4]
tokens = [t for t in nltk.tokenize.word_tokenize(' '.join(lines[5:]))]
clean_tokens = [lemmatizer.lemmatize(token.lower()) for token in tokens \
if re.search(ur'^[a-zA-Z]+', token)]
clean = processText(' '.join(clean_tokens), dictionary).split()
clean = [w for w in clean if w not in stopwords if w in dictionary]
return author, topics, url, clean
a = scrub(url_split[0]); a[3][:10]
[u'complicated', u'thing', u'ocean', u'complicated', u'thing', u'human', u'heal', u'bringing', u'daunting', u'task']
scrubbed = []
total = len(url_split)
for talk in url_split:
scrubbed.append(scrub(talk))
df = pd.DataFrame(scrubbed, columns=['author', 'topics', 'url', 'text'])
df.head()
author | topics | url | text | |
---|---|---|---|---|
0 | Stephen Palumbi: Following the mercury trail | fish,health,mission blue,oceans,science | stephen_palumbi_following_the_mercury_trail.html | [complicated, thing, ocean, complicated, thing... |
1 | Jessa Gamble: Our natural sleep cycle | evolution,humanity,personal growth,science,self | jessa_gamble_how_to_sleep.html | [start, day, night, life, evolved, condition, ... |
2 | Handspring Puppet Co.: The genius puppetry beh... | animals,arts,design,entertainment,theater | handpring_puppet_co_the_genius_puppetry_behind... | [adrian, kohler, today, talk, evolution, puppe... |
3 | Katherine Fulton: You are the future ofphilant... | activism,bottom-up,community,globalissues,phil... | katherine_fulton_you_are_the_future_of_philant... | [philanthropy, wha, tit, relationship, offer, ... |
4 | Chris Gerdes: The future race car -- 150mph, a... | cars,future,technology | chris_gerdes_the_future_race_car_150mph_and_no... | [wheel, car, driving, road, long, day, wanted,... |
df['text'] = df['text'].map(
lambda x: ' '.join(x))
df.head()
author | topics | url | text | |
---|---|---|---|---|
0 | Stephen Palumbi: Following the mercury trail | fish,health,mission blue,oceans,science | stephen_palumbi_following_the_mercury_trail.html | complicated thing ocean complicated thing huma... |
1 | Jessa Gamble: Our natural sleep cycle | evolution,humanity,personal growth,science,self | jessa_gamble_how_to_sleep.html | start day night life evolved condition light d... |
2 | Handspring Puppet Co.: The genius puppetry beh... | animals,arts,design,entertainment,theater | handpring_puppet_co_the_genius_puppetry_behind... | adrian kohler today talk evolution puppet hors... |
3 | Katherine Fulton: You are the future ofphilant... | activism,bottom-up,community,globalissues,phil... | katherine_fulton_you_are_the_future_of_philant... | philanthropy wha tit relationship offer vision... |
4 | Chris Gerdes: The future race car -- 150mph, a... | cars,future,technology | chris_gerdes_the_future_race_car_150mph_and_no... | wheel car driving road long day wanted tired f... |
import cPickle
cPickle.dump(df, open('df.pkl', 'w')) # never have to do this again !
! ls
10kdocterm.pkl UN.en-es.en stopword.txt un.ipynb LICENSE data-projector ted.ipynb wlist_match7.txt README.md data.json ted.xml TED2013 df.pkl ted_old.ipynb
# Whats the text actually look like now?
df.text
0 complicated thing ocean complicated thing huma... 1 start day night life evolved condition light d... 2 adrian kohler today talk evolution puppet hors... ... 1166 explain thing assume explain achieve thing def... 1167 park big parking lot remember parked car probl... 1168 gon talk bit security security start kind sens... Name: text, Length: 1169, dtype: object
topic_words = []
for topics in df.topics:
for topic in topics.split(','):
topic_words.append(topic)
clean_topics = processText(' '.join(topic_words), dictionary).split()
tpx = pd.DataFrame(clean_topics, columns=['topics'])
tpx.topics.value_counts()[:10]
technology 410 science 321 culture 317 design 285 global 278 issues 278 entertainment 207 business 199 arts 138 health 110 dtype: int64
tpx.topics.value_counts().plot(rot=90, figsize=(12,8), fontsize=20)
<matplotlib.axes.AxesSubplot at 0x1227efa90>
tpx.topics.value_counts()[:35].plot(rot=90, xticks=range(35), figsize=(12,8), fontsize=20)
<matplotlib.axes.AxesSubplot at 0x1229bbfd0>
tpx.topics.value_counts()[:20].plot(rot=90, xticks=range(20), figsize=(12,8), fontsize=20)
<matplotlib.axes.AxesSubplot at 0x122c5fe50>
tpx.topics.value_counts()[:10].plot(rot=90, xticks=range(10), figsize=(12,8), fontsize=20)
<matplotlib.axes.AxesSubplot at 0x123359390>
How are we going to represent the words, numerically, so that we can cluster them?
It's one way to do it !
http://en.wikipedia.org/wiki/Tf%E2%80%93idf
tf–idf, short for term frequency–inverse document frequency, is a numerical statistic that is intended to reflect how important a word is to a document in a collection or corpus. It is often used as a weighting factor in information retrieval and text mining. The tf-idf value increases proportionally to the number of times a word appears in the document, but is offset by the frequency of the word in the corpus, which helps to adjust for the fact that some words appear more frequently in general. Variations of the tf–idf weighting scheme are often used by search engines as a central tool in scoring and ranking a document's relevance given a user query. tf–idf can be successfully used for stop-words filtering in various subject fields including text summarization and classification.
# Vectorize: we've already used a ton of stopword lists up above by why not do it again.
# Smoothing here is a +1 to avoid zero division
vectorizer = TfidfVectorizer(stop_words='english', smooth_idf=True)
tfidf = vectorizer.fit_transform(df.text)
/Library/Python/2.7/site-packages/numpy-1.9.1-py2.7-macosx-10.9-intel.egg/numpy/core/fromnumeric.py:2499: VisibleDeprecationWarning: `rank` is deprecated; use the `ndim` attribute or function instead. To find the rank of a matrix see `numpy.linalg.matrix_rank`. VisibleDeprecationWarning)
# Cosine Similarity
similarity_matrix = tfidf.dot(tfidf.T)
similarity_matrix = Normalizer(copy=False).fit_transform(similarity_matrix)
# Estimating K - http://www.slideshare.net/SarahGuido/kmeans-clustering-with-scikitlearn
k_range = range(5, 50, 5)
k_euclid = [KMeans(n_clusters=k).fit(similarity_matrix) for k in k_range]
k_centroids = [X.cluster_centers_ for X in k_euclid]
k_cosine = [cdist(similarity_matrix.toarray(), cent, 'cosine') for cent in k_centroids]
distances = [np.min(ke, axis=1) for ke in k_cosine]
# Within-cluster sum of squares
wcss = [sum(d**2) for d in distances]
# Total sum of squares
tss = sum(pdist(similarity_matrix.toarray()**2/similarity_matrix.toarray().shape[0]))
# Between cluster sum of squares
bss = tss - wcss
plt.plot(k_range, bss/tss*100)
plt.xlabel("Number of Clusters")
plt.ylabel("% Variance Explained")
<matplotlib.text.Text at 0x123ba3190>
# So, we can see that even with just 5 clusters we have over 73% variance explained.
from sklearn.metrics import silhouette_score, silhouette_samples
silhouette_scores = [silhouette_score(tfidf, k.labels_) for k in k_euclid]
/Library/Python/2.7/site-packages/numpy-1.9.1-py2.7-macosx-10.9-intel.egg/numpy/core/_methods.py:59: RuntimeWarning: Mean of empty slice. warnings.warn("Mean of empty slice.", RuntimeWarning)
plt.plot(k_range, silhouette_scores)
[<matplotlib.lines.Line2D at 0x123a27790>]
tfmat = pd.DataFrame(tfidf.todense(), index=df.author, columns=vectorizer.get_feature_names())
# Reduce the data to the top 10,000 most important words
some = tfmat.sum(axis=0)
sorter = some.argsort()
srtd = pd.DataFrame(sorter)
sorted_index = srtd.sort(columns=0).index
reduced = tfmat[sorted_index][:10000]
#cPickle.dump(reduced, open('10kdocterm.pkl', 'w'))
! ls
10kdocterm.pkl UN.en-es.en stopword.txt un.ipynb LICENSE data-projector ted.ipynb wlist_match7.txt README.md data.json ted.xml TED2013 df.pkl ted_old.ipynb
reduced.head(1)
sel | breather | fraser | fractured | sei | aceh | crawler | accepted | pla | crept | ... | ipl | disintegrated | assent | app | nietzsche | bleep | charging | alluded | nerd | intercollegiate | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
author | |||||||||||||||||||||
Stephen Palumbi: Following the mercury trail | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1 rows × 35071 columns
similarity_matrix = reduced.dot(reduced.T)
similarity_matrix.describe()
Stephen Palumbi: Following the mercury trail | Jessa Gamble: Our natural sleep cycle | Handspring Puppet Co.: The genius puppetry behind WarHorse | Katherine Fulton: You are the future ofphilanthropy | Chris Gerdes: The future race car -- 150mph, and nodriver | Stefana Broadbent: How the Internet enablesintimacy | Majora Carter: 3 stories of localeco-entrepreneurship | Britta Riley: A garden in my apartment | Nicholas Negroponte on One Laptop per Child, two yearson | Rodney Brooks says robots will invade our lives | ... | Craig Venter on DNA and the sea | Paul Romer's radical idea: Charter cities | Philip Zimbardo prescribes a healthy take on time | Carolyn Porco flies us to Saturn | Kirk Citron: And now, the real news | Lalitesh Katragadda: Making maps to fight disaster, buildeconomies | Julia Bacha: Pay attention to nonviolence | Simon Sinek: How great leaders inspire action | Neil Burgess: How your brain tells you where youare | James Stavridis: How NATO's Supreme Commander thinks aboutglobal security | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 1169.000000 | 1169.000000 | 1169.000000 | 1169.000000 | 1169.000000 | 1169.000000 | 1169.000000 | 1169.000000 | 1169.000000 | 1169.000000 | ... | 1169.000000 | 1169.000000 | 1169.000000 | 1169.000000 | 1169.000000 | 1169.000000 | 1169.000000 | 1169.000000 | 1169.000000 | 1169.000000 |
mean | 0.041158 | 0.039579 | 0.024480 | 0.040698 | 0.037514 | 0.059773 | 0.076774 | 0.053765 | 0.074385 | 0.034230 | ... | 0.050592 | 0.057745 | 0.037777 | 0.032613 | 0.044480 | 0.029542 | 0.029258 | 0.064016 | 0.036052 | 0.048827 |
std | 0.037136 | 0.033824 | 0.030999 | 0.034068 | 0.039601 | 0.038727 | 0.044644 | 0.038444 | 0.043496 | 0.056849 | ... | 0.044985 | 0.043135 | 0.033596 | 0.037049 | 0.037689 | 0.035060 | 0.032516 | 0.039824 | 0.043407 | 0.038045 |
min | 0.000880 | 0.000000 | 0.000967 | 0.000000 | 0.001010 | 0.000441 | 0.000653 | 0.000000 | 0.001494 | 0.000711 | ... | 0.000309 | 0.001801 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.004357 | 0.000000 | 0.000000 |
25% | 0.026108 | 0.025549 | 0.015786 | 0.026687 | 0.022243 | 0.038263 | 0.050517 | 0.034564 | 0.049623 | 0.019186 | ... | 0.030418 | 0.035169 | 0.024366 | 0.018994 | 0.026349 | 0.015775 | 0.017751 | 0.041783 | 0.018024 | 0.031943 |
50% | 0.036715 | 0.036446 | 0.021726 | 0.037505 | 0.031550 | 0.056583 | 0.071622 | 0.049844 | 0.070002 | 0.026857 | ... | 0.042873 | 0.049978 | 0.034817 | 0.026776 | 0.041076 | 0.024674 | 0.025713 | 0.060156 | 0.026972 | 0.044455 |
75% | 0.048296 | 0.048669 | 0.029211 | 0.050845 | 0.043238 | 0.076600 | 0.096492 | 0.066155 | 0.092490 | 0.034833 | ... | 0.058156 | 0.072267 | 0.047740 | 0.037061 | 0.056010 | 0.037590 | 0.036140 | 0.081782 | 0.041046 | 0.059218 |
max | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | ... | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
8 rows × 1169 columns
n = Normalizer(copy=False)
normal = n.fit_transform(similarity_matrix)
normalized = pd.DataFrame(normal)
normalized.describe()
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 1159 | 1160 | 1161 | 1162 | 1163 | 1164 | 1165 | 1166 | 1167 | 1168 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 1169.000000 | 1169.000000 | 1169.000000 | 1169.000000 | 1169.000000 | 1169.000000 | 1169.000000 | 1169.000000 | 1169.000000 | 1169.000000 | ... | 1169.000000 | 1169.000000 | 1169.000000 | 1169.000000 | 1169.000000 | 1169.000000 | 1169.000000 | 1169.000000 | 1169.000000 | 1169.000000 |
mean | 0.019521 | 0.018878 | 0.011925 | 0.019041 | 0.017776 | 0.027939 | 0.035744 | 0.025148 | 0.034669 | 0.016225 | ... | 0.023905 | 0.026738 | 0.017889 | 0.015709 | 0.020838 | 0.013901 | 0.013969 | 0.029911 | 0.017143 | 0.022997 |
std | 0.018245 | 0.017694 | 0.021978 | 0.016936 | 0.019386 | 0.014810 | 0.014737 | 0.015766 | 0.014404 | 0.026438 | ... | 0.019598 | 0.016756 | 0.018183 | 0.020270 | 0.017689 | 0.020287 | 0.020226 | 0.014522 | 0.021096 | 0.016771 |
min | 0.000853 | 0.000000 | 0.000800 | 0.000000 | 0.000979 | 0.000407 | 0.000633 | 0.000000 | 0.001359 | 0.000690 | ... | 0.000300 | 0.001667 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.003677 | 0.000000 | 0.000000 |
25% | 0.014099 | 0.013528 | 0.008192 | 0.014548 | 0.011560 | 0.021010 | 0.027281 | 0.018725 | 0.027139 | 0.010477 | ... | 0.016031 | 0.019049 | 0.013040 | 0.009763 | 0.014633 | 0.008395 | 0.009466 | 0.022712 | 0.009182 | 0.016917 |
50% | 0.017207 | 0.017242 | 0.010218 | 0.017983 | 0.014834 | 0.027451 | 0.034363 | 0.023530 | 0.034019 | 0.012788 | ... | 0.019946 | 0.024279 | 0.017016 | 0.012770 | 0.018916 | 0.011841 | 0.012077 | 0.029065 | 0.012784 | 0.020862 |
75% | 0.020987 | 0.022067 | 0.012941 | 0.022207 | 0.019354 | 0.033798 | 0.042632 | 0.029272 | 0.040632 | 0.015307 | ... | 0.026324 | 0.031517 | 0.020693 | 0.016981 | 0.024320 | 0.016390 | 0.016003 | 0.035802 | 0.018514 | 0.025942 |
max | 0.527706 | 0.561876 | 0.740655 | 0.551155 | 0.536301 | 0.410707 | 0.329363 | 0.442571 | 0.339461 | 0.440889 | ... | 0.432106 | 0.405845 | 0.578646 | 0.592703 | 0.501759 | 0.638106 | 0.668809 | 0.387987 | 0.518463 | 0.472586 |
8 rows × 1169 columns
similarity_matrix = normalized
# Estimating K
k_range = range(5, 100, 5)
k_variance = [KMeans(n_clusters=k).fit(similarity_matrix) for k in k_range]
k_centroids = [X.cluster_centers_ for X in k_variance]
k_cosine = [cdist(similarity_matrix, cent, 'cosine') for cent in k_centroids]
distances = [np.min(ke, axis=1) for ke in k_cosine]
# Within-cluster sum of squares
wcss = [sum(d**2) for d in distances]
# Total sum of squares
tss = sum(pdist(similarity_matrix**2/similarity_matrix.shape[0]))
# Between cluster sum of squares
bss = tss - wcss
plt.plot(k_range, bss/tss*100)
plt.xlabel("Number of Clusters")
plt.ylabel("% Variance Explained")
<matplotlib.text.Text at 0x123ad4a90>
So when k is 10, we get around 75% ! Whereas we get another 10% by increasing to nearly 100. If we look at the topic distributions we plotted earlier, we'll see that there isn't much reason to focus much more than 10~20 topics.
Now that we've clustered the documents in this huge vector space, we could peruse through them and see if the cluster members correlate to concepts we think are useful or interesting. I'm going to do this with k=10.
ten = KMeans(n_clusters=10).fit(similarity_matrix)
tendf = pd.DataFrame(columns=['cluster_id', 'author','topics', 'text'])
tendf.topics = df.topics
tendf.author = df.author
tendf.cluster_id = ten.labels_
tendf.text = df.text
tendf.head()
cluster_id | author | topics | text | |
---|---|---|---|---|
0 | 8 | Stephen Palumbi: Following the mercury trail | fish,health,mission blue,oceans,science | complicated thing ocean complicated thing huma... |
1 | 4 | Jessa Gamble: Our natural sleep cycle | evolution,humanity,personal growth,science,self | start day night life evolved condition light d... |
2 | 7 | Handspring Puppet Co.: The genius puppetry beh... | animals,arts,design,entertainment,theater | adrian kohler today talk evolution puppet hors... |
3 | 1 | Katherine Fulton: You are the future ofphilant... | activism,bottom-up,community,globalissues,phil... | philanthropy wha tit relationship offer vision... |
4 | 5 | Chris Gerdes: The future race car -- 150mph, a... | cars,future,technology | wheel car driving road long day wanted tired f... |
tendf.cluster_id.hist()
<matplotlib.axes.AxesSubplot at 0x1103a4f10>
tendf['length'] = [len(t) for t in tendf.text]
tendf.text[0].split()[:5]
[u'complicated', u'thing', u'ocean', u'complicated', u'thing']
cluster_topics = []
cluster_text = []
for cluster_id in tendf.cluster_id.value_counts().index:
cluster_df = tendf[tendf.cluster_id==cluster_id]
topic_words = []
for topics in cluster_df.topics:
for topic in topics.split(','):
topic_words.append(topic)
clean_topics = processText(' '.join(topic_words), dictionary).split()
clean_df = pd.DataFrame(clean_topics, columns=['topics'])
cluster_text.append(pd.DataFrame(' '.join([text for text in cluster_df.text]).split(), columns=['text']))
cluster_topics.append(clean_df)
So this is really just one way of looking at our clusters. Our clusters are made of text, so lets look at the most frequently occuring terms, per cluster.
technology 160
design 145
culture 116
science 96
business 73
entertainment 66
arts 52
art 49
creativity 42
education 40
We'll that seems pretty general. Can't take too much from that - except that it talks alot about technology and design and culture... This is the most general category; it covers the T. E. and D. of TED. Since we can see that technology is number one, with a close second of design, then we move to seeing some of our other topics show up.
culture 119
entertainment 78
issues 71
global 71
arts 49
storytelling 46
technology 45
design 40
education 38
business 35
Much more focused here on culture, entertainment, and global issues.
technology 81
science 53
design 51
business 42
global 42
issues 42
environment 36
green 35
energy 30
invention 25
Hmm, once again, technology, design, business, but focusing more towards green energy at the end there.
issues 106
global 106
politics 49
culture 44
business 34
economics 31
health 29
Africa 28
technology 28
war 22
Here we actually see a departure from the earlier topics; were talking more about global issues, business, war, politics.
music 38
entertainment 37
performance 19
talk 16
arts 16
short 16
technology 16
design 13
live 12
culture 11
This is obviously more about art ! Music, performance, live performances.
science 53
technology 37
medicine 25
health 22
brain 19
biology 15
cancer 10
care 10
research 9
medical 9
science 33
oceans 28
technology 15
issues 12
mission 12
global 12
fish 12
blue 12
environment 10
exploration 10
science 27
physics 17
universe 17
technology 16
astronomy 12
cosmos 6
space 6
exploration 6
education 4
change 4
robots 12
technology 12
design 8
science 5
entertainment 3
engineering 3
evolution 3
animals 2
demo 2
AI 2
animals 3
issues 2
oceans 2
global 2
science 2
biodiversity 1
storytelling 1
culture 1
photography 1
creativity 1
for i in cluster_topics:
print i.topics.value_counts()[:10]
print '$' * 70
design 154 technology 149 culture 107 entertainment 72 science 72 arts 67 business 60 art 59 education 50 creativity 46 dtype: int64 $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$ global 143 issues 143 culture 93 business 78 politics 67 technology 58 economics 52 health 49 science 39 Africa 39 dtype: int64 $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$ music 70 entertainment 69 culture 33 technology 33 design 30 arts 29 performance 27 talk 22 short 22 live 18 dtype: int64 $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$ culture 53 issues 39 global 39 women 30 storytelling 27 entertainment 26 arts 20 education 19 politics 19 children 16 dtype: int64 $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$ science 78 technology 57 medicine 33 health 30 brain 28 biology 25 design 14 care 13 cancer 10 business 10 dtype: int64 $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$ science 45 oceans 29 technology 27 issues 21 global 21 exploration 17 animals 15 mission 13 design 13 blue 13 dtype: int64 $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$ food 21 science 20 design 19 issues 16 environment 16 global 16 technology 15 biology 13 green 12 business 10 dtype: int64 $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$ technology 37 energy 27 design 18 green 17 business 17 environment 14 science 12 transportation 11 culture 9 sustainability 9 dtype: int64 $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$ science 27 physics 16 universe 15 technology 14 astronomy 13 cosmos 6 space 6 change 4 exploration 4 education 4 dtype: int64 $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$ robots 12 technology 12 design 8 science 5 entertainment 3 engineering 3 evolution 3 animals 2 demo 2 AI 2 dtype: int64 $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
for i in cluster_text:
print i.text.value_counts()[:10]
print '-'*70
thing 3388 people 2888 time 2106 kind 1684 year 1571 world 1538 work 1523 lot 1136 life 1091 idea 1047 dtype: int64 ---------------------------------------------------------------------- people 3856 world 2048 thing 1683 year 1594 time 1378 country 1027 life 879 lot 798 good 795 problem 772 dtype: int64 ---------------------------------------------------------------------- thing 390 people 366 time 311 music 310 world 264 year 240 good 233 life 218 sound 206 kind 189 dtype: int64 ---------------------------------------------------------------------- woman 927 people 632 year 492 time 472 story 463 child 450 thing 428 girl 425 world 413 life 384 dtype: int64 ---------------------------------------------------------------------- brain 975 cell 774 people 563 thing 544 cancer 520 time 476 year 465 patient 357 life 332 body 329 dtype: int64 ---------------------------------------------------------------------- year 595 water 490 ocean 471 thing 449 time 433 life 353 people 334 world 302 planet 271 earth 259 dtype: int64 ---------------------------------------------------------------------- food 450 people 307 year 276 thing 269 plant 210 world 205 time 195 lot 166 kind 159 tree 152 dtype: int64 ---------------------------------------------------------------------- energy 426 car 424 people 410 thing 406 year 385 time 282 world 262 technology 241 oil 225 city 218 dtype: int64 ---------------------------------------------------------------------- universe 462 galaxy 222 thing 205 star 199 year 199 space 198 planet 167 time 165 earth 153 life 140 dtype: int64 ---------------------------------------------------------------------- robot 344 thing 60 foot 59 animal 58 time 54 leg 49 doe 45 people 42 work 41 kind 37 dtype: int64 ----------------------------------------------------------------------
# Agglomerative Clustering ?
from sklearn.cluster import AgglomerativeClustering
ten = AgglomerativeClustering(n_clusters=10).fit(similarity_matrix)
tendf = pd.DataFrame(columns=['cluster_id', 'author','topics'])
tendf.topics = df.topics
tendf.author = df.author
tendf.cluster_id = ten.labels_
tendf['text'] = df.text
tendf.head()
cluster_id | author | topics | text | |
---|---|---|---|---|
0 | 1 | Stephen Palumbi: Following the mercury trail | fish,health,mission blue,oceans,science | complicated thing ocean complicated thing huma... |
1 | 1 | Jessa Gamble: Our natural sleep cycle | evolution,humanity,personal growth,science,self | start day night life evolved condition light d... |
2 | 0 | Handspring Puppet Co.: The genius puppetry beh... | animals,arts,design,entertainment,theater | adrian kohler today talk evolution puppet hors... |
3 | 5 | Katherine Fulton: You are the future ofphilant... | activism,bottom-up,community,globalissues,phil... | philanthropy wha tit relationship offer vision... |
4 | 1 | Chris Gerdes: The future race car -- 150mph, a... | cars,future,technology | wheel car driving road long day wanted tired f... |
tendf.cluster_id.hist()
tendf['length'] = [len(t) for t in tendf.text]
tendf.text[0].split()[:5]
[u'complicated', u'thing', u'ocean', u'complicated', u'thing']
tendf.head(1)
cluster_id | author | topics | text | length | |
---|---|---|---|---|---|
0 | 1 | Stephen Palumbi: Following the mercury trail | fish,health,mission blue,oceans,science | complicated thing ocean complicated thing huma... | 6680 |
cluster_topics = []
cluster_text = []
for cluster_id in tendf.cluster_id.value_counts().index:
cluster_df = tendf[tendf.cluster_id==cluster_id]
topic_words = []
for topics in cluster_df.topics:
for topic in topics.split(','):
topic_words.append(topic)
clean_topics = processText(' '.join(topic_words), dictionary).split()
clean_df = pd.DataFrame(clean_topics, columns=['topics'])
cluster_text.append(pd.DataFrame(' '.join([text for text in cluster_df.text]).split(), columns=['text']))
cluster_topics.append(clean_df)
for i in cluster_topics:
print i.topics.value_counts()[:10]
print '$' * 70
design 140 culture 139 technology 114 entertainment 72 science 72 arts 67 art 61 business 61 education 55 global 49 dtype: int64 $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$ global 105 issues 105 technology 83 business 76 culture 66 economics 43 politics 43 science 38 design 37 Africa 37 dtype: int64 $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$ technology 78 science 77 design 48 issues 36 global 36 environment 32 oceans 30 energy 27 biology 24 business 23 dtype: int64 $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$ global 67 issues 67 culture 61 war 37 politics 34 entertainment 33 women 27 storytelling 27 arts 25 technology 16 dtype: int64 $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$ music 68 entertainment 61 technology 25 arts 25 performance 22 design 22 culture 21 live 20 short 19 talk 19 dtype: int64 $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$ science 50 technology 39 health 39 medicine 27 biology 17 care 16 business 14 design 13 culture 13 issues 12 dtype: int64 $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$ science 28 physics 17 technology 16 universe 16 astronomy 13 cosmos 6 design 5 space 5 education 4 exploration 4 dtype: int64 $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$ science 25 brain 21 technology 15 neurology 7 neuroscience 6 computers 5 mind 5 biology 5 design 4 consciousness 4 dtype: int64 $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$ technology 12 science 11 cancer 9 medicine 9 health 8 biology 5 medical 5 care 3 business 3 research 2 dtype: int64 $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$ robots 12 technology 12 design 8 science 5 entertainment 3 engineering 3 evolution 3 animals 2 demo 2 AI 2 dtype: int64 $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
for i in cluster_text:
print i.text.value_counts()[:10]
print '-'*70
people 3078 thing 3043 time 2123 year 1666 world 1619 kind 1556 work 1329 life 1194 lot 1058 good 987 dtype: int64 ---------------------------------------------------------------------- people 3291 thing 1825 world 1716 year 1386 time 1242 country 835 work 802 lot 796 kind 719 problem 706 dtype: int64 ---------------------------------------------------------------------- year 1147 thing 1044 people 957 time 871 water 671 world 670 lot 568 life 567 food 525 ocean 461 dtype: int64 ---------------------------------------------------------------------- people 1005 woman 835 world 696 year 577 story 576 thing 573 time 573 life 432 child 405 girl 394 dtype: int64 ---------------------------------------------------------------------- music 333 play 312 thing 270 people 245 sound 241 time 228 year 173 good 167 world 164 yeah 152 dtype: int64 ---------------------------------------------------------------------- thing 539 people 525 year 490 life 415 time 370 technology 333 cell 328 world 297 work 278 patient 249 dtype: int64 ---------------------------------------------------------------------- universe 471 thing 245 galaxy 222 year 207 star 205 space 201 time 189 planet 168 earth 150 life 138 dtype: int64 ---------------------------------------------------------------------- brain 719 people 173 thing 158 cell 154 time 146 neuron 139 human 97 kind 89 pattern 89 called 84 dtype: int64 ---------------------------------------------------------------------- cancer 430 cell 320 disease 131 tumor 118 body 118 patient 111 stem 98 drug 97 woman 93 year 90 dtype: int64 ---------------------------------------------------------------------- robot 344 thing 60 foot 59 animal 58 time 54 leg 49 doe 45 people 42 work 41 kind 37 dtype: int64 ----------------------------------------------------------------------
import gensim
all_text = [doc.split() for doc in df.text]
gensim_d = gensim.corpora.Dictionary(all_text)
corpus = [gensim_d.doc2bow(text) for text in all_text]
lda = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=gensim_d, num_topics=10, update_every=1, chunksize=100, passes=1)
lda_topics = lda.print_topics(10)
lda_tops = [topic.split('+') for topic in lda_topics]
for topic in lda_tops:
for pair in topic:
print pair.split('*')[0] + '\t' + pair.split('*')[1]
print '%' * 70
0.015 cell 0.011 patient 0.011 food 0.008 disease 0.008 cancer 0.007 body 0.007 brain 0.006 heart 0.006 people 0.006 year %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 0.014 space 0.009 universe 0.008 particle 0.007 thing 0.007 earth 0.007 light 0.006 planet 0.006 tree 0.006 theory 0.006 time %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 0.017 brain 0.014 human 0.009 thing 0.008 people 0.006 life 0.006 time 0.006 year 0.006 gene 0.004 evolution 0.004 genome %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 0.014 people 0.013 world 0.013 country 0.009 africa 0.009 year 0.007 woman 0.006 government 0.005 war 0.005 aid 0.005 india %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 0.026 people 0.012 world 0.012 thing 0.006 time 0.005 kind 0.005 idea 0.005 good 0.005 year 0.005 work 0.004 lot %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 0.010 life 0.008 music 0.008 compassion 0.007 people 0.006 time 0.006 sound 0.005 thing 0.005 world 0.005 god 0.004 year %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 0.013 year 0.009 technology 0.008 thing 0.008 people 0.007 energy 0.007 time 0.006 water 0.006 percent 0.005 world 0.005 system %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 0.011 thing 0.010 kind 0.007 time 0.006 water 0.006 animal 0.006 data 0.005 ocean 0.005 lot 0.005 robot 0.005 design %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 0.013 thing 0.010 people 0.010 time 0.007 work 0.007 year 0.006 day 0.006 life 0.005 kid 0.005 story 0.005 school %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 0.009 people 0.007 language 0.007 baby 0.007 child 0.006 love 0.006 time 0.005 year 0.005 thing 0.004 learning 0.004 english %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# Nice husl from seaborn
colors = sns.husl_palette(n_colors=10)
sns.palplot(colors)
colors[9]=[0,0,0]
colors[8]=[1,1,1]
colors.reverse()
sns.palplot(colors)
Google:
In machine learning and statistics, dimensionality reduction or dimension reduction is the process of reducing the number of random variables under consideration, and can be divided into feature selection and feature extraction.
However, it's quite hard to visualize 1000's of dimensions. So below, I go about plotting our clusters in 3 dimensions; yes - we are losing tons of data ! But, this reduction allows us to actually see the data. In order to do these reductions, I compare the following dimensionality reduction algorithms.
Simply put, PCA is a way of finding the most important parts some data set. More exactly, it's an orthogonal transformation of observations into some number of linearally uncorrelated variables, in this case, trying to summarize 1000's of dimensions into three. Basically, the first principle component is the component which accounts for this highest variance of the data (it explains the most), and the subsequent component(s) is the next in terms of variance explanation and also orthogonal (i.e. uncorrelated) with the previous component(s).
Very similar to PCA. A gross simplification; SVD is a way of factorizing a large matrix into 3 sub parts. These 3 parts can re-create the matrix, so we take some of one of the components to make a smaller, approximated copy of the original.
This is a fascinating algorithm. It has a few main parts. Firstly, it creates a probability distribution that represents similarity between points (in the high dimensional space). Then, it creates a similar probability distribution over the low dimensional space and then minimizes the distance between the two (Kullback-Leibler divergence).
Wikipedia:
ICA finds the independent components (also called factors, latent variables or sources) by maximizing the statistical independence of the estimated components.
Basically:
Typical algorithms for ICA use centering (subtract the mean to create a zero mean signal), whitening (usually with the eigenvalue decomposition), and dimensionality reduction as preprocessing steps in order to simplify and reduce the complexity of the problem for the actual iterative algorithm. Whitening and dimension reduction can be achieved with principal component analysis or singular value decomposition.
def plot_reduction_kmeans(first_reduction, first_num, second_reduction, second_num, matrix=similarity_matrix):
# Reduction #1
f = first_reduction(n_components=first_num)
f_matrix = f.fit_transform(matrix)
# Reduction #2 1000 dimensions ->3 dimensions
s = second_reduction(n_components=second_num)
s_matrix = s.fit_transform(f_matrix)
kmeans = KMeans(init='k-means++', n_clusters=10, n_init=100)
kmeans.fit(s_matrix)
d = {i:colors[i] for i in range(10)}
kcolors = [d[i] for i in kmeans.labels_]
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(s_matrix[:,0],s_matrix[:,1],s_matrix[:,2], c=kcolors, alpha=.6)
plot_reduction_kmeans(TruncatedSVD, 100, TSNE, 3)
plot_reduction_kmeans(TruncatedSVD, 100, PCA, 3)
plot_reduction_kmeans(TruncatedSVD, 500, PCA, 3)
plot_reduction_kmeans(TruncatedSVD, 100, ICA, 3)
plot_reduction_kmeans(TruncatedSVD, 500, ICA, 3)
plot_reduction_kmeans(PCA, 100, TSNE, 3)
plot_reduction_kmeans(PCA, 500, TSNE, 3)
plot_reduction_kmeans(PCA, 100, TruncatedSVD, 3)
plot_reduction_kmeans(PCA, 500, TruncatedSVD, 3)
plot_reduction_kmeans(PCA, 100, ICA, 3)
plot_reduction_kmeans(PCA, 500, ICA, 3)
plot_reduction_kmeans(ICA, 100, TSNE, 3)
plot_reduction_kmeans(TruncatedSVD, 300, TSNE, 3)
def plot_reduction_agg(first_reduction, first_num, second_reduction, second_num, matrix=similarity_matrix, affinity='cosine', linkage='complete'):
# Reduction #1
f = first_reduction(n_components=first_num)
f_matrix = f.fit_transform(matrix)
# Reduction #2 1000 dimensions ->3 dimensions
s = second_reduction(n_components=second_num)
s_matrix = s.fit_transform(f_matrix)
agg = AgglomerativeClustering(n_clusters=10, affinity=affinity, linkage=linkage)
agg.fit(s_matrix)
d = {i:colors[i] for i in range(10)}
acolors = [d[i] for i in agg.labels_]
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(s_matrix[:,0],s_matrix[:,1],s_matrix[:,2], c=acolors, alpha=.6)
plot_reduction_agg(TruncatedSVD, 100, TSNE, 3)
plot_reduction_agg(TruncatedSVD, 100, PCA, 3)
plot_reduction_agg(TruncatedSVD, 100, ICA, 3)
plot_reduction_agg(PCA, 100, TruncatedSVD, 3)
plot_reduction_agg(PCA, 100, TSNE, 3)
plot_reduction_agg(PCA, 100, TruncatedSVD, 3)
plot_reduction_agg(PCA, 100, ICA, 3)
plot_reduction_agg(PCA, 100, PCA, 3)
plot_reduction_agg(ICA, 100, TruncatedSVD, 3)
plot_reduction_agg(ICA, 100, TSNE, 3)
plot_reduction_agg(ICA, 100, PCA, 3)
plot_reduction_agg(ICA, 100, ICA, 3)
## My favorite - for now
plot_reduction_kmeans(PCA, 500, TruncatedSVD, 3)
## My favorite - for now
plot_reduction_kmeans(PCA, 500, TruncatedSVD, 3)
Interactive Visualization! https://github.com/datacratic/data-projector http://opensource.datacratic.com/data-projector/
# Reduction #1
f = PCA(n_components=500)
f_matrix = f.fit_transform(similarity_matrix)
# Reduction #2 1000 dimensions ->3 dimensions
s = TruncatedSVD(n_components=3)
s_matrix = s.fit_transform(f_matrix)
kmeans = KMeans(init='k-means++', n_clusters=10, n_init=100)
kmeans.fit(s_matrix)
data_matrix = s_matrix.copy()
data_matrix = pd.DataFrame(data_matrix*200) # Gotta make everything a bit larger
data_matrix['cid'] = kmeans.labels_
data_matrix = data_matrix[[1,0,2,'cid']]
data_matrix.columns=['y','x','z','cid']
data_matrix.cid = data_matrix.cid.astype(int)
data_matrix = data_matrix.astype(str)
data_matrix
y | x | z | cid | |
---|---|---|---|---|
0 | -13.0348188266 | -10.0552407715 | 34.6405097522 | 4 |
1 | -15.0734196879 | -16.0528409209 | 0.0902980978558 | 7 |
2 | 16.0918851228 | -29.1836998758 | -11.0857616069 | 3 |
3 | -0.935772800338 | 21.9657836093 | 5.74032885375 | 0 |
4 | -11.3979443438 | -18.4993158635 | -7.80118007466 | 7 |
... | ... | ... | ... | ... |
1164 | 3.52751145427 | -0.971211989561 | 22.1971858237 | 2 |
1165 | 33.7620250944 | 9.99174149862 | 13.9537345578 | 5 |
1166 | -11.1691445003 | 28.9441680889 | -11.2355379091 | 6 |
1167 | -22.3281534736 | -69.6905826011 | -0.572318236642 | 7 |
1168 | 14.2762038784 | 28.56306616 | 16.2803658902 | 8 |
1169 rows × 4 columns
data_matrix.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 1169 entries, 0 to 1168 Data columns (total 4 columns): y 1169 non-null object x 1169 non-null object z 1169 non-null object cid 1169 non-null object dtypes: object(4) memory usage: 45.7+ KB
d = [{k:data_matrix.values[i][v] for v,k in enumerate(data_matrix.columns)} for i in range(len(data_matrix)) ]
datajson = {"points": d}
import json
json.dump(datajson, open('data.json', 'w'))
! ls
10kdocterm.pkl UN.en-es.en stopword.txt un.ipynb LICENSE data-projector ted.ipynb wlist_match7.txt README.md data.json ted.xml TED2013 df.pkl ted_old.ipynb