from __future__ import division # python 2, so old school import numpy as np import pandas as pd import seaborn as sns import nltk import matplotlib.pyplot as plt import re from bs4 import BeautifulSoup import codecs from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.preprocessing import Normalizer from sklearn.cluster import KMeans from scipy.spatial.distance import cdist, pdist from sklearn.cluster import KMeans from mpl_toolkits.mplot3d import Axes3D from sklearn.decomposition import TruncatedSVD from sklearn.manifold import TSNE from sklearn.decomposition import FastICA as ICA from sklearn.decomposition import PCA pd.set_option('display.max_rows', 10) %matplotlib inline # From OPUS: http://opus.lingfil.uu.se/TED2013.php infile=open('./ted.xml', 'r') raw=infile.read() infile.close() raw = raw.decode('ascii', 'ignore') raw = raw.encode('utf8', 'ignore') soup = BeautifulSoup(raw) text = soup.get_text() number_of_talks = len(text.split('http://www.ted.com/talks/')); number_of_talks url_split = text.split('http://www.ted.com/talks/') url_split=url_split[1:] url_split[0].split('\n')[:10] # Word list from : http://www.keithv.com/software/wlist/ wordfile = open('./wlist_match7.txt', 'r') wordlist = wordfile.readlines() wordfile.close() dictionary = {word.strip(): '' for word in wordlist} # Stop list file from http://www.ranks.nl/stopwords + nltk stopword_file = open('./stopword.txt', 'r') stopwords_raw = stopword_file.read() stopword_file.close() stopwords_list = [w for w in stopwords_raw.split()] stopwords_list = stopwords_list + nltk.corpus.stopwords.words('english') stopwords = list(set(stopwords_list)) stopwords.append('ha') stopwords.append('wa') stopwords[-10:] def getRealWords(word, dictionary): if word in dictionary: return str(word) else: wordlength = len(word) for i in range(wordlength): part = word[:i] if part in dictionary: if word[i:] in dictionary: return str(part) + ' ' + str(word[i:]) return str(word) def processText(text, dictionary): string = u'' words = text.split() for word in words: if word in stopwords: pass else: string += ' ' + getRealWords(word, dictionary) return string lemmatizer = nltk.stem.WordNetLemmatizer() def scrub(text): lines = text.splitlines() url = lines[0] topics = lines[2] author = lines[4] tokens = [t for t in nltk.tokenize.word_tokenize(' '.join(lines[5:]))] clean_tokens = [lemmatizer.lemmatize(token.lower()) for token in tokens \ if re.search(ur'^[a-zA-Z]+', token)] clean = processText(' '.join(clean_tokens), dictionary).split() clean = [w for w in clean if w not in stopwords if w in dictionary] return author, topics, url, clean a = scrub(url_split[0]); a[3][:10] scrubbed = [] total = len(url_split) for talk in url_split: scrubbed.append(scrub(talk)) df = pd.DataFrame(scrubbed, columns=['author', 'topics', 'url', 'text']) df.head() df['text'] = df['text'].map( lambda x: ' '.join(x)) df.head() import cPickle cPickle.dump(df, open('df.pkl', 'w')) # never have to do this again ! ! ls # Whats the text actually look like now? df.text topic_words = [] for topics in df.topics: for topic in topics.split(','): topic_words.append(topic) clean_topics = processText(' '.join(topic_words), dictionary).split() tpx = pd.DataFrame(clean_topics, columns=['topics']) tpx.topics.value_counts()[:10] tpx.topics.value_counts().plot(rot=90, figsize=(12,8), fontsize=20) tpx.topics.value_counts()[:35].plot(rot=90, xticks=range(35), figsize=(12,8), fontsize=20) tpx.topics.value_counts()[:20].plot(rot=90, xticks=range(20), figsize=(12,8), fontsize=20) tpx.topics.value_counts()[:10].plot(rot=90, xticks=range(10), figsize=(12,8), fontsize=20) # Vectorize: we've already used a ton of stopword lists up above by why not do it again. # Smoothing here is a +1 to avoid zero division vectorizer = TfidfVectorizer(stop_words='english', smooth_idf=True) tfidf = vectorizer.fit_transform(df.text) # Cosine Similarity similarity_matrix = tfidf.dot(tfidf.T) similarity_matrix = Normalizer(copy=False).fit_transform(similarity_matrix) # Estimating K - http://www.slideshare.net/SarahGuido/kmeans-clustering-with-scikitlearn k_range = range(5, 50, 5) k_euclid = [KMeans(n_clusters=k).fit(similarity_matrix) for k in k_range] k_centroids = [X.cluster_centers_ for X in k_euclid] k_cosine = [cdist(similarity_matrix.toarray(), cent, 'cosine') for cent in k_centroids] distances = [np.min(ke, axis=1) for ke in k_cosine] # Within-cluster sum of squares wcss = [sum(d**2) for d in distances] # Total sum of squares tss = sum(pdist(similarity_matrix.toarray()**2/similarity_matrix.toarray().shape[0])) # Between cluster sum of squares bss = tss - wcss plt.plot(k_range, bss/tss*100) plt.xlabel("Number of Clusters") plt.ylabel("% Variance Explained") # So, we can see that even with just 5 clusters we have over 73% variance explained. from sklearn.metrics import silhouette_score, silhouette_samples silhouette_scores = [silhouette_score(tfidf, k.labels_) for k in k_euclid] plt.plot(k_range, silhouette_scores) tfmat = pd.DataFrame(tfidf.todense(), index=df.author, columns=vectorizer.get_feature_names()) # Reduce the data to the top 10,000 most important words some = tfmat.sum(axis=0) sorter = some.argsort() srtd = pd.DataFrame(sorter) sorted_index = srtd.sort(columns=0).index reduced = tfmat[sorted_index][:10000] #cPickle.dump(reduced, open('10kdocterm.pkl', 'w')) ! ls reduced.head(1) similarity_matrix = reduced.dot(reduced.T) similarity_matrix.describe() n = Normalizer(copy=False) normal = n.fit_transform(similarity_matrix) normalized = pd.DataFrame(normal) normalized.describe() similarity_matrix = normalized # Estimating K k_range = range(5, 100, 5) k_variance = [KMeans(n_clusters=k).fit(similarity_matrix) for k in k_range] k_centroids = [X.cluster_centers_ for X in k_variance] k_cosine = [cdist(similarity_matrix, cent, 'cosine') for cent in k_centroids] distances = [np.min(ke, axis=1) for ke in k_cosine] # Within-cluster sum of squares wcss = [sum(d**2) for d in distances] # Total sum of squares tss = sum(pdist(similarity_matrix**2/similarity_matrix.shape[0])) # Between cluster sum of squares bss = tss - wcss plt.plot(k_range, bss/tss*100) plt.xlabel("Number of Clusters") plt.ylabel("% Variance Explained") ten = KMeans(n_clusters=10).fit(similarity_matrix) tendf = pd.DataFrame(columns=['cluster_id', 'author','topics', 'text']) tendf.topics = df.topics tendf.author = df.author tendf.cluster_id = ten.labels_ tendf.text = df.text tendf.head() tendf.cluster_id.hist() tendf['length'] = [len(t) for t in tendf.text] tendf.text[0].split()[:5] cluster_topics = [] cluster_text = [] for cluster_id in tendf.cluster_id.value_counts().index: cluster_df = tendf[tendf.cluster_id==cluster_id] topic_words = [] for topics in cluster_df.topics: for topic in topics.split(','): topic_words.append(topic) clean_topics = processText(' '.join(topic_words), dictionary).split() clean_df = pd.DataFrame(clean_topics, columns=['topics']) cluster_text.append(pd.DataFrame(' '.join([text for text in cluster_df.text]).split(), columns=['text'])) cluster_topics.append(clean_df) for i in cluster_topics: print i.topics.value_counts()[:10] print '$' * 70 for i in cluster_text: print i.text.value_counts()[:10] print '-'*70 # Agglomerative Clustering ? from sklearn.cluster import AgglomerativeClustering ten = AgglomerativeClustering(n_clusters=10).fit(similarity_matrix) tendf = pd.DataFrame(columns=['cluster_id', 'author','topics']) tendf.topics = df.topics tendf.author = df.author tendf.cluster_id = ten.labels_ tendf['text'] = df.text tendf.head() tendf.cluster_id.hist() tendf['length'] = [len(t) for t in tendf.text] tendf.text[0].split()[:5] tendf.head(1) cluster_topics = [] cluster_text = [] for cluster_id in tendf.cluster_id.value_counts().index: cluster_df = tendf[tendf.cluster_id==cluster_id] topic_words = [] for topics in cluster_df.topics: for topic in topics.split(','): topic_words.append(topic) clean_topics = processText(' '.join(topic_words), dictionary).split() clean_df = pd.DataFrame(clean_topics, columns=['topics']) cluster_text.append(pd.DataFrame(' '.join([text for text in cluster_df.text]).split(), columns=['text'])) cluster_topics.append(clean_df) for i in cluster_topics: print i.topics.value_counts()[:10] print '$' * 70 for i in cluster_text: print i.text.value_counts()[:10] print '-'*70 import gensim all_text = [doc.split() for doc in df.text] gensim_d = gensim.corpora.Dictionary(all_text) corpus = [gensim_d.doc2bow(text) for text in all_text] lda = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=gensim_d, num_topics=10, update_every=1, chunksize=100, passes=1) lda_topics = lda.print_topics(10) lda_tops = [topic.split('+') for topic in lda_topics] for topic in lda_tops: for pair in topic: print pair.split('*')[0] + '\t' + pair.split('*')[1] print '%' * 70 # Nice husl from seaborn colors = sns.husl_palette(n_colors=10) sns.palplot(colors) colors[9]=[0,0,0] colors[8]=[1,1,1] colors.reverse() sns.palplot(colors) def plot_reduction_kmeans(first_reduction, first_num, second_reduction, second_num, matrix=similarity_matrix): # Reduction #1 f = first_reduction(n_components=first_num) f_matrix = f.fit_transform(matrix) # Reduction #2 1000 dimensions ->3 dimensions s = second_reduction(n_components=second_num) s_matrix = s.fit_transform(f_matrix) kmeans = KMeans(init='k-means++', n_clusters=10, n_init=100) kmeans.fit(s_matrix) d = {i:colors[i] for i in range(10)} kcolors = [d[i] for i in kmeans.labels_] fig = plt.figure() ax = fig.add_subplot(111, projection='3d') ax.scatter(s_matrix[:,0],s_matrix[:,1],s_matrix[:,2], c=kcolors, alpha=.6) plot_reduction_kmeans(TruncatedSVD, 100, TSNE, 3) plot_reduction_kmeans(TruncatedSVD, 100, PCA, 3) plot_reduction_kmeans(TruncatedSVD, 500, PCA, 3) plot_reduction_kmeans(TruncatedSVD, 100, ICA, 3) plot_reduction_kmeans(TruncatedSVD, 500, ICA, 3) plot_reduction_kmeans(PCA, 100, TSNE, 3) plot_reduction_kmeans(PCA, 500, TSNE, 3) plot_reduction_kmeans(PCA, 100, TruncatedSVD, 3) plot_reduction_kmeans(PCA, 500, TruncatedSVD, 3) plot_reduction_kmeans(PCA, 100, ICA, 3) plot_reduction_kmeans(PCA, 500, ICA, 3) plot_reduction_kmeans(ICA, 100, TSNE, 3) plot_reduction_kmeans(TruncatedSVD, 300, TSNE, 3) def plot_reduction_agg(first_reduction, first_num, second_reduction, second_num, matrix=similarity_matrix, affinity='cosine', linkage='complete'): # Reduction #1 f = first_reduction(n_components=first_num) f_matrix = f.fit_transform(matrix) # Reduction #2 1000 dimensions ->3 dimensions s = second_reduction(n_components=second_num) s_matrix = s.fit_transform(f_matrix) agg = AgglomerativeClustering(n_clusters=10, affinity=affinity, linkage=linkage) agg.fit(s_matrix) d = {i:colors[i] for i in range(10)} acolors = [d[i] for i in agg.labels_] fig = plt.figure() ax = fig.add_subplot(111, projection='3d') ax.scatter(s_matrix[:,0],s_matrix[:,1],s_matrix[:,2], c=acolors, alpha=.6) plot_reduction_agg(TruncatedSVD, 100, TSNE, 3) plot_reduction_agg(TruncatedSVD, 100, PCA, 3) plot_reduction_agg(TruncatedSVD, 100, ICA, 3) plot_reduction_agg(PCA, 100, TruncatedSVD, 3) plot_reduction_agg(PCA, 100, TSNE, 3) plot_reduction_agg(PCA, 100, TruncatedSVD, 3) plot_reduction_agg(PCA, 100, ICA, 3) plot_reduction_agg(PCA, 100, PCA, 3) plot_reduction_agg(ICA, 100, TruncatedSVD, 3) plot_reduction_agg(ICA, 100, TSNE, 3) plot_reduction_agg(ICA, 100, PCA, 3) plot_reduction_agg(ICA, 100, ICA, 3) ## My favorite - for now plot_reduction_kmeans(PCA, 500, TruncatedSVD, 3) ## My favorite - for now plot_reduction_kmeans(PCA, 500, TruncatedSVD, 3) # Reduction #1 f = PCA(n_components=500) f_matrix = f.fit_transform(similarity_matrix) # Reduction #2 1000 dimensions ->3 dimensions s = TruncatedSVD(n_components=3) s_matrix = s.fit_transform(f_matrix) kmeans = KMeans(init='k-means++', n_clusters=10, n_init=100) kmeans.fit(s_matrix) data_matrix = s_matrix.copy() data_matrix = pd.DataFrame(data_matrix*200) # Gotta make everything a bit larger data_matrix['cid'] = kmeans.labels_ data_matrix = data_matrix[[1,0,2,'cid']] data_matrix.columns=['y','x','z','cid'] data_matrix.cid = data_matrix.cid.astype(int) data_matrix = data_matrix.astype(str) data_matrix data_matrix.info() d = [{k:data_matrix.values[i][v] for v,k in enumerate(data_matrix.columns)} for i in range(len(data_matrix)) ] datajson = {"points": d} import json json.dump(datajson, open('data.json', 'w')) ! ls