from __future__ import division # python 2, so old school
import numpy as np
import pandas as pd
import seaborn as sns
import nltk
import matplotlib.pyplot as plt
import re
from bs4 import BeautifulSoup
import codecs

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import Normalizer
from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist, pdist

from sklearn.cluster import KMeans
from mpl_toolkits.mplot3d import Axes3D
from sklearn.decomposition import TruncatedSVD
from sklearn.manifold import TSNE
from sklearn.decomposition import FastICA as ICA
from sklearn.decomposition import PCA

pd.set_option('display.max_rows', 10)

%matplotlib inline

# From OPUS: http://opus.lingfil.uu.se/TED2013.php

infile=open('./ted.xml', 'r')
raw=infile.read()
infile.close()
raw = raw.decode('ascii', 'ignore')
raw = raw.encode('utf8', 'ignore')

soup = BeautifulSoup(raw)

text = soup.get_text()

number_of_talks = len(text.split('http://www.ted.com/talks/')); number_of_talks

url_split = text.split('http://www.ted.com/talks/')

url_split=url_split[1:]

url_split[0].split('\n')[:10]

# Word list from : http://www.keithv.com/software/wlist/

wordfile = open('./wlist_match7.txt', 'r')
wordlist = wordfile.readlines()
wordfile.close()

dictionary = {word.strip(): '' for word in wordlist}

# Stop list file from http://www.ranks.nl/stopwords + nltk 
stopword_file = open('./stopword.txt', 'r')
stopwords_raw = stopword_file.read()
stopword_file.close()
stopwords_list = [w for w in stopwords_raw.split()]
stopwords_list = stopwords_list + nltk.corpus.stopwords.words('english')
stopwords = list(set(stopwords_list))
stopwords.append('ha')
stopwords.append('wa')
stopwords[-10:]

def getRealWords(word, dictionary):
    if word in dictionary:
        return str(word)
    else:
        wordlength = len(word)
        for i in range(wordlength):
            part = word[:i]
            if part in dictionary:
                if word[i:] in dictionary:
                    return str(part) + ' ' + str(word[i:])
                    
        return str(word)
    
def processText(text, dictionary):
    string = u''
    words = text.split()
    for word in words:
        if word in stopwords:
            pass
        else:
            string += ' ' + getRealWords(word, dictionary)
    return string
    

lemmatizer = nltk.stem.WordNetLemmatizer()

def scrub(text):
    lines = text.splitlines()
    url = lines[0]
    topics = lines[2]
    author = lines[4]
    tokens = [t for t in nltk.tokenize.word_tokenize(' '.join(lines[5:]))]
    clean_tokens = [lemmatizer.lemmatize(token.lower()) for token in tokens \
                    if re.search(ur'^[a-zA-Z]+', token)]
    clean = processText(' '.join(clean_tokens), dictionary).split()

    clean = [w for w in clean if w not in stopwords if w in dictionary]
    return author, topics, url, clean

a = scrub(url_split[0]); a[3][:10]

scrubbed = []
total = len(url_split)
for talk in url_split:
    scrubbed.append(scrub(talk))

df = pd.DataFrame(scrubbed, columns=['author', 'topics', 'url', 'text'])

df.head()

df['text'] = df['text'].map(
    lambda x: ' '.join(x))

df.head()

import cPickle
cPickle.dump(df, open('df.pkl', 'w')) # never have to do this again !

! ls

# Whats the text actually look like now?

df.text

topic_words = []
for topics in df.topics:
    for topic in topics.split(','):
        topic_words.append(topic)
clean_topics = processText(' '.join(topic_words), dictionary).split()
tpx = pd.DataFrame(clean_topics, columns=['topics'])
tpx.topics.value_counts()[:10]

tpx.topics.value_counts().plot(rot=90, figsize=(12,8), fontsize=20)

tpx.topics.value_counts()[:35].plot(rot=90, xticks=range(35), figsize=(12,8), fontsize=20)


tpx.topics.value_counts()[:20].plot(rot=90, xticks=range(20), figsize=(12,8), fontsize=20)

tpx.topics.value_counts()[:10].plot(rot=90, xticks=range(10), figsize=(12,8), fontsize=20)

# Vectorize: we've already used a ton of stopword lists up above by why not do it again.
# Smoothing here is a +1 to avoid zero division
vectorizer = TfidfVectorizer(stop_words='english', smooth_idf=True)
tfidf = vectorizer.fit_transform(df.text)

# Cosine Similarity
similarity_matrix = tfidf.dot(tfidf.T)
similarity_matrix = Normalizer(copy=False).fit_transform(similarity_matrix)

# Estimating K - http://www.slideshare.net/SarahGuido/kmeans-clustering-with-scikitlearn
k_range = range(5, 50, 5)
k_euclid = [KMeans(n_clusters=k).fit(similarity_matrix) for k in k_range]
k_centroids = [X.cluster_centers_ for X in k_euclid]

k_cosine = [cdist(similarity_matrix.toarray(), cent, 'cosine') for cent in k_centroids]
distances = [np.min(ke, axis=1) for ke in k_cosine]

# Within-cluster sum of squares
wcss = [sum(d**2) for d in distances]

# Total sum of squares
tss = sum(pdist(similarity_matrix.toarray()**2/similarity_matrix.toarray().shape[0]))

# Between cluster sum of squares
bss = tss - wcss

plt.plot(k_range, bss/tss*100)
plt.xlabel("Number of Clusters")
plt.ylabel("% Variance Explained")

# So, we can see that even with just 5 clusters we have over 73% variance explained.

from sklearn.metrics import silhouette_score, silhouette_samples

silhouette_scores = [silhouette_score(tfidf, k.labels_) for k in k_euclid]

plt.plot(k_range, silhouette_scores)

tfmat = pd.DataFrame(tfidf.todense(), index=df.author, columns=vectorizer.get_feature_names())

# Reduce the data to the top 10,000 most important words
some = tfmat.sum(axis=0)
sorter = some.argsort()
srtd = pd.DataFrame(sorter)
sorted_index = srtd.sort(columns=0).index
reduced = tfmat[sorted_index][:10000]

#cPickle.dump(reduced, open('10kdocterm.pkl', 'w'))
! ls

reduced.head(1)

similarity_matrix = reduced.dot(reduced.T)

similarity_matrix.describe()

n = Normalizer(copy=False)
normal = n.fit_transform(similarity_matrix)
normalized = pd.DataFrame(normal)
normalized.describe()

similarity_matrix = normalized

# Estimating K

k_range = range(5, 100, 5)
k_variance = [KMeans(n_clusters=k).fit(similarity_matrix) for k in k_range]
k_centroids = [X.cluster_centers_ for X in k_variance]

k_cosine = [cdist(similarity_matrix, cent, 'cosine') for cent in k_centroids]
distances = [np.min(ke, axis=1) for ke in k_cosine]

# Within-cluster sum of squares
wcss = [sum(d**2) for d in distances]
# Total sum of squares
tss = sum(pdist(similarity_matrix**2/similarity_matrix.shape[0]))
# Between cluster sum of squares
bss = tss - wcss

plt.plot(k_range, bss/tss*100)
plt.xlabel("Number of Clusters")
plt.ylabel("% Variance Explained")

ten = KMeans(n_clusters=10).fit(similarity_matrix)
tendf = pd.DataFrame(columns=['cluster_id', 'author','topics', 'text'])
tendf.topics = df.topics
tendf.author = df.author
tendf.cluster_id = ten.labels_
tendf.text = df.text
tendf.head()

tendf.cluster_id.hist()

tendf['length'] = [len(t) for t in tendf.text]

tendf.text[0].split()[:5]

cluster_topics = []
cluster_text = []

for cluster_id in tendf.cluster_id.value_counts().index:
    cluster_df = tendf[tendf.cluster_id==cluster_id]
    topic_words = []
    for topics in cluster_df.topics:
        for topic in topics.split(','):
            topic_words.append(topic)
    clean_topics = processText(' '.join(topic_words), dictionary).split()
    clean_df = pd.DataFrame(clean_topics, columns=['topics'])
    
    cluster_text.append(pd.DataFrame(' '.join([text for text in cluster_df.text]).split(), columns=['text']))
    cluster_topics.append(clean_df)
    

for i in cluster_topics:
    print i.topics.value_counts()[:10]
    print '$' * 70

for i in cluster_text:
    print i.text.value_counts()[:10]
    print '-'*70

# Agglomerative Clustering ? 
from sklearn.cluster import AgglomerativeClustering
ten = AgglomerativeClustering(n_clusters=10).fit(similarity_matrix)
tendf = pd.DataFrame(columns=['cluster_id', 'author','topics'])
tendf.topics = df.topics
tendf.author = df.author
tendf.cluster_id = ten.labels_
tendf['text'] = df.text
tendf.head()

tendf.cluster_id.hist()

tendf['length'] = [len(t) for t in tendf.text]

tendf.text[0].split()[:5]

tendf.head(1)

cluster_topics = []
cluster_text = []

for cluster_id in tendf.cluster_id.value_counts().index:
    cluster_df = tendf[tendf.cluster_id==cluster_id]
    topic_words = []
    for topics in cluster_df.topics:
        for topic in topics.split(','):
            topic_words.append(topic)
    clean_topics = processText(' '.join(topic_words), dictionary).split()
    clean_df = pd.DataFrame(clean_topics, columns=['topics'])
    
    cluster_text.append(pd.DataFrame(' '.join([text for text in cluster_df.text]).split(), columns=['text']))
    cluster_topics.append(clean_df)
    

for i in cluster_topics:
    print i.topics.value_counts()[:10]
    print '$' * 70

for i in cluster_text:
    print i.text.value_counts()[:10]
    print '-'*70

import gensim

all_text = [doc.split() for doc in df.text]

gensim_d = gensim.corpora.Dictionary(all_text)

corpus = [gensim_d.doc2bow(text) for text in all_text]

lda = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=gensim_d, num_topics=10, update_every=1, chunksize=100, passes=1)

lda_topics = lda.print_topics(10)
lda_tops = [topic.split('+') for topic in lda_topics]
for topic in lda_tops:
    for pair in topic:
        print pair.split('*')[0] + '\t' + pair.split('*')[1]
    print '%' * 70

# Nice husl from seaborn
colors = sns.husl_palette(n_colors=10)
sns.palplot(colors)

colors[9]=[0,0,0]
colors[8]=[1,1,1]

colors.reverse()
sns.palplot(colors)

def plot_reduction_kmeans(first_reduction, first_num, second_reduction, second_num, matrix=similarity_matrix):

    # Reduction #1 
    f = first_reduction(n_components=first_num)
    f_matrix = f.fit_transform(matrix)

    # Reduction #2 1000 dimensions ->3 dimensions
    s = second_reduction(n_components=second_num) 
    s_matrix = s.fit_transform(f_matrix)

    kmeans = KMeans(init='k-means++', n_clusters=10, n_init=100)
    kmeans.fit(s_matrix)
    
    d = {i:colors[i] for i in range(10)}
    kcolors = [d[i] for i in kmeans.labels_]
    
    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d')
    ax.scatter(s_matrix[:,0],s_matrix[:,1],s_matrix[:,2], c=kcolors, alpha=.6)

plot_reduction_kmeans(TruncatedSVD, 100, TSNE, 3)

plot_reduction_kmeans(TruncatedSVD, 100, PCA, 3)

plot_reduction_kmeans(TruncatedSVD, 500, PCA, 3)

plot_reduction_kmeans(TruncatedSVD, 100, ICA, 3)

plot_reduction_kmeans(TruncatedSVD, 500, ICA, 3)

plot_reduction_kmeans(PCA, 100, TSNE, 3)

plot_reduction_kmeans(PCA, 500, TSNE, 3)

plot_reduction_kmeans(PCA, 100, TruncatedSVD, 3)

plot_reduction_kmeans(PCA, 500, TruncatedSVD, 3)

plot_reduction_kmeans(PCA, 100, ICA, 3)

plot_reduction_kmeans(PCA, 500, ICA, 3)

plot_reduction_kmeans(ICA, 100, TSNE, 3)

plot_reduction_kmeans(TruncatedSVD, 300, TSNE, 3)

def plot_reduction_agg(first_reduction, first_num, second_reduction, second_num, matrix=similarity_matrix, affinity='cosine', linkage='complete'):

    # Reduction #1 
    f = first_reduction(n_components=first_num)
    f_matrix = f.fit_transform(matrix)

    # Reduction #2 1000 dimensions ->3 dimensions
    s = second_reduction(n_components=second_num) 
    s_matrix = s.fit_transform(f_matrix)

    agg = AgglomerativeClustering(n_clusters=10, affinity=affinity, linkage=linkage)
    agg.fit(s_matrix)
    
    d = {i:colors[i] for i in range(10)}
    acolors = [d[i] for i in agg.labels_]
    
    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d')
    ax.scatter(s_matrix[:,0],s_matrix[:,1],s_matrix[:,2], c=acolors, alpha=.6)

plot_reduction_agg(TruncatedSVD, 100, TSNE, 3)

plot_reduction_agg(TruncatedSVD, 100, PCA, 3)

plot_reduction_agg(TruncatedSVD, 100, ICA, 3)

plot_reduction_agg(PCA, 100, TruncatedSVD, 3)

plot_reduction_agg(PCA, 100, TSNE, 3)

plot_reduction_agg(PCA, 100, TruncatedSVD, 3)

plot_reduction_agg(PCA, 100, ICA, 3)

plot_reduction_agg(PCA, 100, PCA, 3)

plot_reduction_agg(ICA, 100, TruncatedSVD, 3)

plot_reduction_agg(ICA, 100, TSNE, 3)

plot_reduction_agg(ICA, 100, PCA, 3)

plot_reduction_agg(ICA, 100, ICA, 3)

## My favorite - for now

plot_reduction_kmeans(PCA, 500, TruncatedSVD, 3)

## My favorite - for now

plot_reduction_kmeans(PCA, 500, TruncatedSVD, 3)

# Reduction #1 
f = PCA(n_components=500)
f_matrix = f.fit_transform(similarity_matrix)

# Reduction #2 1000 dimensions ->3 dimensions
s = TruncatedSVD(n_components=3) 
s_matrix = s.fit_transform(f_matrix)

kmeans = KMeans(init='k-means++', n_clusters=10, n_init=100)
kmeans.fit(s_matrix)

data_matrix = s_matrix.copy()
data_matrix = pd.DataFrame(data_matrix*200) # Gotta make everything a bit larger
data_matrix['cid'] = kmeans.labels_
data_matrix = data_matrix[[1,0,2,'cid']]
data_matrix.columns=['y','x','z','cid']
data_matrix.cid = data_matrix.cid.astype(int)
data_matrix = data_matrix.astype(str)

data_matrix

data_matrix.info()

d = [{k:data_matrix.values[i][v] for v,k in enumerate(data_matrix.columns)} for i in range(len(data_matrix)) ]

datajson = {"points": d}

import json
json.dump(datajson, open('data.json', 'w'))

! ls