%matplotlib inline from __future__ import division import logging, sys, random from time import time from sklearn.cluster import KMeans from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.naive_bayes import MultinomialNB from sklearn.naive_bayes import GaussianNB from sklearn.naive_bayes import BernoulliNB from sklearn.decomposition import PCA from sklearn.decomposition import TruncatedSVD from sklearn.pipeline import make_pipeline from sklearn.preprocessing import Normalizer from sklearn.cross_validation import cross_val_score from sklearn import metrics from sklearn.metrics import pairwise_distances import numpy as np from scipy.stats import mode import matplotlib.pyplot as plt, mpld3 from mpld3 import plugins # mpld3.enable_notebook() def find_movie(os_id): return filter(lambda movie: movie['osID'] == str(os_id), movies) def make_histogram(innerDict): x = np.arange(len(innerDict.keys())) y = innerDict.values() fig = plt.figure(figsize=(20,10)) ax = fig.add_subplot(1,1,1) ax.bar(x, y) ax.set_xticks(x) ax.set_xticklabels(innerDict.keys(), rotation=70) plt.show() from load import movies movies = np.array(movies) print "loaded data", len(movies) genres = set() for movie in movies: for genre in movie.get('Genre', []): genres.add(genre) genres = list(genres) print genres N_CLUSTERS = 5 reduce_dimensionality = True k_means = KMeans(n_clusters=N_CLUSTERS, init='k-means++', max_iter=100, n_init=1, verbose=True) vectorizer = TfidfVectorizer(max_df=0.5, min_df=0.1, stop_words='english') lsa = TruncatedSVD(2) text = [movie['script'] for movie in movies] vectors = vectorizer.fit_transform(text) if reduce_dimensionality == True: X = lsa.fit_transform(vectors) else: X = vectors km = k_means.fit(X) k_means_labels = k_means.labels_ k_means_cluster_centers = k_means.cluster_centers_ k_means_labels_unique = np.unique(k_means_labels) terms = vectorizer.get_feature_names() if reduce_dimensionality == False: order_centroids = k_means_cluster_centers.argsort()[:, ::-1] for i in range(N_CLUSTERS): print [(terms[ind], k_means_cluster_centers[i][ind]) for ind in order_centroids[i, :100]] print "" for k in range(N_CLUSTERS): z = vectors.toarray()[k_means_labels == k] wordz_tfidf = [(terms[i], z[:,i].sum()) for i in range(z.shape[1])] wordz_tfidf = sorted(wordz_tfidf, key=lambda x: x[1], reverse=True ) print wordz_tfidf[:100] print "" if reduce_dimensionality == True: mpld3.enable_notebook() fig,ax = plt.subplots(figsize=(15,10)) #.figure(figsize=(20,10)) # ax = fig.add_subplot(1,1,1) # ax.grid(True, alpha=0.3) colors = [(random.random(), random.random(), random.random()) for x in range(N_CLUSTERS)] for k, col in zip(range(N_CLUSTERS), colors): my_members = k_means_labels == k cluster_center = k_means_cluster_centers[k] points = ax.plot(X[my_members, 0], X[my_members, 1], 'w', markerfacecolor=col, marker='.', label='Cluster %i' % k) centers = ax.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col, markeredgecolor='k', markersize=6) labels = [] for movie in movies[k_means_labels == k]: labels.append(movie.get('Title', '') + " " + movie.get('osID', '') + " " + movie.get('imdbID', '') + " " + ", ".join(movie.get('Genre', '')) + " " + movie.get('', '') + " ") tooltip = plugins.PointHTMLTooltip(points[0], labels, voffset=10, hoffset=10) plugins.connect(fig, tooltip) ax.set_title('KMeans') ax.set_xticks(()) ax.set_yticks(()) ax.legend() mpld3.disable_notebook() # find_movie(3909) genre_histogram = [] for k in range(N_CLUSTERS): innerDict = dict.fromkeys(genres, 0) for movie in movies[k_means_labels == k]: current_genres = movie.get('Genre', '') for genre in current_genres: if genre in innerDict: innerDict[genre] = innerDict[genre]+1 else: innerDict[genre] = 0 genre_histogram.append(innerDict) year_histogram = [] for k in range(N_CLUSTERS): innerDict = {} for movie in movies[k_means_labels == k]: year = movie.get('Year', '') if year in innerDict: innerDict[year] = innerDict[year]+1 else: innerDict[year] = 0 year_histogram.append(innerDict) # make_histogram(genre_histogram[0]) # make_histogram(genre_histogram[1]) # make_histogram(genre_histogram[2])