%matplotlib inline ! wget http://www.cs.cornell.edu/people/pabo/movie-review-data/review_polarity.tar.gz ! tar xzf review_polarity.tar.gz import numpy as np import matplotlib.pyplot as plt from sklearn.datasets import load_files from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.svm import LinearSVC sent_data = load_files('txt_sentoken') tfidf_vec = TfidfVectorizer() sent_X = tfidf_vec.fit_transform(sent_data.data) sent_y = sent_data.target lsvc = LinearSVC() lsvc.fit(sent_X, sent_y) def display_top_features(weights, names, top_n): top_features = sorted(zip(weights, names), key=lambda x: abs(x[0]), reverse=True)[:top_n] top_weights = [x[0] for x in top_features] top_names = [x[1] for x in top_features] fig, ax = plt.subplots(figsize=(16,8)) ind = np.arange(top_n) bars = ax.bar(ind, top_weights, color='blue', edgecolor='black') for bar, w in zip(bars, top_weights): if w < 0: bar.set_facecolor('red') width = 0.30 ax.set_xticks(ind + width) ax.set_xticklabels(top_names, rotation=45, fontsize=12) plt.show(fig) display_top_features(lsvc.coef_[0], tfidf_vec.get_feature_names(), 20) from wordcloud import WordCloud def generate_word_cloud(weights, names): return WordCloud(width=350, height=250).generate_from_frequencies(zip(names, weights)) def display_word_cloud(weights, names): fig, ax = plt.subplots(1, 2, figsize=(28, 10)) pos_weights = weights[weights > 0] pos_names = np.array(names)[weights > 0] neg_weights = np.abs(weights[weights < 0]) neg_names = np.array(names)[weights < 0] lst = [('Positive', pos_weights, pos_names), ('Negative', neg_weights, neg_names)] for i, (label, weights, names) in enumerate(lst): wc = generate_word_cloud(weights, names) ax[i].imshow(wc) ax[i].set_axis_off() ax[i].set_title('{} words'.format(label), fontsize=24) plt.show(fig) display_word_cloud(lsvc.coef_[0], tfidf_vec.get_feature_names()) from sklearn.datasets import load_iris from sklearn.manifold import TSNE import mpld3 iris = load_iris() def display_iris(data): X_tsne = TSNE(n_components=2, perplexity=20, learning_rate=50).fit_transform(data.data) fig, ax = plt.subplots(1, 2, figsize=(10, 5)) ax[0].scatter(X_tsne[:, 0], X_tsne[:, 1]) ax[0].set_title('All instances', fontsize=14) ax[1].scatter(X_tsne[:, 0], X_tsne[:, 1], c=data.target) ax[1].set_title('All instances labeled with color', fontsize=14) return mpld3.display(fig) display_iris(iris) from sklearn.datasets import fetch_mldata from sklearn.decomposition import PCA mnist = fetch_mldata('MNIST original') def display_mnist(data, n_samples): X, y = data.data / 255.0, data.target # downsample as the scikit-learn implementation of t-SNE is unable to handle too much data indices = np.arange(X.shape[0]) np.random.shuffle(indices) X_train, y_train = X[indices[:n_samples]], y[indices[:n_samples]] X_tsne = TSNE(n_components=2, perplexity=30).fit_transform(X_train) X_pca = PCA(n_components=2).fit_transform(X_train) fig, ax = plt.subplots(1, 2, figsize=(12, 6)) points = ax[0].scatter(X_tsne[:,0], X_tsne[:,1], c=y_train) tooltip = mpld3.plugins.PointLabelTooltip(points, labels=y_train.tolist()) mpld3.plugins.connect(fig, tooltip) ax[0].set_title('t-SNE') points = ax[1].scatter(X_pca[:,0], X_pca[:,1], c=y_train) tooltip = mpld3.plugins.PointLabelTooltip(points, labels=y_train.tolist()) mpld3.plugins.connect(fig, tooltip) ax[1].set_title('PCA') return mpld3.display(fig) display_mnist(mnist, 1000) from mpl_toolkits.mplot3d import Axes3D from sklearn.lda import LDA def display_mnist_3d(data, n_samples): X, y = data.data / 255.0, data.target # downsample as the scikit-learn implementation of t-SNE is unable to handle too much data indices = np.arange(X.shape[0]) np.random.shuffle(indices) X_train, y_train = X[indices[:n_samples]], y[indices[:n_samples]] X_lda = LDA(n_components=3).fit_transform(X_train, y_train) fig, ax = plt.subplots(figsize=(10,10), subplot_kw={'projection':'3d'}) points = ax.scatter(X_lda[:,0], X_lda[:,1], X_lda[:,2] , c=y_train) ax.set_title('LDA') ax.set_xlim((-6, 6)) ax.set_ylim((-6, 6)) plt.show(fig) display_mnist_3d(mnist, 1000) import pandas as pd import sqlite3 con = sqlite3.connect('output/database.sqlite') kaggle_df = pd.read_sql_query(''' SELECT * FROM Submissions''', con) kaggle_df.head() print('There is {} submissions'.format(kaggle_df.shape[0])) # convert time strings to DatetimeIndex kaggle_df['timestamp'] = pd.to_datetime(kaggle_df['DateSubmitted']) print('The earliest and latest submissions are on {} and {}'.format(kaggle_df['timestamp'].min(), kaggle_df['timestamp'].max())) kaggle_df['weekday'] = kaggle_df['timestamp'].dt.weekday kaggle_df['weekhr'] = kaggle_df['weekday'] * 24 + kaggle_df['timestamp'].dt.hour import calendar def display_kaggle(df): fig, ax = plt.subplots(1, 2, figsize=(16, 8)) ax[0].set_title('submissions per weekday') df['weekday'].value_counts().sort_index().rename_axis(lambda x: calendar.day_name[x]).plot.bar(ax=ax[0]) ax[1].set_title('submissions per hour of week') ax[1].set_xticks(np.linspace(0, 24*7, 8)) df['weekhr'].value_counts().sort_index().plot(color='red', ax=ax[1]) plt.show(fig) display_kaggle(kaggle_df) from collections import defaultdict from sklearn.cluster import KMeans def display_hr(df, n_clusters): hrs_per_user = df[['SubmittedUserId', 'weekhr', 'Id']].groupby(['SubmittedUserId', 'weekhr']).count() total_per_user = hrs_per_user.sum(axis=0, level=0) user_patterns = (hrs_per_user / total_per_user)['Id'] vectors = defaultdict(lambda: np.zeros(24*7)) for (u, hr), r in user_patterns.items(): vectors[u][hr] = r X_hr = np.array(list(vectors.values())) y = KMeans(n_clusters=n_clusters, random_state=3).fit_predict(X_hr) for i in range(n_clusters): fig, ax = plt.subplots(figsize=(6, 6)) indices = y == i X = X_hr[indices] ax.plot(np.arange(24*7), X.mean(axis=0)) ax.set_xticks(np.linspace(0, 24*7, 8)) ax.set_xlim((0, 24*7)) ax.set_title('Cluster #{}, n = {}'.format(i, len(X)), fontsize=14) plt.show(fig) display_hr(kaggle_df, 9) def xkcd(): with plt.xkcd(): fig, ax = plt.subplots() ax.spines['right'].set_color('none') ax.spines['top'].set_color('none') ax.set_xticks([]) ax.set_yticks([]) ax.set_ylim([-1, 10]) data = np.zeros(100) data[:60] += np.linspace(-1, 0, 60) data[60:75] += np.arange(15) data[75:] -= np.ones(25) ax.annotate( 'DEADLINE', xy=(71, 7), arrowprops=dict(arrowstyle='->'), xytext=(30, 2)) ax.plot(data) ax.plot([72, 72], [-1, 15], 'k-', color='red') ax.set_xlabel('time') ax.set_ylabel('productivity') ax.set_title('productivity under a deadline') plt.show(fig) xkcd()