import math, pdb, json import pandas as pd import numpy as np import matplotlib.pyplot as plt import dateutil as du from sklearn import datasets, neighbors, linear_model,svm, naive_bayes, ensemble #allows for matplotlib plots to be embedded directly inline\n", %pylab inline # You can get data files from our site (caution large files): # # training_subset (152M): https://s3.amazonaws.com/zipf_data/training_subset.csv # training_full (529M): https://s3.amazonaws.com/zipf_data/training_full.csv # change this file path to where you downloaded the data df = pd.read_csv("data/grockit_all_data/training_subset.csv") #full_data = pd.read_csv("data/grockit_all_data/training_full.csv") df df.head(50) print "There are %d students in the dataset" % df['user_id'].nunique() counts = df['user_id'].value_counts() counts.head(1000) counts.mean() # remove outliers filtered = df[df['user_id'] != 133472] print "There %d rows in our dataset. Each row corresponds to a user answering a question." % len(filtered) filtered['question_id'].nunique() # Find top 500 most answered questions top1000 = filtered['question_id'].value_counts().head(1000) top1000 # select these from our data set top_questions = filtered[filtered['question_id'].isin(top1000.index)] top_questions['question_id'].nunique() # Select the top 10,000 users who answered the top 500 questions users15000 = top_questions['user_id'].value_counts().head(15000) top_users = top_questions[top_questions['user_id'].isin(users15000.index)] top_users.to_csv("users_sample.csv") top_questions.to_csv("questions_sample.csv") users15000 top_users.head() # get rid of duplicate user answers for the same question no_dups = top_users.drop_duplicates(cols=['user_id', 'question_id'], take_last=True) # pivot the matrix -- each row becomes a user, and each column an index pivot = no_dups.pivot(index='user_id', columns='question_id', values='outcome') pivot.iloc[:20, 20:40] # convert unanswered questions to 0 pivot_fill = pivot.fillna(0) pivot_fill.iloc[:20, 20:40] # transform unanwered, skipped, not finished => incorrect mapping = { 1: 1, 0: 0, 2: 2, 3: 2, 4: 2 } for name in pivot_fill.columns: pivot_fill[name] = pivot_fill[name].map(mapping) pivot_fill.iloc[:20, 20:40] col_names = pivot_fill.columns test_columns = list(col_names) test_columns.remove(1272) print len(col_names) print len(test_columns) # shuffle data to randomize cross validation split pivot_fill = pivot_fill.reindex(np.random.permutation(pivot_fill.index)) # split data into features and target variable train_x = pivot_fill[test_columns] train_y = pivot_fill[1272] # map unanswered to incorrect in our target variable train_y = train_y.map({0: 2, 2:2, 1:1}) # Create cross validation splits -- 90% train ;; 10% test n_samples = len(pivot_fill) X_train = train_x.as_matrix()[:.9 * n_samples] y_train = np.array(train_y[:.9 * n_samples]) X_test = train_x.as_matrix()[.9 * n_samples:] y_test = np.array(train_y[.9 * n_samples:]) # create classifier objects with parameters logistic = linear_model.LogisticRegression(C=1e5, penalty="l1") knn = neighbors.KNeighborsClassifier() sgd = linear_model.SGDClassifier(loss="log", penalty="l1") bayes = naive_bayes.BernoulliNB() svmm = svm.SVC(kernel='rbf') rf = ensemble.RandomForestClassifier() %%time # train Stocastic Gradient Descent on training set sgd_model = sgd.fit(X_train, y_train) %%time # train Stocastic Gradient Descent on training set rf_model = rf.fit(X_train, y_train) %%time # train Stocastic Gradient Descent on training set svm_model = svmm.fit(X_train, y_train) %%time # train Stocastic Gradient Descent on training set bayes_model = bayes.fit(X_train, y_train) %%time # train k-Nearest Neighbors on training set knn_model = knn.fit(X_train, y_train) %%time print('sgd score: %f' % sgd_model.score(X_test, y_test)) %%time print('Random Forest score: %f' % rf_model.score(X_test, y_test)) %%time print('bayes score: %f' % bayes_model.score(X_test, y_test)) %%time print('svm score: %f' % svm_model.score(X_test, y_test)) %%time print('KNN score: %f' % knn_model.score(X_test, y_test)) %%time print('LogisticRegression score: %f' % logistic.fit(X_train, y_train).score(X_test, y_test)) #load the dataset user_knowledge = pd.read_table('data/uci_students_train.tsv', sep='\t') knowledge_test = pd.read_table('data/uci_students_test.tsv', sep='\t') user_knowledge user_knowledge.head() # get rid of extraneous column user_knowledge = user_knowledge.iloc[:,:-1] user_knowledge.head() # remove any spaces in column titles user_knowledge.columns = user_knowledge.columns.map(str.strip) # create a separate feature vector and label vector # all feature are an appropriate scale, but in some cases # normalization/feature scaling may be necessary student_train_frame = user_knowledge.iloc[:,:-1] student_label_frame = user_knowledge.iloc[:,-1:] student_train_frame.head() # convert to Numpy array for sklearn student_train = student_train_frame.as_matrix() student_label = student_label_frame.as_matrix() from sklearn.cluster import KMeans kmeans = KMeans(n_clusters=4) kmeans.fit(student_train) student_label y_hat = kmeans.labels_ y_hat pl = plt.scatter(student_train[:,0], student_train[:, 1], c=y_hat) import itertools as iter list(iter.combinations([0,1,2,3,4], 2)) col = user_knowledge.columns col from sklearn import preprocessing le = preprocessing.LabelEncoder() ground_truth = le.fit_transform(student_label) def plot_kmeans(title="kmeans", fignum = 1, size=(10, 6), columns=(0,1), labels=None, axes=["x_axis", "y_axis"]): fig = plt.figure(fignum, figsize=size) plt.scatter(student_train[:,columns[0]], student_train[:, columns[1]], c=labels, label=labels) plt.title(title) plt.xlabel(axes[columns[0]]) plt.ylabel(axes[columns[1]]) return fignum + 1 f = 1 for x,y in iter.combinations([0,1,2,3,4], 2): # plot kmeans clustering f = plot_kmeans(fignum=f, columns=(x,y), labels=y_hat, axes=col) # plot true labels f = plot_kmeans(title="Ground Truth", fignum=f, columns=(x,y), labels=ground_truth, axes=col) # Example from scikit-learn site: http://scikit-learn.org/stable/auto_examples/cluster/plot_cluster_iris.html import numpy as np import pylab as pl from mpl_toolkits.mplot3d import Axes3D from sklearn.cluster import KMeans from sklearn import datasets np.random.seed(5) centers = [[1, 1], [-1, -1], [1, -1]] iris = datasets.load_iris() X = iris.data y = iris.target estimators = {'k_means_iris_3': KMeans(n_clusters=3), 'k_means_iris_8': KMeans(n_clusters=8), 'k_means_iris_bad_init': KMeans(n_clusters=3, n_init=1, init='random')} fignum = 1 for name, est in estimators.iteritems(): fig = pl.figure(fignum, figsize=(10, 6)) pl.clf() ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=48, azim=134) pl.cla() est.fit(X) labels = est.labels_ ax.scatter(X[:, 3], X[:, 0], X[:, 2], c=labels.astype(np.float)) ax.w_xaxis.set_ticklabels([]) ax.w_yaxis.set_ticklabels([]) ax.w_zaxis.set_ticklabels([]) ax.set_xlabel('Petal width') ax.set_ylabel('Sepal length') ax.set_zlabel('Petal length') fignum = fignum + 1 # Plot the ground truth fig = pl.figure(fignum, figsize=(10, 6)) pl.clf() ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=48, azim=134) pl.cla() for name, label in [('Setosa', 0), ('Versicolour', 1), ('Virginica', 2)]: ax.text3D(X[y == label, 3].mean(), X[y == label, 0].mean() + 1.5, X[y == label, 2].mean(), name, horizontalalignment='center', bbox=dict(alpha=.5, edgecolor='w', facecolor='w')) # Reorder the labels to have colors matching the cluster results y = np.choose(y, [1, 2, 0]).astype(np.float) ax.scatter(X[:, 3], X[:, 0], X[:, 2], c=y) ax.w_xaxis.set_ticklabels([]) ax.w_yaxis.set_ticklabels([]) ax.w_zaxis.set_ticklabels([]) ax.set_xlabel('Petal width') ax.set_ylabel('Sepal length') ax.set_zlabel('Petal length') pl.show() # Euclidean distance function def euclidean(vec1, vec2): diff = vec1 - vec2 total_sq = np.sum(diff ** 2) return math.sqrt(total_sq)