import math, pdb, json

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import dateutil as du
from sklearn import datasets, neighbors, linear_model,svm, naive_bayes, ensemble
     
#allows for matplotlib plots to be embedded directly inline\n",
%pylab inline

# You can get data files from our site (caution large files): 
#
# training_subset (152M): https://s3.amazonaws.com/zipf_data/training_subset.csv
# training_full (529M): https://s3.amazonaws.com/zipf_data/training_full.csv

# change this file path to where you downloaded the data
df = pd.read_csv("data/grockit_all_data/training_subset.csv")
#full_data = pd.read_csv("data/grockit_all_data/training_full.csv")

df

df.head(50)

print "There are %d students in the dataset" % df['user_id'].nunique()

counts = df['user_id'].value_counts()
counts.head(1000)

counts.mean()

# remove outliers
filtered = df[df['user_id'] != 133472]

print "There %d rows in our dataset.  Each row corresponds to a user answering a question." % len(filtered)

filtered['question_id'].nunique()

# Find top 500 most answered questions
top1000 = filtered['question_id'].value_counts().head(1000)

top1000

# select these from our data set
top_questions = filtered[filtered['question_id'].isin(top1000.index)]

top_questions['question_id'].nunique()

# Select the top 10,000 users who answered the top 500 questions
users15000 = top_questions['user_id'].value_counts().head(15000)

top_users = top_questions[top_questions['user_id'].isin(users15000.index)]

top_users.to_csv("users_sample.csv")
top_questions.to_csv("questions_sample.csv")

users15000

top_users.head()

# get rid of duplicate user answers for the same question
no_dups = top_users.drop_duplicates(cols=['user_id', 'question_id'], take_last=True)

# pivot the matrix -- each row becomes a user, and each column an index
pivot = no_dups.pivot(index='user_id', columns='question_id', values='outcome')

pivot.iloc[:20, 20:40]

# convert unanswered questions to 0
pivot_fill = pivot.fillna(0)

pivot_fill.iloc[:20, 20:40]

# transform unanwered, skipped, not finished => incorrect
mapping = { 1: 1, 0: 0, 2: 2, 3: 2, 4: 2 }

for name in pivot_fill.columns:
    pivot_fill[name] = pivot_fill[name].map(mapping)

pivot_fill.iloc[:20, 20:40]

col_names = pivot_fill.columns

test_columns = list(col_names)

test_columns.remove(1272)

print len(col_names)
print len(test_columns)

# shuffle data to randomize cross validation split
pivot_fill = pivot_fill.reindex(np.random.permutation(pivot_fill.index))

# split data into features and target variable
train_x = pivot_fill[test_columns]
train_y = pivot_fill[1272]

# map unanswered to incorrect in our target variable
train_y = train_y.map({0: 2, 2:2, 1:1})

# Create cross validation splits -- 90% train ;; 10% test
n_samples = len(pivot_fill)

X_train = train_x.as_matrix()[:.9 * n_samples]
y_train = np.array(train_y[:.9 * n_samples])
X_test = train_x.as_matrix()[.9 * n_samples:]
y_test = np.array(train_y[.9 * n_samples:])

# create classifier objects with parameters
logistic = linear_model.LogisticRegression(C=1e5, penalty="l1")
knn = neighbors.KNeighborsClassifier()
sgd = linear_model.SGDClassifier(loss="log", penalty="l1")
bayes = naive_bayes.BernoulliNB()
svmm = svm.SVC(kernel='rbf')
rf = ensemble.RandomForestClassifier()

%%time

# train Stocastic Gradient Descent on training set
sgd_model = sgd.fit(X_train, y_train)

%%time

# train Stocastic Gradient Descent on training set
rf_model = rf.fit(X_train, y_train)

%%time

# train Stocastic Gradient Descent on training set
svm_model = svmm.fit(X_train, y_train)

%%time

# train Stocastic Gradient Descent on training set
bayes_model = bayes.fit(X_train, y_train)

%%time

# train k-Nearest Neighbors on training set
knn_model = knn.fit(X_train, y_train)

%%time

print('sgd score: %f' % sgd_model.score(X_test, y_test))

%%time

print('Random Forest score: %f' % rf_model.score(X_test, y_test))

%%time

print('bayes score: %f' % bayes_model.score(X_test, y_test))

%%time

print('svm score: %f' % svm_model.score(X_test, y_test))

%%time

print('KNN score: %f' % knn_model.score(X_test, y_test))

%%time

print('LogisticRegression score: %f'
      % logistic.fit(X_train, y_train).score(X_test, y_test))

#load the dataset
user_knowledge = pd.read_table('data/uci_students_train.tsv', sep='\t')
knowledge_test = pd.read_table('data/uci_students_test.tsv', sep='\t')

user_knowledge

user_knowledge.head()

# get rid of extraneous column
user_knowledge = user_knowledge.iloc[:,:-1]

user_knowledge.head()

# remove any spaces in column titles
user_knowledge.columns = user_knowledge.columns.map(str.strip)

# create a separate feature vector and label vector
# all feature are an appropriate scale, but in some cases
# normalization/feature scaling may be necessary
student_train_frame = user_knowledge.iloc[:,:-1]
student_label_frame = user_knowledge.iloc[:,-1:]

student_train_frame.head()

# convert to Numpy array for sklearn
student_train = student_train_frame.as_matrix()
student_label = student_label_frame.as_matrix()

from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=4)
kmeans.fit(student_train)

student_label

y_hat = kmeans.labels_
y_hat

pl = plt.scatter(student_train[:,0], student_train[:, 1], c=y_hat)

import itertools as iter

list(iter.combinations([0,1,2,3,4], 2))

col = user_knowledge.columns
col

from sklearn import preprocessing
le = preprocessing.LabelEncoder()
ground_truth = le.fit_transform(student_label)

def plot_kmeans(title="kmeans", fignum = 1, size=(10, 6), columns=(0,1), labels=None, axes=["x_axis", "y_axis"]):
    fig = plt.figure(fignum, figsize=size)
    plt.scatter(student_train[:,columns[0]], student_train[:, columns[1]], c=labels, label=labels)
    plt.title(title)
    plt.xlabel(axes[columns[0]])
    plt.ylabel(axes[columns[1]])
    return fignum + 1

f = 1

for x,y in iter.combinations([0,1,2,3,4], 2):
    # plot kmeans clustering
    f = plot_kmeans(fignum=f, columns=(x,y), labels=y_hat, axes=col)
    
    # plot true labels
    f = plot_kmeans(title="Ground Truth", fignum=f, columns=(x,y), labels=ground_truth, axes=col)

# Example from scikit-learn site: http://scikit-learn.org/stable/auto_examples/cluster/plot_cluster_iris.html
import numpy as np
import pylab as pl
from mpl_toolkits.mplot3d import Axes3D


from sklearn.cluster import KMeans
from sklearn import datasets

np.random.seed(5)

centers = [[1, 1], [-1, -1], [1, -1]]
iris = datasets.load_iris()
X = iris.data
y = iris.target

estimators = {'k_means_iris_3': KMeans(n_clusters=3),
              'k_means_iris_8': KMeans(n_clusters=8),
              'k_means_iris_bad_init': KMeans(n_clusters=3, n_init=1,
                                              init='random')}


fignum = 1
for name, est in estimators.iteritems():
    fig = pl.figure(fignum, figsize=(10, 6))
    pl.clf()
    ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=48, azim=134)

    pl.cla()
    est.fit(X)
    labels = est.labels_

    ax.scatter(X[:, 3], X[:, 0], X[:, 2], c=labels.astype(np.float))

    ax.w_xaxis.set_ticklabels([])
    ax.w_yaxis.set_ticklabels([])
    ax.w_zaxis.set_ticklabels([])
    ax.set_xlabel('Petal width')
    ax.set_ylabel('Sepal length')
    ax.set_zlabel('Petal length')
    fignum = fignum + 1

# Plot the ground truth
fig = pl.figure(fignum, figsize=(10, 6))
pl.clf()
ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=48, azim=134)

pl.cla()

for name, label in [('Setosa', 0),
                    ('Versicolour', 1),
                    ('Virginica', 2)]:
    ax.text3D(X[y == label, 3].mean(),
              X[y == label, 0].mean() + 1.5,
              X[y == label, 2].mean(), name,
              horizontalalignment='center',
              bbox=dict(alpha=.5, edgecolor='w', facecolor='w'))
# Reorder the labels to have colors matching the cluster results
y = np.choose(y, [1, 2, 0]).astype(np.float)
ax.scatter(X[:, 3], X[:, 0], X[:, 2], c=y)

ax.w_xaxis.set_ticklabels([])
ax.w_yaxis.set_ticklabels([])
ax.w_zaxis.set_ticklabels([])
ax.set_xlabel('Petal width')
ax.set_ylabel('Sepal length')
ax.set_zlabel('Petal length')
pl.show()

# Euclidean distance function
def euclidean(vec1, vec2):
	diff = vec1 - vec2
	total_sq = np.sum(diff ** 2)
	return math.sqrt(total_sq)