from __future__ import division import pandas as pd import numpy as np import matplotlib.pyplot as plt from sklearn.linear_model import LogisticRegression from sklearn.lda import LDA from sklearn.neighbors import KNeighborsClassifier from sklearn.metrics import confusion_matrix from sklearn.metrics import accuracy_score %matplotlib inline smarket_df = pd.read_csv("../data/Smarket.csv") smarket_df.head() # equivalent to the R pairs(df) command. axes = pd.tools.plotting.scatter_matrix(smarket_df, color="brown") X = smarket_df[smarket_df.columns[1:-2]] y = pd.factorize(smarket_df["Direction"])[0] clf = LogisticRegression() clf.fit(X, y) (clf.intercept_, clf.coef_) # return probability for each class. R's predict() only returns the probability of the first # class, so we do the same. probs = clf.predict_proba(X) [prob[0] for prob in probs[0:5]] ypreds = ["Up" if prob[0] > 0.5 else "Down" for prob in probs] ypreds[0:5] # R uses table() to tabulate the confusion matrix below yacts = [str(x) for x in smarket_df["Direction"].values] confusion_matrix(yacts, ypreds) accuracy_score(yacts, ypreds) # Split dataset into training and test sets smarket_train_df = smarket_df[smarket_df["Year"] < 2005] smarket_test_df = smarket_df[smarket_df["Year"] >= 2005] # train Logistic Regression model with training data clf2 = LogisticRegression() Xtrain = smarket_train_df[smarket_df.columns[1:-2]] ytrain = pd.factorize(smarket_train_df["Direction"])[0] clf2.fit(Xtrain, ytrain) # test model with test data Xtest = smarket_test_df[smarket_df.columns[1:-2]] ytest = pd.factorize(smarket_test_df["Direction"])[0] ypred = clf2.predict(Xtest) # calculate confusion matrix and accuracy confusion_matrix(ytest, ypred) accuracy_score(ytest, ypred) clf3 = LogisticRegression() Xtrain = smarket_train_df[smarket_df.columns[1:3]] Xtest = smarket_test_df[smarket_df.columns[1:3]] clf3.fit(Xtrain, ytrain) ypred = clf3.predict(Xtest) confusion_matrix(ytest, ypred) accuracy_score(ytest, ypred) clf4 = LDA() clf4.fit(Xtrain, ytrain) ypred = clf4.predict(Xtest) ypred[0:5] confusion_matrix(ytest, ypred) accuracy_score(ytest, ypred) clf5 = KNeighborsClassifier() clf5.fit(Xtrain, ytrain) ypred = clf5.predict(Xtest) confusion_matrix(ytest, ypred) accuracy_score(ytest, ypred)