import pandas as pd
import numpy as np
# Load the data into x and y
data = pd.DataFrame(boston['data'], columns = boston['feature_names'][:-1])
# Make x 2D
x = boston.data[:, np.newaxis]
# Add median housing prices as y
y = boston.target
# Use pandas to describe
print pd.Series(boston['target'], name = boston['feature_names'][-1]).describe()
print pd.DataFrame(boston['data'], columns = boston['feature_names'][:-1]).describe()
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-4-f45acd2a9e5f> in <module>() 1 # Load the data into x and y ----> 2 data = pd.DataFrame(boston['data'], columns = boston['feature_names'][:-1]) 3 # Make x 2D 4 x = boston.data[:, np.newaxis] 5 # Add median housing prices as y NameError: name 'boston' is not defined
from sklearn.linear_model import LinearRegression
# Create a new variable which is just number of rooms
rooms = x[:, :, 5]
# LinearRegression?
# LinearRegression(self, fit_intercept=True, normalize=False, copy_X=True)
lr = LinearRegression(fit_intercept=True)
lr.fit(rooms, y)
# Calculate R^2
print "R^2: {}".format(lr.score(rooms, y))
# Calculate residual sum of squares (residual error)
print "Residual sum of squares: {}".format(np.mean(lr.predict(rooms) - y)**2)
We will now build a K-folds cross-validation function to utilize on future analysis. Dont be afraid to use help in IPython notebooks.
KFold divides all the samples in k
groups of samples, called folds (if k = n
, this is equivalent to the Leave One Out strategy), of equal sizes (if possible). The prediction function is learned using k - 1
folds, and the fold left out is used for test.
KFold?
KFold(self, n, n_folds=3, indices=True, shuffle=False, random_state=None, k=None)
Provides train/test indices to split data in train test sets. Split dataset into k consecutive folds (without shuffling).
Each fold is then used a validation set once while the k - 1 remaining fold form the training set.
n : int
n_folds : int, default=3
indices : boolean, optional (default True)
shuffle : boolean, optional
random_state : int or RandomState
from sklearn.cross_validation import KFold
def cross_vd(x, y, folds):
kf = KFold(len(x), n_folds=folds, shuffle=True)
lr = LinearRegression(fit_intercept=True)
scores = []
# train and test are outputs of kf
for train, test in kf:
lr.fit(x[train], y[train])
prediction = lr.predict(x[test])
score = lr.score(x[test], y[test])
scores.append(score)
return sum(scores)/len(scores)
cross_vd(rooms, y, 10)
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-5-4efaa3f778d9> in <module>() 15 return sum(scores)/len(scores) 16 ---> 17 cross_vd(rooms, y, 10) NameError: name 'rooms' is not defined
from sklearn.cross_validation import cross_val_score, ShuffleSplit
cross_val_score?
Now, the only thing we need to really worry about is specifying a custom cross validation iterator with ShuffleSplit. Then we simply pass our data, and iterator.
The beauty of this approach is:
Regarding point 2, we can pass a different scoring parameter depending on which type of model we are fitting. Regression ('r2', 'mean_squared_error'), Classification ('accuracy', 'precision', ect)
The full list can be found here.
cv = ShuffleSplit(len(x), n_iter=10, test_size=0.3, random_state=0)
lr = LinearRegression(fit_intercept=True, normalize=True)
scores = cross_val_score(lr, rooms, y, cv=cv, scoring='mean_squared_error')
scores
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-7-5b6ea7402800> in <module>() ----> 1 cv = ShuffleSplit(len(x), n_iter=10, test_size=0.3, random_state=0) 2 3 lr = LinearRegression(fit_intercept=True, normalize=True) 4 scores = cross_val_score(lr, rooms, y, cv=cv, scoring='mean_squared_error') 5 scores NameError: name 'x' is not defined
We got a little side tracked there, but lets try improving our regression model with more input features, while at the same time talking about more of the theory, assumptions, and best practices for multiple linear OLS regression.
Firstly, regression (or machine learning for that matter) is not all about the predictive power of the final model. Regression also gives us the ability to start to make estimations about which features are impacting the model. For example, it would be useful to know which house variables contribute the median value, or which variables do not significantly contribute. With regression, we can begin to claim things like "as crime rates go up, housing prices go down".
# Load all the possible predictors into one dataframe
X = pd.DataFrame(boston['data'], columns = boston['feature_names'][:-1])
y = boston.target
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-8-8e3a79e455f3> in <module>() 1 # Load all the possible predictors into one dataframe ----> 2 X = pd.DataFrame(boston['data'], columns = boston['feature_names'][:-1]) 3 y = boston.target NameError: name 'boston' is not defined
Let's improve out cross validation model fitting function so that we can analyze the variables
def cross_vd(x, y, folds):
kf = KFold(len(x), n_folds=folds, shuffle=True)
lr = LinearRegression(fit_intercept=True)
scores = []
# train and test are outputs of kf
for train, test in kf:
lr.fit(x[train], y[train])
prediction = lr.predict(x[test])
score = lr.score(x[test], y[test])
scores.append(score)
return sum(scores)/len(scores)
cross_vd(rooms, y, 10)