# any other imports here
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from __future__ import division
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 50)
pd.set_option('display.width', 1000)
# note: while it's considered common to import pandas as pd and matplotlib with plt,
# i'm unaware of a "best practice" for sklearn modules.
from sklearn import feature_selection as f_select
from sklearn import linear_model as lm
from sklearn import cross_validation as cv
from sklearn import metrics
from sklearn.datasets import load_boston
boston = load_boston()
desc = boston.DESCR
bostondf = pd.DataFrame(boston.data, columns=boston.feature_names)
bostondf['MEDV'] = boston.target
x_columns = list(bostondf.columns)
y_column = 'MEDV'
x_columns.remove(y_column)
significant_columns = []
pvals = []
for feature in x_columns:
pval = f_select.f_regression(bostondf[[feature]], bostondf[y_column])
if pval[1][0] < 0.05:
significant_columns.append(feature)
pvals.append(pval[1][0])
x_train, x_test, y_train, y_test = cv.train_test_split(bostondf[significant_columns],
bostondf[y_column],
test_size=0.333,
random_state=1234)
model = lm.LinearRegression().fit(x_train, y_train)
print pd.DataFrame({
'column': significant_columns,
'coef': model.coef_,
'p-value': pvals,
}).set_index('column')
print
print model.score(x_train, y_train)
print metrics.r2_score(y_train, model.predict(x_train))
print
print model.score(x_test, y_test)
print metrics.r2_score(y_test, model.predict(x_test))
coef p-value column CRIM -0.101202 2.083550e-19 ZN 0.062922 5.713584e-17 INDUS -0.024451 4.900260e-31 CHAS 2.743707 7.390623e-05 NOX -22.724112 7.065042e-24 RM 2.415003 2.487229e-74 AGE 0.004942 1.569982e-18 DIS -1.904339 1.206612e-08 RAD 0.389172 5.465933e-19 TAX -0.014100 5.637734e-29 PTRATIO -1.118604 1.609509e-34 B 0.007027 1.318113e-14 LSTAT -0.587270 5.081103e-88 0.72839057184 0.72839057184 0.736360506263 0.736360506263
print desc
Boston House Prices dataset Notes ------ Data Set Characteristics: :Number of Instances: 506 :Number of Attributes: 13 numeric/categorical predictive :Median Value (attribute 14) is usually the target :Attribute Information (in order): - CRIM per capita crime rate by town - ZN proportion of residential land zoned for lots over 25,000 sq.ft. - INDUS proportion of non-retail business acres per town - CHAS Charles River dummy variable (= 1 if tract bounds river; 0 otherwise) - NOX nitric oxides concentration (parts per 10 million) - RM average number of rooms per dwelling - AGE proportion of owner-occupied units built prior to 1940 - DIS weighted distances to five Boston employment centres - RAD index of accessibility to radial highways - TAX full-value property-tax rate per $10,000 - PTRATIO pupil-teacher ratio by town - B 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town - LSTAT % lower status of the population - MEDV Median value of owner-occupied homes in $1000's :Missing Attribute Values: None :Creator: Harrison, D. and Rubinfeld, D.L. This is a copy of UCI ML housing dataset. http://archive.ics.uci.edu/ml/datasets/Housing This dataset was taken from the StatLib library which is maintained at Carnegie Mellon University. The Boston house-price data of Harrison, D. and Rubinfeld, D.L. 'Hedonic prices and the demand for clean air', J. Environ. Economics & Management, vol.5, 81-102, 1978. Used in Belsley, Kuh & Welsch, 'Regression diagnostics ...', Wiley, 1980. N.B. Various transformations are used in the table on pages 244-261 of the latter. The Boston house-price data has been used in many machine learning papers that address regression problems. **References** - Belsley, Kuh & Welsch, 'Regression diagnostics: Identifying Influential Data and Sources of Collinearity', Wiley, 1980. 244-261. - Quinlan,R. (1993). Combining Instance-Based and Model-Based Learning. In Proceedings on the Tenth International Conference of Machine Learning, 236-243, University of Massachusetts, Amherst. Morgan Kaufmann. - many more! (see http://archive.ics.uci.edu/ml/datasets/Housing)