%matplotlib inline import numpy as np import matplotlib.pyplot as plt from sklearn.feature_selection import SelectKBest, f_regression from sklearn.cross_validation import cross_val_score, KFold from sklearn.linear_model import LinearRegression def hidden_model(x): #y is a linear combination of columns 5 and 10... result = x[:, 5] + x[:, 10] #... with a little noise result += np.random.normal(0, .005, result.shape) return result def make_x(nobs): return np.random.uniform(0, 3, (nobs, 10 ** 6)) x = make_x(20) y = hidden_model(x) print(x.shape) selector = SelectKBest(f_regression, k=2).fit(x, y) best_features = np.where(selector.get_support())[0] print(best_features) for b in best_features: plt.plot(x[:, b], y, 'o') plt.title("Column %i" % b) plt.xlabel("X") plt.ylabel("Y") plt.show() xt = x[:, best_features] clf = LinearRegression().fit(xt, y) print("Score is ", clf.score(xt, y)) yp = clf.predict(xt) plt.plot(yp, y, 'o') plt.plot(y, y, 'r-') plt.xlabel("Predicted") plt.ylabel("Observed") cross_val_score(clf, xt, y, cv=5).mean() for train, test in KFold(len(y), 10): xtrain, xtest, ytrain, ytest = xt[train], xt[test], y[train], y[test] clf.fit(xtrain, ytrain) yp = clf.predict(xtest) plt.plot(yp, ytest, 'o') plt.plot(ytest, ytest, 'r-') plt.xlabel("Predicted") plt.ylabel("Observed") x2 = make_x(100) y2 = hidden_model(x2) x2 = x2[:, best_features] y2p = clf.predict(x2) plt.plot(y2p, y2, 'o') plt.plot(y2, y2, 'r-') plt.xlabel("Predicted") plt.ylabel("Observed") scores = [] for train, test in KFold(len(y), n_folds=5): xtrain, xtest, ytrain, ytest = x[train], x[test], y[train], y[test] b = SelectKBest(f_regression, k=2) b.fit(xtrain, ytrain) xtrain = xtrain[:, b.get_support()] xtest = xtest[:, b.get_support()] clf.fit(xtrain, ytrain) scores.append(clf.score(xtest, ytest)) yp = clf.predict(xtest) plt.plot(yp, ytest, 'o') plt.plot(ytest, ytest, 'r-') plt.xlabel("Predicted") plt.ylabel("Observed") print("CV Score is ", np.mean(scores))