# special IPython command to prepare the notebook for matplotlib %matplotlib inline import requests import StringIO import zipfile import numpy as np import pandas as pd # pandas import matplotlib.pyplot as plt # module for plotting # If this module is not already installed, you may need to install it. # You can do this by typing 'pip install seaborn' in the command line import seaborn as sns import sklearn import sklearn.datasets import sklearn.cross_validation import sklearn.decomposition import sklearn.grid_search import sklearn.neighbors import sklearn.metrics ### Your code here ### def getZIP(zipFileName): r = requests.get(zipFileName).content s = StringIO.StringIO(r) zf = zipfile.ZipFile(s, 'r') # Read in a list of zipped files return zf url = 'http://seanlahman.com/files/database/lahman-csv_2014-02-14.zip' zf = getZIP(url) tablenames = zf.namelist() print tablenames teams = pd.read_csv(zf.open(tablenames[tablenames.index('Teams.csv')])) players = pd.read_csv(zf.open(tablenames[tablenames.index('Batting.csv')])) salaries = pd.read_csv(zf.open(tablenames[tablenames.index('Salaries.csv')])) fielding = pd.read_csv(zf.open(tablenames[tablenames.index('Fielding.csv')])) master = pd.read_csv(zf.open(tablenames[tablenames.index('Master.csv')])) ### Your code here ### print "Dimensions of teams DataFrame:", teams.shape print "Dimensions of players DataFrame:", players.shape print "Dimensions of salaries DataFrame:", salaries.shape print "Dimensions of fielding DataFrame:", fielding.shape print "Dimensions of master DataFrame:", master.shape ### Your code here ### byPlayerID = salaries.groupby('playerID')['playerID','salary'].median() medianSalaries = pd.merge(master[['playerID', 'nameFirst', 'nameLast']], byPlayerID, \ left_on='playerID', right_index = True, how="inner") medianSalaries.head() ### Your code here ### subTeams = teams[(teams['G'] == 162) & (teams['yearID'] > 1947)].copy() subTeams["1B"] = subTeams.H - subTeams["2B"] - subTeams["3B"] - subTeams["HR"] subTeams["PA"] = subTeams.BB + subTeams.AB for col in ["1B","2B","3B","HR","BB"]: subTeams[col] = subTeams[col]/subTeams.PA stats = subTeams[["teamID","yearID","W","1B","2B","3B","HR","BB"]].copy() stats.head() ### Your code here ### for col in ["1B","2B","3B","HR","BB"]: plt.scatter(stats.yearID, stats[col], c="g", alpha=0.5) plt.title(col) plt.xlabel('Year') plt.ylabel('Rate') plt.show() ### Your code here ### stats.groupby('yearID')["1B","2B","3B","HR","BB"].mean().head() def meanNormalizeRates(df): subRates = df[["1B","2B","3B","HR","BB"]] df[["1B","2B","3B","HR","BB"]] = subRates - subRates.mean(axis=0) return df stats = stats.groupby('yearID').apply(meanNormalizeRates) ### Your code here ### from sklearn import linear_model clf = linear_model.LinearRegression() stat_train = stats[stats.yearID < 2002] stat_test = stats[stats.yearID >= 2002] XX_train = stat_train[["1B","2B","3B","HR","BB"]].values XX_test = stat_test[["1B","2B","3B","HR","BB"]].values YY_train = stat_train.W.values YY_test = stat_test.W.values clf.fit(XX_train,YY_train) clf.coef_ print("Mean squared error: %.2f" % np.mean((YY_test - clf.predict(XX_test)) ** 2)) ### Your code here ### subPlayers = players[(players.AB + players.BB > 500) & (players.yearID > 1947)].copy() subPlayers["1B"] = subPlayers.H - subPlayers["2B"] - subPlayers["3B"] - subPlayers["HR"] subPlayers["PA"] = subPlayers.BB + subPlayers.AB for col in ["1B","2B","3B","HR","BB"]: subPlayers[col] = subPlayers[col]/subPlayers.PA # Create playerstats DataFrame playerstats = subPlayers[["playerID","yearID","1B","2B","3B","HR","BB"]].copy() playerstats = playerstats.groupby('yearID').apply(meanNormalizeRates) ### Your code here ### playerstats.head() ### Your code here ### def meanNormalizePlayerLS(df): df = df[['playerID', '1B','2B','3B','HR','BB']].mean() return df def getyear(x): return int(x[0:4]) playerLS = playerstats.groupby('playerID').apply(meanNormalizePlayerLS).reset_index() playerLS = master[["playerID","debut","finalGame"]].merge(playerLS, how='inner', on="playerID") playerLS.head() playerLS["debut"] = playerLS.debut.apply(getyear) playerLS["finalGame"] = playerLS.finalGame.apply(getyear) cols = list(playerLS.columns) cols[1:3]=["minYear","maxYear"] playerLS.columns = cols playerLS.head() ### Your code here ### avgRates = playerLS[["1B","2B","3B","HR","BB"]].values playerLS["OPW"] = clf.predict(avgRates) playerLS.head() ### Your code here ### from collections import defaultdict def find_pos(df): positions = df.POS d = defaultdict(int) for pos in positions: d[pos] += 1 result = max(d.iteritems(), key=lambda x: x[1]) return result[0] positions_df = fielding.groupby("playerID").apply(find_pos) positions_df = positions_df.reset_index() positions_df = positions_df.rename(columns={0:"POS"}) playerLS_merged = positions_df.merge(playerLS, how='inner', on="playerID") playerLS_merged = playerLS_merged.merge(medianSalaries, how='inner', on=['playerID']) ### Your code here ### playerLS_merged.head() ### Your code here ### active = playerLS_merged[(playerLS_merged["minYear"] <= 2002) & \ (playerLS_merged["maxYear"] >= 2003) & \ (playerLS_merged["maxYear"] - playerLS_merged["minYear"] >= 3) ] fig = plt.figure() ax = fig.gca() ax.scatter(active.salary/10**6, active.OPW, alpha=0.5, c='red') ax.set_xscale('log') ax.set_xlabel('Salary (in Millions) on log') ax.set_ylabel('OPW') ax.set_title('Relationship between Salary and Predicted Number of Wins') plt.show() ### Your code here ### def meanNormalizeOPW(df): tmp = df[['resid']] df[['resid']]=tmp-tmp.median(axis=0) return df active['resid']=active['OPW'] active = active.groupby('POS').apply(meanNormalizeOPW) Y = active.resid.values X = np.log(active[["salary"]]) clf = linear_model.LinearRegression() clf.fit(X,Y) active['resid'] = Y - clf.predict(X) active = active[active.resid >= 0] def getMinSalary(s): return s["salary"].min() minSalaryByPos = active.groupby('POS').apply(getMinSalary) minSalaryByPos.sort(ascending=False) posleft = list(minSalaryByPos.index) print posleft moneyleft = 20*10**6 # indexes will contain the indexes of the players we chose indexes=[] for i in range(len(posleft)): # you need to have at least this much left to not go over in the next picks maxmoney = moneyleft - sum([minSalaryByPos[x] for x in posleft[:-1] ]) # consider only players in positions we have not selected index = [True if elem in posleft else False for elem in active.POS.values] left = active[index & (active.salary <= maxmoney)] # pick the one that stands out the most from what is expected given his salary j = left["resid"].argmax() indexes.append(j) # remove position we just filled from posleft posleft.remove(left.loc[j].POS) moneyleft = moneyleft - left.loc[j].salary topPicks=active.loc[indexes,:] topPicks=topPicks.sort(["OPW"],ascending=False) topPicks['salary'].sum() round(topPicks['OPW'].mean()) ### Your code here ### def round1000(x): return np.round(x*1000) topPicks[["1B","2B","3B", "HR","BB"]] = topPicks[["1B","2B","3B", "HR","BB"]].apply(round1000) topPicks[["OPW"]] = np.round(topPicks[["OPW"]]) topPicks[["nameFirst","nameLast","POS","1B","2B","3B", "HR","BB","OPW","salary","minYear","maxYear"]] #load the iris data set iris = sklearn.datasets.load_iris() X = iris.data Y = iris.target print X.shape, Y.shape ### Your code here ### # put test data aside X_train, X_test, Y_train, Y_test = sklearn.cross_validation.train_test_split( X, Y, test_size=0.33, random_state=42) print X_train.shape print X_test.shape print Y_train.shape print Y_test.shape ### Your code here ### # make a scatter plot of the data in two dimensions svd = sklearn.decomposition.TruncatedSVD(n_components=2) X_train_centered = X_train - np.mean(X_train, axis=0) X_2d = svd.fit_transform(X_train_centered) sns.set_style('white') plt.scatter(X_2d[:,0], X_2d[:,1], c=Y_train, s = 50, cmap=plt.cm.prism) plt.xlabel('PC1') plt.ylabel('PC2') plt.title('First two PCs using iris data') plt.show() ### Your code here ### # use cross validation to find the optimal value for k k = np.arange(20)+1 parameters = {'n_neighbors': k} knn = sklearn.neighbors.KNeighborsClassifier() clf = sklearn.grid_search.GridSearchCV(knn, parameters, cv=10) clf.fit(X_train, Y_train) # clf.grid_scores_ ### Your code here ### a = clf.grid_scores_ scores = [b.cv_validation_scores for b in a] score_means = np.mean(scores, axis=1) sns.boxplot(scores) plt.scatter(k,score_means, c='k', zorder=2) plt.ylim(0.8, 1.1) plt.title('Accuracy as a function of $k$') plt.ylabel('Accuracy') plt.xlabel('Choice of k') plt.show() ### Your code here ### clf.best_params_ def computeTestScores(test_x, test_y, clf, cv): kFolds = sklearn.cross_validation.KFold(test_x.shape[0], n_folds=cv) scores = [] for _, test_index in kFolds: test_data = test_x[test_index] test_labels = test_y[test_index] scores.append(sklearn.metrics.accuracy_score(test_labels, clf.predict(test_data))) return scores ### Your code here ### ## no measurement without standard deviation scores = computeTestScores(test_x = X_test, test_y = Y_test, clf=clf, cv=5) print np.mean(scores), np.std(scores) digits = sklearn.datasets.load_digits() X = digits.data Y = digits.target print X.shape, Y.shape ### Your code here ### X_train, X_test, Y_train, Y_test = sklearn.cross_validation.train_test_split( X, Y, test_size=0.33, random_state=42) X_train_means = np.mean(X_train, axis=0) X_train_centered = X_train - X_train_means X_test_centered = X_test - X_train_means print X_train.shape print X_test.shape print Y_train.shape print Y_test.shape ### Your code here ### ## make a scatter plot of the data in two dimensions svd = sklearn.decomposition.TruncatedSVD(n_components=2) X_2d = svd.fit_transform(X_train_centered) plt.scatter(X_2d[:,0], X_2d[:,1], c=Y_train, s = 50, cmap=plt.cm.Paired) plt.colorbar() plt.xlabel('PC1') plt.ylabel('PC2') plt.title('First two PCs using digits data') plt.show() ### Your code here ### ind = np.logical_or(Y_train==4, Y_train==1) plt.scatter(X_2d[ind,0], X_2d[ind,1], c=Y_train[ind], s = 50, cmap=plt.cm.Paired) plt.colorbar() plt.xlabel('PC1') plt.ylabel('PC2') plt.title('First two PCs using digits data (Only two classes)') plt.show() def computeTestScores(test_x, test_y, clf, cv): kFolds = sklearn.cross_validation.KFold(test_x.shape[0], n_folds=cv) scores = [] for _, test_index in kFolds: test_data = test_x[test_index] test_labels = test_y[test_index] scores.append(sklearn.metrics.accuracy_score(test_labels, clf.predict(test_data))) return scores ### Your cross validation and evaluation code here ### # use cross validation to find the optimal value for k k = np.arange(20)+1 parameters = {'n_neighbors': k} knn = sklearn.neighbors.KNeighborsClassifier() clf = sklearn.grid_search.GridSearchCV(knn, parameters, cv=10) all_scores = [] all_k = [] all_d = [1,2,3,4,5,6,7,8,9,10] for d in all_d: print d svd = sklearn.decomposition.TruncatedSVD(n_components=d) if d<64: X_d = svd.fit_transform(X_train_centered) X_d_test = svd.transform(X_test_centered) else: X_d = X_train X_d_test = X_test clf.fit(X_d, Y_train) all_scores.append(computeTestScores(test_x=X_d_test, test_y=Y_test, clf=clf, cv=10)) all_k.append(clf.best_params_['n_neighbors']) # use cross validation to find the optimal value for k k = np.arange(20)+1 parameters = {'n_neighbors': k} knn = sklearn.neighbors.KNeighborsClassifier() clf = sklearn.grid_search.GridSearchCV(knn, parameters, cv=10) all_scores = [] all_k = [] all_d = [1,2,3,4,5,6,7,8,9,10] kFolds = sklearn.cross_validation.KFold(X.shape[0], n_folds=10) for d in all_d: print d svd = sklearn.decomposition.TruncatedSVD(n_components=d) #get the data for this iteration of the outer cross validation loop scores = [] for train_index, test_index in kFolds: train_data, test_data = X[train_index], X[test_index] train_labels, test_labels = Y[train_index], Y[test_index] if d<64: data_mean = np.mean(train_data, axis=0) train_data_centered = train_data - data_mean test_data_centered = test_data - data_mean X_d = svd.fit_transform(train_data_centered) X_d_test = svd.transform(test_data_centered) else: X_d = train_data X_d_test = test_data clf.fit(X_d, train_labels) scores.append(sklearn.metrics.accuracy_score(test_labels, clf.predict(X_d_test))) all_scores.append(scores) all_k.append(clf.best_params_['n_neighbors']) ### Your boxplot code here ### all_s = np.asarray(all_scores) sns.boxplot(all_s.T,names = [np.str(dd) for dd in all_d]) plt.ylabel("Accuracy") plt.xlabel("Number of Dimensions") plt.title('Accuracy as a function of Number of Dimensions') plt.show() ### Your code here ### all_s = np.asarray(all_scores) sns.boxplot(all_s.T,names = [np.str(dd) + ", k=" + np.str(kk) for dd, kk in zip(all_d, all_k)]) plt.ylabel("Accuracy") plt.xlabel("Number of Dimensions") plt.title('Accuracy as a function of Number of Dimensions') plt.show()