# special IPython command to prepare the notebook for matplotlib %matplotlib inline import numpy as np import pandas as pd import scipy.stats as stats import matplotlib.pyplot as plt import statsmodels.api as sm # To load in the baseball data import requests import StringIO import zipfile # special matplotlib argument for improved plots from matplotlib import rcParams #colorbrewer2 Dark2 qualitative color table dark2_colors = [(0.10588235294117647, 0.6196078431372549, 0.4666666666666667), (0.8509803921568627, 0.37254901960784315, 0.00784313725490196), (0.4588235294117647, 0.4392156862745098, 0.7019607843137254), (0.9058823529411765, 0.1607843137254902, 0.5411764705882353), (0.4, 0.6509803921568628, 0.11764705882352941), (0.9019607843137255, 0.6705882352941176, 0.00784313725490196), (0.6509803921568628, 0.4627450980392157, 0.11372549019607843)] rcParams['figure.figsize'] = (10, 6) rcParams['figure.dpi'] = 150 rcParams['axes.color_cycle'] = dark2_colors rcParams['lines.linewidth'] = 2 rcParams['axes.facecolor'] = 'white' rcParams['font.size'] = 14 rcParams['patch.edgecolor'] = 'white' rcParams['patch.facecolor'] = dark2_colors[0] rcParams['font.family'] = 'StixGeneral' faithful = sm.datasets.get_rdataset("faithful") # Let's look at the help file # sm.datasets.get_rdataset? # faithful? faithful.title faithful = faithful.data faithful.head() faithful.shape plt.hist(faithful.waiting) plt.xlabel('Waiting time to next eruption (in mins)') plt.ylabel('Frequency') plt.title('Old Faithful Geyser time between eruption') plt.show() plt.scatter(faithful.waiting, faithful.eruptions) plt.xlabel('Waiting time to next eruption (in mins)') plt.ylabel('Eruption time (in mins)') plt.title('Old Faithful Geyser') plt.show() X = faithful.waiting y = faithful.eruptions model = sm.OLS(y, X) # Let's look at the options in model # model. results = model.fit() # Let's look at the options in results # results. print results.summary() results.params.values X = sm.add_constant(X) X.head() modelW0 = sm.OLS(y, X) resultsW0 = modelW0.fit() print resultsW0.summary() newX = np.array([1,75]) resultsW0.params[0]*newX[0] + resultsW0.params[1] * newX[1] resultsW0.predict(newX) plt.scatter(faithful.waiting, faithful.eruptions) plt.xlabel('Waiting time to next eruption (in mins)') plt.ylabel('Eruption time (in mins)') plt.title('Old Faithful Geyser') plt.plot(faithful.waiting, resultsW0.fittedvalues, color='blue', linewidth=3) plt.show() resids = faithful.eruptions - resultsW0.predict(X) resids = resultsW0.resid plt.plot(faithful.waiting, resids, 'o') plt.hlines(y = 0, xmin=40, xmax = 100) plt.xlabel('Waiting time') plt.ylabel('Residuals') plt.title('Residual Plot') plt.show() print np.sum((faithful.eruptions - resultsW0.predict(X)) ** 2) print np.mean((faithful.eruptions - resultsW0.predict(X)) ** 2) X = sm.add_constant(faithful.waiting) y = faithful.eruptions np.dot(X.T, X) np.linalg.inv(np.dot(X.T, X)) beta = np.linalg.inv(np.dot(X.T, X)).dot(X.T).dot(y) print "Directly estimating beta:", beta print "Estimating beta using statmodels: ", resultsW0.params.values def getZIP(zipFileName): r = requests.get(zipFileName).content s = StringIO.StringIO(r) zf = zipfile.ZipFile(s, 'r') # Read in a list of zipped files return zf url = 'http://seanlahman.com/files/database/lahman-csv_2014-02-14.zip' zf = getZIP(url) tablenames = zf.namelist() salaries = pd.read_csv(zf.open(tablenames[tablenames.index('Salaries.csv')])) teams = pd.read_csv(zf.open(tablenames[tablenames.index('Teams.csv')])) teams = teams[['yearID', 'teamID', 'W']] totSalaries = salaries.groupby(['yearID','teamID'], as_index=False).sum() joined = pd.merge(totSalaries, teams, how="inner", on=['yearID', 'teamID']) joined.head() teamName = 'OAK' years = np.arange(1999, 2005) residData = pd.DataFrame() for yr in years: df = joined[joined['yearID'] == yr] X = df['salary'].values / 1e6 X = sm.add_constant(X) y = df['W'].values # least squares estimates model = sm.OLS(y, X) results = model.fit() # fit the linear regression model beta = results.params # least squares coefficients yhat = (beta[0] + beta[1]*X[:,1]) # regression line residData[yr] = results.resid # residuals = y - yhat residData.index = df['teamID'] residData = residData.T residData.index = residData.index.format() residData.plot(title = 'Residuals from least squares estimates across years', figsize = (15, 8), color=map(lambda x: 'blue' if x=='OAK' else 'gray',df.teamID)) plt.xlabel('Year') plt.ylabel('Residuals') plt.show() # your turn mtcars = sm.datasets.get_rdataset("mtcars") mtcars = mtcars.data mtcars['mpg'].hist() plt.title('Distribution of MPG') plt.xlabel('Miles Per Gallon') plt.plot(mtcars.cyl, mtcars.mpg, 'o') plt.xlim(3, 9) plt.xlabel('Cylinders') plt.ylabel('MPG') plt.title('Relationship between cylinders and MPG') plt.plot(mtcars.hp, mtcars.mpg, 'o') plt.xlabel('Horsepower') plt.ylabel('MPG') plt.title('Relationship between horsepower and MPG') # your turn y = mtcars.mpg X = mtcars[['cyl', 'hp']] X = sm.add_constant(X) model = sm.OLS(y,X) results = model.fit() print results.summary() newX = np.array([1, 6, 180]) results.predict(newX) # your turn newX = np.array([1, 4, 120]) results.predict(newX) # your turn beta = np.linalg.inv(np.dot(X.T, X)).dot(X.T).dot(y) beta