# download the software import urllib urllib.urlretrieve('https://github.com/sods/ods/archive/master.zip', 'master.zip') # unzip the software import zipfile zip = zipfile.ZipFile('./master.zip', 'r') for name in zip.namelist(): zip.extract(name, '.') # add the module location to the python path. import sys sys.path.append("./ods-master/") #import pods #pods.notebook.code_toggle() import pods pods.notebook.display_google_book(id='spcAAAAAMAAJ', page=72) data = pods.datasets.olympic_marathon_men() x = data['X'] y = data['Y'] print(x) print(y) %matplotlib inline import pylab as plt plt.plot(x, y, 'rx') plt.xlabel('year') plt.ylabel('pace in min/km') m = -0.4 c = 80 # set c to the minimum c = (y - m*x).mean() print c m = ((y - c)*x).sum()/(x**2).sum() print m import numpy as np x_test = np.linspace(1890, 2020, 130)[:, None] f_test = m*x_test + c plt.plot(x_test, f_test, 'b-') plt.plot(x, y, 'rx') for i in np.arange(10): m = ((y - c)*x).sum()/(x*x).sum() c = (y-m*x).sum()/y.shape[0] print(m) print(c) f_test = m*x_test + c plt.plot(x_test, f_test, 'b-') plt.plot(x, y, 'rx') # Question 3 Answer Code # Write code for you answer to this question in this box # Do not delete these comments, otherwise you will get zero for this answer. # Make sure your code has run and the answer is correct *before* submitting your notebook for marking. # define the vector w w = np.zeros(shape=(2, 1)) w[0] = m w[1] = c X = np.hstack((np.ones_like(x), x)) print(X) f = np.dot(X, w) # np.dot does matrix multiplication in python resid = (y-f) E = np.dot(resid.T, resid) # matrix multiplication on a single vector is equivalent to a dot product. print "Error function is:", E np.linalg.solve? w = np.linalg.solve(np.dot(X.T, X), np.dot(X.T, y)) print w m = w[1]; c=w[0] f_test = m*x_test + c print(m) print(c) plt.plot(x_test, f_test, 'b-') plt.plot(x, y, 'rx') data = pods.datasets.movie_body_count() movies = data['Y'] print ', '.join(movies.columns) select_features = ['Year', 'Body_Count', 'Length_Minutes'] X = movies[select_features] X['Eins'] = 1 # add a column for the offset y = movies[['IMDB_Rating']] import pandas as pd w = pd.DataFrame(data=np.linalg.solve(np.dot(X.T, X), np.dot(X.T, y)), # solve linear regression here index = X.columns, # columns of X become rows of w columns=['regression_coefficient']) # the column of X is the value of regression coefficient (y - np.dot(X, w)).hist() w from IPython.display import YouTubeVideo YouTubeVideo('ui-uNlFHoms') from IPython.display import YouTubeVideo YouTubeVideo('78YNphT90-k') import scipy as sp Q, R = np.linalg.qr(X) w = sp.linalg.solve_triangular(R, np.dot(Q.T, y)) w = pd.DataFrame(w, index=X.columns) w