#Import the useful data science packages! import numpy as np import pandas as pd import statsmodels.api as sm #Load the data prosper = pd.read_csv('/Users/charlie/Downloads/prosperLoanData.csv') # you can download the dataset from https://docs.google.com/document/d/1w7KhqotVi5eoKE3I_AZHbsxdr-NmcWsLTIiZrpxWx4w/pub and then # run all this locally. prosper.columns # Normalisation function used to ensure that each numerical variable has mean = 0 # and standard deviation = 1. Does the same as the function in Lesson 3 of Intro to DS. def normalise(data): mean = data.mean() stdev = data.std() return (data - mean)/stdev # Choose some of the many columns from the dataset. We're going to attempt to predict the # 'LoanOriginalAmount' from some of the other data. prosper = prosper[['CreditScoreRangeLower','StatedMonthlyIncome', \ 'IsBorrowerHomeowner', 'CreditScoreRangeUpper',\ 'EmploymentStatus','Term','BorrowerRate','LenderYield',\ 'LoanOriginalAmount']] # Select just the numerical variables, we'll normalise these and we'll be creating dummy variables # from the categorical variables. numerical_variables = ['CreditScoreRangeLower','StatedMonthlyIncome',\ 'Term','CreditScoreRangeUpper','BorrowerRate',\ 'LenderYield','LoanOriginalAmount'] #just remove the missing data and any duplication for simplicity! prosper.dropna(inplace = True) prosper.drop_duplicates(inplace = True) #choose the numerical variables from prosper, remove the target to create features features = prosper[numerical_variables].drop(['LoanOriginalAmount'],axis = 1) #normalising numerical features improves the performance of fitting algorithms # (don't normalise the dummy variables though, that's generally a bad idea!) features = normalise(features) #create dataframes of homeowner and employment dummies home_dum = pd.get_dummies(prosper.IsBorrowerHomeowner,prefix="homeowner") job_dum = pd.get_dummies(prosper.EmploymentStatus,prefix = "job") # uncomment to add a constant column #features = sm.add_constant(features) # uncomment these to add the dummy variables to the features #features = features.join(job_dum) #features = features.join(home_dum) # uncomment these to drop a single dummy variable from each full set # (but only if you've previously added them!) #features.drop(['job_Employed'],axis=1,inplace=True) #features.drop(['homeowner_True'],axis = 1,inplace=True) # set the target values to fit the linear regression model values = prosper.LoanOriginalAmount # Watch out for strongly correlated features! features.corr() # create, fit and summarise the model # check out the coefficients and the condition number to look for multicollinearity # A good resource for understanding all of this summary output can be found in the excellent # online statistics textbook here: http://work.thaslwanter.at/Stats/html/statsModels.html#linear-regression-analysis-with-python sm.OLS(values,features).fit().summary()