#Import the useful data science packages!
import numpy as np
import pandas as pd
import statsmodels.api as sm

#Load the data
prosper = pd.read_csv('/Users/charlie/Downloads/prosperLoanData.csv')
# you can download the dataset from https://docs.google.com/document/d/1w7KhqotVi5eoKE3I_AZHbsxdr-NmcWsLTIiZrpxWx4w/pub and then
# run all this locally.

prosper.columns

# Normalisation function used to ensure that each numerical variable has mean = 0 
# and standard deviation = 1. Does the same as the function in Lesson 3 of Intro to DS.
def normalise(data):
    mean = data.mean()
    stdev = data.std()
    return (data - mean)/stdev

# Choose some of the many columns from the dataset. We're going to attempt to predict the 
# 'LoanOriginalAmount' from some of the other data.
prosper = prosper[['CreditScoreRangeLower','StatedMonthlyIncome', \
                   'IsBorrowerHomeowner', 'CreditScoreRangeUpper',\
                   'EmploymentStatus','Term','BorrowerRate','LenderYield',\
                   'LoanOriginalAmount']]

# Select just the numerical variables, we'll normalise these and we'll be creating dummy variables
# from the categorical variables.
numerical_variables = ['CreditScoreRangeLower','StatedMonthlyIncome',\
                       'Term','CreditScoreRangeUpper','BorrowerRate',\
                       'LenderYield','LoanOriginalAmount']

#just remove the missing data and any duplication for simplicity!
prosper.dropna(inplace = True)
prosper.drop_duplicates(inplace = True)

#choose the numerical variables from prosper, remove the target to create features
features = prosper[numerical_variables].drop(['LoanOriginalAmount'],axis = 1)
#normalising numerical features improves the performance of fitting algorithms 
# (don't normalise the dummy variables though, that's generally a bad idea!)
features = normalise(features)

#create dataframes of homeowner and employment dummies 
home_dum = pd.get_dummies(prosper.IsBorrowerHomeowner,prefix="homeowner")
job_dum = pd.get_dummies(prosper.EmploymentStatus,prefix = "job")

# uncomment to add a constant column
#features = sm.add_constant(features)

# uncomment these to add the dummy variables to the features
#features = features.join(job_dum)
#features = features.join(home_dum)

# uncomment these to drop a single dummy variable from each full set 
# (but only if you've previously added them!)
#features.drop(['job_Employed'],axis=1,inplace=True)
#features.drop(['homeowner_True'],axis = 1,inplace=True)

# set the target values to fit the linear regression model
values = prosper.LoanOriginalAmount

# Watch out for strongly correlated features!
features.corr()

# create, fit and summarise the model
# check out the coefficients and the condition number to look for multicollinearity

# A good resource for understanding all of this summary output can be found in the excellent
# online statistics textbook here: http://work.thaslwanter.at/Stats/html/statsModels.html#linear-regression-analysis-with-python
sm.OLS(values,features).fit().summary()