import pandas as pd import numpy as np from statsmodels.formula.api import ols from statsmodels.graphics.gofplots import qqplot loansData = pd.read_csv('https://spark-public.s3.amazonaws.com/dataanalysis/loansData.csv') loansData = loansData.reset_index(drop=True) loansData.columns = map(lambda x: x.replace('.', '_').lower(), loansData.columns) loansData['amount_requested'] = loansData['amount_requested'].astype(float) loansData['interest_rate'] = loansData['interest_rate'].apply(lambda x: float(x.strip('%'))) loansData['debt_to_income_ratio'] = loansData['debt_to_income_ratio'].apply(lambda x: float(x.strip('%')) / 100) loansData['loan_length'] = loansData['loan_length'].apply(lambda x: float(x.strip('months').strip())) loansData = loansData.dropna() loansData['interest_rate'].describe() loansData.interest_rate.plot(kind='kde'); loansData.corr() loansData.boxplot(column='interest_rate'); props = loansData.loan_length.value_counts().astype(float)/len(loansData.loan_length) loansData.boxplot(column='interest_rate', by='loan_length', widths=props); grouped = loansData.groupby('loan_length') fr = pd.Categorical.from_array(loansData.fico_range) cols = ['Yellow', 'MediumOrchid'] i = 0 for length, group in grouped: scatter(pd.Categorical.from_array(group.fico_range).labels, group.interest_rate, c=cols[i].strip().strip('\n'), label=str(int(length)) + ' months', s=30) i += 1 a = gca() a.set_xticks(np.unique(fr.labels)) a.set_xticklabels(fr.levels, rotation='30') a.set_xlabel('FICO range', fontsize=12) a.set_ylabel('Interest rate', fontsize=12) a.legend() f = gcf() f.set_size_inches(12,7) f.suptitle('Interest rate as a function of FICO range and loan length', fontsize=15) f.savefig('interest.png'); lm = ols('interest_rate ~ C(loan_length) + fico_range', loansData).fit() lm.summary() lm.pvalues qqplot(lm.resid, line='s');