import pandas as pd import numpy as np import pylab as pl df = pd.read_csv("./data/credit-data-trainingset.csv") df.head() from sklearn.ensemble import RandomForestClassifier features = np.array(['revolving_utilization_of_unsecured_lines', 'age', 'number_of_time30-59_days_past_due_not_worse', 'debt_ratio', 'monthly_income','number_of_open_credit_lines_and_loans', 'number_of_times90_days_late', 'number_real_estate_loans_or_lines', 'number_of_time60-89_days_past_due_not_worse', 'number_of_dependents']) clf = RandomForestClassifier(compute_importances=True) clf.fit(df[features], df['serious_dlqin2yrs']) # from the calculated importances, order them from most to least important # and make a barplot so we can visualize what is/isn't important importances = clf.feature_importances_ sorted_idx = np.argsort(importances) padding = np.arange(len(features)) + 0.5 pl.barh(padding, importances[sorted_idx], align='center') pl.yticks(padding, features[sorted_idx]) pl.xlabel("Relative Importance") pl.title("Variable Importance") pl.show() df['income_bins'] = pd.cut(df.monthly_income, bins=15) pd.value_counts(df['income_bins']) # not very helpful def cap_values(x, cap): if x > cap: return cap else: return x df.monthly_income = df.monthly_income.apply(lambda x: cap_values(x, 15000)) df.monthly_income.describe() df['income_bins'] = pd.cut(df.monthly_income, bins=15, labels=False) pd.value_counts(df.income_bins) df[["income_bins", "serious_dlqin2yrs"]].groupby("income_bins").mean() cols = ["income_bins", "serious_dlqin2yrs"] df[cols].groupby("income_bins").mean().plot() cols = ['age', 'serious_dlqin2yrs'] age_means = df[cols].groupby("age").mean() age_means.plot() mybins = [0] + range(20, 80, 5) + [120] df['age_bucket'] = pd.cut(df.age, bins=mybins) pd.value_counts(df['age_bin']) df[["age_bucket", "serious_dlqin2yrs"]].groupby("age_bucket").mean() df[["age_bucket", "serious_dlqin2yrs"]].groupby("age_bucket").mean().plot() labels, levels = pd.factorize(df.age_bucket) df.age_bucket = labels bins = [] for q in [0.2, 0.4, 0.6, 0.8, 1.0]: bins.append(df.debt_ratio.quantile(q)) debt_ratio_binned = pd.cut(df.debt_ratio, bins=bins) debt_ratio_binned print pd.value_counts(debt_ratio_binned) from sklearn.preprocessing import StandardScaler df['monthly_income_scaled'] = StandardScaler().fit_transform(df.monthly_income) print df.monthly_income_scaled.describe() print print "Mean at 0?", round(df.monthly_income_scaled.mean(), 10)==0 pl.hist(df.monthly_income_scaled) features = np.array(['revolving_utilization_of_unsecured_lines', 'age', 'number_of_time30-59_days_past_due_not_worse', 'debt_ratio', 'monthly_income','number_of_open_credit_lines_and_loans', 'number_of_times90_days_late', 'number_real_estate_loans_or_lines', 'number_of_time60-89_days_past_due_not_worse', 'number_of_dependents', 'income_bins', 'age_bucket', 'monthly_income_scaled']) clf = RandomForestClassifier(compute_importances=True) clf.fit(df[features], df['serious_dlqin2yrs']) importances = clf.feature_importances_ sorted_idx = np.argsort(importances) padding = np.arange(len(features)) + 0.5 pl.barh(padding, importances[sorted_idx], align='center') pl.yticks(padding, features[sorted_idx]) pl.xlabel("Relative Importance") pl.title("Variable Importance") pl.show() best_features = features[sorted_idx][::-1] best_features