Handling missing values
Imputation is fairly intuitive. For the missing data in our dataset, we're going to replace it with values that come from similar records in our dataset that aren't null. To do this we're going to use the NearestNeighbors algorithm.
import pandas as pd
import numpy as np
import pylab as pl
df = pd.read_csv("./data/credit-data-post-import.csv")
We're going to use the simplest type of cross validation. we'll simply split our data into 2 groups: training and test. we'll use the training set to calibrate our model and then use the test set to evaluate how effective it is.
is_test = np.random.uniform(0, 1, len(df)) > 0.75
train = df[is_test==False]
test = df[is_test==True]
len(train), len(test)
(112415, 37585)
from sklearn.neighbors import KNeighborsRegressor
income_imputer = KNeighborsRegressor(n_neighbors=1)
#split our data into 2 groups; data containing nulls and data
# not containing nulls we'll train on the latter and make
# 'predictions' on the null data to impute monthly_income
train_w_monthly_income = train[train.monthly_income.isnull()==False]
train_w_null_monthly_income = train[train.monthly_income.isnull()==True]
train_w_monthly_income.corr()
serious_dlqin2yrs | revolving_utilization_of_unsecured_lines | age | number_of_time30-59_days_past_due_not_worse | debt_ratio | monthly_income | number_of_open_credit_lines_and_loans | number_of_times90_days_late | number_real_estate_loans_or_lines | number_of_time60-89_days_past_due_not_worse | number_of_dependents | |
---|---|---|---|---|---|---|---|---|---|---|---|
serious_dlqin2yrs | 1.000000 | -0.001968 | -0.102272 | 0.120134 | -0.002382 | -0.019663 | -0.026876 | 0.108921 | -0.005976 | 0.091995 | 0.047273 |
revolving_utilization_of_unsecured_lines | -0.001968 | 1.000000 | -0.003601 | -0.001269 | 0.000075 | 0.007388 | -0.010750 | -0.001091 | 0.007419 | -0.000975 | 0.002690 |
age | -0.102272 | -0.003601 | 1.000000 | -0.052251 | 0.000083 | 0.037199 | 0.186420 | -0.051162 | 0.066435 | -0.046745 | -0.206503 |
number_of_time30-59_days_past_due_not_worse | 0.120134 | -0.001269 | -0.052251 | 1.000000 | -0.001488 | -0.010458 | -0.047053 | 0.977052 | -0.026220 | 0.981492 | 0.007043 |
debt_ratio | -0.002382 | 0.000075 | 0.000083 | -0.001488 | 1.000000 | -0.026406 | 0.008032 | -0.002738 | 0.015818 | -0.001608 | 0.005896 |
monthly_income | -0.019663 | 0.007388 | 0.037199 | -0.010458 | -0.026406 | 1.000000 | 0.089260 | -0.012778 | 0.123512 | -0.011264 | 0.064060 |
number_of_open_credit_lines_and_loans | -0.026876 | -0.010750 | 0.186420 | -0.047053 | 0.008032 | 0.089260 | 1.000000 | -0.074116 | 0.426890 | -0.063968 | 0.041384 |
number_of_times90_days_late | 0.108921 | -0.001091 | -0.051162 | 0.977052 | -0.002738 | -0.012778 | -0.074116 | 1.000000 | -0.041557 | 0.990128 | -0.000703 |
number_real_estate_loans_or_lines | -0.005976 | 0.007419 | 0.066435 | -0.026220 | 0.015818 | 0.123512 | 0.426890 | -0.041557 | 1.000000 | -0.035835 | 0.119559 |
number_of_time60-89_days_past_due_not_worse | 0.091995 | -0.000975 | -0.046745 | 0.981492 | -0.001608 | -0.011264 | -0.063968 | 0.990128 | -0.035835 | 1.000000 | -0.002197 |
number_of_dependents | 0.047273 | 0.002690 | -0.206503 | 0.007043 | 0.005896 | 0.064060 | 0.041384 | -0.000703 | 0.119559 | -0.002197 | 1.000000 |
train_w_monthly_income.corr().ix[:,5]
serious_dlqin2yrs -0.019663 revolving_utilization_of_unsecured_lines 0.007388 age 0.037199 number_of_time30-59_days_past_due_not_worse -0.010458 debt_ratio -0.026406 monthly_income 1.000000 number_of_open_credit_lines_and_loans 0.089260 number_of_times90_days_late -0.012778 number_real_estate_loans_or_lines 0.123512 number_of_time60-89_days_past_due_not_worse -0.011264 number_of_dependents 0.064060 Name: monthly_income
cols = ['number_real_estate_loans_or_lines', 'number_of_open_credit_lines_and_loans']
income_imputer.fit(train_w_monthly_income[cols], train_w_monthly_income.monthly_income)
KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski', n_neighbors=1, p=2, weights='uniform')
new_values = income_imputer.predict(train_w_null_monthly_income[cols])
train_w_null_monthly_income['monthly_income'] = new_values
new_values
array([ 7666., 3235., 9666., ..., 2155., 3771., 10000.])
#combine the data back together
train = train_w_monthly_income.append(train_w_null_monthly_income)
len(train)
112415
test['monthly_income_imputed'] = income_imputer.predict(test[cols])
test.head()
serious_dlqin2yrs | revolving_utilization_of_unsecured_lines | age | number_of_time30-59_days_past_due_not_worse | debt_ratio | monthly_income | number_of_open_credit_lines_and_loans | number_of_times90_days_late | number_real_estate_loans_or_lines | number_of_time60-89_days_past_due_not_worse | number_of_dependents | monthly_income_imputed | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
3 | 0 | 0.233810 | 30 | 0 | 0.036050 | 3300 | 5 | 0 | 0 | 0 | 0 | 6017 |
13 | 1 | 0.964673 | 40 | 3 | 0.382965 | 13700 | 9 | 3 | 1 | 1 | 2 | 2850 |
16 | 0 | 0.061086 | 78 | 0 | 2058.000000 | NaN | 10 | 0 | 2 | 0 | 0 | 2500 |
23 | 0 | 0.075427 | 32 | 0 | 0.085512 | 7916 | 6 | 0 | 0 | 0 | 0 | 4145 |
24 | 0 | 0.046560 | 58 | 0 | 0.241622 | 2416 | 9 | 0 | 1 | 0 | 0 | 2850 |
test['monthly_income'] = np.where(test.monthly_income.isnull(), test.monthly_income_imputed,
test.monthly_income)
print pd.value_counts(train.monthly_income.isnull())
print pd.value_counts(test.monthly_income.isnull())
False 112415 False 37585
train.to_csv("./data/credit-data-trainingset.csv", index=False)
test.to_csv("./data/credit-data-testset.csv", index=False)
Partner with the person next to you and think of other ways you might be able to impute missing valus for the monthly_income
column.
Things to consider:
number_of_open_credit_lines_and_loans
and number_real_estate_loans_or_lines
the best predictors?monthly_income