from lifelines import *
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from ggplot import *
from pylab import *
%load_ext ipycache
data=pd.read_csv("E:/Github Stuff/srepho.github.io/Survival/survival.csv")
#data=pd.read_csv("D:/Users/soates/Documents/Github/srepho.github.io/Survival/survival.csv")
ggplot(aes(x="Account_Length"), data=data) + geom_histogram() + facet_grid("Churn")
<ggplot: (24903691)>
%pylab inline
figsize(12,12)
from lifelines.plotting import plot_lifetimes
#plt.xlim(0,25)
#plt.vlines(10,0,30,lw=2, linestyles="--")
plt.xlabel('time')
plt.title('Births and deaths of our population, at $t=10$')
plot_lifetimes(data.Account_Length, event_observed=data.Churn)
Populating the interactive namespace from numpy and matplotlib warning: you may want to subsample to less than 100 individuals.
data.Churn.describe()
count 3333 mean 0.1449145 std 0.3520674 min False 25% 0 50% 0 75% 0 max True dtype: object
kmf = KaplanMeierFitter()
kmf.fit(data["Account_Length"], event_observed=data["Churn"])
<lifelines.KaplanMeierFitter: fitted with 3333 observations, 2850 censored>
kmf.survival_function_.plot()
<matplotlib.axes.AxesSubplot at 0x17c02198>
naf = NelsonAalenFitter()
naf.fit(data["Account_Length"], event_observed=data["Churn"])
<lifelines.NelsonAalenFitter: fitted with 3333 observations, 2850 censored>
print naf.cumulative_hazard_.head()
naf.plot()
NA-estimate timeline 0 0.000000 1 0.000300 2 0.000601 3 0.000601 4 0.000601
<matplotlib.axes.AxesSubplot at 0x1855a9b0>
del data["Phone"]
del data["State"]
del data["Area_Code"]
del data["Intl_Plan"]
del data["Vmail_Plan"]
data.head()
State | Account_Length | Area_Code | Phone | Intl_Plan | Vmail_Plan | Vmail_Message | Day_Mins | Day_Calls | Day_Charge | ... | Eve_Calls | Eve_Charge | Night_Mins | Night_Calls | Night_Charge | Intl_Mins | Intl_Calls | Intl_Charge | CustServ_Calls | Churn | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | KS | 128 | 415 | 382-4657 | no | yes | 25 | 265.1 | 110 | 45.07 | ... | 99 | 16.78 | 244.7 | 91 | 11.01 | 10.0 | 3 | 2.70 | 1 | True |
1 | OH | 107 | 415 | 371-7191 | no | yes | 26 | 161.6 | 123 | 27.47 | ... | 103 | 16.62 | 254.4 | 103 | 11.45 | 13.7 | 3 | 3.70 | 1 | True |
2 | NJ | 137 | 415 | 358-1921 | no | no | 0 | 243.4 | 114 | 41.38 | ... | 110 | 10.30 | 162.6 | 104 | 7.32 | 12.2 | 5 | 3.29 | 0 | True |
3 | OH | 84 | 408 | 375-9999 | yes | no | 0 | 299.4 | 71 | 50.90 | ... | 88 | 5.26 | 196.9 | 89 | 8.86 | 6.6 | 7 | 1.78 | 2 | True |
4 | OK | 75 | 415 | 330-6626 | yes | no | 0 | 166.7 | 113 | 28.34 | ... | 122 | 12.61 | 186.9 | 121 | 8.41 | 10.1 | 3 | 2.73 | 3 | True |
5 rows × 21 columns
s = ' + '.join(data.columns) + ' -1'
print s
State + Account_Length + Area_Code + Phone + Intl_Plan + Vmail_Plan + Vmail_Message + Day_Mins + Day_Calls + Day_Charge + Eve_Mins + Eve_Calls + Eve_Charge + Night_Mins + Night_Calls + Night_Charge + Intl_Mins + Intl_Calls + Intl_Charge + CustServ_Calls + Churn -1
import patsy
X = patsy.dmatrix("State + Account_Length + Area_Code + Phone + Intl_Plan + Vmail_Plan + Vmail_Message + Day_Mins + Day_Calls + Day_Charge + Eve_Mins + Eve_Calls + Eve_Charge + Night_Mins + Night_Calls + Night_Charge + Intl_Mins + Intl_Calls + Intl_Charge + CustServ_Calls -1", data, return_type='dataframe')
X.head()
State[AK] | State[AL] | State[AR] | State[AZ] | State[CA] | State[CO] | State[CT] | State[DC] | State[DE] | State[FL] | ... | Eve_Mins | Eve_Calls | Eve_Charge | Night_Mins | Night_Calls | Night_Charge | Intl_Mins | Intl_Calls | Intl_Charge | CustServ_Calls | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 197.4 | 99 | 16.78 | 244.7 | 91 | 11.01 | 10.0 | 3 | 2.70 | 1 |
1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 195.5 | 103 | 16.62 | 254.4 | 103 | 11.45 | 13.7 | 3 | 3.70 | 1 |
2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 121.2 | 110 | 10.30 | 162.6 | 104 | 7.32 | 12.2 | 5 | 3.29 | 0 |
3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 61.9 | 88 | 5.26 | 196.9 | 89 | 8.86 | 6.6 | 7 | 1.78 | 2 |
4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 148.3 | 122 | 12.61 | 186.9 | 121 | 8.41 | 10.1 | 3 | 2.73 | 3 |
5 rows × 3401 columns
X.columns
Index([u'State[AK]', u'State[AL]', u'State[AR]', u'State[AZ]', u'State[CA]', u'State[CO]', u'State[CT]', u'State[DC]', u'State[DE]', u'State[FL]', u'State[GA]', u'State[HI]', u'State[IA]', u'State[ID]', u'State[IL]', u'State[IN]', u'State[KS]', u'State[KY]', u'State[LA]', u'State[MA]', u'State[MD]', u'State[ME]', u'State[MI]', u'State[MN]', u'State[MO]', u'State[MS]', u'State[MT]', u'State[NC]', u'State[ND]', u'State[NE]', u'State[NH]', u'State[NJ]', u'State[NM]', u'State[NV]', u'State[NY]', u'State[OH]', u'State[OK]', u'State[OR]', u'State[PA]', u'State[RI]', u'State[SC]', u'State[SD]', u'State[TN]', u'State[TX]', u'State[UT]', u'State[VA]', u'State[VT]', u'State[WA]', u'State[WI]', u'State[WV]', u'State[WY]', u'Intl_Plan[T.yes]', u'Vmail_Plan[T.yes]', u'Churn[T.True]', u'Account_Length', u'Area_Code', u'Vmail_Message', u'Day_Mins', u'Day_Calls', u'Day_Charge', u'Eve_Mins', u'Eve_Calls', u'Eve_Charge', u'Night_Mins', u'Night_Calls', u'Night_Charge', u'Intl_Mins', u'Intl_Calls', u'Intl_Charge', u'CustServ_Calls'], dtype='object')
data.head()
State | Account Length | Area Code | Intl Plan | VMail Plan | VMail Message | Day Mins | Day Calls | Day Charge | Eve Mins | Eve Calls | Eve Charge | Night Mins | Night Calls | Night Charge | Intl Mins | Intl Calls | Intl Charge | CustServ Calls | Churn | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | KS | 128 | 415 | no | yes | 25 | 265.1 | 110 | 45.07 | 197.4 | 99 | 16.78 | 244.7 | 91 | 11.01 | 10.0 | 3 | 2.70 | 1 | True |
1 | OH | 107 | 415 | no | yes | 26 | 161.6 | 123 | 27.47 | 195.5 | 103 | 16.62 | 254.4 | 103 | 11.45 | 13.7 | 3 | 3.70 | 1 | True |
2 | NJ | 137 | 415 | no | no | 0 | 243.4 | 114 | 41.38 | 121.2 | 110 | 10.30 | 162.6 | 104 | 7.32 | 12.2 | 5 | 3.29 | 0 | True |
3 | OH | 84 | 408 | yes | no | 0 | 299.4 | 71 | 50.90 | 61.9 | 88 | 5.26 | 196.9 | 89 | 8.86 | 6.6 | 7 | 1.78 | 2 | True |
4 | OK | 75 | 415 | yes | no | 0 | 166.7 | 113 | 28.34 | 148.3 | 122 | 12.61 | 186.9 | 121 | 8.41 | 10.1 | 3 | 2.73 | 3 | True |
del X["Churn[T.True]"]
X["Churn"] = data["Churn"]
from lifelines.datasets import generate_rossi_dataset
from lifelines import CoxPHFitter
from lifelines import statsitics.concordance_index
rossi_dataset = generate_rossi_dataset()
cf = CoxPHFitter(alpha=0.95, tie_method='Efron')
cf.fit(rossi_dataset, duration_col='week', event_col='arrest')
print cf.summary()
lifelines.statsitics.concordance_index
coef exp(coef) se(coef) z p lower 0.95 upper 0.95 fin -0.379022 0.684531 0.191364 -1.980629 0.047633 -0.754172 -0.003872 age -0.057246 0.944362 0.021983 -2.604155 0.009210 -0.100340 -0.014151 race 0.314130 1.369067 0.308019 1.019839 0.307805 -0.289709 0.917969 wexp -0.151115 0.859749 0.212125 -0.712386 0.476226 -0.566963 0.264733 mar -0.432783 0.648702 0.381797 -1.133541 0.256987 -1.181255 0.315690 paro -0.084983 0.918528 0.195748 -0.434144 0.664184 -0.468726 0.298760 prio 0.091112 1.095391 0.028630 3.182426 0.001460 0.034986 0.147237 None
cf.durations.head()
313 1 100 2 183 3 416 4 79 5 Name: week, dtype: int64
rossi_dataset.head()
week | arrest | fin | age | race | wexp | mar | paro | prio | |
---|---|---|---|---|---|---|---|---|---|
0 | 20 | 1 | 0 | 27 | 1 | 0 | 0 | 1 | 3 |
1 | 17 | 1 | 0 | 18 | 1 | 0 | 0 | 1 | 8 |
2 | 25 | 1 | 0 | 19 | 0 | 1 | 0 | 1 | 13 |
3 | 52 | 0 | 1 | 23 | 1 | 1 | 1 | 1 | 1 |
4 | 52 | 0 | 0 | 19 | 0 | 1 | 0 | 1 | 3 |
from lifelines import statistics
statistics.concordance_index( rossi_dataset['week'], cf.durations, event_observed=rossi_dataset['arrest'])
0.48917215904975736
from lifelines.utils import k_fold_cross_validation
cf = CoxPHFitter(alpha=0.95, tie_method='Efron')
cf.fit(X, duration_col='Account_Length', event_col='Churn')
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-10-26f6078a002a> in <module>() ----> 1 cf.fit(X, duration_col='Account_Length', event_col='Churn') C:\Users\Stephen\AppData\Local\Enthought\Canopy\User\lib\site-packages\lifelines\estimation.pyc in fit(self, df, duration_col, event_col, show_progress, initial_beta, include_likelihood) 995 hazards_ = self._newton_rhaphson(df, T, E, initial_beta=initial_beta, 996 show_progress=show_progress, --> 997 include_likelihood=include_likelihood) 998 999 self.hazards_ = pd.DataFrame(hazards_.T, columns=df.columns, C:\Users\Stephen\AppData\Local\Enthought\Canopy\User\lib\site-packages\lifelines\estimation.pyc in _newton_rhaphson(self, X, T, E, initial_beta, step_size, epsilon, show_progress, include_likelihood) 938 beta = delta + beta 939 if pd.isnull(delta).sum() > 1: --> 940 raise ValueError("delta contains nan value(s). Converge halted.") 941 if norm(delta) < epsilon: 942 converging = False ValueError: delta contains nan value(s). Converge halted.
cf.summary()
coef exp(coef) se(coef) z p lower 0.95 upper 0.95 Vmail_Message 0.003974 1.003982 0.001351 2.940537 0.003276 0.001325 0.006623 Day_Mins -0.134447 0.874200 1.106259 -0.121533 0.903269 -2.303151 2.034258 Day_Calls -0.002494 0.997509 0.000933 -2.672612 0.007526 -0.004323 -0.000665 Day_Charge 0.780957 2.183560 6.507411 0.120010 0.904475 -11.976138 13.538052 Eve_Mins 0.108991 1.115152 0.556185 0.195962 0.844640 -0.981351 1.199334 Eve_Calls -0.000125 0.999875 0.000942 -0.132629 0.894487 -0.001972 0.001722 Eve_Charge -1.288985 0.275550 6.543494 -0.196987 0.843837 -14.116818 11.538848 Night_Mins 0.077512 1.080595 0.298030 0.260080 0.794802 -0.506746 0.661769 Night_Calls 0.000554 1.000554 0.000975 0.568107 0.569962 -0.001358 0.002466 Night_Charge -1.728339 0.177579 6.622481 -0.260981 0.794108 -14.711017 11.254340 Intl_Mins 0.819233 2.268760 1.760719 0.465283 0.641728 -2.632471 4.270938 Intl_Calls 0.006350 1.006370 0.007608 0.834627 0.403928 -0.008565 0.021265 Intl_Charge -3.072109 0.046323 6.521514 -0.471073 0.637589 -15.856851 9.712634 CustServ_Calls -0.059976 0.941787 0.014996 -3.999558 0.000063 -0.089374 -0.030579
scores=k_fold_cross_validation(cf, data, duration_col='Account_Length', event_col='Churn')
scores.mean()
0.4954231541874119
cf.summary()
coef exp(coef) se(coef) z p lower 0.95 upper 0.95 Vmail_Message 0.003974 1.003982 0.001351 2.940537 0.003276 0.001325 0.006623 Day_Mins -0.134447 0.874200 1.106259 -0.121533 0.903269 -2.303151 2.034258 Day_Calls -0.002494 0.997509 0.000933 -2.672612 0.007526 -0.004323 -0.000665 Day_Charge 0.780957 2.183560 6.507411 0.120010 0.904475 -11.976138 13.538052 Eve_Mins 0.108991 1.115152 0.556185 0.195962 0.844640 -0.981351 1.199334 Eve_Calls -0.000125 0.999875 0.000942 -0.132629 0.894487 -0.001972 0.001722 Eve_Charge -1.288985 0.275550 6.543494 -0.196987 0.843837 -14.116818 11.538848 Night_Mins 0.077512 1.080595 0.298030 0.260080 0.794802 -0.506746 0.661769 Night_Calls 0.000554 1.000554 0.000975 0.568107 0.569962 -0.001358 0.002466 Night_Charge -1.728339 0.177579 6.622481 -0.260981 0.794108 -14.711017 11.254340 Intl_Mins 0.819233 2.268760 1.760719 0.465283 0.641728 -2.632471 4.270938 Intl_Calls 0.006350 1.006370 0.007608 0.834627 0.403928 -0.008565 0.021265 Intl_Charge -3.072109 0.046323 6.521514 -0.471073 0.637589 -15.856851 9.712634 CustServ_Calls -0.059976 0.941787 0.014996 -3.999558 0.000063 -0.089374 -0.030579
foo=cf.baseline_hazard_
foo.iloc[6]
baseline hazard 0.001312 Name: 6, dtype: float64