#!/usr/bin/env python # coding: utf-8 # 통계적 사고 (2판) 연습문제 ([thinkstats2.com](thinkstats2.com), [think-stat.xwmooc.org](http://think-stat.xwmooc.org))
# Allen Downey / 이광춘(xwMOOC) # ## 연습문제 13.1 # # NSFG 주기 6과 7에서, cmdivorcx 변수는 응답자의 만약 적용되다면 년도-월(century-month)로 부호화된 첫 혼인에 대한 이혼 날짜 정보를 담고 있다. # 이혼으로 끝나는 결혼기간과 지금까지 지속되는 혼인 기간을 계산하시오. 결혼기간에 대한 위험함수와 생존함수를 추정하시오. # # 표집 가중치를 고려해서 재표집을 사용하고, 표집오차를 시각화하는데 재표집으로 나온 표본에서 데이터를 도식화하시오. # # 응답자를 10년주기 출생으로 집단화하고, 가능하면 첫번째 혼인 연령으로 나누는 것을 고려한다. # In[3]: get_ipython().run_line_magic('matplotlib', 'inline') from __future__ import print_function import pandas import numpy as np import thinkplot import thinkstats2 import survival def CleanData(resp): """Cleans respondent data. resp: DataFrame """ resp.cmdivorcx.replace([9998, 9999], np.nan, inplace=True) resp['notdivorced'] = resp.cmdivorcx.isnull().astype(int) resp['duration'] = (resp.cmdivorcx - resp.cmmarrhx) / 12.0 resp['durationsofar'] = (resp.cmintvw - resp.cmmarrhx) / 12.0 month0 = pandas.to_datetime('1899-12-15') dates = [month0 + pandas.DateOffset(months=cm) for cm in resp.cmbirth] resp['decade'] = (pandas.DatetimeIndex(dates).year - 1900) // 10 def ResampleDivorceCurve(resps): """Plots divorce curves based on resampled data. resps: list of respondent DataFrames """ for _ in range(41): samples = [thinkstats2.ResampleRowsWeighted(resp) for resp in resps] sample = pandas.concat(samples, ignore_index=True) PlotDivorceCurveByDecade(sample, color='#225EA8', alpha=0.1) thinkplot.Show(xlabel='years', axis=[0, 28, 0, 1]) def ResampleDivorceCurveByDecade(resps): """Plots divorce curves for each birth cohort. resps: list of respondent DataFrames """ for i in range(41): samples = [thinkstats2.ResampleRowsWeighted(resp) for resp in resps] sample = pandas.concat(samples, ignore_index=True) groups = sample.groupby('decade') if i == 0: survival.AddLabelsByDecade(groups, alpha=0.7) EstimateSurvivalByDecade(groups, alpha=0.1) thinkplot.Show(xlabel='years', axis=[0, 28, 0, 1]) def EstimateSurvivalByDecade(groups, **options): """Groups respondents by decade and plots survival curves. groups: GroupBy object """ thinkplot.PrePlot(len(groups)) for name, group in groups: print(name, len(group)) _, sf = EstimateSurvival(group) thinkplot.Plot(sf, **options) def EstimateSurvival(resp): """Estimates the survival curve. resp: DataFrame of respondents returns: pair of HazardFunction, SurvivalFunction """ complete = resp[resp.notdivorced == 0].duration ongoing = resp[resp.notdivorced == 1].durationsofar hf = survival.EstimateHazardFunction(complete, ongoing) sf = hf.MakeSurvival() return hf, sf def main(): resp6 = survival.ReadFemResp2002() CleanData(resp6) married6 = resp6[resp6.evrmarry==1] resp7 = survival.ReadFemResp2010() CleanData(resp7) married7 = resp7[resp7.evrmarry==1] ResampleDivorceCurveByDecade([married6, married7]) if __name__ == '__main__': main() # In[ ]: