Style-Sheet
%pylab inline
Populating the interactive namespace from numpy and matplotlib
cd ../src
/cellar/users/agross/TCGA_Code/TCGA/src
from Processing.Imports import *
params = pd.read_table('../global_params.txt', header=None, squeeze=True,
index_col=0)
run_path = '{}/Firehose__{}/'.format(params.ix['OUT_PATH'], params.ix['RUN_DATE'])
run = get_run(run_path, 'Run_' + params.ix['VERSION'])
cancer = run.load_cancer(params.ix['CANCER'])
clinical = cancer.load_clinical()
mut = cancer.load_data('Mutation')
mut.uncompress()
cn = cancer.load_data('CN_broad')
cn.uncompress()
rna = pickle.load(open(cancer.path + '/mRNASeq/store/no_hpv.p', 'rb'))
#meth = pickle.load(open(cancer.path + '/Methylation/store/no_hpv.p', 'rb'))
mirna = pickle.load(open(cancer.path + '/miRNASeq/store/no_hpv.p', 'rb'))
surv = clinical.survival.survival_5y
age = clinical.clinical.age
hpv_inferred = clinical.hpv_inferred
keepers_o = true_index(hpv_inferred==0)
keepers_o = keepers_o.intersection(mut.features.columns)
keepers_o = keepers_o.intersection(cn.features.columns)
keepers_o = keepers_o.intersection(surv.unstack().index)
keepers_o = keepers_o.intersection(rna.features.columns)
keepers_o = keepers_o.intersection(mirna.features.columns)
keepers_o = keepers_o.intersection(true_index(age < 85))
stage = clinical.stage.pathologicstage.ix[keepers_o].fillna('nx')
stage = stage.dropna().map(lambda s: s.replace('a','').replace('b',''))
stage = stage.map(lambda s: s.replace('stge','Stage'))
lymph_stage = clinical.stage.pathologicn.ix[keepers_o]
lymph_stage = lymph_stage.dropna().map(lambda s: s[:2])
old_age = (age >= 75).map({True: 'Age > 75', False: 'Age < 75'})
pack_years = py = clinical.clinical.numberpackyearssmoked.dropna().astype(float)
group = [['oral tongue','oral cavity','floor of mouth','buccal mucosa','alveolar ridge','hard palate','lip'],
['oropharynx','tonsil','base of tongue'],
#['hypopharynx'],
['larynx']]
groups = ['oral cavity','oropharynx','larynx']
tumor_subdivision = pd.Series({idx: groups[i] for i,g in enumerate(group) for idx,j in
clinical.clinical.anatomicneoplasmsubdivision.iteritems()
if j.lower() in g})
invasion = clinical.clinical.perineuralinvasionpresent.replace('nan', nan)
invasion = invasion.str.lower()
spread = clinical.clinical.presenceofpathologicalnodalextracapsularspread
spread = spread.map(str.lower, na_action='ignore')
spread = spread.map({'no extranodal extension': 'no', 'microscopic extension':'yes',
'gross extension':'yes'}).dropna()
year = clinical.clinical.yearofinitialpathologicdiagnosis
year = year.replace('[Discrepancy]', nan).astype(float)
lymph = lymph_stage.ix[keepers_o] != 'n0'
lymph_status = combine(lymph, spread.ix[keepers_o]=='yes')
lymph_status = lymph_status.map({'neither': 'n0', lymph.name: 'lymph_node', 'both': 'extra_capsular_spread'})
#lymph_status = (lymph_status == 'extra_capsular_spread').astype(float)
from Stats.Classification import SVC_fill
smoker = clinical.clinical.tobaccosmokinghistory.str.lower()
smoker_binary = smoker[smoker.isin(['current smoker','lifelong non-smoker'])] == 'current smoker'
smoker.value_counts()
current smoker 128 current reformed smoker for < or = 15 years 101 lifelong non-smoker 80 current reformed smoker for > 15 years 59 dtype: int64
ret = SVC_fill(smoker_binary, rna.features.ix['real'])
ret['auc']
0.90839694656488545
figsize(6,4)
fun = ret['decision_function']
o = ['current smoker','current reformed smoker for < or = 15 years',
'current reformed smoker for > 15 years', 'lifelong non-smoker']
violin_plot_pandas(smoker, fun, order = o)
ax = plt.gca()
t = ax.set_xticklabels(o, rotation=20)
prettify_ax(ax)
get_surv_fit_lr(surv, smoker_binary.ix[keepers_o].fillna('Missing'))
Stats | Median Survival | 5y Survival | Log-Rank | |||||||
---|---|---|---|---|---|---|---|---|---|---|
# Patients | # Events | Median | Lower | Upper | Surv | Lower | Upper | chi2 | p | |
7.66 | 0.0217 | |||||||||
Missing | 123 | 50 | 4 | 2.3 | NaN | 0.487 | 0.39 | 0.609 | ||
True | 84 | 38 | 1.6 | 1.35 | NaN | 0.284 | 0.146 | 0.553 | ||
False | 44 | 14 | 4.71 | 2.96 | NaN | 0.393 | 0.19 | 0.813 |
4 rows × 10 columns
smoker_inferred = 1.*smoker_binary.combine_first(ret['filled_feature'])
smoker_inferred.name = 'smoker_inferred'
pd.crosstab(smoker_inferred, smoker.ix[smoker_inferred.index].fillna('M')).T
smoker_inferred | 0.0 | 1.0 |
---|---|---|
tobaccosmokinghistory | ||
M | 4 | 6 |
current reformed smoker for < or = 15 years | 22 | 52 |
current reformed smoker for > 15 years | 27 | 17 |
current smoker | 0 | 128 |
lifelong non-smoker | 80 | 0 |
5 rows × 2 columns
get_surv_fit_lr(clinical.survival.survival, smoker_inferred)
Stats | Median Survival | 5y Survival | Log-Rank | |||||||
---|---|---|---|---|---|---|---|---|---|---|
# Patients | # Events | Median | Lower | Upper | Surv | Lower | Upper | chi2 | p | |
12.8 | 0.000342 | |||||||||
1 | 203 | 91 | 2.21 | 1.6 | 3.29 | 0.323 | 0.237 | 0.439 | ||
0 | 132 | 41 | 5.48 | 4.71 | NaN | 0.553 | 0.428 | 0.713 |
3 rows × 10 columns
survival_and_stats(smoker_binary.ix[smoker_inferred.index], surv)
si = smoker_inferred.map({1:'smoker_inf', 0:'non-smoker_inf'})
s = smoker_binary.map({True:'smoker', False:'non-smoker'})
survival_and_stats(s.combine_first(si), surv)
figsize(6,4)
clinical.clinical.amountofalcoholconsumptionperday.astype(float).hist()
<matplotlib.axes.AxesSubplot at 0xb8ff990>
clinical.clinical.alcoholhistorydocumented.value_counts()
yes 254 no 118 dtype: int64
freq = clinical.clinical.frequencyofalcoholconsumption.astype(float)
count = clinical.clinical.amountofalcoholconsumptionperday.astype(float)
drinker = (freq * count).dropna()
#drinker = drinker[(drinker < 8) + (drinker > 14)]
drinker = drinker.ix[keepers_o].dropna() > 10
drinker.value_counts()
True 57 False 38 dtype: int64
ret = SVC_fill(drinker, rna.features.ix['real'])
ret['auc']
0.88421052631578945
fun = ret['decision_function']
violin_plot_pandas(drinker, fun)
series_scatter((freq * count).dropna(), fun)
xlim(-1,20)
(-1, 20)
get_surv_fit_lr(surv, drinker*1.)
Stats | Median Survival | 5y Survival | Log-Rank | |||||||
---|---|---|---|---|---|---|---|---|---|---|
# Patients | # Events | Median | Lower | Upper | Surv | Lower | Upper | chi2 | p | |
3.1 | 0.0783 | |||||||||
1 | 57 | 24 | 2.5 | 1.79 | NaN | 0.319 | 0.176 | 0.577 | ||
0 | 38 | 8 | NaN | NaN | NaN | 0.732 | 0.583 | 0.919 |
3 rows × 10 columns
drinker_inferred = 1.*drinker.combine_first(ret['filled_feature'])
get_surv_fit_lr(surv, drinker_inferred)
Stats | Median Survival | 5y Survival | Log-Rank | |||||||
---|---|---|---|---|---|---|---|---|---|---|
# Patients | # Events | Median | Lower | Upper | Surv | Lower | Upper | chi2 | p | |
6.03 | 0.0141 | |||||||||
1 | 179 | 84 | 2.2 | 1.6 | 3.53 | 0.351 | 0.264 | 0.467 | ||
0 | 80 | 25 | NaN | 4.49 | NaN | 0.562 | 0.431 | 0.734 |
3 rows × 10 columns
si = drinker_inferred.map({1:'drinker_inf', 0:'non-drinker_inf'})
s = drinker.map({True:'drinker', False:'non-drinker'})
survival_and_stats(s.combine_first(si), surv)
survival_and_stats(drinker_inferred, surv)
invasion.value_counts()
no 137 yes 126 dtype: int64
ret = SVC_fill(invasion[invasion.isin(['yes','no'])]=='yes',
rna.features.ix['real'])
ret['auc']
0.84375
fun = ret['decision_function']
violin_plot_pandas(invasion, fun)
invasion_inferred = 1.*(invasion.dropna()=='yes').combine_first(ret['filled_feature'])
get_surv_fit_lr(surv, invasion)
Stats | Median Survival | 5y Survival | Log-Rank | |||||||
---|---|---|---|---|---|---|---|---|---|---|
# Patients | # Events | Median | Lower | Upper | Surv | Lower | Upper | chi2 | p | |
6.88 | 0.032 | |||||||||
no | 137 | 34 | NaN | 4 | NaN | 0.506 | 0.356 | 0.72 | ||
yes | 126 | 54 | 2.58 | 1.49 | NaN | 0.388 | 0.285 | 0.529 |
3 rows × 10 columns
get_surv_fit_lr(surv, invasion_inferred)
Stats | Median Survival | 5y Survival | Log-Rank | |||||||
---|---|---|---|---|---|---|---|---|---|---|
# Patients | # Events | Median | Lower | Upper | Surv | Lower | Upper | chi2 | p | |
4.87 | 0.0273 | |||||||||
0 | 179 | 55 | 4.71 | 2.84 | NaN | 0.439 | 0.325 | 0.594 | ||
1 | 151 | 68 | 2.54 | 1.54 | 4.49 | 0.369 | 0.277 | 0.493 |
3 rows × 10 columns
survival_and_stats(invasion_inferred, surv)
survival_and_stats(invasion.ix[invasion_inferred.index].combine_first(invasion_inferred), surv)
ret = SVC_fill(spread=='yes', rna.features.ix['real'])
ret['auc']
0.88636363636363635
fun = ret['decision_function']
violin_plot_pandas(spread, fun)
spread_inferred = 1.*(spread.dropna()=='yes').combine_first(ret['filled_feature'])
get_surv_fit_lr(surv, spread)
Stats | Median Survival | 5y Survival | Log-Rank | |||||||
---|---|---|---|---|---|---|---|---|---|---|
# Patients | # Events | Median | Lower | Upper | Surv | Lower | Upper | chi2 | p | |
12.5 | 0.000413 | |||||||||
no | 168 | 50 | NaN | 2.99 | NaN | 0.507 | 0.4 | 0.641 | ||
yes | 71 | 34 | 1.42 | 1.25 | 2.08 | 0.282 | 0.17 | 0.469 |
3 rows × 10 columns
get_surv_fit_lr(surv, spread_inferred)
Stats | Median Survival | 5y Survival | Log-Rank | |||||||
---|---|---|---|---|---|---|---|---|---|---|
# Patients | # Events | Median | Lower | Upper | Surv | Lower | Upper | chi2 | p | |
19.9 | 7.97e-06 | |||||||||
0 | 227 | 70 | 4.71 | 3.29 | NaN | 0.493 | 0.399 | 0.608 | ||
1 | 95 | 49 | 1.42 | 1.25 | 1.71 | 0.246 | 0.153 | 0.397 |
3 rows × 10 columns
survival_and_stats(spread, surv)
survival_and_stats(spread_inferred, surv)
clinical_processed = pd.concat({'spread': spread,
'spread_inferred': spread_inferred,
'invasion': invasion,
'invasion_inferred': invasion_inferred,
'hpv': clinical.hpv,
'hpv_inferred': hpv_inferred,
'smoker': smoker,
'smoker_inferred': smoker_inferred,
'drinker': drinker,
'drinker_inferred': drinker_inferred,
'stage': stage,
'lymph_stage': lymph_stage,
'age': age,
'old_age': old_age,
'pack_years': pack_years,
'year': (year < 2000).map({True: 'pre_2000', False: 'post_2000'}),
'lymph_status': lymph_status,
'tumor_subdivision': tumor_subdivision}, axis=1)
clinical.processed = clinical_processed
clinical.save()