%pylab inline
Populating the interactive namespace from numpy and matplotlib
cd ../src
/cellar/users/agross/TCGA_Code/TCGA/src
from Processing.Imports import *
from Figures.Survival import survival_and_stats
params = pd.read_table('../global_params.txt', header=None, squeeze=True,
index_col=0)
run_path = '{}/Firehose__{}/'.format(params.ix['OUT_PATH'], params.ix['RUN_DATE'])
run = get_run(run_path, 'Run_' + params.ix['VERSION'])
cancer = run.load_cancer(params.ix['CANCER'])
clinical = cancer.load_clinical()
mut = cancer.load_data('Mutation')
mut.uncompress()
cn = cancer.load_data('CN_broad')
cn.uncompress()
rna = cancer.load_data('mRNASeq')
mirna = cancer.load_data('miRNASeq')
rppa = cancer.load_data('RPPA')
#meth = cancer.load_data('Methylation')
clinical_processed = clinical.processed
#clinical_processed = clinical_processed.replace('yes', 1.).replace('no', 0.)
hpv_inferred = clinical_processed.hpv_inferred
surv = clinical.survival.survival_5y
age = clinical.clinical.age.astype(float)
keepers_o = true_index(hpv_inferred == 0)
keepers_o = keepers_o.intersection(mut.features.columns)
keepers_o = keepers_o.intersection(cn.features.columns)
keepers_o = keepers_o.intersection(surv.unstack().index)
keepers_o = keepers_o.intersection(rna.features.columns)
keepers_o = keepers_o.intersection(mirna.features.columns)
keepers_o = keepers_o.intersection(true_index(age < 85))
old = pd.Series(1.*(age>=75) + 1.*(age>=85), name='age')
old = old.map({0:'younger than 75', 1:'between 75 and 85', 2:'older than 85'})
ff = clinical_processed.select(lambda s: ('hpv' not in s), axis=1)
ff = ff.join(clinical.clinical.race).join(clinical.clinical.gender)
ff['year of dx.'] = ff['year']
ff['age'] = old
ff['smoker'] = ff['smoker'].map({'current smoker':'yes', 'lifelong non-smoker':'no',
'current reformed smoker for < or = 15 years':'reformed / missing',
'current reformed smoker for > 15 years':'reformed / missing',
nan:'reformed / missing'})
ff['stage'] = ff['stage'].map(lambda s: s.split()[1].upper() if (isinstance(s, str) and ' ' in s) else 'NX')
ff['lymph_stage'] = ff['lymph_stage'].map(lambda s: s.upper() if (isinstance(s, str) and s[0] == 'n') else 'NX')
ff = ff.replace('[Unknown]', np.nan)
ff = ff.ix[keepers_o].fillna('missing')
ff['tumor_subdivision'] = ff['tumor_subdivision'].replace('missing','missing / other')
ff['drinker'] = ff['drinker'].map({True: 'yes', False: 'no', 'missing': 'missing / moderate'})
clin_uni = pd.concat({g: get_surv_fit_lr(surv, f)
for g,f in ff.iteritems()
if len(f.dropna().unique()) in range(2,10)})
pat = clin_uni[('Stats','# Patients')]
med_surv = clin_uni[('Median Survival','Median')]
surv_p = clin_uni[('Log-Rank','p')]
tab = pd.concat([pat, med_surv, surv_p], keys=['# Patients', 'Median Surv.', 'Log-rank P'], axis=1)
o = ['gender', 'race', 'year of dx.', 'age', 'tumor_subdivision','stage','lymph_stage','invasion',
'invasion_inferred','spread','spread_inferred', 'drinker','drinker_inferred','smoker','smoker_inferred']
tab = tab.ix[o]
o = {v:i for i,v in enumerate(o)}
tab.index = pd.MultiIndex.from_tuples([(o[i[0]], i[0], i[1]) for i in tab.index if i[0] in o])
tab = tab.reset_index(level=0).sort(['level_0','# Patients'], ascending=[True,False])
del tab['level_0']
Read in UPMC data and scrub clinical data
meta = pd.read_csv('../Extra_Data/UPMC_cohort/meta.csv', index_col=0)
meta.index = pd.MultiIndex.from_tuples([('-'.join(i.split('-')[:-1]), i.split('-')[-1])
for i in meta.index])
clin = pd.read_csv('../Extra_Data/UPMC_cohort/pitt_broad_data.csv', index_col=0)
clin.Tumor_type = clin.Tumor_type.map(str.strip)
surv = pd.concat([clin.os_5yr, clin.os_5yr_mons*30.5], keys=['event','days'], axis=1).stack()
clin2 = pd.read_csv('../Extra_Data/UPMC_cohort/pitt_broad_data_update.csv', index_col=0)
surv2 = pd.concat([clin2.os_5yr, clin2.os_5yr_mons*30.5], keys=['event','days'], axis=1).stack()
surv = surv2.combine_first(surv)
clin.HPV.value_counts()
Negative 63 Positive 11 dtype: int64
keepers = meta['SNP array ID'].notnull() & (meta['Final_74_exome_analysis'] == 1)
keepers = keepers.groupby(level=0).sum()
keepers = meta['SNP array ID'].notnull() & (meta['Final_74_exome_analysis'] == 1)
keepers = keepers.groupby(level=0).last()
keepers = (keepers > 0) & (clin.HPV == 'Negative')
keepers = keepers.ix[surv.index.get_level_values(0)].dropna()
keepers = keepers.groupby(level=0).last()
keepers = true_index(keepers)
len(keepers)
48
o = ['gender', 'race', 'year of dx.', 'age', 'tumor_subdivision','stage','lymph_stage','invasion',
'invasion_inferred','spread','spread_inferred', 'drinker','drinker_inferred','smoker','smoker_inferred']
clin = clin.ix[keepers]
clin['gender'] = clin['Gender']
clin['race'] = clin['Race'].map(str.lower).replace('black','black or african american')
clin['year of dx.'] = clin.Date_Dx.map(lambda s: 'post_2000' if s[-2] in ['0','1'] else 'pre_2000')
clin['age'] = clin.Age_Dx.map(lambda s: 'between 75 and 85' if s >= 75 else 'younger than 75')
def subd(s):
if s == 'Oral cavity':
return 'oral cavity'
elif s == 'Larynx':
return 'larynx'
elif s == 'Oropharynx':
return 'oropharynx'
else:
return 'missing / other'
clin['tumor_subdivision'] = clin.Primary_site.map(subd)
clin['stage'] = clin.clinical_stage.map(lambda s: s[:2] if s != 'Unstaged' else 'NX')
clin['lymph_stage'] = clin.Nodal_stage.map(lambda s: s[:2] if s != '.' else 'NX')
clin['invasion'] = clin.PNI.replace('unknown','missing')
clin['spread'] = clin.EPS.replace('unknown','missing')
def alch(v):
s = v['Alcohol_amt']
c = v['Alcohol']
if c == 'no':
return 'no'
if pd.isnull(s):
return 'missing / moderate'
elif int(s) <= 6:
return 'no'
elif int(s) > 13:
return 'yes'
else:
return 'missing / moderate'
clin['drinker'] = clin.apply(alch,1)
clin['smoker'] = clin.Smoking_history
clin_uni2 = pd.concat({v: get_surv_fit_lr(surv, clin[v])
for v in o if v in clin and len(clin[v].unique()) > 1})
pat = pd.concat({v: clin[v].value_counts() for v in o if v in clin})
#pat = clin_uni2[('Stats','# Patients')]
med_surv = clin_uni2[('Median Survival','Median')]
surv_p = clin_uni2[('Log-Rank','p')]
tab2 = pd.concat([pat, med_surv, surv_p], keys=['# Patients', 'Median Surv.', 'Log-rank P'], axis=1)
tab_combo = pd.concat([tab, tab2.ix[tab.index]], keys=['TCGA','UPMC'], axis=1)
o = ['gender', 'race', 'year of dx.', 'age', 'tumor_subdivision','stage','lymph_stage','invasion',
'invasion_inferred','spread','spread_inferred', 'drinker','drinker_inferred','smoker','smoker_inferred']
#tab_combo = tab_combo.ix[o]
#o = {v:i for i,v in enumerate(o)}
#tab_combo.index = pd.MultiIndex.from_tuples([(o[i[0]], i[0], i[1]) for i in tab_combo.index if i[0] in o])
#tab_combo = tab_combo.reset_index(level=0).sort(['level_0',('TCGA','# Patients')], ascending=[True,False])
#del tab_combo['level_0']
tab_combo[('UPMC', '# Patients')] = tab_combo[('UPMC', '# Patients')].fillna('')
o = [('gender', ''),
('gender', 'male'),
('gender', 'female'),
('race', ''),
('race', 'white'),
('race', 'black or african american'),
('race', 'missing'),
('race', 'asian'),
('race', 'american indian or alaska native'),
('year of dx.', ''),
('year of dx.', 'post_2000'),
('year of dx.', 'pre_2000'),
('age', ''),
('age', 'younger than 75'),
('age', 'between 75 and 85'),
('tumor_subdivision', ''),
('tumor_subdivision', 'oral cavity'),
('tumor_subdivision', 'larynx'),
('tumor_subdivision', 'oropharynx'),
('tumor_subdivision', 'missing / other'),
('stage', ''),
('stage', 'IV'),
('stage', 'III'),
('stage', 'II'),
('stage', 'I'),
('stage', 'NX'),
('lymph_stage', ''),
('lymph_stage', 'N3'),
('lymph_stage', 'N2'),
('lymph_stage', 'N1'),
('lymph_stage', 'N0'),
('lymph_stage', 'NX'),
('invasion', ''),
('invasion', 'yes'),
('invasion', 'no'),
('invasion', 'missing'),
('invasion_inferred', ''),
('invasion_inferred', 1),
('invasion_inferred', 0),
('spread', ''),
('spread', 'yes'),
('spread', 'no'),
('spread', 'missing'),
('spread_inferred', ''),
('spread_inferred', 1),
('spread_inferred', 0),
('drinker', ''),
('drinker', 'yes'),
('drinker', 'no'),
('drinker', 'missing / moderate'),
('drinker_inferred', ''),
('drinker_inferred', 1),
('drinker_inferred', 0),
('smoker', ''),
('smoker', 'yes'),
('smoker', 'no'),
('smoker', 'reformed / missing'),
('smoker_inferred', ''),
('smoker_inferred', 1),
('smoker_inferred', 0)]
tab_combo.ix[o]
TCGA | UPMC | ||||||
---|---|---|---|---|---|---|---|
# Patients | Median Surv. | Log-rank P | # Patients | Median Surv. | Log-rank P | ||
gender | 0.671 | 0.472 | |||||
male | 177 | 4 | 37 | 1.85 | |||
female | 74 | 2.96 | 11 | NaN | |||
race | 0.409 | 0.182 | |||||
white | 214 | 4 | 47 | 2.81 | |||
black or african american | 25 | 1.48 | 1 | 1 | |||
missing | 6 | 1.6 | NaN | NaN | |||
asian | 5 | NaN | NaN | NaN | |||
american indian or alaska native | 1 | NaN | NaN | NaN | |||
year of dx. | 4.67e-06 | NaN | NaN | ||||
post_2000 | 204 | NaN | 48 | NaN | NaN | ||
pre_2000 | 47 | 1.53 | NaN | NaN | |||
age | 0.00995 | 0.0929 | |||||
younger than 75 | 221 | 4 | 45 | 1.85 | |||
between 75 and 85 | 30 | 1.81 | 3 | NaN | |||
tumor_subdivision | 0.116 | 0.42 | |||||
oral cavity | 161 | 4.36 | 25 | 2.11 | |||
larynx | 76 | 2.25 | 11 | NaN | |||
oropharynx | 13 | 1.43 | 4 | 0.844 | |||
missing / other | 1 | 0.986 | 8 | 3.7 | |||
stage | 0.232 | 0.167 | |||||
IV | 136 | 2.3 | 35 | 1.5 | |||
III | 36 | 4.49 | NaN | NaN | |||
II | 37 | NaN | 12 | NaN | |||
I | 14 | NaN | NaN | NaN | |||
NX | 28 | 2.25 | 1 | NaN | |||
lymph_stage | 0.000908 | 0.000435 | |||||
N3 | 5 | 0.962 | 3 | 0.894 | |||
N2 | 86 | 1.9 | 23 | 1.3 | |||
N1 | 29 | NaN | 7 | 3.7 | |||
N0 | 84 | NaN | 14 | NaN | |||
NX | 47 | 1.83 | 1 | NaN | |||
invasion | 0.0038 | 0.0188 | |||||
yes | 96 | 2.58 | 22 | 1.36 | |||
no | 89 | NaN | 20 | NaN | |||
missing | 66 | 1.9 | 6 | 3.25 | |||
invasion_inferred | 0.0159 | NaN | NaN | ||||
1 | 121 | 2.2 | NaN | NaN | |||
0 | 130 | 4.71 | NaN | NaN | |||
spread | 0.00126 | 0.0448 | |||||
yes | 51 | 1.42 | 16 | 1.37 | |||
no | 119 | NaN | 14 | 2.11 | |||
missing | 81 | 3.53 | 18 | NaN | |||
spread_inferred | 6.13e-06 | NaN | NaN | ||||
1 | 74 | 1.43 | NaN | NaN | |||
0 | 177 | NaN | NaN | NaN | |||
drinker | 0.133 | 0.0252 | |||||
yes | 57 | 2.5 | 8 | 1.04 | |||
no | 38 | NaN | 27 | NaN | |||
missing / moderate | 156 | 2.84 | 13 | 1.37 | |||
drinker_inferred | 0.0124 | NaN | NaN | ||||
1 | 173 | 2.21 | NaN | NaN | |||
0 | 78 | NaN | NaN | NaN | |||
smoker | 0.0217 | 0.285 | |||||
yes | 84 | 1.6 | 41 | 1.85 | |||
no | 44 | 4.71 | 7 | 5.01 | |||
reformed / missing | 123 | 4 | NaN | NaN | |||
smoker_inferred | 0.00134 | NaN | NaN | ||||
1 | 157 | 2.16 | NaN | NaN | |||
0 | 94 | NaN | NaN | NaN |
60 rows × 6 columns
fig, axs = subplots(1,5, figsize=(15,3), sharey=True)
c = clin_all[['hpv', 'hpv_inferred']].replace({True:'HPV+', False:'HPV-'})
c.hpv_inferred.ix[true_index(c.hpv.isnull() == False)] = nan
c.apply(pd.value_counts).T.plot(kind='bar', ax=axs[0])
c = clin_all[['drinker', 'drinker_inferred']].replace({True:'yes', False:'no'})
c.drinker_inferred.ix[true_index(c.drinker.isnull() == False)] = nan
c.apply(pd.value_counts).T.plot(kind='bar', ax=axs[1])
c = clin_all[['smoker', 'smoker_inferred']].replace({True:'yes', False:'no'})
c = c.replace({'current smoker':'yes', 'lifelong non-smoker':'no'})
c = c.replace({'current reformed smoker for < or = 15 years':nan,
'current reformed smoker for > 15 years':nan})
c.smoker_inferred.ix[true_index(c.smoker.isin(['yes','no']) == True)] = nan
c.apply(pd.value_counts).T.plot(kind='bar', ax=axs[2])
c = clin_all[['invasion', 'invasion_inferred']].replace({True:'yes', False:'no','Missing':None})
c.invasion_inferred.ix[true_index(c.invasion.isnull() == False)] = nan
c.apply(pd.value_counts).T.plot(kind='bar', ax=axs[3])
c = clin_all[['spread', 'spread_inferred']].replace({True:'yes', False:'no'})
c.spread_inferred.ix[true_index(c.spread.isnull() == False)] = nan
c.apply(pd.value_counts).T.plot(kind='bar', ax=axs[4])
<matplotlib.axes.AxesSubplot at 0xceab650>