Overview of clinical variables in the TCGA HNSCC cohort and their implications towards patient prognosis. Here we are mainly just processing data for compilation of Supplemental Table 1.
import NotebookImport
from Imports import *
importing IPython notebook from Imports.ipynb Populating the interactive namespace from numpy and matplotlib changing to source dirctory line_width has been deprecated, use display.width instead (currently both are identical) populating namespace with data
old = pd.Series(1.*(age>=75) + 1.*(age>=85), name='age')
old = old.map({0:'younger than 75', 1:'between 75 and 85', 2:'older than 85'})
ff = clinical.processed.select(lambda s: ('hpv' not in s), axis=1)
ff = ff.join(clinical.clinical.race).join(clinical.clinical.gender)
ff['year of dx.'] = ff['year']
ff['age'] = old
ff['smoker'] = ff['smoker'].map({'current smoker':'yes', 'lifelong non-smoker':'no',
'current reformed smoker for < or = 15 years':'reformed / missing',
'current reformed smoker for > 15 years':'reformed / missing',
nan:'reformed / missing'})
ff['stage'] = ff['stage'].map(lambda s: s.split()[1].upper() if (isinstance(s, str) and ' ' in s) else 'NX')
ff['lymph_stage'] = ff['lymph_stage'].map(lambda s: s.upper() if (isinstance(s, str) and s[0] == 'n') else 'NX')
ff = ff.replace('[Unknown]', np.nan)
ff = ff.ix[keepers_o].fillna('missing')
ff['tumor_subdivision'] = ff['tumor_subdivision'].replace('missing','missing / other')
ff['drinker'] = ff['drinker'].map({True: 'yes', False: 'no', 'missing': 'missing / moderate'})
clin_uni = pd.concat({g: get_surv_fit_lr(surv, f)
for g,f in ff.iteritems()
if len(f.dropna().unique()) in range(2,10)})
pat = clin_uni[('Stats','# Patients')]
med_surv = clin_uni[('Median Survival','Median')]
surv_p = clin_uni[('Log-Rank','p')]
tab = pd.concat([pat, med_surv, surv_p], keys=['# Patients', 'Median Surv.', 'Log-rank P'], axis=1)
o = ['gender', 'race', 'year of dx.', 'age', 'tumor_subdivision','stage','lymph_stage','invasion',
'spread', 'drinker', 'smoker']
tab = tab.ix[o]
o = {v:i for i,v in enumerate(o)}
tab.index = pd.MultiIndex.from_tuples([(o[i[0]], i[0], i[1]) for i in tab.index if i[0] in o])
tab = tab.reset_index(level=0).sort(['level_0','# Patients'], ascending=[True,False])
del tab['level_0']
Read in UPMC data and scrub clinical data
meta = pd.read_csv('../Extra_Data/UPMC_cohort/meta.csv', index_col=0)
meta.index = pd.MultiIndex.from_tuples([('-'.join(i.split('-')[:-1]), i.split('-')[-1])
for i in meta.index])
clin = pd.read_csv('../Extra_Data/UPMC_cohort/pitt_broad_data.csv', index_col=0)
clin.Tumor_type = clin.Tumor_type.map(str.strip)
surv = pd.concat([clin.os_5yr, clin.os_5yr_mons*30.5], keys=['event','days'], axis=1).stack()
clin2 = pd.read_csv('../Extra_Data/UPMC_cohort/pitt_broad_data_update.csv', index_col=0)
surv2 = pd.concat([clin2.os_5yr, clin2.os_5yr_mons*30.5], keys=['event','days'], axis=1).stack()
surv = surv2.combine_first(surv)
clin.HPV.value_counts()
Negative 63 Positive 11 dtype: int64
keepers = meta['SNP array ID'].notnull() & (meta['Final_74_exome_analysis'] == 1)
keepers = keepers.groupby(level=0).sum()
keepers = meta['SNP array ID'].notnull() & (meta['Final_74_exome_analysis'] == 1)
keepers = keepers.groupby(level=0).last()
keepers = (keepers > 0) & (clin.HPV == 'Negative')
keepers = keepers.ix[surv.index.get_level_values(0)].dropna()
keepers = keepers.groupby(level=0).last()
keepers = true_index(keepers)
len(keepers)
48
o = ['gender', 'race', 'year of dx.', 'age', 'tumor_subdivision','stage','lymph_stage','invasion',
'spread', 'drinker', 'smoker']
clin = clin.ix[keepers]
clin['gender'] = clin['Gender']
clin['race'] = clin['Race'].map(str.lower).replace('black','black or african american')
clin['year of dx.'] = clin.Date_Dx.map(lambda s: 'post_2000' if s[-2] in ['0','1'] else 'pre_2000')
clin['age'] = clin.Age_Dx.map(lambda s: 'between 75 and 85' if s >= 75 else 'younger than 75')
def subd(s):
if s == 'Oral cavity':
return 'oral cavity'
elif s == 'Larynx':
return 'larynx'
elif s == 'Oropharynx':
return 'oropharynx'
else:
return 'missing / other'
clin['tumor_subdivision'] = clin.Primary_site.map(subd)
clin['stage'] = clin.clinical_stage.map(lambda s: s[:2] if s != 'Unstaged' else 'NX')
clin['lymph_stage'] = clin.Nodal_stage.map(lambda s: s[:2] if s != '.' else 'NX')
clin['invasion'] = clin.PNI.replace('unknown','missing')
clin['spread'] = clin.EPS.replace('unknown','missing')
def alch(v):
s = v['Alcohol_amt']
c = v['Alcohol']
if c == 'no':
return 'no'
if pd.isnull(s):
return 'missing / moderate'
elif int(s) <= 6:
return 'no'
elif int(s) > 13:
return 'yes'
else:
return 'missing / moderate'
clin['drinker'] = clin.apply(alch,1)
clin['smoker'] = clin.Smoking_history
clin_uni2 = pd.concat({v: get_surv_fit_lr(surv, clin[v])
for v in o if v in clin and len(clin[v].unique()) > 1})
pat = pd.concat({v: clin[v].value_counts() for v in o if v in clin})
#pat = clin_uni2[('Stats','# Patients')]
med_surv = clin_uni2[('Median Survival','Median')]
surv_p = clin_uni2[('Log-Rank','p')]
tab2 = pd.concat([pat, med_surv, surv_p], keys=['# Patients', 'Median Surv.', 'Log-rank P'], axis=1)
tab_combo = pd.concat([tab, tab2.ix[tab.index]], keys=['TCGA','UPMC'], axis=1)
tab_combo[('UPMC', '# Patients')] = tab_combo[('UPMC', '# Patients')].fillna('')
o = [('gender', ''),
('gender', 'male'),
('gender', 'female'),
('race', ''),
('race', 'white'),
('race', 'black or african american'),
('race', 'missing'),
('race', 'asian'),
('race', 'american indian or alaska native'),
('year of dx.', ''),
('year of dx.', 'post_2000'),
('year of dx.', 'pre_2000'),
('age', ''),
('age', 'younger than 75'),
('age', 'between 75 and 85'),
('tumor_subdivision', ''),
('tumor_subdivision', 'oral cavity'),
('tumor_subdivision', 'larynx'),
('tumor_subdivision', 'oropharynx'),
('tumor_subdivision', 'missing / other'),
('stage', ''),
('stage', 'IV'),
('stage', 'III'),
('stage', 'II'),
('stage', 'I'),
('stage', 'NX'),
('lymph_stage', ''),
('lymph_stage', 'N3'),
('lymph_stage', 'N2'),
('lymph_stage', 'N1'),
('lymph_stage', 'N0'),
('lymph_stage', 'NX'),
('invasion', ''),
('invasion', 'yes'),
('invasion', 'no'),
('invasion', 'missing'),
('spread', ''),
('spread', 'yes'),
('spread', 'no'),
('spread', 'missing'),
('drinker', ''),
('drinker', 'yes'),
('drinker', 'no'),
('drinker', 'missing / moderate'),
('smoker', ''),
('smoker', 'yes'),
('smoker', 'no'),
('smoker', 'reformed / missing'),]
tab_combo.ix[o]
TCGA | UPMC | ||||||
---|---|---|---|---|---|---|---|
# Patients | Median Surv. | Log-rank P | # Patients | Median Surv. | Log-rank P | ||
gender | 0.696 | 0.472 | |||||
male | 176 | 4 | 37 | 1.85 | |||
female | 74 | 2.96 | 11 | NaN | |||
race | 0.418 | 0.182 | |||||
white | 213 | 4 | 47 | 2.81 | |||
black or african american | 25 | 1.48 | 1 | 1 | |||
missing | 6 | 1.6 | NaN | NaN | |||
asian | 5 | NaN | NaN | NaN | |||
american indian or alaska native | 1 | NaN | NaN | NaN | |||
year of dx. | 5.46e-06 | NaN | NaN | ||||
post_2000 | 203 | NaN | 48 | NaN | NaN | ||
pre_2000 | 47 | 1.53 | NaN | NaN | |||
age | 0.0106 | 0.0929 | |||||
younger than 75 | 220 | 4 | 45 | 1.85 | |||
between 75 and 85 | 30 | 1.81 | 3 | NaN | |||
tumor_subdivision | 0.116 | 0.42 | |||||
oral cavity | 161 | 4.36 | 25 | 2.11 | |||
larynx | 75 | 2.25 | 11 | NaN | |||
oropharynx | 13 | 1.43 | 4 | 0.844 | |||
missing / other | 1 | 0.986 | 8 | 3.7 | |||
stage | 0.221 | 0.167 | |||||
IV | 135 | 2.3 | 35 | 1.5 | |||
III | 36 | 4.49 | NaN | NaN | |||
II | 37 | NaN | 12 | NaN | |||
I | 14 | NaN | NaN | NaN | |||
NX | 28 | 2.25 | 1 | NaN | |||
lymph_stage | 0.00083 | 0.000435 | |||||
N3 | 5 | 0.962 | 3 | 0.894 | |||
N2 | 85 | 1.9 | 23 | 1.3 | |||
N1 | 29 | NaN | 7 | 3.7 | |||
N0 | 84 | NaN | 14 | NaN | |||
NX | 47 | 1.83 | 1 | NaN | |||
invasion | 0.00342 | 0.0188 | |||||
yes | 95 | 2.5 | 22 | 1.36 | |||
no | 89 | NaN | 20 | NaN | |||
missing | 66 | 1.9 | 6 | 3.25 | |||
spread | 0.00146 | 0.0448 | |||||
yes | 51 | 1.42 | 16 | 1.37 | |||
no | 118 | NaN | 14 | 2.11 | |||
missing | 81 | 3.53 | 18 | NaN | |||
drinker | 0.0875 | 0.0252 | |||||
yes | 45 | 4 | 8 | 1.04 | |||
no | 35 | NaN | 27 | NaN | |||
missing / moderate | 170 | 2.5 | 13 | 1.37 | |||
smoker | 0.0231 | 0.285 | |||||
yes | 84 | 1.6 | 41 | 1.85 | |||
no | 44 | 4.71 | 7 | 5.01 | |||
reformed / missing | 122 | 4 | NaN | NaN |