HNSCC HPV- Cohort Clinical Variable Exploration

Overview of clinical variables in the TCGA HNSCC cohort and their implications towards patient prognosis. Here we are mainly just processing data for compilation of Supplemental Table 1.

Import Data and Packages¶

For full list of data and packages imported see the Imports notebook.

In [1]:

import NotebookImport
from Imports import *

importing IPython notebook from Imports.ipynb
Populating the interactive namespace from numpy and matplotlib
changing to source dirctory
line_width has been deprecated, use display.width instead (currently both are
identical)

populating namespace with data

Supplemtal Table 1: Clinical characteristics of HPV- cohort¶

In [2]:

old = pd.Series(1.*(age>=75) + 1.*(age>=85), name='age')
old = old.map({0:'younger than 75', 1:'between 75 and 85', 2:'older than 85'})

In [3]:

ff = clinical.processed.select(lambda s: ('hpv' not in s), axis=1)
ff = ff.join(clinical.clinical.race).join(clinical.clinical.gender)
ff['year of dx.'] = ff['year']
ff['age'] = old
ff['smoker'] = ff['smoker'].map({'current smoker':'yes', 'lifelong non-smoker':'no',
                                 'current reformed smoker for < or = 15 years':'reformed / missing',
                                 'current reformed smoker for > 15 years':'reformed / missing',
                                 nan:'reformed / missing'})
ff['stage'] = ff['stage'].map(lambda s: s.split()[1].upper() if (isinstance(s, str) and ' ' in s) else 'NX')
ff['lymph_stage'] = ff['lymph_stage'].map(lambda s: s.upper() if (isinstance(s, str) and s[0] == 'n') else 'NX')
ff = ff.replace('[Unknown]', np.nan)
ff = ff.ix[keepers_o].fillna('missing')
ff['tumor_subdivision'] = ff['tumor_subdivision'].replace('missing','missing / other')
ff['drinker'] = ff['drinker'].map({True: 'yes', False: 'no', 'missing': 'missing / moderate'})

clin_uni = pd.concat({g: get_surv_fit_lr(surv, f)
                      for g,f in ff.iteritems() 
                      if len(f.dropna().unique()) in range(2,10)})

In [4]:

pat = clin_uni[('Stats','# Patients')]
med_surv = clin_uni[('Median Survival','Median')]
surv_p = clin_uni[('Log-Rank','p')]
tab = pd.concat([pat, med_surv, surv_p], keys=['# Patients', 'Median Surv.', 'Log-rank P'], axis=1)

In [5]:

o = ['gender', 'race', 'year of dx.', 'age', 'tumor_subdivision','stage','lymph_stage','invasion',
     'spread', 'drinker', 'smoker']
tab = tab.ix[o]
o = {v:i for i,v in enumerate(o)}

In [6]:

tab.index = pd.MultiIndex.from_tuples([(o[i[0]], i[0], i[1]) for i in tab.index if i[0] in o])
tab = tab.reset_index(level=0).sort(['level_0','# Patients'], ascending=[True,False])
del tab['level_0']

UPMC Cohort¶

Read in UPMC data and scrub clinical data

In [7]:

meta = pd.read_csv('../Extra_Data/UPMC_cohort/meta.csv', index_col=0)
meta.index = pd.MultiIndex.from_tuples([('-'.join(i.split('-')[:-1]), i.split('-')[-1]) 
                                        for i in meta.index])
clin = pd.read_csv('../Extra_Data/UPMC_cohort/pitt_broad_data.csv', index_col=0)
clin.Tumor_type = clin.Tumor_type.map(str.strip)
surv = pd.concat([clin.os_5yr, clin.os_5yr_mons*30.5], keys=['event','days'], axis=1).stack()
clin2 = pd.read_csv('../Extra_Data/UPMC_cohort/pitt_broad_data_update.csv', index_col=0)
surv2 = pd.concat([clin2.os_5yr, clin2.os_5yr_mons*30.5], keys=['event','days'], axis=1).stack()
surv = surv2.combine_first(surv)

In [8]:

clin.HPV.value_counts()

Out[8]:

Negative    63
Positive    11
dtype: int64

In [9]:

keepers = meta['SNP array ID'].notnull() & (meta['Final_74_exome_analysis'] == 1)
keepers = keepers.groupby(level=0).sum() 

In [10]:

keepers = meta['SNP array ID'].notnull() & (meta['Final_74_exome_analysis'] == 1)
keepers = keepers.groupby(level=0).last() 
keepers = (keepers > 0) & (clin.HPV == 'Negative')
keepers = keepers.ix[surv.index.get_level_values(0)].dropna()
keepers = keepers.groupby(level=0).last()
keepers = true_index(keepers)
len(keepers)

Out[10]:

In [11]:

o = ['gender', 'race', 'year of dx.', 'age', 'tumor_subdivision','stage','lymph_stage','invasion',
     'spread', 'drinker', 'smoker']

In [12]:

clin = clin.ix[keepers]
clin['gender'] = clin['Gender']
clin['race'] = clin['Race'].map(str.lower).replace('black','black or african american')
clin['year of dx.'] = clin.Date_Dx.map(lambda s: 'post_2000' if s[-2] in ['0','1'] else 'pre_2000')
clin['age'] = clin.Age_Dx.map(lambda s: 'between 75 and 85' if s >= 75 else 'younger than 75')
def subd(s):
    if s == 'Oral cavity':
        return 'oral cavity'
    elif s == 'Larynx':
        return 'larynx'
    elif s == 'Oropharynx':
        return 'oropharynx'
    else:
        return 'missing / other'
clin['tumor_subdivision'] = clin.Primary_site.map(subd)
clin['stage'] = clin.clinical_stage.map(lambda s: s[:2] if s != 'Unstaged' else 'NX')
clin['lymph_stage'] = clin.Nodal_stage.map(lambda s: s[:2] if s != '.' else 'NX')
clin['invasion'] = clin.PNI.replace('unknown','missing')
clin['spread'] = clin.EPS.replace('unknown','missing')
def alch(v):
    s = v['Alcohol_amt']
    c = v['Alcohol']
    if c == 'no':
        return 'no'
    if pd.isnull(s):
        return 'missing / moderate'
    elif int(s) <= 6:
        return 'no'
    elif int(s) > 13:
        return 'yes'
    else:
        return 'missing / moderate'
clin['drinker'] = clin.apply(alch,1)
clin['smoker'] = clin.Smoking_history

In [13]:

clin_uni2 = pd.concat({v: get_surv_fit_lr(surv, clin[v])
                       for v in o if v in clin and len(clin[v].unique()) > 1})
pat = pd.concat({v: clin[v].value_counts() for v in o if v in clin})
#pat = clin_uni2[('Stats','# Patients')]
med_surv = clin_uni2[('Median Survival','Median')]
surv_p = clin_uni2[('Log-Rank','p')]
tab2 = pd.concat([pat, med_surv, surv_p], keys=['# Patients', 'Median Surv.', 'Log-rank P'], axis=1)

In [17]:

tab_combo = pd.concat([tab, tab2.ix[tab.index]], keys=['TCGA','UPMC'], axis=1)
tab_combo[('UPMC', '# Patients')] = tab_combo[('UPMC', '# Patients')].fillna('')

In [16]:

o = [('gender', ''),
 ('gender', 'male'),
 ('gender', 'female'),
 ('race', ''),
 ('race', 'white'),
 ('race', 'black or african american'),
 ('race', 'missing'),
 ('race', 'asian'),
 ('race', 'american indian or alaska native'),
 ('year of dx.', ''),
 ('year of dx.', 'post_2000'),
 ('year of dx.', 'pre_2000'),
 ('age', ''),
 ('age', 'younger than 75'),
 ('age', 'between 75 and 85'),
 ('tumor_subdivision', ''),
 ('tumor_subdivision', 'oral cavity'),
 ('tumor_subdivision', 'larynx'),
 ('tumor_subdivision', 'oropharynx'),
 ('tumor_subdivision', 'missing / other'),
 ('stage', ''),
 ('stage', 'IV'),
 ('stage', 'III'),
 ('stage', 'II'),
 ('stage', 'I'),
 ('stage', 'NX'),
 ('lymph_stage', ''),
 ('lymph_stage', 'N3'),
 ('lymph_stage', 'N2'),
 ('lymph_stage', 'N1'),
 ('lymph_stage', 'N0'),
 ('lymph_stage', 'NX'), 
 ('invasion', ''),
 ('invasion', 'yes'),
 ('invasion', 'no'),
 ('invasion', 'missing'),
 ('spread', ''),
 ('spread', 'yes'),
 ('spread', 'no'),
 ('spread', 'missing'),
 ('drinker', ''),
 ('drinker', 'yes'),
 ('drinker', 'no'),
 ('drinker', 'missing / moderate'),
 ('smoker', ''),
 ('smoker', 'yes'),
 ('smoker', 'no'),
 ('smoker', 'reformed / missing'),]
tab_combo.ix[o]

Out[16]:

		TCGA			UPMC
		# Patients	Median Surv.	Log-rank P	# Patients	Median Surv.	Log-rank P
gender				0.696			0.472
	male	176	4		37	1.85
	female	74	2.96		11	NaN
race				0.418			0.182
	white	213	4		47	2.81
	black or african american	25	1.48		1	1
	missing	6	1.6			NaN	NaN
	asian	5	NaN			NaN	NaN
	american indian or alaska native	1	NaN			NaN	NaN
year of dx.				5.46e-06		NaN	NaN
	post_2000	203	NaN		48	NaN	NaN
	pre_2000	47	1.53			NaN	NaN
age				0.0106			0.0929
	younger than 75	220	4		45	1.85
	between 75 and 85	30	1.81		3	NaN
tumor_subdivision				0.116			0.42
	oral cavity	161	4.36		25	2.11
	larynx	75	2.25		11	NaN
	oropharynx	13	1.43		4	0.844
	missing / other	1	0.986		8	3.7
stage				0.221			0.167
	IV	135	2.3		35	1.5
	III	36	4.49			NaN	NaN
	II	37	NaN		12	NaN
	I	14	NaN			NaN	NaN
	NX	28	2.25		1	NaN
lymph_stage				0.00083			0.000435
	N3	5	0.962		3	0.894
	N2	85	1.9		23	1.3
	N1	29	NaN		7	3.7
	N0	84	NaN		14	NaN
	NX	47	1.83		1	NaN
invasion				0.00342			0.0188
	yes	95	2.5		22	1.36
	no	89	NaN		20	NaN
	missing	66	1.9		6	3.25
spread				0.00146			0.0448
	yes	51	1.42		16	1.37
	no	118	NaN		14	2.11
	missing	81	3.53		18	NaN
drinker				0.0875			0.0252
	yes	45	4		8	1.04
	no	35	NaN		27	NaN
	missing / moderate	170	2.5		13	1.37
smoker				0.0231			0.285
	yes	84	1.6		41	1.85
	no	44	4.71		7	5.01
	reformed / missing	122	4			NaN	NaN