import NotebookImport from Imports import * old = pd.Series(1.*(age>=75) + 1.*(age>=85), name='age') old = old.map({0:'younger than 75', 1:'between 75 and 85', 2:'older than 85'}) ff = clinical.processed.select(lambda s: ('hpv' not in s), axis=1) ff = ff.join(clinical.clinical.race).join(clinical.clinical.gender) ff['year of dx.'] = ff['year'] ff['age'] = old ff['smoker'] = ff['smoker'].map({'current smoker':'yes', 'lifelong non-smoker':'no', 'current reformed smoker for < or = 15 years':'reformed / missing', 'current reformed smoker for > 15 years':'reformed / missing', nan:'reformed / missing'}) ff['stage'] = ff['stage'].map(lambda s: s.split()[1].upper() if (isinstance(s, str) and ' ' in s) else 'NX') ff['lymph_stage'] = ff['lymph_stage'].map(lambda s: s.upper() if (isinstance(s, str) and s[0] == 'n') else 'NX') ff = ff.replace('[Unknown]', np.nan) ff = ff.ix[keepers_o].fillna('missing') ff['tumor_subdivision'] = ff['tumor_subdivision'].replace('missing','missing / other') ff['drinker'] = ff['drinker'].map({True: 'yes', False: 'no', 'missing': 'missing / moderate'}) clin_uni = pd.concat({g: get_surv_fit_lr(surv, f) for g,f in ff.iteritems() if len(f.dropna().unique()) in range(2,10)}) pat = clin_uni[('Stats','# Patients')] med_surv = clin_uni[('Median Survival','Median')] surv_p = clin_uni[('Log-Rank','p')] tab = pd.concat([pat, med_surv, surv_p], keys=['# Patients', 'Median Surv.', 'Log-rank P'], axis=1) o = ['gender', 'race', 'year of dx.', 'age', 'tumor_subdivision','stage','lymph_stage','invasion', 'spread', 'drinker', 'smoker'] tab = tab.ix[o] o = {v:i for i,v in enumerate(o)} tab.index = pd.MultiIndex.from_tuples([(o[i[0]], i[0], i[1]) for i in tab.index if i[0] in o]) tab = tab.reset_index(level=0).sort(['level_0','# Patients'], ascending=[True,False]) del tab['level_0'] meta = pd.read_csv('../Extra_Data/UPMC_cohort/meta.csv', index_col=0) meta.index = pd.MultiIndex.from_tuples([('-'.join(i.split('-')[:-1]), i.split('-')[-1]) for i in meta.index]) clin = pd.read_csv('../Extra_Data/UPMC_cohort/pitt_broad_data.csv', index_col=0) clin.Tumor_type = clin.Tumor_type.map(str.strip) surv = pd.concat([clin.os_5yr, clin.os_5yr_mons*30.5], keys=['event','days'], axis=1).stack() clin2 = pd.read_csv('../Extra_Data/UPMC_cohort/pitt_broad_data_update.csv', index_col=0) surv2 = pd.concat([clin2.os_5yr, clin2.os_5yr_mons*30.5], keys=['event','days'], axis=1).stack() surv = surv2.combine_first(surv) clin.HPV.value_counts() keepers = meta['SNP array ID'].notnull() & (meta['Final_74_exome_analysis'] == 1) keepers = keepers.groupby(level=0).sum() keepers = meta['SNP array ID'].notnull() & (meta['Final_74_exome_analysis'] == 1) keepers = keepers.groupby(level=0).last() keepers = (keepers > 0) & (clin.HPV == 'Negative') keepers = keepers.ix[surv.index.get_level_values(0)].dropna() keepers = keepers.groupby(level=0).last() keepers = true_index(keepers) len(keepers) o = ['gender', 'race', 'year of dx.', 'age', 'tumor_subdivision','stage','lymph_stage','invasion', 'spread', 'drinker', 'smoker'] clin = clin.ix[keepers] clin['gender'] = clin['Gender'] clin['race'] = clin['Race'].map(str.lower).replace('black','black or african american') clin['year of dx.'] = clin.Date_Dx.map(lambda s: 'post_2000' if s[-2] in ['0','1'] else 'pre_2000') clin['age'] = clin.Age_Dx.map(lambda s: 'between 75 and 85' if s >= 75 else 'younger than 75') def subd(s): if s == 'Oral cavity': return 'oral cavity' elif s == 'Larynx': return 'larynx' elif s == 'Oropharynx': return 'oropharynx' else: return 'missing / other' clin['tumor_subdivision'] = clin.Primary_site.map(subd) clin['stage'] = clin.clinical_stage.map(lambda s: s[:2] if s != 'Unstaged' else 'NX') clin['lymph_stage'] = clin.Nodal_stage.map(lambda s: s[:2] if s != '.' else 'NX') clin['invasion'] = clin.PNI.replace('unknown','missing') clin['spread'] = clin.EPS.replace('unknown','missing') def alch(v): s = v['Alcohol_amt'] c = v['Alcohol'] if c == 'no': return 'no' if pd.isnull(s): return 'missing / moderate' elif int(s) <= 6: return 'no' elif int(s) > 13: return 'yes' else: return 'missing / moderate' clin['drinker'] = clin.apply(alch,1) clin['smoker'] = clin.Smoking_history clin_uni2 = pd.concat({v: get_surv_fit_lr(surv, clin[v]) for v in o if v in clin and len(clin[v].unique()) > 1}) pat = pd.concat({v: clin[v].value_counts() for v in o if v in clin}) #pat = clin_uni2[('Stats','# Patients')] med_surv = clin_uni2[('Median Survival','Median')] surv_p = clin_uni2[('Log-Rank','p')] tab2 = pd.concat([pat, med_surv, surv_p], keys=['# Patients', 'Median Surv.', 'Log-rank P'], axis=1) tab_combo = pd.concat([tab, tab2.ix[tab.index]], keys=['TCGA','UPMC'], axis=1) tab_combo[('UPMC', '# Patients')] = tab_combo[('UPMC', '# Patients')].fillna('') o = [('gender', ''), ('gender', 'male'), ('gender', 'female'), ('race', ''), ('race', 'white'), ('race', 'black or african american'), ('race', 'missing'), ('race', 'asian'), ('race', 'american indian or alaska native'), ('year of dx.', ''), ('year of dx.', 'post_2000'), ('year of dx.', 'pre_2000'), ('age', ''), ('age', 'younger than 75'), ('age', 'between 75 and 85'), ('tumor_subdivision', ''), ('tumor_subdivision', 'oral cavity'), ('tumor_subdivision', 'larynx'), ('tumor_subdivision', 'oropharynx'), ('tumor_subdivision', 'missing / other'), ('stage', ''), ('stage', 'IV'), ('stage', 'III'), ('stage', 'II'), ('stage', 'I'), ('stage', 'NX'), ('lymph_stage', ''), ('lymph_stage', 'N3'), ('lymph_stage', 'N2'), ('lymph_stage', 'N1'), ('lymph_stage', 'N0'), ('lymph_stage', 'NX'), ('invasion', ''), ('invasion', 'yes'), ('invasion', 'no'), ('invasion', 'missing'), ('spread', ''), ('spread', 'yes'), ('spread', 'no'), ('spread', 'missing'), ('drinker', ''), ('drinker', 'yes'), ('drinker', 'no'), ('drinker', 'missing / moderate'), ('smoker', ''), ('smoker', 'yes'), ('smoker', 'no'), ('smoker', 'reformed / missing'),] tab_combo.ix[o]