import NotebookImport
from Imports import *

old = pd.Series(1.*(age>=75) + 1.*(age>=85), name='age')
old = old.map({0:'younger than 75', 1:'between 75 and 85', 2:'older than 85'})

ff = clinical.processed.select(lambda s: ('hpv' not in s), axis=1)
ff = ff.join(clinical.clinical.race).join(clinical.clinical.gender)
ff['year of dx.'] = ff['year']
ff['age'] = old
ff['smoker'] = ff['smoker'].map({'current smoker':'yes', 'lifelong non-smoker':'no',
                                 'current reformed smoker for < or = 15 years':'reformed / missing',
                                 'current reformed smoker for > 15 years':'reformed / missing',
                                 nan:'reformed / missing'})
ff['stage'] = ff['stage'].map(lambda s: s.split()[1].upper() if (isinstance(s, str) and ' ' in s) else 'NX')
ff['lymph_stage'] = ff['lymph_stage'].map(lambda s: s.upper() if (isinstance(s, str) and s[0] == 'n') else 'NX')
ff = ff.replace('[Unknown]', np.nan)
ff = ff.ix[keepers_o].fillna('missing')
ff['tumor_subdivision'] = ff['tumor_subdivision'].replace('missing','missing / other')
ff['drinker'] = ff['drinker'].map({True: 'yes', False: 'no', 'missing': 'missing / moderate'})

clin_uni = pd.concat({g: get_surv_fit_lr(surv, f)
                      for g,f in ff.iteritems() 
                      if len(f.dropna().unique()) in range(2,10)})

pat = clin_uni[('Stats','# Patients')]
med_surv = clin_uni[('Median Survival','Median')]
surv_p = clin_uni[('Log-Rank','p')]
tab = pd.concat([pat, med_surv, surv_p], keys=['# Patients', 'Median Surv.', 'Log-rank P'], axis=1)

o = ['gender', 'race', 'year of dx.', 'age', 'tumor_subdivision','stage','lymph_stage','invasion',
     'spread', 'drinker', 'smoker']
tab = tab.ix[o]
o = {v:i for i,v in enumerate(o)}

tab.index = pd.MultiIndex.from_tuples([(o[i[0]], i[0], i[1]) for i in tab.index if i[0] in o])
tab = tab.reset_index(level=0).sort(['level_0','# Patients'], ascending=[True,False])
del tab['level_0']

meta = pd.read_csv('../Extra_Data/UPMC_cohort/meta.csv', index_col=0)
meta.index = pd.MultiIndex.from_tuples([('-'.join(i.split('-')[:-1]), i.split('-')[-1]) 
                                        for i in meta.index])
clin = pd.read_csv('../Extra_Data/UPMC_cohort/pitt_broad_data.csv', index_col=0)
clin.Tumor_type = clin.Tumor_type.map(str.strip)
surv = pd.concat([clin.os_5yr, clin.os_5yr_mons*30.5], keys=['event','days'], axis=1).stack()
clin2 = pd.read_csv('../Extra_Data/UPMC_cohort/pitt_broad_data_update.csv', index_col=0)
surv2 = pd.concat([clin2.os_5yr, clin2.os_5yr_mons*30.5], keys=['event','days'], axis=1).stack()
surv = surv2.combine_first(surv)

clin.HPV.value_counts()

keepers = meta['SNP array ID'].notnull() & (meta['Final_74_exome_analysis'] == 1)
keepers = keepers.groupby(level=0).sum() 

keepers = meta['SNP array ID'].notnull() & (meta['Final_74_exome_analysis'] == 1)
keepers = keepers.groupby(level=0).last() 
keepers = (keepers > 0) & (clin.HPV == 'Negative')
keepers = keepers.ix[surv.index.get_level_values(0)].dropna()
keepers = keepers.groupby(level=0).last()
keepers = true_index(keepers)
len(keepers)

o = ['gender', 'race', 'year of dx.', 'age', 'tumor_subdivision','stage','lymph_stage','invasion',
     'spread', 'drinker', 'smoker']

clin = clin.ix[keepers]
clin['gender'] = clin['Gender']
clin['race'] = clin['Race'].map(str.lower).replace('black','black or african american')
clin['year of dx.'] = clin.Date_Dx.map(lambda s: 'post_2000' if s[-2] in ['0','1'] else 'pre_2000')
clin['age'] = clin.Age_Dx.map(lambda s: 'between 75 and 85' if s >= 75 else 'younger than 75')
def subd(s):
    if s == 'Oral cavity':
        return 'oral cavity'
    elif s == 'Larynx':
        return 'larynx'
    elif s == 'Oropharynx':
        return 'oropharynx'
    else:
        return 'missing / other'
clin['tumor_subdivision'] = clin.Primary_site.map(subd)
clin['stage'] = clin.clinical_stage.map(lambda s: s[:2] if s != 'Unstaged' else 'NX')
clin['lymph_stage'] = clin.Nodal_stage.map(lambda s: s[:2] if s != '.' else 'NX')
clin['invasion'] = clin.PNI.replace('unknown','missing')
clin['spread'] = clin.EPS.replace('unknown','missing')
def alch(v):
    s = v['Alcohol_amt']
    c = v['Alcohol']
    if c == 'no':
        return 'no'
    if pd.isnull(s):
        return 'missing / moderate'
    elif int(s) <= 6:
        return 'no'
    elif int(s) > 13:
        return 'yes'
    else:
        return 'missing / moderate'
clin['drinker'] = clin.apply(alch,1)
clin['smoker'] = clin.Smoking_history

clin_uni2 = pd.concat({v: get_surv_fit_lr(surv, clin[v])
                       for v in o if v in clin and len(clin[v].unique()) > 1})
pat = pd.concat({v: clin[v].value_counts() for v in o if v in clin})
#pat = clin_uni2[('Stats','# Patients')]
med_surv = clin_uni2[('Median Survival','Median')]
surv_p = clin_uni2[('Log-Rank','p')]
tab2 = pd.concat([pat, med_surv, surv_p], keys=['# Patients', 'Median Surv.', 'Log-rank P'], axis=1)

tab_combo = pd.concat([tab, tab2.ix[tab.index]], keys=['TCGA','UPMC'], axis=1)
tab_combo[('UPMC', '# Patients')] = tab_combo[('UPMC', '# Patients')].fillna('')

o = [('gender', ''),
 ('gender', 'male'),
 ('gender', 'female'),
 ('race', ''),
 ('race', 'white'),
 ('race', 'black or african american'),
 ('race', 'missing'),
 ('race', 'asian'),
 ('race', 'american indian or alaska native'),
 ('year of dx.', ''),
 ('year of dx.', 'post_2000'),
 ('year of dx.', 'pre_2000'),
 ('age', ''),
 ('age', 'younger than 75'),
 ('age', 'between 75 and 85'),
 ('tumor_subdivision', ''),
 ('tumor_subdivision', 'oral cavity'),
 ('tumor_subdivision', 'larynx'),
 ('tumor_subdivision', 'oropharynx'),
 ('tumor_subdivision', 'missing / other'),
 ('stage', ''),
 ('stage', 'IV'),
 ('stage', 'III'),
 ('stage', 'II'),
 ('stage', 'I'),
 ('stage', 'NX'),
 ('lymph_stage', ''),
 ('lymph_stage', 'N3'),
 ('lymph_stage', 'N2'),
 ('lymph_stage', 'N1'),
 ('lymph_stage', 'N0'),
 ('lymph_stage', 'NX'), 
 ('invasion', ''),
 ('invasion', 'yes'),
 ('invasion', 'no'),
 ('invasion', 'missing'),
 ('spread', ''),
 ('spread', 'yes'),
 ('spread', 'no'),
 ('spread', 'missing'),
 ('drinker', ''),
 ('drinker', 'yes'),
 ('drinker', 'no'),
 ('drinker', 'missing / moderate'),
 ('smoker', ''),
 ('smoker', 'yes'),
 ('smoker', 'no'),
 ('smoker', 'reformed / missing'),]
tab_combo.ix[o]