import NotebookImport from Imports import * binary_df = pd.DataFrame() gender = clinical.clinical.gender.dropna() == 'male' binary_df['gender_male'] = gender survival_and_stats(gender.ix[keepers_o], surv) race = clinical.clinical.race survival_and_stats(race.ix[keepers_o], surv) white = race.ix[keepers_o].dropna() == 'white' binary_df['race_white'] = white black = race.ix[keepers_o].dropna() == 'black or african american' binary_df['race_black'] = black survival_and_stats(black, surv) stage = clinical.stage.pathologicstage.ix[keepers_o].fillna('nx') stage = stage.dropna().map(lambda s: s.replace('a','').replace('b','')) stage = stage.map(lambda s: s.replace('stge','Stage')) survival_and_stats(stage, surv) binary_df['stage_iv'] = stage.replace('nx',nan).dropna() == 'Stage iv' lymph_stage = clinical.stage.pathologicn lymph_stage = lymph_stage.map(lambda s: s[:2], na_action='ignore') lymph_stage = lymph_stage.fillna('nx') survival_and_stats(lymph_stage.ix[keepers_o], surv) binary_df['lymph_n0'] = lymph_stage.replace('nx',nan).dropna() == 'n0' binary_df['lymph_n2+'] = lymph_stage.replace('nx',nan).dropna().isin(['n2','n3']) spread = clinical.clinical.presenceofpathologicalnodalextracapsularspread spread = spread.map(str.lower, na_action='ignore') spread = spread.map({'no extranodal extension': False, 'microscopic extension':True, 'gross extension':True}).dropna() survival_and_stats(spread.ix[keepers_o], surv) binary_df['spread'] = spread lymph = lymph_stage.ix[keepers_o] != 'n0' lymph_status = combine(lymph, spread.ix[keepers_o]) lymph_status = lymph_status.map({'neither': 'n0', lymph.name: 'lymph_node', 'both': 'extra_capsular_spread'}) survival_and_stats(lymph_status, surv) invasion = clinical.clinical.perineuralinvasionpresent.replace('nan', nan) invasion = invasion.str.lower() survival_and_stats(invasion.ix[keepers_o], surv) binary_df['invasion'] = invasion.dropna() == 'yes' group = [['oral tongue','oral cavity','floor of mouth','buccal mucosa','alveolar ridge','hard palate','lip'], ['oropharynx','tonsil','base of tongue'], #['hypopharynx'], ['larynx']] groups = ['oral cavity','oropharynx','larynx'] tumor_subdivision = pd.Series({idx: groups[i] for i,g in enumerate(group) for idx,j in clinical.clinical.anatomicneoplasmsubdivision.iteritems() if j.lower() in g}) survival_and_stats(tumor_subdivision.ix[keepers_o], surv) binary_df['oral_cavity'] = tumor_subdivision.dropna() == 'oral cavity' binary_df['larynx'] = tumor_subdivision.dropna() == 'oral larynx' binary_df['oropharynx'] = tumor_subdivision.dropna() == 'oropharynx' age = clinical.clinical.age.astype(float) old_age = (age >= 75) fig, ax = subplots(figsize=(3,3)) age.hist(color='grey') prettify_ax(ax) ax.set_ylabel('# of Patients') ax.set_xlabel('Age in Years') fig.tight_layout() fig.savefig(FIGDIR + 'hpv_sup_d.pdf', transparent=True) fig, ax = subplots(figsize=(5,3)) draw_survival_curve(1.*(age >= 85) + 1.*(age >=75), surv, ax=ax, colors=[colors[2], colors[4], colors[0]]) ax.legend(title=False, frameon=False, loc='lower right') prettify_ax(ax) fig.tight_layout() fig.savefig(FIGDIR + 'hpv_sup_c.pdf', transparent=True) survival_stat_plot(get_surv_fit(surv, 1.*(age >= 85) + 1.*(age >=75))) smoking = clinical.clinical.tobaccosmokinghistory.ix[keepers_o] smoking = smoking.replace({'current reformed smoker for < or = 15 years': 'reformed (recent)', 'current reformed smoker for > 15 years': 'reformed (distant)'}) smoking.value_counts() survival_and_stats(smoking, surv) get_surv_fit_lr(surv, smoking.fillna('M')) pack_years = py = clinical.clinical.numberpackyearssmoked.dropna().astype(float) binary_df['current_smoker'] = smoking.dropna() == 'current smoker' binary_df['non_smoker'] = smoking.dropna() == 'lifelong non-smoker' binary_df['recent_smoker'] = smoking.dropna().isin(['reformed (recent)', 'current smoker']) freq = clinical.clinical.frequencyofalcoholconsumption.astype(float) count = clinical.clinical.amountofalcoholconsumptionperday.fillna(0).astype(float) drinks_per_week = (freq * count).ix[keepers_o].dropna() drinks_per_week.hist() survival_and_stats(drinks_per_week > 10, surv) binary_df['drinking_status'] = drinks_per_week > 10 survival_and_stats(binary_df['drinking_status'].ix[keepers_o].fillna('M'), surv) get_surv_fit_lr(surv, binary_df['drinking_status'].ix[keepers_o].fillna('M')) clinical.clinical.alcoholhistorydocumented.ix[keepers_o].value_counts() year = clinical.clinical.yearofinitialpathologicdiagnosis.ix[keepers_o] year = year.replace('[Discrepancy]', nan).astype(float) pre_2000 = year < 2000 pre_2000.name = 'pre_2000' year.hist() survival_and_stats(pre_2000, surv) binary_df['pre_2000'] = pre_2000 binary_df = binary_df.ix[keepers_o].T clinical.binary_df = binary_df clinical.save()