%pylab inline cd ../src import pandas as pd import pickle as pickle def get_run(firehose_dir, version='Latest'): ''' Helper to get a run from the file-system. ''' path = '{}/ucsd_analyses'.format(firehose_dir) if version is 'Latest': version = sorted(os.listdir(path))[-1] run = pickle.load(open('{}/{}/RunObject.p'.format(path, version), 'rb')) return run params = pd.read_table('../global_params.txt', header=None, squeeze=True, index_col=0) run_path = '{}/Firehose__{}/'.format(params.ix['OUT_PATH'], params.ix['RUN_DATE']) run = get_run(run_path, 'Run_' + params.ix['VERSION']) cancer = run.load_cancer(params.ix['CANCER']) clinical = cancer.load_clinical() mut = cancer.load_data('Mutation') mut.uncompress() cn = cancer.load_data('CN_broad') cn.uncompress() rna = cancer.load_data('mRNASeq') mirna = cancer.load_data('miRNASeq') surv = clinical.survival.survival_5y hpv_all = pd.read_csv('../Extra_Data/hpv_summary_3_20_13_distribute.csv', index_col=0) hpv = hpv_all.Molecular_HPV.map({0:'HPV-', 1:'HPV+'}) hpv.name = 'HPV' hpv_seq = hpv hpv_seq.value_counts() status = clinical.clinical[['hpvstatusbyishtesting','hpvstatusbyp16testing']] hpv_clin = (status.dropna() == 'positive').sum(1) hpv_clin = hpv_clin.map({2: 'HPV+', 0:'HPV-', 1:nan}).dropna() hpv_clin.value_counts() hpv_clin.ix[hpv_clin.index.diff(hpv_seq.index)].value_counts() hpv_new = pd.read_table('../Extra_Data/nationwidechildrens.org_auxiliary_hnsc.txt', skiprows=[1], index_col=0, na_values=['[Not Available]']) hpv_new = hpv_new['hpv_status'] hpv_combo = (hpv_seq.dropna() == 'HPV+').combine_first(hpv_new == 'Positive') hpv_combo.to_clipboard() clinical.hpv = hpv_combo clinical.save() #I keep the same object as there are no side effects keepers_o = hpv_combo[hpv_combo==0].index keepers_o = keepers_o.intersection(mut.features.columns) keepers_o = keepers_o.intersection(cn.features.columns) keepers_o = keepers_o.intersection(surv.unstack().index) keepers_o = keepers_o.intersection(rna.features.columns) keepers_o = keepers_o.intersection(mirna.features.columns) len(keepers_o) from Initialization.InitializeReal import RealDataset from Processing.Helpers import make_path_dump rna = RealDataset(run, cancer, 'mRNASeq', keepers_o) mirna = RealDataset(run, cancer, 'miRNASeq', keepers_o, create_meta_features=False) make_path_dump(rna, rna.path + '/store/no_hpv2.p') make_path_dump(mirna, mirna.path + '/store/no_hpv2.p')