Notebook

In [1]:

import numpy as np
import synapseclient
import os
import sys
import pandas
import pylab as pl

import matplotlib.pyplot as plt
%matplotlib inline

import numpy as np
import pylab as pl
from sklearn import svm, datasets
from sklearn.utils import shuffle
from sklearn.metrics import roc_curve, auc, precision_recall_curve

import warnings
warnings.filterwarnings('ignore')

In [2]:

syn = synapseclient.login()

Welcome, Abhishek Pratap!

In [3]:

def get_truePhenotype_testSamples_RA_challenge(syn):
    """
    function to get the read the true phenotype values for the testSamples
    """
    #get the phenotype data
    phenotype_data_synId = 'syn2324939'
    phenotype_data_entity = syn.get(phenotype_data_synId)
    phenotype_df = pandas.DataFrame.from_csv(header=0, sep=" ", path=phenotype_data_entity.path)
    phenotype_df = phenotype_df.reset_index()
    #renaming column 'ID_1' to 'ID'
    phenotype_df['ID'] = phenotype_df.ID_1
    phenotype_df = phenotype_df.drop(['ID_1'], axis=1)

    #get the partitions and extract the testSamples ID  (test1 == 1)
    partition_data_synId = 'syn2351768'
    partition_data_entity = syn.get(partition_data_synId)
    partitions_df = pandas.DataFrame.from_csv(partition_data_entity.path,header=0,sep=" ")
    partitions_df = partitions_df.reset_index()

    #the following generates warning 
    #testSamples_ID = partitions_df.query('test1 == 1')['ID']
    testSamples_ID = partitions_df[partitions_df['test1'] == 1]['ID']

    #return only the phenotypes for the testSamples set
    #testSamples_true_phenotype = phenotype_df.query('ID == testSamples_ID')
    testSamples_true_phenotype = phenotype_df[phenotype_df.ID.isin(testSamples_ID)]
    return (testSamples_true_phenotype)

In [4]:

truth_df = get_truePhenotype_testSamples_RA_challenge(syn)

creating binary class for Precision Recall calculation¶

1 = non responder
0 = responder

In [5]:

truth_df['true_class'] = np.nan
truth_df.true_class[truth_df.Response.isin(['Intermediate', 'Good'])] = 0
truth_df.true_class[truth_df.Response.isin(['Non', 'Supernon'])] = 1

In [6]:

def __get_blockWise_stats(sub_stats):
    
    #group to calculate group wise stats for each block
    grouped = sub_stats.groupby(['predict'], sort=False)
    
    #instantiate a pandas dataframe to store the results for each group (tied values)
    result = pandas.DataFrame.from_dict({'block':xrange(len(grouped)),
                                         'block_numElements'  : np.nan,
                                         'block_truePos_density' : np.nan,
                                         'block_truePos'      : np.nan,
                                         'blockValue'   : np.nan
                                         })
    
    for block,grp in enumerate(grouped):
        name,grp = grp[0],grp[1]
        truePositive = sum(grp.truth == 1)
        grp_truePositive_density = truePositive / float(len(grp))
        idxs = result.block == block
        result.block_truePos_density[idxs] = grp_truePositive_density
        result.block_numElements[idxs] = len(grp)
        result.block_truePos[idxs] = truePositive
        result.blockValue[idxs] = grp.predict.unique()
    result.block = result.block + 1
    result['cum_numElements'] = result.block_numElements.cumsum()
    result['cum_truePos'] = result.block_truePos.cumsum()
    
    return(result)

In [7]:

def get_precision_recall_fpr(truth, pred):
    
    sub_stats = pandas.DataFrame.from_dict({'predict':pred, 'truth':truth}, dtype='float64')
    sub_stats = sub_stats.sort_values(by=['predict'],ascending=False)

    blockWise_stats = __get_blockWise_stats(sub_stats)
    grouped = sub_stats.groupby(['predict'],sort=False)
    sub_stats = grouped.apply(__nonlinear_interpolated_evalStats,blockWise_stats)
#    sub_stats = sub_stats.sort(columns=['precision'])
    precision, recall,  fpr, threshold = sub_stats.precision.values, sub_stats.recall.values, sub_stats.fpr.values, sub_stats.predict.values 
    
    #YFG suggestion - for the case when Truth == Prediction
    # REF - https://github.com/Sage-Bionetworks/DARPA_Challenge/blob/master/challenge_config.py#L162
    #PR curve AUC (Fixes error when prediction == truth)
    #recall_new=list(recall)
    #precision_new=list(precision)
    #recall_new.reverse()
    #recall_new.append(0)
    #recall_new.reverse()
    #precision_new.reverse()
    #precision_new.append(precision_new[len(precision_new)-1])
    #precision_new.reverse()
    
    ### Implementing the change using numpy style  // Abhishek Pratap - 08/31/2016
    recall_mod = np.insert(recall,0,0)  ## adding 0 at the beginning
    precision_mod = np.insert(precision,0,precision[0]) ## adding corresponding value at the beginning 
    fpr_mod = np.insert(fpr,0,fpr[0]) ## adding corresponding value at the beginning 

    return(precision_mod, recall_mod, fpr_mod, threshold)


def __nonlinear_interpolated_evalStats(block_df, blockWise_stats):
    """
    //needs to be updated
    """
    
    blockValue = block_df.predict.unique()
    if len(blockValue) != 1:
        raise Exception("grouping by predict column doesnt yield unique predict vals per group..WIERD")
    blockValue = blockValue[0]
    blockStats = blockWise_stats[blockWise_stats.blockValue == blockValue].squeeze() #squeeze will convert one row df to series
    
    block_precision = []
    block_recall = []
    block_fpr = []
    test_FP = []
    test_TP = []
    total_elements = blockWise_stats.cum_numElements.max()
    total_truePos = blockWise_stats.cum_truePos.max()
    total_trueNeg = total_elements - total_truePos
    for block_depth,row in enumerate(block_df.iterrows()):
        block_depth += 1  #increase block depth by 1 
        #calculate the cumulative true positives seen till the last block from the current active block
        # and the total number of elements(cumulative) seen till the last block
        if blockStats.block == 1: #no previous obviously
            cum_truePos_till_lastBlock = 0
            cum_numElements_till_lastBlock = 0
            cum_trueNeg_till_lastBlock = 0
        elif blockStats.block > 1:
            last_blockStats = blockWise_stats[blockWise_stats.block == (blockStats.block-1)].squeeze()
            cum_truePos_till_lastBlock = last_blockStats['cum_truePos']
            cum_numElements_till_lastBlock = last_blockStats['cum_numElements']
            cum_trueNeg_till_lastBlock = cum_numElements_till_lastBlock - cum_truePos_till_lastBlock
            
        truePos = cum_truePos_till_lastBlock + (blockStats.block_truePos_density*block_depth)
        falsePos = cum_trueNeg_till_lastBlock + ((1 - blockStats.block_truePos_density ) * block_depth)
        test_FP.append(falsePos)
        test_TP.append(truePos)
        #precision
        interpolated_precision = truePos / float((cum_numElements_till_lastBlock+block_depth))
        block_precision.append(interpolated_precision)
        #recall == true positive rate
        interpolated_recall = truePos / float(total_truePos)
        block_recall.append(interpolated_recall)
        #fpr == false positive rate
        interpolated_fpr = falsePos / float(total_trueNeg)
        block_fpr.append(interpolated_fpr)
        
    block_df['precision'] = block_precision
    block_df['recall'] = block_recall
    block_df['fpr'] = block_fpr
    block_df['block_depth'] = np.arange(1,block_df.shape[0]+1)
    block_df['block'] = blockStats.block
    return(block_df)

In [45]:

def __temp_plot(truth, pred, debug=False):
    
    fpr1, tpr1, threshold1 = roc_curve(truth, pred)
    frp1 = sorted(fpr1)
    precision1, recall1, thresholds  = precision_recall_curve(truth, pred)
    print 'linear interpolation (AUC: %0.3f)' % auc(recall1,precision1, reorder=True)

    precision2, recall2, fpr2, threshold2= get_precision_recall_fpr(truth, pred)
    print 'Non-linear interpolation (AUC: %0.3f)' % auc(recall2,precision2, reorder=True)

    plt.clf()
    fig = plt.figure(figsize=(8,6))
    fig.subplots_adjust(hspace=.5)
    ax1 = fig.add_subplot(211)
    ax1.plot(recall1, precision1, label='linear (AUC: %0.3f)' % auc(recall1,precision1, reorder=True) )
    ax1.plot(recall2, precision2, label='Non linear (AUC: %0.3f)' % auc(recall2,precision2, reorder=True) )
    ax1.set_xlabel('Recall')
    ax1.set_ylabel('Precision')
    ax1.set_ylim([0.0, 1.2])
    ax1.set_xlim([0.0, 1.2])
    ax1.set_title('Precision-Recall')
    ax1.legend(loc="upper right",fontsize='small' )

    ax2 = fig.add_subplot(212)
    ax2.plot(fpr1,tpr1, label='linear (AUC: %0.3f)' % auc(fpr1,tpr1, reorder=True) )
    ax2.plot(fpr2, recall2, label='Non linear (AUC: %0.3f)' % auc(fpr2,recall2, reorder=True) )
    ax2.plot(np.arange(0,1.1,.1), np.arange(0,1.1,.1),'--', label='TPR=FPR')
    ax2.set_xlabel('FPR')
    ax2.set_ylabel('TPR')
    ax2.set_ylim([0.0, 1.2])
    ax2.set_xlim([0.0, 1.2])
    ax2.set_title('ROC')
    ax2.legend(loc="lower right",fontsize='small' )
    plt.show()
    
    if(debug == True):
        sub_stats = pandas.DataFrame.from_dict({'predict':pred, 'truth':truth}, dtype='float64')
        sub_stats = sub_stats.sort_values(by=['predict'],ascending=False)
        blockWise_stats = __get_blockWise_stats(sub_stats)
        grouped = sub_stats.groupby(['predict'],sort=False)
        
        print('-- Block wise true positive densities-----\n')
        print(blockWise_stats)
        
        print('\n\n --- Stats per observation---\n')
        sub_stats = grouped.apply(__nonlinear_interpolated_evalStats,blockWise_stats)
        print(sub_stats)
        

Examples¶

1 : single belief score¶

In [46]:

sub = syn.getSubmission(2363293)
df = pandas.DataFrame.from_csv(sub.filePath).reset_index()
df = df.merge(truth_df[['true_class','ID', 'Response']], left_on="ID", right_on="ID", how="outer")
__temp_plot(df.true_class, df.belief_gen)

linear interpolation (AUC: 0.613)
Non-linear interpolation (AUC: 0.225)

<matplotlib.figure.Figure at 0x11738bb10>

Example 2 : binary predictions¶

In [17]:

sub = syn.getSubmission(2363303)
df = pandas.DataFrame.from_csv(sub.filePath).reset_index()
df = df.merge(truth_df[['true_class','ID', 'Response']], left_on="ID", right_on="ID", how="outer")
__temp_plot(df.true_class, df.belief_gen)

linear interpolation (AUC: 0.457)
Non-linear interpolation (AUC: 0.275)

<matplotlib.figure.Figure at 0x117228f50>

Example 3 : less skewed scores¶

In [18]:

sub = syn.getSubmission(2364533)
df = pandas.DataFrame.from_csv(sub.filePath).reset_index()
df = df.merge(truth_df[['true_class','ID', 'Response']], left_on="ID", right_on="ID", how="outer")
__temp_plot(df.true_class, df.belief_gen)

linear interpolation (AUC: 0.261)
Non-linear interpolation (AUC: 0.258)

<matplotlib.figure.Figure at 0x117237a50>

Example 4 : 670 uniq scores¶

In [19]:

sub = syn.getSubmission(2368533)
df = pandas.DataFrame.from_csv(sub.filePath).reset_index()
df = df.merge(truth_df[['true_class','ID', 'Response']], left_on="ID", right_on="ID", how="outer")
__temp_plot(df.true_class, df.belief_gen)

linear interpolation (AUC: 0.381)
Non-linear interpolation (AUC: 0.381)

<matplotlib.figure.Figure at 0x1021d2890>

Testing -- When truth == prediction (May 9, 2016)¶

reported case by Tom

In [22]:

pred = np.concatenate([np.repeat(1,25), np.repeat(0,30)])
truth = pred
__temp_plot(truth, pred)

linear interpolation (AUC: 1.000)
Non-linear interpolation (AUC: 0.989)

<matplotlib.figure.Figure at 0x11763b1d0>

8/9/2016 - Testing Another case¶

reported by Tom - possible bug

In [48]:

sub1 = syn.getSubmission(7180273)
sub1 = pandas.DataFrame.from_csv(sub1.filePath).reset_index()
sub1['pred'] = sub1.SHEDDING_SC1

sub2 = syn.getSubmission(7154754)
sub2 = pandas.DataFrame.from_csv(sub2.filePath).reset_index()
sub2['pred'] = sub2.SHEDDING_SC1

truth = syn.get('syn5705153')
truth = pandas.DataFrame.from_csv(truth.path).reset_index()
truth['true_class'] = truth.SHEDDING_SC1

In [49]:

data = sub1.merge(truth, left_on="SUBJECTID", right_on="SUBJECTID", how="outer")
__temp_plot(data.true_class, data.pred, debug=True)

linear interpolation (AUC: 0.873)
Non-linear interpolation (AUC: 0.852)

<matplotlib.figure.Figure at 0x117661b90>

-- Block wise true positive densities-----

   block  blockValue  block_numElements  block_truePos  block_truePos_density  \
0      1    1.000000                5.0            4.0               0.800000   
1      2    0.857143                1.0            1.0               1.000000   
2      3    0.750000                4.0            4.0               1.000000   
3      4    0.600000                2.0            2.0               1.000000   
4      5    0.571429                1.0            1.0               1.000000   
5      6    0.500000                4.0            3.0               0.750000   
6      7    0.375000                1.0            0.0               0.000000   
7      8    0.285714                1.0            0.0               0.000000   
8      9    0.250000                3.0            2.0               0.666667   
9     10    0.000000                1.0            0.0               0.000000   

   cum_numElements  cum_truePos  
0              5.0          4.0  
1              6.0          5.0  
2             10.0          9.0  
3             12.0         11.0  
4             13.0         12.0  
5             17.0         15.0  
6             18.0         15.0  
7             19.0         15.0  
8             22.0         17.0  
9             23.0         17.0  


 --- Stats per observation---

     predict  truth  precision    recall       fpr  block_depth  block
16  1.000000    1.0   0.800000  0.047059  0.033333            1    1.0
2   1.000000    1.0   0.800000  0.094118  0.066667            2    1.0
4   1.000000    1.0   0.800000  0.141176  0.100000            3    1.0
6   1.000000    1.0   0.800000  0.188235  0.133333            4    1.0
1   1.000000    0.0   0.800000  0.235294  0.166667            5    1.0
19  0.857143    1.0   0.833333  0.294118  0.166667            1    2.0
5   0.750000    1.0   0.857143  0.352941  0.166667            1    3.0
21  0.750000    1.0   0.875000  0.411765  0.166667            2    3.0
8   0.750000    1.0   0.888889  0.470588  0.166667            3    3.0
17  0.750000    1.0   0.900000  0.529412  0.166667            4    3.0
0   0.600000    1.0   0.909091  0.588235  0.166667            1    4.0
3   0.600000    1.0   0.916667  0.647059  0.166667            2    4.0
22  0.571429    1.0   0.923077  0.705882  0.166667            1    5.0
13  0.500000    1.0   0.910714  0.750000  0.208333            1    6.0
10  0.500000    0.0   0.900000  0.794118  0.250000            2    6.0
9   0.500000    1.0   0.890625  0.838235  0.291667            3    6.0
20  0.500000    1.0   0.882353  0.882353  0.333333            4    6.0
18  0.375000    0.0   0.833333  0.882353  0.500000            1    7.0
15  0.285714    0.0   0.789474  0.882353  0.666667            1    8.0
12  0.250000    0.0   0.783333  0.921569  0.722222            1    9.0
7   0.250000    1.0   0.777778  0.960784  0.777778            2    9.0
11  0.250000    1.0   0.772727  1.000000  0.833333            3    9.0
14  0.000000    0.0   0.739130  1.000000  1.000000            1   10.0

In [50]:

data = sub2.merge(truth, left_on="SUBJECTID", right_on="SUBJECTID", how="outer")
__temp_plot(data.true_class, data.pred, debug=True)

linear interpolation (AUC: 0.887)
Non-linear interpolation (AUC: 0.867)

<matplotlib.figure.Figure at 0x1173c9310>

-- Block wise true positive densities-----

    block  blockValue  block_numElements  block_truePos  \
0       1    1.000000                6.0            5.0   
1       2    0.875000                1.0            1.0   
2       3    0.857143                1.0            1.0   
3       4    0.800000                1.0            1.0   
4       5    0.750000                2.0            2.0   
5       6    0.625000                1.0            1.0   
6       7    0.571429                1.0            1.0   
7       8    0.500000                3.0            3.0   
8       9    0.428571                1.0            0.0   
9      10    0.375000                1.0            0.0   
10     11    0.250000                2.0            1.0   
11     12    0.000000                3.0            1.0   

    block_truePos_density  cum_numElements  cum_truePos  
0                0.833333              6.0          5.0  
1                1.000000              7.0          6.0  
2                1.000000              8.0          7.0  
3                1.000000              9.0          8.0  
4                1.000000             11.0         10.0  
5                1.000000             12.0         11.0  
6                1.000000             13.0         12.0  
7                1.000000             16.0         15.0  
8                0.000000             17.0         15.0  
9                0.000000             18.0         15.0  
10               0.500000             20.0         16.0  
11               0.333333             23.0         17.0  


 --- Stats per observation---

     predict  truth  precision    recall       fpr  block_depth  block
0   1.000000    1.0   0.833333  0.049020  0.027778            1    1.0
16  1.000000    1.0   0.833333  0.098039  0.055556            2    1.0
2   1.000000    1.0   0.833333  0.147059  0.083333            3    1.0
4   1.000000    1.0   0.833333  0.196078  0.111111            4    1.0
6   1.000000    1.0   0.833333  0.245098  0.138889            5    1.0
1   1.000000    0.0   0.833333  0.294118  0.166667            6    1.0
17  0.875000    1.0   0.857143  0.352941  0.166667            1    2.0
19  0.857143    1.0   0.875000  0.411765  0.166667            1    3.0
3   0.800000    1.0   0.888889  0.470588  0.166667            1    4.0
5   0.750000    1.0   0.900000  0.529412  0.166667            1    5.0
9   0.750000    1.0   0.909091  0.588235  0.166667            2    5.0
21  0.625000    1.0   0.916667  0.647059  0.166667            1    6.0
22  0.571429    1.0   0.923077  0.705882  0.166667            1    7.0
13  0.500000    1.0   0.928571  0.764706  0.166667            1    8.0
8   0.500000    1.0   0.933333  0.823529  0.166667            2    8.0
20  0.500000    1.0   0.937500  0.882353  0.166667            3    8.0
15  0.428571    0.0   0.882353  0.882353  0.333333            1    9.0
18  0.375000    0.0   0.833333  0.882353  0.500000            1   10.0
12  0.250000    0.0   0.815789  0.911765  0.583333            1   11.0
7   0.250000    1.0   0.800000  0.941176  0.666667            2   11.0
14  0.000000    0.0   0.777778  0.960784  0.777778            1   12.0
10  0.000000    0.0   0.757576  0.980392  0.888889            2   12.0
11  0.000000    1.0   0.739130  1.000000  1.000000            3   12.0

In [ ]: