%matplotlib inline
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import pandas as pd
import pylab
import scipy as sp
import scipy.stats as sps
import seaborn as sns

from IPython.display import display
from math import sqrt
from textwrap import wrap
pylab.rcParams['figure.figsize'] = (14.0, 6.0)

def expand_checkbox(checkbox_column, options):
    """
    checkbox_column = a series of lists of boxes checked in response to a question
    options = a list the options available to check
    
    expand checkbox_column into a DataFrame of bools with index= respondents, columns= options
    """
    return pd.DataFrame({option : checkbox_column.apply(lambda x: option in x) for option in options})

def tuple_normalize(counts, n_responses):
    """
    counts (tuple of ints): tuple of counts
    n_responses (int): total number of respondents
    
    convert a pair (or more) of counts to percetages
    """
    return map(lambda x: float(x) / n_responses * 100, counts)


def interval_to_error(confidence_interval, center):
    """
    confidence_interval (tuple): low, high relative to the origin
    center: (int, float): measured value (e.g., mean)
    returns the ci as a tuple relative to the center (i.e., minus, plus the mean)
    """
    return tuple(map(lambda x: abs(float(x) - center), confidence_interval))
    
def split_interval(interval):
    """
    split a confidence interval tuple and return as a 2-element Series
    """
    return pd.Series(interval, index= ['low','high'])


#set seaborn style
sns.set_style("white", 
              {'font.sans-serif': ['Helevetica', 'Liberation Sans', 
                                   'Bitstream Vera Sans', 'sans-serif'],
               'axes.linewidth': 0,
               'xtick.direction': 'in',
               'xtick.major.size': 8.0})

# max characters per line in graph labels
LABEL_WIDTH = 25
# color of x-axis
AXIS_COLOR = '#808080'
# color of checkbox bar graph bars
BAR_COLOR = '#08519c'
# color of confidence intervals
INTERVAL_COLOR = '#949494'

def apply_cdl_style(fig, axis_color=AXIS_COLOR):
    fig.set_ylabel('')
    sns.despine(ax=fig, left=True)
    # get rid of weird dashed line
    fig.lines[0].set_visible(False) 
    
    #set font sizes
    fig.tick_params(axis='x', width=2, labelsize=14, color=axis_color)
    fig.tick_params(axis='y', labelsize=16)
    
    return fig

def bootstrap_percentile_ci(data, n_samples=100000, alpha=0.05, stat_function=np.sum):
    """
    Calculates a confidence interval for True/False count data and returns a tuple (low, high) 
    This is a straighforward percentile calculation
    
    data (numpy array): of bools to resample
    num_samples (int): number of times to resample
    alpha (float): 1 - desired confidence interval (e.g., 0.05 for 95%)
    
    returns a tuple (low, high)
    """
    n_responses = len(data)
    # get num_samples resampled arrays (of length n) of valid indicies for data
    indicies = np.random.randint(0, n_responses, (n_samples, n_responses))
    
    # generate sorted array of desired stat in each resampled array
    stats = [stat_function(data[x]) for x in indicies]
    stats.sort()

    # return stats at the edge of the 2.5 and 97.5 percentiles
    return (stats[int((alpha/2.0)*n_samples)], stats[int((1-alpha/2.0)*n_samples)])


def bootstrap_basic_ci(data, n_samples=100000, alpha=0.05, stat_function=np.sum):
    """
    Calculates a confidence interval for True/False count data and returns a tuple (low, high)
    Calls bootsrtap_percentile_ci and converts to a basic bootstrap
    
    data (numpy array): of bools to resample
    num_samples (int): number of times to resample
    alpha (float): 1 - desired confidence interval (e.g., 0.05 for 95%)
    
    returns a tuple (low, high)
    """
    double_observed = 2 * stat_function(data)
    high, low = bootstrap_percentile_ci(data, n_samples=n_samples, alpha=alpha, stat_function=stat_function)

    return (double_observed - low, double_observed - high)


def graph_checkbox(question, answers, bar_color=BAR_COLOR, interval_color=INTERVAL_COLOR):
    split_checkbox = responses[question].dropna()

    # checkbox_responses== DataFrame of bools where index=individual respondents, columns=answer choices
    checkbox_responses = expand_checkbox(split_checkbox, answers)
    
    # sum checked boxes in each column; response_counts== Series with values=sums, index=answer choices
    response_counts = checkbox_responses.sum()
    
    # resample and sum from each column to bootstrap a confidence interval
    # count_confidence_intervals== Series with values= tuples (low, high), index=answer choices
    count_confidence_intervals = checkbox_responses.apply(lambda x: bootstrap_basic_ci(np.array(x)))
    
    #print checkbox_responses.apply(lambda x: bootstrap_percentile_ci(np.array(x))).apply(tuple_normalize, args=([len(checkbox_responses)]))
    #print checkbox_responses.apply(lambda x: bootstrap_basic_ci(np.array(x))).apply(tuple_normalize, args=([len(checkbox_responses)]))
    
    #normalize response_counts to percentage of total respondents to the question and sort
    response_counts = response_counts.apply(lambda x: float(x) / len(checkbox_responses) * 100)
    response_counts.sort(ascending=True)
    
    #normalize confidence intervals to percentages and sort 
    count_confidence_intervals = count_confidence_intervals.apply(tuple_normalize, args=([len(checkbox_responses)]))
    count_confidence_intervals = count_confidence_intervals.reindex(index=response_counts.index)
    
    
    #convert absolute interval values to distance below and above the observed value
    for index in count_confidence_intervals.index.values:
        count_confidence_intervals.loc[index] = interval_to_error(count_confidence_intervals.loc[index], 
                                                                  response_counts.loc[index])
    
    #split interval tuples into 2 element Series
    count_confidence_intervals = count_confidence_intervals.apply(split_interval)
    
    response_counts.index = [ '\n'.join(wrap(i, LABEL_WIDTH)) for i in response_counts.index ]

    fig = response_counts.plot(kind='barh', color=bar_color, edgecolor='w', 
                               grid=False, xlim=(0,100), fontsize=14)
    
    fig.errorbar(response_counts.as_matrix(), np.arange(len(response_counts)), 
                 xerr=count_confidence_intervals.T.as_matrix(),
                 fmt='none', ecolor=interval_color, alpha=0.65, elinewidth=2, capsize=12, capthick=2)
    
    
    apply_cdl_style(fig)

    fig.get_figure().set_size_inches(14., 2. * len(response_counts.index))    
    return fig, response_counts
    

def graph_fisher_exact(question, answers, alpha=0.05, labels=None, edge_color=BAR_COLOR):
    
    checkbox_responses = expand_checkbox(question.dropna(), answers)

    fig=nx.Graph()
    fig.add_nodes_from(checkbox_responses.columns)
    pos = nx.circular_layout(fig, scale=10000)

    nx.draw_networkx_nodes(fig, pos, node_size=5000, node_color='#cdcdcd', linewidths=0)
    nx.draw_networkx_labels(fig, pos, labels=labels, fontsize=10, font_family='serif')
    
    i = 0
    for a in checkbox_responses.columns:
        i += 1
        for b in checkbox_responses.columns[i:]:
            square = pd.DataFrame({ 0 : 0, 0 : 0}, index=[True, False], columns=[True, False])        
        
            # fill in counts
            square[True] = (checkbox_responses[checkbox_responses[a] == True][b].value_counts())
            square[False] = (checkbox_responses[checkbox_responses[a] == False][b].value_counts())

            odds_ratio, p = sps.fisher_exact(square.as_matrix())
            
            e_color = edge_color
            if odds_ratio < 1:
                odds_ratio, p = sps.fisher_exact(square.T.as_matrix())
                e_color = 'r'
            
            how_significant = 0.5 if p > alpha else 1.0
            
            nx.draw_networkx_edges(fig, pos, edgelist=[(a,b)], width=odds_ratio, edge_color=e_color, alpha=how_significant)

    plt.axis('off')
    return fig

def graph_likert(questions, answers, filter_on_column=None, filter_value=None, 
                 interval_color=INTERVAL_COLOR):
    
    if filter_on_column:
        responses_ft = responses[responses[filter_on_column] == filter_value]
    else:
        responses_ft = responses
    
    collected_counts = pd.DataFrame(index=answers)
    stats = pd.DataFrame(index=questions,columns=['mean', 'ci'])

    # set up dict for converstion from likert scale (e.g., 1-5) to 0-100%
    number_of_answers = len(answers) 
    answer_to_value = dict(zip(answers, np.arange(number_of_answers)/float(number_of_answers - 1)*100)) 
    
    for column in questions:
        collected_counts[column] = responses_ft[column].value_counts().dropna()
       
        #scale responses to go from 0 to 100
        likert_values = responses[column].dropna().map(answer_to_value)
        
        #cacluate mean and 95% confidence interval
        stats['mean'].loc[column] = likert_values.mean() 
        stats['ci'].loc[column] = bootstrap_basic_ci(np.array(likert_values), stat_function=np.mean)
        
    #sort stats and collected_counts by the mean   
    stats = stats.sort_index(axis=0, by='mean', ascending=True)
    collected_counts = collected_counts.T.reindex(index=stats.index)
    collected_counts = collected_counts.div(collected_counts.sum(1).astype(float)/100, axis = 0) 
    
    #convert absolute interval values to distance below and above the observed value
    for index in stats.index.values:
        stats['ci'].loc[index] = interval_to_error(stats['ci'].loc[index], stats['mean'].loc[index])

    #split interval tuples into 2 element Series
    stats['ci'] = stats['ci'].apply(split_interval)
    
    collected_counts.index = [ '\n'.join(wrap(i, LABEL_WIDTH)) for i in collected_counts.index ]
   
    #plot percentages of each response
    fig = collected_counts.plot(kind='barh', stacked=True, grid=False, 
                                color=sns.color_palette("Blues", len(collected_counts.columns)),
                                xlim = (0,100), edgecolor='w', linewidth=2) 
    
    # plot mean and 95% confidence interval
    fig.plot(stats['mean'], np.arange(len(stats)), marker='o', color='w',axes=fig, 
             markersize=25, markeredgewidth=0, linewidth=0)
    
    fig.errorbar(stats['mean'].as_matrix(), np.arange(len(stats)), xerr=stats['ci'],
                 fmt='none', ecolor=interval_color, alpha=0.65, elinewidth=2, capsize=12, capthick=2)
    
    fig.legend(bbox_to_anchor=(0., -0.02, 1., -0.03), loc='upper left', ncol=number_of_answers, mode="expand",
                    borderaxespad=0., fontsize=14)
    
    apply_cdl_style(fig)
    
    fig.get_figure().set_size_inches(14., 2. * len(collected_counts.index))
    
    return fig

EXCLUDE = {'role' : 'Librarian', 'discipline' : 'Information science', 
           'highest_degree' : 'Highschool', 'generated_data' : 'No'}

responses = pd.read_csv('DataPubSurvey_anon.csv')

for column, value in EXCLUDE.items():
    responses = responses[responses[column] != value]


DISCIPLINE_MAP = {'Anthropology' : 'Social science',
                  'Archaeology' : 'Archaeology',
                  'Area studies' : 'Social science',
                  'Economics' : 'Social science',
                  'Political science' : 'Social science',
                  'Psychology' : 'Social science',
                  'Sociology' : 'Social science',
                  'Astronomy' : 'Space science',
                  'Astrophysics' : 'Space science',
                  'Environmental Science' : 'Environmental science',
                  'Geology' : 'Earth science',
                  'Oceanography' : 'Environmental science',
                  'Planetary science' : 'Earth science',
                  'Biochemistry' : 'Biology',
                  'Bioinformatics' : 'Biology',
                  'Biology' : 'Biology',
                  'Evolutionary Biology' : 'Biology',
                  'Neurobiology' : 'Biology',
                  'Social science' : 'Social science',
                  'Space science' : 'Space science',
                  'Earth science' : 'Earth science',
                  'Life science' : 'Biology',
                  'Chemistry' : 'Physical science',
                  'Physics' : 'Physical science',
                  'Computer science' : 'Computer science',
                  'Mathematics' : 'Mathematics',
                  'Information science' : 'Information science',
                  'Other' : 'Other'}

responses.discipline= responses.discipline.map(DISCIPLINE_MAP).dropna()

DEMOGRAPHICS = ['discipline', 'highest_degree', 'role','institution']

for column in DEMOGRAPHICS:
    count = responses[column].value_counts()
    percentages = 100 * count.apply(lambda x: float(x) / count.sum())
    display(pd.DataFrame([count, percentages], index=['count', 'percent']).T)

AWARENESS_QUESTIONS = ['aware_ostp_policy', 'aware_nsf_dmp', 'aware_nih_data_sharing_policy']
AWARENESS_ANSWERS = ["Never heard of it", "Heard of it", "Read about it", "Know all the details"]  

graph_likert(AWARENESS_QUESTIONS, AWARENESS_ANSWERS, filter_on_column='united_states', filter_value=True)

SHARING_CHANNELS = ["Email / direct contact", "Personal or lab website", "Journal website (as supplemental material)", 
                    "Database or repository"]
graph_checkbox('how_shared', SHARING_CHANNELS)

graph_checkbox('how_others_got', SHARING_CHANNELS)

graph_checkbox('how_you_got', SHARING_CHANNELS)

HOW_DOCUMENTED_ANSWERS = ["A traditional research paper based on the data (with analysis and conclusions)",
                          "A data paper describing the data (without analysis or conclusions)",
                          "Informal text describing the data",
                          "Formal metadata describing the data (e.g. as XML)",
                          "Computer code used to process or generate the data",
                          "Shared with no additional documentation"]

graph_checkbox('how_documented', HOW_DOCUMENTED_ANSWERS)

HOW_CREDITED_ANSWERS = ["Authorship on paper",
                        "Acknowledgement in the paper",
                        "Data cited in the reference list",
                        "Data cited informally in the text of the paper"]

graph_checkbox('data_sharing_credit', HOW_CREDITED_ANSWERS)

graph_checkbox('how_you_credited', HOW_CREDITED_ANSWERS)

DP_FEATURES = ["Openly available without contacting the author(s)",
               "Deposited in a database or repository",
               "Assigned a unique identifier such as a DOI",
               "A traditional research paper is based on the data",
               "A data paper (without conclusions) describes the data",
               "Packaged with a thorough description of the data",
               "Packaged with formal metadata describing the data (e.g. as XML)",
               "Dataset is \"peer reviewed\""]
fig = graph_checkbox('publish_definition', DP_FEATURES)

PR_FEATURES = ["Collection and processing methods were evaluated",
               "Descriptive text is thorough enough to use or replicate the dataset",
               "Necessary metadata is standardized (e.g. in XML)",
               "Technical details have been checked (e.g. no missing files no missing values)",
               "Plausibility considered based on area expertise",
               "Novelty/impact considered"]

fig = graph_checkbox('peer_review_definition', PR_FEATURES)

dp_labels = {"Openly available without contacting the author(s)" : "openly\navailable",
             "Deposited in a database or repository" : "repository\ndeposit",
             "Assigned a unique identifier such as a DOI" : "unique\nID",
              "A traditional research paper is based on the data" : "traditional\npaper",
              "A data paper (without conclusions) describes the data" : "data\npaper",
              "Packaged with a thorough description of the data" : "thorough\nmetadata",
              "Packaged with formal metadata describing the data (e.g. as XML)" : "formal\nmetadata",
              "Dataset is \"peer reviewed\"" : "peer\nreview"}

graph_fisher_exact(responses.publish_definition, DP_FEATURES, labels=dp_labels, alpha=0.05/28)

pr_labels = {"Collection and processing methods were evaluated" : "methods\nappropriate",
               "Descriptive text is thorough enough to use or replicate the dataset" : "thorough\nmetadata",
               "Necessary metadata is standardized (e.g. in XML)" : "standard\nmetadata",
               "Technical details have been checked (e.g. no missing files no missing values)" : "technical\ndetails",
               "Plausibility considered based on area expertise" : "data\nplausible",
               "Novelty/impact considered" : "novelty/\nimpact"}

graph_fisher_exact(responses.peer_review_definition, PR_FEATURES, labels=pr_labels, alpha=0.05/15)

REVIEW_ACTIONS = ["reviewed a journal article",
                  "reviewed a grant proposal",
                  "reviewed an application to graduate school",
                  "reviewed a CV to hire someone for your lab",
                  "served on a hiring committee",
                  "served on a tenure & promotions committee"]
fig = graph_checkbox('researcher_review_experience', REVIEW_ACTIONS)

DATA_TRUST = ['traditional_paper_confidence', 'data_paper_confidence', 'peer_review_confidence', 'reuse_confidence']
DATA_TRUST_SEQUENCE = ["No confidence", "Little confidence", "Some confidence", "High confidence", "Complete confidence"]

graph_likert(DATA_TRUST,DATA_TRUST_SEQUENCE)

DATA_IMPACT = ['impact_citation', 'impact_downloads', 'impact_altmetrics', 'impact_google_rank']
DATA_IMPACT_SEQUENCE = ["Not at all useful", "Slightly useful", "Somewhat useful", "Highly useful", "Extremely useful"]

graph_likert(DATA_IMPACT, DATA_IMPACT_SEQUENCE)

PUBLICATION_VALUE = ["traditional_paper_value", "data_paper_pr_value", "data_paper_npr_value", "dataset_pr_value", "dataset_npr_value"]
PUBLICATION_VALUE_SEQUENCE = ["None", "A small amount", "Some", "Significant", "A great deal"]

graph_likert(PUBLICATION_VALUE, PUBLICATION_VALUE_SEQUENCE)