%matplotlib inline import matplotlib.pyplot as plt import networkx as nx import numpy as np import pandas as pd import pylab import scipy as sp import scipy.stats as sps import seaborn as sns from IPython.display import display from math import sqrt from textwrap import wrap pylab.rcParams['figure.figsize'] = (14.0, 6.0) def expand_checkbox(checkbox_column, options): """ checkbox_column = a series of lists of boxes checked in response to a question options = a list the options available to check expand checkbox_column into a DataFrame of bools with index= respondents, columns= options """ return pd.DataFrame({option : checkbox_column.apply(lambda x: option in x) for option in options}) def tuple_normalize(counts, n_responses): """ counts (tuple of ints): tuple of counts n_responses (int): total number of respondents convert a pair (or more) of counts to percetages """ return map(lambda x: float(x) / n_responses * 100, counts) def interval_to_error(confidence_interval, center): """ confidence_interval (tuple): low, high relative to the origin center: (int, float): measured value (e.g., mean) returns the ci as a tuple relative to the center (i.e., minus, plus the mean) """ return tuple(map(lambda x: abs(float(x) - center), confidence_interval)) def split_interval(interval): """ split a confidence interval tuple and return as a 2-element Series """ return pd.Series(interval, index= ['low','high']) #set seaborn style sns.set_style("white", {'font.sans-serif': ['Helevetica', 'Liberation Sans', 'Bitstream Vera Sans', 'sans-serif'], 'axes.linewidth': 0, 'xtick.direction': 'in', 'xtick.major.size': 8.0}) # max characters per line in graph labels LABEL_WIDTH = 25 # color of x-axis AXIS_COLOR = '#808080' # color of checkbox bar graph bars BAR_COLOR = '#08519c' # color of confidence intervals INTERVAL_COLOR = '#949494' def apply_cdl_style(fig, axis_color=AXIS_COLOR): fig.set_ylabel('') sns.despine(ax=fig, left=True) # get rid of weird dashed line fig.lines[0].set_visible(False) #set font sizes fig.tick_params(axis='x', width=2, labelsize=14, color=axis_color) fig.tick_params(axis='y', labelsize=16) return fig def bootstrap_percentile_ci(data, n_samples=100000, alpha=0.05, stat_function=np.sum): """ Calculates a confidence interval for True/False count data and returns a tuple (low, high) This is a straighforward percentile calculation data (numpy array): of bools to resample num_samples (int): number of times to resample alpha (float): 1 - desired confidence interval (e.g., 0.05 for 95%) returns a tuple (low, high) """ n_responses = len(data) # get num_samples resampled arrays (of length n) of valid indicies for data indicies = np.random.randint(0, n_responses, (n_samples, n_responses)) # generate sorted array of desired stat in each resampled array stats = [stat_function(data[x]) for x in indicies] stats.sort() # return stats at the edge of the 2.5 and 97.5 percentiles return (stats[int((alpha/2.0)*n_samples)], stats[int((1-alpha/2.0)*n_samples)]) def bootstrap_basic_ci(data, n_samples=100000, alpha=0.05, stat_function=np.sum): """ Calculates a confidence interval for True/False count data and returns a tuple (low, high) Calls bootsrtap_percentile_ci and converts to a basic bootstrap data (numpy array): of bools to resample num_samples (int): number of times to resample alpha (float): 1 - desired confidence interval (e.g., 0.05 for 95%) returns a tuple (low, high) """ double_observed = 2 * stat_function(data) high, low = bootstrap_percentile_ci(data, n_samples=n_samples, alpha=alpha, stat_function=stat_function) return (double_observed - low, double_observed - high) def graph_checkbox(question, answers, bar_color=BAR_COLOR, interval_color=INTERVAL_COLOR): split_checkbox = responses[question].dropna() # checkbox_responses== DataFrame of bools where index=individual respondents, columns=answer choices checkbox_responses = expand_checkbox(split_checkbox, answers) # sum checked boxes in each column; response_counts== Series with values=sums, index=answer choices response_counts = checkbox_responses.sum() # resample and sum from each column to bootstrap a confidence interval # count_confidence_intervals== Series with values= tuples (low, high), index=answer choices count_confidence_intervals = checkbox_responses.apply(lambda x: bootstrap_basic_ci(np.array(x))) #print checkbox_responses.apply(lambda x: bootstrap_percentile_ci(np.array(x))).apply(tuple_normalize, args=([len(checkbox_responses)])) #print checkbox_responses.apply(lambda x: bootstrap_basic_ci(np.array(x))).apply(tuple_normalize, args=([len(checkbox_responses)])) #normalize response_counts to percentage of total respondents to the question and sort response_counts = response_counts.apply(lambda x: float(x) / len(checkbox_responses) * 100) response_counts.sort(ascending=True) #normalize confidence intervals to percentages and sort count_confidence_intervals = count_confidence_intervals.apply(tuple_normalize, args=([len(checkbox_responses)])) count_confidence_intervals = count_confidence_intervals.reindex(index=response_counts.index) #convert absolute interval values to distance below and above the observed value for index in count_confidence_intervals.index.values: count_confidence_intervals.loc[index] = interval_to_error(count_confidence_intervals.loc[index], response_counts.loc[index]) #split interval tuples into 2 element Series count_confidence_intervals = count_confidence_intervals.apply(split_interval) response_counts.index = [ '\n'.join(wrap(i, LABEL_WIDTH)) for i in response_counts.index ] fig = response_counts.plot(kind='barh', color=bar_color, edgecolor='w', grid=False, xlim=(0,100), fontsize=14) fig.errorbar(response_counts.as_matrix(), np.arange(len(response_counts)), xerr=count_confidence_intervals.T.as_matrix(), fmt='none', ecolor=interval_color, alpha=0.65, elinewidth=2, capsize=12, capthick=2) apply_cdl_style(fig) fig.get_figure().set_size_inches(14., 2. * len(response_counts.index)) return fig, response_counts def graph_fisher_exact(question, answers, alpha=0.05, labels=None, edge_color=BAR_COLOR): checkbox_responses = expand_checkbox(question.dropna(), answers) fig=nx.Graph() fig.add_nodes_from(checkbox_responses.columns) pos = nx.circular_layout(fig, scale=10000) nx.draw_networkx_nodes(fig, pos, node_size=5000, node_color='#cdcdcd', linewidths=0) nx.draw_networkx_labels(fig, pos, labels=labels, fontsize=10, font_family='serif') i = 0 for a in checkbox_responses.columns: i += 1 for b in checkbox_responses.columns[i:]: square = pd.DataFrame({ 0 : 0, 0 : 0}, index=[True, False], columns=[True, False]) # fill in counts square[True] = (checkbox_responses[checkbox_responses[a] == True][b].value_counts()) square[False] = (checkbox_responses[checkbox_responses[a] == False][b].value_counts()) odds_ratio, p = sps.fisher_exact(square.as_matrix()) e_color = edge_color if odds_ratio < 1: odds_ratio, p = sps.fisher_exact(square.T.as_matrix()) e_color = 'r' how_significant = 0.5 if p > alpha else 1.0 nx.draw_networkx_edges(fig, pos, edgelist=[(a,b)], width=odds_ratio, edge_color=e_color, alpha=how_significant) plt.axis('off') return fig def graph_likert(questions, answers, filter_on_column=None, filter_value=None, interval_color=INTERVAL_COLOR): if filter_on_column: responses_ft = responses[responses[filter_on_column] == filter_value] else: responses_ft = responses collected_counts = pd.DataFrame(index=answers) stats = pd.DataFrame(index=questions,columns=['mean', 'ci']) # set up dict for converstion from likert scale (e.g., 1-5) to 0-100% number_of_answers = len(answers) answer_to_value = dict(zip(answers, np.arange(number_of_answers)/float(number_of_answers - 1)*100)) for column in questions: collected_counts[column] = responses_ft[column].value_counts().dropna() #scale responses to go from 0 to 100 likert_values = responses[column].dropna().map(answer_to_value) #cacluate mean and 95% confidence interval stats['mean'].loc[column] = likert_values.mean() stats['ci'].loc[column] = bootstrap_basic_ci(np.array(likert_values), stat_function=np.mean) #sort stats and collected_counts by the mean stats = stats.sort_index(axis=0, by='mean', ascending=True) collected_counts = collected_counts.T.reindex(index=stats.index) collected_counts = collected_counts.div(collected_counts.sum(1).astype(float)/100, axis = 0) #convert absolute interval values to distance below and above the observed value for index in stats.index.values: stats['ci'].loc[index] = interval_to_error(stats['ci'].loc[index], stats['mean'].loc[index]) #split interval tuples into 2 element Series stats['ci'] = stats['ci'].apply(split_interval) collected_counts.index = [ '\n'.join(wrap(i, LABEL_WIDTH)) for i in collected_counts.index ] #plot percentages of each response fig = collected_counts.plot(kind='barh', stacked=True, grid=False, color=sns.color_palette("Blues", len(collected_counts.columns)), xlim = (0,100), edgecolor='w', linewidth=2) # plot mean and 95% confidence interval fig.plot(stats['mean'], np.arange(len(stats)), marker='o', color='w',axes=fig, markersize=25, markeredgewidth=0, linewidth=0) fig.errorbar(stats['mean'].as_matrix(), np.arange(len(stats)), xerr=stats['ci'], fmt='none', ecolor=interval_color, alpha=0.65, elinewidth=2, capsize=12, capthick=2) fig.legend(bbox_to_anchor=(0., -0.02, 1., -0.03), loc='upper left', ncol=number_of_answers, mode="expand", borderaxespad=0., fontsize=14) apply_cdl_style(fig) fig.get_figure().set_size_inches(14., 2. * len(collected_counts.index)) return fig EXCLUDE = {'role' : 'Librarian', 'discipline' : 'Information science', 'highest_degree' : 'Highschool', 'generated_data' : 'No'} responses = pd.read_csv('DataPubSurvey_anon.csv') for column, value in EXCLUDE.items(): responses = responses[responses[column] != value] DISCIPLINE_MAP = {'Anthropology' : 'Social science', 'Archaeology' : 'Archaeology', 'Area studies' : 'Social science', 'Economics' : 'Social science', 'Political science' : 'Social science', 'Psychology' : 'Social science', 'Sociology' : 'Social science', 'Astronomy' : 'Space science', 'Astrophysics' : 'Space science', 'Environmental Science' : 'Environmental science', 'Geology' : 'Earth science', 'Oceanography' : 'Environmental science', 'Planetary science' : 'Earth science', 'Biochemistry' : 'Biology', 'Bioinformatics' : 'Biology', 'Biology' : 'Biology', 'Evolutionary Biology' : 'Biology', 'Neurobiology' : 'Biology', 'Social science' : 'Social science', 'Space science' : 'Space science', 'Earth science' : 'Earth science', 'Life science' : 'Biology', 'Chemistry' : 'Physical science', 'Physics' : 'Physical science', 'Computer science' : 'Computer science', 'Mathematics' : 'Mathematics', 'Information science' : 'Information science', 'Other' : 'Other'} responses.discipline= responses.discipline.map(DISCIPLINE_MAP).dropna() DEMOGRAPHICS = ['discipline', 'highest_degree', 'role','institution'] for column in DEMOGRAPHICS: count = responses[column].value_counts() percentages = 100 * count.apply(lambda x: float(x) / count.sum()) display(pd.DataFrame([count, percentages], index=['count', 'percent']).T) AWARENESS_QUESTIONS = ['aware_ostp_policy', 'aware_nsf_dmp', 'aware_nih_data_sharing_policy'] AWARENESS_ANSWERS = ["Never heard of it", "Heard of it", "Read about it", "Know all the details"] graph_likert(AWARENESS_QUESTIONS, AWARENESS_ANSWERS, filter_on_column='united_states', filter_value=True) SHARING_CHANNELS = ["Email / direct contact", "Personal or lab website", "Journal website (as supplemental material)", "Database or repository"] graph_checkbox('how_shared', SHARING_CHANNELS) graph_checkbox('how_others_got', SHARING_CHANNELS) graph_checkbox('how_you_got', SHARING_CHANNELS) HOW_DOCUMENTED_ANSWERS = ["A traditional research paper based on the data (with analysis and conclusions)", "A data paper describing the data (without analysis or conclusions)", "Informal text describing the data", "Formal metadata describing the data (e.g. as XML)", "Computer code used to process or generate the data", "Shared with no additional documentation"] graph_checkbox('how_documented', HOW_DOCUMENTED_ANSWERS) HOW_CREDITED_ANSWERS = ["Authorship on paper", "Acknowledgement in the paper", "Data cited in the reference list", "Data cited informally in the text of the paper"] graph_checkbox('data_sharing_credit', HOW_CREDITED_ANSWERS) graph_checkbox('how_you_credited', HOW_CREDITED_ANSWERS) DP_FEATURES = ["Openly available without contacting the author(s)", "Deposited in a database or repository", "Assigned a unique identifier such as a DOI", "A traditional research paper is based on the data", "A data paper (without conclusions) describes the data", "Packaged with a thorough description of the data", "Packaged with formal metadata describing the data (e.g. as XML)", "Dataset is \"peer reviewed\""] fig = graph_checkbox('publish_definition', DP_FEATURES) PR_FEATURES = ["Collection and processing methods were evaluated", "Descriptive text is thorough enough to use or replicate the dataset", "Necessary metadata is standardized (e.g. in XML)", "Technical details have been checked (e.g. no missing files no missing values)", "Plausibility considered based on area expertise", "Novelty/impact considered"] fig = graph_checkbox('peer_review_definition', PR_FEATURES) dp_labels = {"Openly available without contacting the author(s)" : "openly\navailable", "Deposited in a database or repository" : "repository\ndeposit", "Assigned a unique identifier such as a DOI" : "unique\nID", "A traditional research paper is based on the data" : "traditional\npaper", "A data paper (without conclusions) describes the data" : "data\npaper", "Packaged with a thorough description of the data" : "thorough\nmetadata", "Packaged with formal metadata describing the data (e.g. as XML)" : "formal\nmetadata", "Dataset is \"peer reviewed\"" : "peer\nreview"} graph_fisher_exact(responses.publish_definition, DP_FEATURES, labels=dp_labels, alpha=0.05/28) pr_labels = {"Collection and processing methods were evaluated" : "methods\nappropriate", "Descriptive text is thorough enough to use or replicate the dataset" : "thorough\nmetadata", "Necessary metadata is standardized (e.g. in XML)" : "standard\nmetadata", "Technical details have been checked (e.g. no missing files no missing values)" : "technical\ndetails", "Plausibility considered based on area expertise" : "data\nplausible", "Novelty/impact considered" : "novelty/\nimpact"} graph_fisher_exact(responses.peer_review_definition, PR_FEATURES, labels=pr_labels, alpha=0.05/15) REVIEW_ACTIONS = ["reviewed a journal article", "reviewed a grant proposal", "reviewed an application to graduate school", "reviewed a CV to hire someone for your lab", "served on a hiring committee", "served on a tenure & promotions committee"] fig = graph_checkbox('researcher_review_experience', REVIEW_ACTIONS) DATA_TRUST = ['traditional_paper_confidence', 'data_paper_confidence', 'peer_review_confidence', 'reuse_confidence'] DATA_TRUST_SEQUENCE = ["No confidence", "Little confidence", "Some confidence", "High confidence", "Complete confidence"] graph_likert(DATA_TRUST,DATA_TRUST_SEQUENCE) DATA_IMPACT = ['impact_citation', 'impact_downloads', 'impact_altmetrics', 'impact_google_rank'] DATA_IMPACT_SEQUENCE = ["Not at all useful", "Slightly useful", "Somewhat useful", "Highly useful", "Extremely useful"] graph_likert(DATA_IMPACT, DATA_IMPACT_SEQUENCE) PUBLICATION_VALUE = ["traditional_paper_value", "data_paper_pr_value", "data_paper_npr_value", "dataset_pr_value", "dataset_npr_value"] PUBLICATION_VALUE_SEQUENCE = ["None", "A small amount", "Some", "Significant", "A great deal"] graph_likert(PUBLICATION_VALUE, PUBLICATION_VALUE_SEQUENCE)