import pandas as pd import re pd.set_option('display.mpl_style', 'default') font = {'family' : 'sans', 'weight' : 'normal', 'size' : 11} matplotlib.rc('font', **font) matplotlib.rc('xtick', labelsize=12) matplotlib.rc('ytick', labelsize=12) def get_questions(df): return df.columns[:-2] def get_answer_ratios(df): questions = get_questions(df) ratios = pd.concat([df[col].value_counts() for col in questions], axis=1) ratios.columns = questions ratios = ratios.T sums = ratios.sum(axis=1).astype(float) return ratios['gender neutral'] / sums def create_summary(everyone): women = everyone[everyone['Do you identify as a woman?'] == 'Yes'] men = everyone[everyone['Do you identify as a woman?'] == 'No'] print "Number of women:", len(women) print "Number of men:", len(men) summary = pd.concat([get_answer_ratios(everyone), get_answer_ratios(men), get_answer_ratios(women)], axis=1) summary.columns = ['Everyone', 'Men + Other', 'Women'] return men, women, summary everyone = pd.read_csv('./guys-guys-guys-no-email.csv', parse_dates=True, index_col='Timestamp') men, women, summary = create_summary(everyone) def draw_figure(summary, question): figure() figsize(9,3) fig, axes = subplots(1,3) for i, name in enumerate(summary.columns): ax = axes[i] ax.set_title(name) percent = summary.ix[question][name] percent_label = int(round(percent * 100)) labels = [str(percent_label) + '%', str(100 - percent_label) + '%'] ax.pie([percent, 1-percent], colors=['white', 'lightblue'], labels=labels) return fig def make_filename(question, extension="png"): unwanted_re = re.compile("[!.']") fname = question.lower().replace(' ', '-') fname = unwanted_re.sub("", fname) return "images/" + fname + "." + extension everyone['Do you identify as a woman?'].value_counts().plot(kind='bar', rot=0, title="Do you identify as a woman?") for question in summary.index: print question fig = draw_figure(summary, question) fig.savefig(make_filename(question)) men, women, summary = create_summary(everyone) # Compute binomial confidence interval lengths = pd.Series([len(everyone), len(men), len(women)], index=summary.columns) print lengths error = np.sqrt(summary * (1-summary) / lengths) # Scale everything up for graphing error = error * 100 summary = summary * 100 font = {'family' : 'sans', 'weight' : 'normal', 'size' : 25} matplotlib.rc('font', **font) matplotlib.rc('xtick', labelsize=30) def plot_histogram(summmary): summary = summmary.copy() questions= ["Hey guys! I \njust saw that a \npenguin escaped.", "I'm going out with \nthe guys. You can deal \nwith the penguins.", "We're going to need to \nhire a Python guy to \ndeal with our \npenguin problems.", "This would never have \nhappened if the \nJava guys were here.", "That penguin is \nsuch a good guy", "Those guys dealt \nwith the penguin \nemergency so \nprofessionally.", "I met a great Erlang guy \nthe other day who \nknows how to deal with\nthorny penguin issues"] summary.index = questions # Reverse the summary summary = summary.ix[reversed(questions)] return summary[['Women', 'Men + Other']].plot(kind='barh', figsize=(20, 20), rot=0, title="What percentage of people think this usage is gender-neutral", colors = ['#466fd5', '#ff7400'], xlim=(0, 100)) fig = plot_histogram(summary) plot_histogram(error)