import pandas as pd
import re
pd.set_option('display.mpl_style', 'default')
font = {'family' : 'sans',
'weight' : 'normal',
'size' : 11}
matplotlib.rc('font', **font)
matplotlib.rc('xtick', labelsize=12)
matplotlib.rc('ytick', labelsize=12)
def get_questions(df):
return df.columns[:-2]
def get_answer_ratios(df):
questions = get_questions(df)
ratios = pd.concat([df[col].value_counts() for col in questions], axis=1)
ratios.columns = questions
ratios = ratios.T
sums = ratios.sum(axis=1).astype(float)
return ratios['gender neutral'] / sums
def create_summary(everyone):
women = everyone[everyone['Do you identify as a woman?'] == 'Yes']
men = everyone[everyone['Do you identify as a woman?'] == 'No']
print "Number of women:", len(women)
print "Number of men:", len(men)
summary = pd.concat([get_answer_ratios(everyone), get_answer_ratios(men), get_answer_ratios(women)], axis=1)
summary.columns = ['Everyone', 'Men + Other', 'Women']
return men, women, summary
everyone = pd.read_csv('./guys-guys-guys-no-email.csv', parse_dates=True, index_col='Timestamp')
men, women, summary = create_summary(everyone)
Number of women: 651 Number of men: 1672
def draw_figure(summary, question):
figure()
figsize(9,3)
fig, axes = subplots(1,3)
for i, name in enumerate(summary.columns):
ax = axes[i]
ax.set_title(name)
percent = summary.ix[question][name]
percent_label = int(round(percent * 100))
labels = [str(percent_label) + '%', str(100 - percent_label) + '%']
ax.pie([percent, 1-percent], colors=['white', 'lightblue'], labels=labels)
return fig
def make_filename(question, extension="png"):
unwanted_re = re.compile("[!.']")
fname = question.lower().replace(' ', '-')
fname = unwanted_re.sub("", fname)
return "images/" + fname + "." + extension
everyone['Do you identify as a woman?'].value_counts().plot(kind='bar',
rot=0,
title="Do you identify as a woman?")
<matplotlib.axes.AxesSubplot at 0x3e71a50>
for question in summary.index:
print question
fig = draw_figure(summary, question)
fig.savefig(make_filename(question))
Hey guys! I just saw that a penguin escaped. I'm going out with the guys. You can deal with the penguins. We're going to need to hire a Python guy to deal with our penguin problems. This would never have happened if the Java guys were here. That penguin is such a good guy Those guys dealt with the penguin emergency so professionally. I met a great Erlang guy the other day who knows how to deal with thorny penguin issues
<matplotlib.figure.Figure at 0x3ef2110>
<matplotlib.figure.Figure at 0x3e71310>
<matplotlib.figure.Figure at 0x443d850>
<matplotlib.figure.Figure at 0x45fc290>
<matplotlib.figure.Figure at 0x4991fd0>
<matplotlib.figure.Figure at 0x4bd4290>
<matplotlib.figure.Figure at 0x4f73d50>
men, women, summary = create_summary(everyone)
Number of women: 651 Number of men: 1672
# Compute binomial confidence interval
lengths = pd.Series([len(everyone), len(men), len(women)], index=summary.columns)
print lengths
error = np.sqrt(summary * (1-summary) / lengths)
Everyone 2323 Men + Other 1672 Women 651 dtype: int64
# Scale everything up for graphing
error = error * 100
summary = summary * 100
font = {'family' : 'sans',
'weight' : 'normal',
'size' : 25}
matplotlib.rc('font', **font)
matplotlib.rc('xtick', labelsize=30)
def plot_histogram(summmary):
summary = summmary.copy()
questions= ["Hey guys! I \njust saw that a \npenguin escaped.",
"I'm going out with \nthe guys. You can deal \nwith the penguins.",
"We're going to need to \nhire a Python guy to \ndeal with our \npenguin problems.",
"This would never have \nhappened if the \nJava guys were here.",
"That penguin is \nsuch a good guy",
"Those guys dealt \nwith the penguin \nemergency so \nprofessionally.",
"I met a great Erlang guy \nthe other day who \nknows how to deal with\nthorny penguin issues"]
summary.index = questions
# Reverse the summary
summary = summary.ix[reversed(questions)]
return summary[['Women', 'Men + Other']].plot(kind='barh',
figsize=(20, 20),
rot=0,
title="What percentage of people think this usage is gender-neutral",
colors = ['#466fd5', '#ff7400'],
xlim=(0, 100))
fig = plot_histogram(summary)
Not sure how to plot them on the same graph yet, though
plot_histogram(error)
<matplotlib.axes.AxesSubplot at 0x4371f10>