import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
tips = sns.load_dataset("tips")
tips.head()
total_bill | tip | sex | smoker | day | time | size | |
---|---|---|---|---|---|---|---|
0 | 16.99 | 1.01 | Female | No | Sun | Dinner | 2 |
1 | 10.34 | 1.66 | Male | No | Sun | Dinner | 3 |
2 | 21.01 | 3.50 | Male | No | Sun | Dinner | 3 |
3 | 23.68 | 3.31 | Male | No | Sun | Dinner | 2 |
4 | 24.59 | 3.61 | Female | No | Sun | Dinner | 4 |
tips_counts = tips.groupby(['sex', 'time']).size()
tips_counts = tips_counts.reset_index()
tips_counts = tips_counts.rename(columns={0:'n'})
tips_counts.head()
tips_counts['percentage'] = tips_counts.groupby(['sex'], as_index=False, group_keys=False).apply(lambda x: x['n']/x['n'].sum())
tips_counts.head()
ERROR! Session/line number was not unique in database. History logging moved to new session 1443
sex | time | n | percentage | |
---|---|---|---|---|
0 | Male | Lunch | 33 | 0.210191 |
1 | Male | Dinner | 124 | 0.789809 |
2 | Female | Lunch | 35 | 0.402299 |
3 | Female | Dinner | 52 | 0.597701 |
Force categorical type, otherwise the order of categories to factorplot
and annotate_bars
are inconsistent, since seaborn
creates a categorical type under the hood.
tips_counts.sex = pd.Categorical(tips_counts.sex, categories=['Male', "Female"], ordered=True)
tips_counts.time = pd.Categorical(tips_counts.time, categories=['Lunch', "Dinner"], ordered=True)
Make a factorplot and annotate it.
def annotate_bars(x, y, x_groupby, hue_groupby, height_col, count_col, **kwargs):
data = kwargs.pop('data')
ax = plt.gca()
n_hues = len(data.groupby(hue_groupby).size())
width = 0.8/n_hues
x_base = -.5 - width/2.5
for group1, df1 in data.groupby(x_groupby):
i = 0
for group2, df2 in df1.groupby(hue_groupby):
i += 1
x_position = x_base + width*i + width/4
y_position = df2[height_col]
ax.annotate(str(df2[count_col].values[0]), (x_position, y_position),
textcoords='offset points', xytext=(0, 2),
ha='center', va='bottom', fontsize=12)
x_base += 1
g = sns.factorplot(x='sex', y='percentage', hue='time',
kind='bar', data=tips_counts, legend=False, ci=None)
g.map_dataframe(annotate_bars, 'sex', 'percentage', x_groupby='sex',
hue_groupby='time', height_col='percentage', count_col='n')
# g.add_legend();
Male Female