# Start with our normal batch of imports and settings
from __future__ import print_function, division
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns; sns.set()
index_col
argument to set the index to the first column)pd.concat
into a single dataframefemales = pd.read_csv('../data/femaleVisitsToPhysician.csv')
females.head()
Unnamed: 0 | year | age | sex | consultations | population | perCapita | |
---|---|---|---|---|---|---|---|
0 | 4 | 2006 | 1 | f | 67194 | 28221 | 2.380993 |
1 | 7 | 2006 | 2 | f | 49802 | 28361 | 1.756003 |
2 | 10 | 2006 | 3 | f | 42741 | 27899 | 1.531990 |
3 | 13 | 2006 | 4 | f | 36555 | 28420 | 1.286242 |
4 | 16 | 2006 | 5 | f | 33319 | 29479 | 1.130262 |
It looks like this unnamed column is meant to be some sort of index. Let's use it as a numerical index for the data:
females = pd.read_csv('../data/femaleVisitsToPhysician.csv',
index_col=0)
females.head()
year | age | sex | consultations | population | perCapita | |
---|---|---|---|---|---|---|
4 | 2006 | 1 | f | 67194 | 28221 | 2.380993 |
7 | 2006 | 2 | f | 49802 | 28361 | 1.756003 |
10 | 2006 | 3 | f | 42741 | 27899 | 1.531990 |
13 | 2006 | 4 | f | 36555 | 28420 | 1.286242 |
16 | 2006 | 5 | f | 33319 | 29479 | 1.130262 |
Now we do the same for the males data:
males = pd.read_csv('../data/maleVisitsToPhysician.csv',
index_col=0)
males.head()
year | age | sex | consultations | population | perCapita | |
---|---|---|---|---|---|---|
5 | 2006 | 1 | m | 78700 | 29535 | 2.664635 |
8 | 2006 | 2 | m | 57272 | 29504 | 1.941161 |
11 | 2006 | 3 | m | 47374 | 28894 | 1.639579 |
14 | 2006 | 4 | m | 39366 | 29576 | 1.331012 |
17 | 2006 | 5 | m | 36272 | 30953 | 1.171841 |
Concatenating these together, we get the following:
data = pd.concat([males, females]).sort_index()
data.head()
year | age | sex | consultations | population | perCapita | |
---|---|---|---|---|---|---|
4 | 2006 | 1 | f | 67194 | 28221 | 2.380993 |
5 | 2006 | 1 | m | 78700 | 29535 | 2.664635 |
7 | 2006 | 2 | f | 49802 | 28361 | 1.756003 |
8 | 2006 | 2 | m | 57272 | 29504 | 1.941161 |
10 | 2006 | 3 | f | 42741 | 27899 | 1.531990 |
For each gender, the data shows the per capita consultations by age and year.
Use pd.pivot_table
and plot the data.
Also, as you create these plots, experiment with sns.set_palette
to get a color scheme which helps convey the information you're interested in.
by_age = data.pivot_table('perCapita', index=['sex', 'age'], columns='year')
by_age.head()
year | 2006 | 2007 | 2008 | 2009 | 2010 | 2011 | 2012 | 2013 | |
---|---|---|---|---|---|---|---|---|---|
sex | age | ||||||||
f | 1 | 2.380993 | 2.349824 | 2.425987 | 2.419597 | 2.630705 | 2.483097 | 2.482780 | 2.367074 |
2 | 1.756003 | 1.720274 | 1.776511 | 1.689157 | 1.783554 | 1.758548 | 1.807354 | 1.655557 | |
3 | 1.531990 | 1.479736 | 1.441581 | 1.455040 | 1.405735 | 1.465799 | 1.469925 | 1.367749 | |
4 | 1.286242 | 1.255660 | 1.242319 | 1.231516 | 1.229607 | 1.266953 | 1.272097 | 1.184196 | |
5 | 1.130262 | 1.101836 | 1.128775 | 1.111600 | 1.062639 | 1.183401 | 1.134739 | 1.091244 |
fig, ax = plt.subplots(1, 2, figsize=(14, 5))
by_age.loc['f'].plot(ax=ax[0], title='Visits Per Capita: Females')
by_age.loc['m'].plot(ax=ax[1], title='Visits Per Capita: Males');
Because the color palette is not sequential, some of the trend is hidden. Let's choose an 8-step sequential palette that will show us more information:
sns.set_palette('winter', 8)
fig, ax = plt.subplots(1, 2, figsize=(14, 5))
by_age.loc['f'].plot(ax=ax[0], title='Visits Per Capita: Females')
by_age.loc['m'].plot(ax=ax[1], title='Visits Per Capita: Males');
Now we can see that there seems to be an overall upward trend in visits as the years pass.
The copayment for GP visits was eliminated in 2010. Let's see whether there is any indication that this affected the rate of visits
with_copay
, which is True if the year is prior to 2010, and False otherwisedata['with_copay'] = (data['year'] < 2010)
data.head()
year | age | sex | consultations | population | perCapita | with_copay | |
---|---|---|---|---|---|---|---|
4 | 2006 | 1 | f | 67194 | 28221 | 2.380993 | True |
5 | 2006 | 1 | m | 78700 | 29535 | 2.664635 | True |
7 | 2006 | 2 | f | 49802 | 28361 | 1.756003 | True |
8 | 2006 | 2 | m | 57272 | 29504 | 1.941161 | True |
10 | 2006 | 3 | f | 42741 | 27899 | 1.531990 | True |
by_age_copay = data.pivot_table('perCapita', index=['sex', 'age'],
columns='with_copay', aggfunc='mean')
by_age_copay.head()
with_copay | False | True | |
---|---|---|---|
sex | age | ||
f | 1 | 2.490914 | 2.394100 |
2 | 1.751253 | 1.735486 | |
3 | 1.427302 | 1.477087 | |
4 | 1.238213 | 1.253934 | |
5 | 1.118006 | 1.118118 |
sns.set() # reset palette
fig, ax = plt.subplots(1, 2, figsize=(14, 5))
by_age_copay.loc['f'].plot(ax=ax[0], title='Visits Per Capita: Females')
by_age_copay.loc['m'].plot(ax=ax[1], title='Visits Per Capita: Males');
Apparently the removal of the copayment increased the number of visits in much of the population.
Let's look at this percentage directly:
increase = 100 * (by_age_copay[False] / by_age_copay[True] - 1)
increase.loc['m'].plot(label='Males')
increase.loc['f'].plot(label='Females')
plt.legend();
plt.ylabel('increase (pct)')
plt.title('Percent Increase in per capita visits');
Here we see an interesting trend: the lack of copay increased visits by teenagers more than any other group!
Let's try to pull some information out of the data that's not obviously available.
Notice that the age
column and the year
column are intertwined... that is, by subtracting the age from the year, we can find the birth year of the group of people recorded.
data['birth_year'] = data['year'] - data['age']
data.head()
year | age | sex | consultations | population | perCapita | with_copay | birth_year | |
---|---|---|---|---|---|---|---|---|
4 | 2006 | 1 | f | 67194 | 28221 | 2.380993 | True | 2005 |
5 | 2006 | 1 | m | 78700 | 29535 | 2.664635 | True | 2005 |
7 | 2006 | 2 | f | 49802 | 28361 | 1.756003 | True | 2004 |
8 | 2006 | 2 | m | 57272 | 29504 | 1.941161 | True | 2004 |
10 | 2006 | 3 | f | 42741 | 27899 | 1.531990 | True | 2003 |
pop_by_birthyear = data.pivot_table('population', index=['sex', 'birth_year'],
columns='year', aggfunc='mean')
sns.set_palette('winter', 8)
fig, ax = plt.subplots(1, 2, figsize=(14, 5))
pop_by_birthyear.loc['m'].plot(title='Male Population', ax=ax[0]);
pop_by_birthyear.loc['f'].plot(title='Female Population', ax=ax[1]);
percap_by_birthyear = data.pivot_table('perCapita', index=['sex', 'birth_year'],
columns='year', aggfunc='mean')
sns.set_palette('winter', 8)
fig, ax = plt.subplots(1, 2, figsize=(14, 5))
percap_by_birthyear.loc['m'].plot(title='Male Visits per Capita', ax=ax[0]);
percap_by_birthyear.loc['f'].plot(title='Female Visits per Capita', ax=ax[1]);
If you finish the above tasks, try this more open-ended exploration on a different dataset.
Seaborn includes a dataset representing the individuals who were on-board the ill-fated maiden voyage of the Titanic. It has information about their age, gender, class, fare paid, the deck their quarters were on, whether they were traveling with someone, and whether they survived.
This is a fairly open-ended exploration, but try answering these questions:
See what sort of interesting relationships you can find between the various pieces of data.
titanic = sns.load_dataset('titanic')
titanic.head()
survived | pclass | sex | age | sibsp | parch | fare | embarked | class | who | adult_male | deck | embark_town | alive | alone | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 3 | male | 22 | 1 | 0 | 7.2500 | S | Third | man | True | NaN | Southampton | no | False |
1 | 1 | 1 | female | 38 | 1 | 0 | 71.2833 | C | First | woman | False | C | Cherbourg | yes | False |
2 | 1 | 3 | female | 26 | 0 | 0 | 7.9250 | S | Third | woman | False | NaN | Southampton | yes | True |
3 | 1 | 1 | female | 35 | 1 | 0 | 53.1000 | S | First | woman | False | C | Southampton | yes | False |
4 | 0 | 3 | male | 35 | 0 | 0 | 8.0500 | S | Third | man | True | NaN | Southampton | no | True |