Goal:
References:
Note the split-apply-combine framework of thinking
# note version of pandas
import pandas
pandas.__version__
'0.13.0'
import us
import census
import settings
import pandas as pd
import numpy as np
from pandas import DataFrame, Series
from itertools import islice
c = census.Census(settings.CENSUS_KEY)
def states(variables='NAME'):
geo={'for':'state:*'}
states_fips = set([state.fips for state in us.states.STATES])
# need to filter out non-states
for r in c.sf1.get(variables, geo=geo, year=2010):
if r['state'] in states_fips:
yield r
# make a dataframe from the total populations of states in the 2010 Census
df = DataFrame(states('NAME,P0010001'))
df.P0010001 = df.P0010001.astype('int')
df.head()
NAME | P0010001 | state | |
---|---|---|---|
0 | Alabama | 4779736 | 01 |
1 | Alaska | 710231 | 02 |
2 | Arizona | 6392017 | 04 |
3 | Arkansas | 2915918 | 05 |
4 | California | 37253956 | 06 |
5 rows × 3 columns
# check that that we have the right total population
df.P0010001.sum() == 308745538
True
# add a column with the first letter
# we'll be grouping states based on the first letter of the state NAME
df['first_letter'] = df.NAME.apply(lambda s:s[0])
df.head()
NAME | P0010001 | state | first_letter | |
---|---|---|---|---|
0 | Alabama | 4779736 | 01 | A |
1 | Alaska | 710231 | 02 | A |
2 | Arizona | 6392017 | 04 | A |
3 | Arkansas | 2915918 | 05 | A |
4 | California | 37253956 | 06 | C |
5 rows × 4 columns
Possible to do grouping in many ways. "The mapping can be specified many different ways":
We can also group by columns, axis=1, but such functionality is not demonstrated in this notebook.
# we can explicitly name df.first_letter
grouped = df.groupby(df.first_letter)
grouped
<pandas.core.groupby.DataFrameGroupBy object at 0x106212350>
# shorthand for df.first_letter
# same thing as df.groupby(df.first_letter)
grouped = df.groupby('first_letter')
grouped
<pandas.core.groupby.DataFrameGroupBy object at 0x106203f10>
# count the number of states with each first letter -- look at top of the resulting DataFrame
grouped.count().head()
NAME | P0010001 | state | first_letter | |
---|---|---|---|---|
first_letter | ||||
A | 4 | 4 | 4 | 4 |
C | 3 | 3 | 3 | 3 |
D | 2 | 2 | 2 | 2 |
F | 1 | 1 | 1 | 1 |
G | 1 | 1 | 1 | 1 |
5 rows × 4 columns
# count the number of states with each first letter -- look at bottom of the resulting DataFrame
grouped.count().tail()
NAME | P0010001 | state | first_letter | |
---|---|---|---|---|
first_letter | ||||
S | 2 | 2 | 2 | 2 |
T | 2 | 2 | 2 | 2 |
U | 1 | 1 | 1 | 1 |
V | 2 | 2 | 2 | 2 |
W | 4 | 4 | 4 | 4 |
5 rows × 4 columns
# we didn't have to explicitly create a new column -- we could groupby on a dynamically generated Series
# note the use of str operations on df.NAME:
# http://pandas.pydata.org/pandas-docs/stable/basics.html#vectorized-string-methods
df.groupby(df.NAME.str.slice(0,1)).sum()
P0010001 | |
---|---|
NAME | |
A | 14797902 |
C | 45857249 |
D | 1499657 |
F | 18801310 |
G | 9687653 |
H | 1360301 |
I | 23928371 |
K | 7192485 |
L | 4533372 |
M | 38782746 |
N | 46280611 |
O | 19118929 |
P | 12702379 |
R | 1052567 |
S | 5439544 |
T | 31491666 |
U | 2763885 |
V | 8626765 |
W | 14828146 |
19 rows × 1 columns
# we can get groups of indexes
df.groupby('first_letter').groups
{u'A': [0, 1, 2, 3], u'C': [4, 5, 6], u'D': [7, 8], u'F': [9], u'G': [10], u'H': [11], u'I': [12, 13, 14, 15], u'K': [16, 17], u'L': [18], u'M': [19, 20, 21, 22, 23, 24, 25, 26], u'N': [27, 28, 29, 30, 31, 32, 33, 34], u'O': [35, 36, 37], u'P': [38], u'R': [39], u'S': [40, 41], u'T': [42, 43], u'U': [44], u'V': [45, 46], u'W': [47, 48, 49, 50]}
# loop through all groups --> here just the first one
for name, group in islice(df.groupby('first_letter'),1):
print(name)
print type(group) # yes -- a DataFrame
print group.index
print(group),
A <class 'pandas.core.frame.DataFrame'> Int64Index([0, 1, 2, 3], dtype='int64') NAME P0010001 state first_letter 0 Alabama 4779736 01 A 1 Alaska 710231 02 A 2 Arizona 6392017 04 A 3 Arkansas 2915918 05 A [4 rows x 4 columns]
# how about accessing group 'C'?
# http://stackoverflow.com/a/14734627/7782
# http://stackoverflow.com/questions/19804282/in-pandas-is-there-something-like-a-groupby-get-group-but-with-an-optional-defa
# http://pandas.pydata.org/pandas-docs/dev/generated/pandas.core.groupby.GroupBy.get_group.html
grouped = df.groupby('first_letter')
grouped.get_group('C')
NAME | P0010001 | state | first_letter | |
---|---|---|---|---|
4 | California | 37253956 | 06 | C |
5 | Colorado | 5029196 | 08 | C |
6 | Connecticut | 3574097 | 09 | C |
3 rows × 4 columns
# total population of states starting with 'C'
grouped.get_group('C').P0010001.sum()
45857249
# generate a Series of total populations by first letter
grouped = df.groupby('first_letter')
s = grouped['P0010001'].sum()
s
first_letter A 14797902 C 45857249 D 1499657 F 18801310 G 9687653 H 1360301 I 23928371 K 7192485 L 4533372 M 38782746 N 46280611 O 19118929 P 12702379 R 1052567 S 5439544 T 31491666 U 2763885 V 8626765 W 14828146 Name: P0010001, dtype: int64
# sort the list to get the most populous groups
# http://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.sort.html
# note sorting in place
s.sort(ascending=False)
s
first_letter N 46280611 C 45857249 M 38782746 T 31491666 I 23928371 O 19118929 F 18801310 W 14828146 A 14797902 P 12702379 G 9687653 V 8626765 K 7192485 S 5439544 L 4533372 U 2763885 D 1499657 H 1360301 R 1052567 Name: P0010001, dtype: int64
s.index
Index([u'N', u'C', u'M', u'T', u'I', u'O', u'F', u'W', u'A', u'P', u'G', u'V', u'K', u'S', u'L', u'U', u'D', u'H', u'R'], dtype='object')
# first pass at pulling together the letter, the total population, and the list of corresponding states
for k in s.index:
print k, s[k], list(grouped.get_group(k).NAME)
N 46280611 [u'Nebraska', u'Nevada', u'New Hampshire', u'New Jersey', u'New Mexico', u'New York', u'North Carolina', u'North Dakota'] C 45857249 [u'California', u'Colorado', u'Connecticut'] M 38782746 [u'Maine', u'Maryland', u'Massachusetts', u'Michigan', u'Minnesota', u'Mississippi', u'Missouri', u'Montana'] T 31491666 [u'Tennessee', u'Texas'] I 23928371 [u'Idaho', u'Illinois', u'Indiana', u'Iowa'] O 19118929 [u'Ohio', u'Oklahoma', u'Oregon'] F 18801310 [u'Florida'] W 14828146 [u'Washington', u'West Virginia', u'Wisconsin', u'Wyoming'] A 14797902 [u'Alabama', u'Alaska', u'Arizona', u'Arkansas'] P 12702379 [u'Pennsylvania'] G 9687653 [u'Georgia'] V 8626765 [u'Vermont', u'Virginia'] K 7192485 [u'Kansas', u'Kentucky'] S 5439544 [u'South Carolina', u'South Dakota'] L 4533372 [u'Louisiana'] U 2763885 [u'Utah'] D 1499657 [u'Delaware', u'District of Columbia'] H 1360301 [u'Hawaii'] R 1052567 [u'Rhode Island']
# let's try this again and try to do this in a more idiomatic Pandas fashion
# ideally, generate a DataFrame with a NAME column that is a list of all states with the same first letter
# It turns out that apply can come to the rescue
# here' calculate the list of NAMEs
# http://pandas.pydata.org/pandas-docs/dev/groupby.html#flexible-apply
# http://stackoverflow.com/questions/19530568/can-pandas-groupby-aggregate-into-a-list-rather-than-sum-mean-etc
df.groupby("first_letter").apply(lambda x: list(x['NAME']))
first_letter A [Alabama, Alaska, Arizona, Arkansas] C [California, Colorado, Connecticut] D [Delaware, District of Columbia] F [Florida] G [Georgia] H [Hawaii] I [Idaho, Illinois, Indiana, Iowa] K [Kansas, Kentucky] L [Louisiana] M [Maine, Maryland, Massachusetts, Michigan, Min... N [Nebraska, Nevada, New Hampshire, New Jersey, ... O [Ohio, Oklahoma, Oregon] P [Pennsylvania] R [Rhode Island] S [South Carolina, South Dakota] T [Tennessee, Texas] U [Utah] V [Vermont, Virginia] W [Washington, West Virginia, Wisconsin, Wyoming] dtype: object
# apply can be used to add up the populations by group
df.groupby("first_letter").apply(lambda x: np.sum(x['P0010001']))
first_letter A 14797902 C 45857249 D 1499657 F 18801310 G 9687653 H 1360301 I 23928371 K 7192485 L 4533372 M 38782746 N 46280611 O 19118929 P 12702379 R 1052567 S 5439544 T 31491666 U 2763885 V 8626765 W 14828146 dtype: int64
# make a tuple out of the list of names and the population
df.groupby("first_letter").apply(lambda x:( list(x['NAME']), np.sum(x['P0010001'])))
first_letter A ([Alabama, Alaska, Arizona, Arkansas], 14797902) C ([California, Colorado, Connecticut], 45857249) D ([Delaware, District of Columbia], 1499657) F ([Florida], 18801310) G ([Georgia], 9687653) H ([Hawaii], 1360301) I ([Idaho, Illinois, Indiana, Iowa], 23928371) K ([Kansas, Kentucky], 7192485) L ([Louisiana], 4533372) M ([Maine, Maryland, Massachusetts, Michigan, Mi... N ([Nebraska, Nevada, New Hampshire, New Jersey,... O ([Ohio, Oklahoma, Oregon], 19118929) P ([Pennsylvania], 12702379) R ([Rhode Island], 1052567) S ([South Carolina, South Dakota], 5439544) T ([Tennessee, Texas], 31491666) U ([Utah], 2763885) V ([Vermont, Virginia], 8626765) W ([Washington, West Virginia, Wisconsin, Wyomin... dtype: object
# remind ourselves on how to turn a tuple into a Series with a small example
Series(([1,2],2), index=['one','two'])
one [1, 2] two 2 dtype: object
# we're ready to make a new DataFrame
df.groupby("first_letter").apply(lambda x:Series((list(x['NAME']), np.sum(x['P0010001'])),
index=['states','total_pop']))
states | total_pop | |
---|---|---|
first_letter | ||
A | [Alabama, Alaska, Arizona, Arkansas] | 14797902 |
C | [California, Colorado, Connecticut] | 45857249 |
D | [Delaware, District of Columbia] | 1499657 |
F | [Florida] | 18801310 |
G | [Georgia] | 9687653 |
H | [Hawaii] | 1360301 |
I | [Idaho, Illinois, Indiana, Iowa] | 23928371 |
K | [Kansas, Kentucky] | 7192485 |
L | [Louisiana] | 4533372 |
M | [Maine, Maryland, Massachusetts, Michigan, Min... | 38782746 |
N | [Nebraska, Nevada, New Hampshire, New Jersey, ... | 46280611 |
O | [Ohio, Oklahoma, Oregon] | 19118929 |
P | [Pennsylvania] | 12702379 |
R | [Rhode Island] | 1052567 |
S | [South Carolina, South Dakota] | 5439544 |
T | [Tennessee, Texas] | 31491666 |
U | [Utah] | 2763885 |
V | [Vermont, Virginia] | 8626765 |
W | [Washington, West Virginia, Wisconsin, Wyoming] | 14828146 |
19 rows × 2 columns
df2 = df.groupby("first_letter").apply(lambda x:Series((list(x['NAME']), np.sum(x['P0010001'])),
index=['states','total_pop'])).sort_index(by='total_pop',ascending=False)
# make sure you understand the syntax here:
# .ix: http://pandas.pydata.org/pandas-docs/dev/indexing.html#advanced-indexing-with-ix
# ability to grab columns by name to return a new DataFrame
df2.ix['C'][['states','total_pop']]
states [California, Colorado, Connecticut] total_pop 45857249 Name: C, dtype: object
print "states that start with 'C'", df2.ix['C']['states']
states that start with 'C' [u'California', u'Colorado', u'Connecticut']
print "total population of states that start with 'C'", df2.ix['C']['total_pop']
total population of states that start with 'C' 45857249