# note version of pandas import pandas pandas.__version__ import us import census import settings import pandas as pd import numpy as np from pandas import DataFrame, Series from itertools import islice c = census.Census(settings.CENSUS_KEY) def states(variables='NAME'): geo={'for':'state:*'} states_fips = set([state.fips for state in us.states.STATES]) # need to filter out non-states for r in c.sf1.get(variables, geo=geo, year=2010): if r['state'] in states_fips: yield r # make a dataframe from the total populations of states in the 2010 Census df = DataFrame(states('NAME,P0010001')) df.P0010001 = df.P0010001.astype('int') df.head() # check that that we have the right total population df.P0010001.sum() == 308745538 # add a column with the first letter # we'll be grouping states based on the first letter of the state NAME df['first_letter'] = df.NAME.apply(lambda s:s[0]) df.head() # we can explicitly name df.first_letter grouped = df.groupby(df.first_letter) grouped # shorthand for df.first_letter # same thing as df.groupby(df.first_letter) grouped = df.groupby('first_letter') grouped # count the number of states with each first letter -- look at top of the resulting DataFrame grouped.count().head() # count the number of states with each first letter -- look at bottom of the resulting DataFrame grouped.count().tail() # we didn't have to explicitly create a new column -- we could groupby on a dynamically generated Series # note the use of str operations on df.NAME: # http://pandas.pydata.org/pandas-docs/stable/basics.html#vectorized-string-methods df.groupby(df.NAME.str.slice(0,1)).sum() # we can get groups of indexes df.groupby('first_letter').groups # loop through all groups --> here just the first one for name, group in islice(df.groupby('first_letter'),1): print(name) print type(group) # yes -- a DataFrame print group.index print(group), # how about accessing group 'C'? # http://stackoverflow.com/a/14734627/7782 # http://stackoverflow.com/questions/19804282/in-pandas-is-there-something-like-a-groupby-get-group-but-with-an-optional-defa # http://pandas.pydata.org/pandas-docs/dev/generated/pandas.core.groupby.GroupBy.get_group.html grouped = df.groupby('first_letter') grouped.get_group('C') # total population of states starting with 'C' grouped.get_group('C').P0010001.sum() # generate a Series of total populations by first letter grouped = df.groupby('first_letter') s = grouped['P0010001'].sum() s # sort the list to get the most populous groups # http://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.sort.html # note sorting in place s.sort(ascending=False) s s.index # first pass at pulling together the letter, the total population, and the list of corresponding states for k in s.index: print k, s[k], list(grouped.get_group(k).NAME) # let's try this again and try to do this in a more idiomatic Pandas fashion # ideally, generate a DataFrame with a NAME column that is a list of all states with the same first letter # It turns out that apply can come to the rescue # here' calculate the list of NAMEs # http://pandas.pydata.org/pandas-docs/dev/groupby.html#flexible-apply # http://stackoverflow.com/questions/19530568/can-pandas-groupby-aggregate-into-a-list-rather-than-sum-mean-etc df.groupby("first_letter").apply(lambda x: list(x['NAME'])) # apply can be used to add up the populations by group df.groupby("first_letter").apply(lambda x: np.sum(x['P0010001'])) # make a tuple out of the list of names and the population df.groupby("first_letter").apply(lambda x:( list(x['NAME']), np.sum(x['P0010001']))) # remind ourselves on how to turn a tuple into a Series with a small example Series(([1,2],2), index=['one','two']) # we're ready to make a new DataFrame df.groupby("first_letter").apply(lambda x:Series((list(x['NAME']), np.sum(x['P0010001'])), index=['states','total_pop'])) df2 = df.groupby("first_letter").apply(lambda x:Series((list(x['NAME']), np.sum(x['P0010001'])), index=['states','total_pop'])).sort_index(by='total_pop',ascending=False) # make sure you understand the syntax here: # .ix: http://pandas.pydata.org/pandas-docs/dev/indexing.html#advanced-indexing-with-ix # ability to grab columns by name to return a new DataFrame df2.ix['C'][['states','total_pop']] print "states that start with 'C'", df2.ix['C']['states'] print "total population of states that start with 'C'", df2.ix['C']['total_pop']