# our usual pylab import %pylab --no-import-all inline # YouTube video I made on how to use the American Factfinder site to look up addresses from IPython.display import YouTubeVideo YouTubeVideo('HeXcliUx96Y') # standard numpy, pandas, matplotlib imports import numpy as np import matplotlib.pyplot as plt from pandas import DataFrame, Series, Index import pandas as pd # check that CENSUS_KEY is defined import census import us import requests import settings assert settings.CENSUS_KEY is not None c = census.Census(key=settings.CENSUS_KEY) # call the API and instantiate `df` df = DataFrame(c.sf1.get('NAME,P0010001', geo={'for':'state:*'})) # convert the population to integer df['P0010001'] = df['P0010001'].astype(np.int) df.head() states_fips = np.array([state.fips for state in us.states.STATES]) states_df = df[np.in1d(df.state,states_fips)] # check that we have three columns assert set(states_df.columns) == set((u'NAME', u'P0010001', u'state')) # check that the total 2010 census population is correct assert np.sum(states_df.P0010001) == 308745538 # check that the number of states+DC is 51 assert len(states_df) == 51 # Here's a way to use translate # http://api.census.gov/data/2010/sf1?get=P0010001&for=county:* # into a call using the census.Census object r = c.sf1.get('NAME,P0010001', geo={'for':'county:*'}) # ask yourself what len(r) means and what it should be len(r) # let's try out one of the `census` object convenience methods # instead of using `c.sf1.get` r = c.sf1.state_county('NAME,P0010001',census.ALL,census.ALL) r # convert the json from the API into a DataFrame # coerce to integer the P0010001 column df = DataFrame(r) df['P0010001'] = df['P0010001'].astype('int') # display the first records df.head() # calculate the total population # what happens when you google the number you get? np.sum(df['P0010001']) # often you can use dot notation to access a DataFrame column df.P0010001.head() # let's filter out PR -- what's the total population now sum(df[np.in1d(df.state, states_fips)].P0010001) # fall back to non-Pandas solution if you need ton np.sum([int(county['P0010001']) for county in r if county['state'] in states_fips]) # construct counties_df with only 50 states + DC counties_df = df[np.in1d(df.state, states_fips)] len(counties_df) set(counties_df.columns) == set(df.columns) # number of counties assert len(counties_df) == 3143 #3143 county/county-equivs in US # check that the total population by adding all counties == population by adding all states assert np.sum(counties_df['P0010001']) == np.sum(states_df.P0010001) # check we have same columns between counties_df and df set(counties_df.columns) == set(df.columns) # take a look at the current structure of counties_df counties_df.head() # reindex states_df by state FIPS # http://pandas.pydata.org/pandas-docs/dev/generated/pandas.DataFrame.set_index.html states_df.set_index(keys='state', inplace=True) states_df.head() # display the result of using set_index counties_df.head() # http://manishamde.github.io/blog/2013/03/07/pandas-and-python-top-10/#create counties_df['FIPS'] = counties_df.apply(lambda s:s['state'] + s['county'], axis=1) counties_df.set_index('FIPS', inplace=True) counties_df.head() counties_df.groupby('state').sum().head() states_df.P0010001.head() # now we're ready to compare for each state, if you add all the counties, do you get the same # population? # not that you can do .agg('sum') instead of .sum() # look at http://pandas.pydata.org/pandas-docs/dev/groupby.html to learn more about agg np.all(states_df.P0010001 == counties_df.groupby('state').agg('sum').P0010001) # boolean indexing to pull up California states_df[states_df.NAME == 'California'] # use .ix -- most general indexing # http://pandas.pydata.org/pandas-docs/dev/indexing.html#different-choices-for-indexing-loc-iloc-and-ix states_df.ix['06'] # California counties counties_df[counties_df.state=='06'] counties_df[counties_df.NAME == 'Alameda County'] counties_df[counties_df.NAME == 'Alameda County']['P0010001'] counties_df[counties_df.NAME == 'Alameda County']['P0010001'].to_dict().values()[0] list(counties_df[counties_df.NAME == 'Alameda County']['P0010001'].iteritems())[0][1] int(counties_df[counties_df.NAME == 'Alameda County']['P0010001'].values) # this is like accessing a cell in a spreadsheet -- row, col ALAMEDA_COUNTY_FIPS = '06001' counties_df.ix[ALAMEDA_COUNTY_FIPS,'P0010001'] counties_df.ix[ALAMEDA_COUNTY_FIPS,'county'] # http://api.census.gov/data/2010/sf1/geo.html # state-county-tract geo = {'for': 'tract:*', 'in': 'state:%s county:%s' % (us.states.CA.fips, counties_df.ix[ALAMEDA_COUNTY_FIPS,'county'])} r = c.sf1.get('NAME,P0010001', geo=geo) alameda_county_tracts_df.apply(lambda s: s['state']+s['county']+s['tract'], axis=1) #use state_county_tract to make a DataFrame alameda_county_tracts_df = DataFrame(r) alameda_county_tracts_df['P0010001'] = alameda_county_tracts_df['P0010001'].astype('int') alameda_county_tracts_df['FIPS'] = alameda_county_tracts_df.apply(lambda s: s['state']+s['county']+s['tract'], axis=1) alameda_county_tracts_df.head() alameda_county_tracts_df.P0010001.sum() # Cafe Milano is in tract 4228 MILANO_TRACT_ID = '422800' alameda_county_tracts_df[alameda_county_tracts_df.tract==MILANO_TRACT_ID] import time import us from itertools import islice def census_tracts(variable=('NAME','P0010001'), sleep_time=1.0): for state in us.states.STATES: print state for tract in c.sf1.get(variable, geo={'for':"tract:*", 'in':'state:{state_fips}'.format(state_fips=state.fips) }): yield tract # don't hit the API more than once a second time.sleep(sleep_time) # limit the number of tracts we crawl for until we're reading to get all of them tracts_df = DataFrame(list(islice(census_tracts(), 100))) tracts_df['P0010001'] = tracts_df['P0010001'].astype('int') tracts_df.head()