# our usual pylab import %pylab --no-import-all inline # YouTube video I made on how to use the American Factfinder site to look up addresses from IPython.display import YouTubeVideo YouTubeVideo('HeXcliUx96Y') # standard numpy, pandas, matplotlib imports import numpy as np import matplotlib.pyplot as plt from pandas import DataFrame, Series, Index import pandas as pd # check that CENSUS_KEY is defined import census import us import requests import settings assert settings.CENSUS_KEY is not None c = census.Census(key=settings.CENSUS_KEY) # call the API and instantiate `df` df = DataFrame(c.sf1.get('NAME,P0010001', geo={'for':'state:*'})) # convert the population to integer df['P0010001'] = df['P0010001'].astype(np.int) df.head() ## FILL IN ## calculate states_fips so that PR not included # check that we have three columns assert set(states_df.columns) == set((u'NAME', u'P0010001', u'state')) # check that the total 2010 census population is correct assert np.sum(states_df.P0010001) == 308745538 # check that the number of states+DC is 51 assert len(states_df) == 51 # Here's a way to use translate # http://api.census.gov/data/2010/sf1?get=P0010001&for=county:* # into a call using the census.Census object r = c.sf1.get('NAME,P0010001', geo={'for':'county:*'}) # ask yourself what len(r) means and what it should be len(r) # let's try out one of the `census` object convenience methods # instead of using `c.sf1.get` r = c.sf1.state_county('NAME,P0010001',census.ALL,census.ALL) r # convert the json from the API into a DataFrame # coerce to integer the P0010001 column df = DataFrame(r) df['P0010001'] = df['P0010001'].astype('int') # display the first records df.head() # calculate the total population # what happens when you google the number you get? np.sum(df['P0010001']) # often you can use dot notation to access a DataFrame column df.P0010001.head() ## FILL IN ## compute counties_df ## counties_df should have same columns as df ## filter out PR -- what's the total population now # number of counties assert len(counties_df) == 3143 #3143 county/county-equivs in US # check that the total population by adding all counties == population by adding all states assert np.sum(counties_df['P0010001']) == np.sum(states_df.P0010001) # check we have same columns between counties_df and df set(counties_df.columns) == set(df.columns) # take a look at the current structure of counties_df counties_df.head() ## FILL IN # boolean indexing to pull up California states_df[states_df.NAME == 'California'] # use .ix -- most general indexing # http://pandas.pydata.org/pandas-docs/dev/indexing.html#different-choices-for-indexing-loc-iloc-and-ix states_df.ix['06'] # California counties counties_df[counties_df.state=='06'] counties_df[counties_df.NAME == 'Alameda County'] counties_df[counties_df.NAME == 'Alameda County']['P0010001'] counties_df[counties_df.NAME == 'Alameda County']['P0010001'].to_dict().values()[0] list(counties_df[counties_df.NAME == 'Alameda County']['P0010001'].iteritems())[0][1] int(counties_df[counties_df.NAME == 'Alameda County']['P0010001'].values) # this is like accessing a cell in a spreadsheet -- row, col ALAMEDA_COUNTY_FIPS = '06001' counties_df.ix[ALAMEDA_COUNTY_FIPS,'P0010001'] counties_df.ix[ALAMEDA_COUNTY_FIPS,'county'] ## FILL IN ## generate a DataFrame named alameda_county_tracts_df by ## calling the census api and the state-county-tract technique ## how many census tracts in Alameda County? ## if you add up the population, what do you get? ## generate the FIPS code for each tract # confirm that you can find the census tract in which Cafe Milano is located # Cafe Milano is in tract 4228 MILANO_TRACT_ID = '422800' alameda_county_tracts_df[alameda_county_tracts_df.tract==MILANO_TRACT_ID] ## FILL IN ## try to reproduce the generator I show in class for all the census tracts ## start to think about how to do this for other geographical entities import time import us from itertools import islice def census_tracts(variable=('NAME','P0010001'), sleep_time=1.0): for state in us.states.STATES: print state for tract in c.sf1.get(variable, geo={'for':"tract:*", 'in':'state:{state_fips}'.format(state_fips=state.fips) }): yield tract # don't hit the API more than once a second time.sleep(sleep_time) # limit the number of tracts we crawl for until we're reading to get all of them tracts_df = DataFrame(list(islice(census_tracts(), 100))) tracts_df['P0010001'] = tracts_df['P0010001'].astype('int') tracts_df.head() ## EXERCISE for next time ## write a generator all census places