# our usual pylab import
%pylab --no-import-all inline

# YouTube video I made on how to use the American Factfinder site to look up addresses
from IPython.display import YouTubeVideo
YouTubeVideo('HeXcliUx96Y')

#  standard numpy, pandas, matplotlib imports

import numpy as np
import matplotlib.pyplot as plt
from pandas import DataFrame, Series, Index
import pandas as pd

# check that CENSUS_KEY is defined
import census
import us

import requests

import settings
assert settings.CENSUS_KEY is not None

c = census.Census(key=settings.CENSUS_KEY)

# call the API and instantiate `df`
df = DataFrame(c.sf1.get('NAME,P0010001', geo={'for':'state:*'}))
# convert the population to integer
df['P0010001'] = df['P0010001'].astype(np.int)
df.head()

states_fips = np.array([state.fips for state in us.states.STATES])
states_df = df[np.in1d(df.state,states_fips)]

# check that we have three columns
assert set(states_df.columns) == set((u'NAME', u'P0010001', u'state'))

# check that the total 2010 census population is correct
assert np.sum(states_df.P0010001) == 308745538 

# check that the number of states+DC is 51
assert len(states_df) == 51

# Here's a way to use translate 
# http://api.census.gov/data/2010/sf1?get=P0010001&for=county:*
# into a call using the census.Census object

r = c.sf1.get('NAME,P0010001', geo={'for':'county:*'})

# ask yourself what len(r) means and what it should be
len(r)

# let's try out one of the `census` object convenience methods
# instead of using `c.sf1.get`

r = c.sf1.state_county('NAME,P0010001',census.ALL,census.ALL)
r

# convert the json from the API into a DataFrame
# coerce to integer the P0010001 column

df = DataFrame(r)
df['P0010001'] = df['P0010001'].astype('int')

# display the first records
df.head()

# calculate the total population 
# what happens when you google the number you get?

np.sum(df['P0010001'])

# often you can use dot notation to access a DataFrame column
df.P0010001.head()

# let's filter out PR -- what's the total population now
sum(df[np.in1d(df.state, states_fips)].P0010001)

# fall back to non-Pandas solution if you need ton
np.sum([int(county['P0010001']) for county in r if county['state'] in states_fips])

# construct counties_df with only 50 states + DC
counties_df = df[np.in1d(df.state, states_fips)]
len(counties_df)

set(counties_df.columns) == set(df.columns)

# number of counties
assert len(counties_df) == 3143 #3143 county/county-equivs in US

# check that the total population by adding all counties == population by adding all states

assert np.sum(counties_df['P0010001']) == np.sum(states_df.P0010001)

# check we have same columns between counties_df and df
set(counties_df.columns) == set(df.columns)

# take a look at the current structure of counties_df

counties_df.head()

# reindex states_df by state FIPS
# http://pandas.pydata.org/pandas-docs/dev/generated/pandas.DataFrame.set_index.html

states_df.set_index(keys='state', inplace=True)
states_df.head()

# display the result of using set_index
counties_df.head()

# http://manishamde.github.io/blog/2013/03/07/pandas-and-python-top-10/#create

counties_df['FIPS'] = counties_df.apply(lambda s:s['state'] + s['county'], axis=1)
counties_df.set_index('FIPS', inplace=True)

counties_df.head()

counties_df.groupby('state').sum().head()

states_df.P0010001.head()

# now we're ready to compare for each state, if you add all the counties, do you get the same
# population?
# not that you can do .agg('sum') instead of .sum()
# look at http://pandas.pydata.org/pandas-docs/dev/groupby.html to learn more about agg

np.all(states_df.P0010001 == counties_df.groupby('state').agg('sum').P0010001)

# boolean indexing to pull up California
states_df[states_df.NAME == 'California']

# use .ix -- most general indexing 
# http://pandas.pydata.org/pandas-docs/dev/indexing.html#different-choices-for-indexing-loc-iloc-and-ix
states_df.ix['06']

# California counties

counties_df[counties_df.state=='06']

counties_df[counties_df.NAME == 'Alameda County']

counties_df[counties_df.NAME == 'Alameda County']['P0010001']

counties_df[counties_df.NAME == 'Alameda County']['P0010001'].to_dict().values()[0]

list(counties_df[counties_df.NAME == 'Alameda County']['P0010001'].iteritems())[0][1]

int(counties_df[counties_df.NAME == 'Alameda County']['P0010001'].values)

# this is like accessing a cell in a spreadsheet -- row, col

ALAMEDA_COUNTY_FIPS = '06001'

counties_df.ix[ALAMEDA_COUNTY_FIPS,'P0010001']

counties_df.ix[ALAMEDA_COUNTY_FIPS,'county']

# http://api.census.gov/data/2010/sf1/geo.html
# state-county-tract

geo = {'for': 'tract:*', 
       'in': 'state:%s county:%s' % (us.states.CA.fips, 
                                     counties_df.ix[ALAMEDA_COUNTY_FIPS,'county'])}
        
r = c.sf1.get('NAME,P0010001', geo=geo)

alameda_county_tracts_df.apply(lambda s: s['state']+s['county']+s['tract'], axis=1)

#use state_county_tract to make a DataFrame

alameda_county_tracts_df = DataFrame(r)
alameda_county_tracts_df['P0010001'] = alameda_county_tracts_df['P0010001'].astype('int')
alameda_county_tracts_df['FIPS'] = alameda_county_tracts_df.apply(lambda s: s['state']+s['county']+s['tract'], axis=1)
alameda_county_tracts_df.head()

alameda_county_tracts_df.P0010001.sum()

# Cafe Milano is in tract 4228
MILANO_TRACT_ID = '422800'
alameda_county_tracts_df[alameda_county_tracts_df.tract==MILANO_TRACT_ID]

import time
import us

from itertools import islice

def census_tracts(variable=('NAME','P0010001'), sleep_time=1.0):
    
    for state in us.states.STATES:
        print state
        for tract in c.sf1.get(variable, 
                    geo={'for':"tract:*", 
                        'in':'state:{state_fips}'.format(state_fips=state.fips)
                        }):
            yield tract
        # don't hit the API more than once a second    
        time.sleep(sleep_time)
 
# limit the number of tracts we crawl for until we're reading to get all of them        
tracts_df = DataFrame(list(islice(census_tracts(), 100)))
tracts_df['P0010001'] = tracts_df['P0010001'].astype('int')


tracts_df.head()