# our usual pylab import
%pylab --no-import-all inline

# YouTube video I made on how to use the American Factfinder site to look up addresses
from IPython.display import YouTubeVideo
YouTubeVideo('HeXcliUx96Y')

#  standard numpy, pandas, matplotlib imports

import numpy as np
import matplotlib.pyplot as plt
from pandas import DataFrame, Series, Index
import pandas as pd

# check that CENSUS_KEY is defined
import census
import us

import requests

import settings
assert settings.CENSUS_KEY is not None

c = census.Census(key=settings.CENSUS_KEY)

# call the API and instantiate `df`
df = DataFrame(c.sf1.get('NAME,P0010001', geo={'for':'state:*'}))
# convert the population to integer
df['P0010001'] = df['P0010001'].astype(np.int)
df.head()

## FILL IN
## calculate states_fips so that PR not included


# check that we have three columns
assert set(states_df.columns) == set((u'NAME', u'P0010001', u'state'))

# check that the total 2010 census population is correct
assert np.sum(states_df.P0010001) == 308745538 

# check that the number of states+DC is 51
assert len(states_df) == 51

# Here's a way to use translate 
# http://api.census.gov/data/2010/sf1?get=P0010001&for=county:*
# into a call using the census.Census object

r = c.sf1.get('NAME,P0010001', geo={'for':'county:*'})

# ask yourself what len(r) means and what it should be
len(r)

# let's try out one of the `census` object convenience methods
# instead of using `c.sf1.get`

r = c.sf1.state_county('NAME,P0010001',census.ALL,census.ALL)
r

# convert the json from the API into a DataFrame
# coerce to integer the P0010001 column

df = DataFrame(r)
df['P0010001'] = df['P0010001'].astype('int')

# display the first records
df.head()

# calculate the total population 
# what happens when you google the number you get?

np.sum(df['P0010001'])

# often you can use dot notation to access a DataFrame column
df.P0010001.head()

## FILL IN
## compute counties_df
## counties_df should have same columns as df
## filter out PR -- what's the total population now


# number of counties
assert len(counties_df) == 3143 #3143 county/county-equivs in US

# check that the total population by adding all counties == population by adding all states

assert np.sum(counties_df['P0010001']) == np.sum(states_df.P0010001)

# check we have same columns between counties_df and df
set(counties_df.columns) == set(df.columns)

# take a look at the current structure of counties_df

counties_df.head()

## FILL IN


# boolean indexing to pull up California
states_df[states_df.NAME == 'California']

# use .ix -- most general indexing 
# http://pandas.pydata.org/pandas-docs/dev/indexing.html#different-choices-for-indexing-loc-iloc-and-ix
states_df.ix['06']

# California counties

counties_df[counties_df.state=='06']

counties_df[counties_df.NAME == 'Alameda County']

counties_df[counties_df.NAME == 'Alameda County']['P0010001']

counties_df[counties_df.NAME == 'Alameda County']['P0010001'].to_dict().values()[0]

list(counties_df[counties_df.NAME == 'Alameda County']['P0010001'].iteritems())[0][1]

int(counties_df[counties_df.NAME == 'Alameda County']['P0010001'].values)

# this is like accessing a cell in a spreadsheet -- row, col

ALAMEDA_COUNTY_FIPS = '06001'

counties_df.ix[ALAMEDA_COUNTY_FIPS,'P0010001']

counties_df.ix[ALAMEDA_COUNTY_FIPS,'county']

## FILL IN 
## generate a DataFrame named alameda_county_tracts_df by
## calling the census api and the state-county-tract technique

## how many census tracts in Alameda County?
## if you add up the population, what do you get?
## generate the FIPS code for each tract


# confirm that you can find the census tract in which Cafe Milano is located
# Cafe Milano is in tract 4228

MILANO_TRACT_ID = '422800'
alameda_county_tracts_df[alameda_county_tracts_df.tract==MILANO_TRACT_ID]

## FILL IN
## try to reproduce the generator I show in class for all the census tracts
## start to think about how to do this for other geographical entities

import time
import us

from itertools import islice

def census_tracts(variable=('NAME','P0010001'), sleep_time=1.0):
    
    for state in us.states.STATES:
        print state
        for tract in c.sf1.get(variable, 
                    geo={'for':"tract:*", 
                        'in':'state:{state_fips}'.format(state_fips=state.fips)
                        }):
            yield tract
        # don't hit the API more than once a second    
        time.sleep(sleep_time)
 
# limit the number of tracts we crawl for until we're reading to get all of them        
tracts_df = DataFrame(list(islice(census_tracts(), 100)))
tracts_df['P0010001'] = tracts_df['P0010001'].astype('int')


tracts_df.head()

## EXERCISE for next time
## write a generator all census places