%pylab --no-import-all inline import numpy as np import matplotlib.pyplot as plt from pandas import DataFrame, Series, Index import pandas as pd from itertools import islice import census import us import settings c = census.Census(key=settings.CENSUS_KEY) # generators for the various census geographic entities of interest def states(variables='NAME'): geo={'for':'state:*'} states_fips = set([state.fips for state in us.states.STATES]) # need to filter out non-states for r in c.sf1.get(variables, geo=geo): if r['state'] in states_fips: yield r def counties(variables='NAME'): """ask for all the states in one call""" # tabulate a set of fips codes for the states states_fips = set([s.fips for s in us.states.STATES]) geo={'for':'county:*', 'in':'state:*'} for county in c.sf1.get(variables, geo=geo): # eliminate counties whose states aren't in a state or DC if county['state'] in states_fips: yield county def counties2(variables='NAME'): """generator for all counties""" # since we can get all the counties in one call, # this function is for demonstrating the use of walking through # the states to get at the counties for state in us.states.STATES: geo={'for':'county:*', 'in':'state:{fips}'.format(fips=state.fips)} for county in c.sf1.get(variables, geo=geo): yield county def tracts(variables='NAME'): for state in us.states.STATES: # handy to print out state to monitor progress # print state.fips, state counties_in_state={'for':'county:*', 'in':'state:{fips}'.format(fips=state.fips)} for county in c.sf1.get('NAME', geo=counties_in_state): # print county['state'], county['NAME'] tracts_in_county = {'for':'tract:*', 'in': 'state:{s_fips} county:{c_fips}'.format(s_fips=state.fips, c_fips=county['county'])} for tract in c.sf1.get(variables,geo=tracts_in_county): yield tract def block_groups(variables='NAME'): # http://api.census.gov/data/2010/sf1?get=P0010001&for=block+group:*&in=state:02+county:170 # let's use the county generator for county in counties(variables): geo = {'for':'block group:*', 'in':'state:{state} county:{county}'.format(state=county['state'], county=county['county']) } for block_group in c.sf1.get(variables, geo): yield block_group def blocks(variables='NAME'): # http://api.census.gov/data/2010/sf1?get=P0010001&for=block:*&in=state:02+county:290+tract:00100 # make use of the tract generator for tract in tracts(variables): geo={'for':'block:*', 'in':'state:{state} county:{county} tract:{tract}'.format(state=tract['state'], county=tract['county'], tract=tract['tract']) } for block in c.sf1.get(variables, geo): yield block # msa, csas, districts, zip_codes def msas(variables="NAME"): for state in us.STATES: geo = {'for':'metropolitan statistical area/micropolitan statistical area:*', 'in':'state:{state_fips}'.format(state_fips=state.fips) } for msa in c.sf1.get(variables, geo=geo): yield msa def csas(variables="NAME"): # http://api.census.gov/data/2010/sf1?get=P0010001&for=combined+statistical+area:*&in=state:24 for state in us.STATES: geo = {'for':'combined statistical area:*', 'in':'state:{state_fips}'.format(state_fips=state.fips) } for csa in c.sf1.get(variables, geo=geo): yield csa def districts(variables="NAME"): # http://api.census.gov/data/2010/sf1?get=P0010001&for=congressional+district:*&in=state:24 for state in us.STATES: geo = {'for':'congressional district:*', 'in':'state:{state_fips}'.format(state_fips=state.fips) } for district in c.sf1.get(variables, geo=geo): yield district def zip_code_tabulation_areas(variables="NAME"): # http://api.census.gov/data/2010/sf1?get=P0010001&for=zip+code+tabulation+area:*&in=state:02 for state in us.STATES: geo = {'for':'zip code tabulation area:*', 'in':'state:{state_fips}'.format(state_fips=state.fips) } for zip_code_tabulation_area in c.sf1.get(variables, geo=geo): yield zip_code_tabulation_area list(islice(msas(), 1)) list(islice(csas(), 1)) districts_list = list(islice(districts(), 1)) districts_list list(islice(zip_code_tabulation_areas(), 1)) # let's get the total population -- tabulated in two variables: P0010001, P0050001 # P0050002 Not Hispanic or Latino (total) # P0050010 Hispanic or Latino r = list(states(('NAME','P0010001','P0050001','P0050002','P0050010'))) r[:5] # Hispanic/Latino origin vs not-Hispanic/Latino # Compare with http://www.census.gov/prod/cen2010/briefs/c2010br-02.pdf Table 1 # Hispanic/Latino: 50477594 # non-Hispanic/Latino: 258267944 df=DataFrame(r) df[['P0010001', 'P0050001','P0050002','P0050010']] = \ df[['P0010001', 'P0050001','P0050002','P0050010']].astype('int') df[['P0010001', 'P0050001', 'P0050002', 'P0050010']].sum() # is the total Hispanic/Latino population and non-Hispanic populations the same as reported in # http://www.census.gov/prod/cen2010/briefs/c2010br-02.pdf Table 1 (df['P0050010'].sum() == 50477594, df['P0050002'].sum() == 258267944) # How about the non-Hispanic/Latino White only category? # P0050003 # total should be 196817552 df = DataFrame(list(states('NAME,P0050003'))) df['P0050003'] = df['P0050003'].astype('int') df.P0050003.sum() # USE a little convience function to calculate the variable names to be used def P005_range(n0,n1): return tuple(('P005'+ "{i:04d}".format(i=i) for i in xrange(n0,n1))) P005_vars = P005_range(1,18) P005_vars_str = ",".join(P005_vars) P005_vars_with_name = ['NAME'] + list(P005_vars) P005_vars_with_name # HAVE YOU TRIED THE EXERCISE....IF NOT....TRY IT....HERE'S ONE POSSIBLE ANSWER# # http://manishamde.github.io/blog/2013/03/07/pandas-and-python-top-10/#create def convert_P005_to_int(df): # do conversion in place df[list(P005_vars)] = df[list(P005_vars)].astype('int') return df def convert_to_rdotmap(row): """takes the P005 variables and maps to a series with White, Black, Asian, Hispanic, Other Total and Name""" return pd.Series({'Total':row['P0050001'], 'White':row['P0050003'], 'Black':row['P0050004'], 'Asian':row['P0050006'], 'Hispanic':row['P0050010'], 'Other': row['P0050005'] + row['P0050007'] + row['P0050008'] + row['P0050009'], 'Name': row['NAME'] }, index=['Name', 'Total', 'White', 'Black', 'Hispanic', 'Asian', 'Other']) from census import Census import settings from settings import CENSUS_KEY import time from itertools import islice def P005_range(n0,n1): return tuple(('P005'+ "{i:04d}".format(i=i) for i in xrange(n0,n1))) P005_vars = P005_range(1,18) P005_vars_str = ",".join(P005_vars) # http://manishamde.github.io/blog/2013/03/07/pandas-and-python-top-10/#create def convert_to_rdotmap(row): """takes the P005 variables and maps to a series with White, Black, Asian, Hispanic, Other Total and Name""" return pd.Series({'Total':row['P0050001'], 'White':row['P0050003'], 'Black':row['P0050004'], 'Asian':row['P0050006'], 'Hispanic':row['P0050010'], 'Other': row['P0050005'] + row['P0050007'] + row['P0050008'] + row['P0050009'], 'Name': row['NAME'] }, index=['Name', 'Total', 'White', 'Black', 'Hispanic', 'Asian', 'Other']) def normalize(s): """take a Series and divide each item by the sum so that the new series adds up to 1.0""" total = np.sum(s) return s.astype('float') / total def entropy(series): """Normalized Shannon Index""" # a series in which all the entries are equal should result in normalized entropy of 1.0 # eliminate 0s series1 = series[series!=0] # if len(series) < 2 (i.e., 0 or 1) then return 0 if len(series) > 1: # calculate the maximum possible entropy for given length of input series max_s = -np.log(1.0/len(series)) total = float(sum(series1)) p = series1.astype('float')/float(total) return sum(-p*np.log(p))/max_s else: return 0.0 def convert_P005_to_int(df): # do conversion in place df[list(P005_vars)] = df[list(P005_vars)].astype('int') return df def diversity(r): """Returns a DataFrame with the following columns """ df = DataFrame(r) df = convert_P005_to_int(df) # df[list(P005_vars)] = df[list(P005_vars)].astype('int') df1 = df.apply(convert_to_rdotmap, axis=1) df1['entropy5'] = df1[['Asian','Black','Hispanic','White','Other']].apply(entropy,axis=1) df1['entropy4'] = df1[['Asian','Black','Hispanic','White']].apply(entropy,axis=1) return df1 # states r=list(states(P005_vars_with_name)) diversity(r) # counties r = list(counties(P005_vars_with_name)) df2 = diversity(r) df2.sort_index(by='entropy5',ascending=False) msas_list = list(islice(msas('NAME,P0010001'),None)) len(msas_list) df = DataFrame(msas_list) df.P0010001 = df.P0010001.astype('int') df.groupby('metropolitan statistical area/micropolitan statistical area').apply(lambda x:sum(x['P0010001']))