%pylab --no-import-all inline import numpy as np import matplotlib.pyplot as plt from pandas import DataFrame, Series, Index import pandas as pd from itertools import islice import census import us import settings c = census.Census(key=settings.CENSUS_KEY) # generators for the various census geographic entities of interest def states(variables='NAME'): geo={'for':'state:*'} states_fips = set([state.fips for state in us.states.STATES]) # need to filter out non-states for r in c.sf1.get(variables, geo=geo): if r['state'] in states_fips: yield r def counties(variables='NAME'): """ask for all the states in one call""" # tabulate a set of fips codes for the states states_fips = set([s.fips for s in us.states.STATES]) geo={'for':'county:*', 'in':'state:*'} for county in c.sf1.get(variables, geo=geo): # eliminate counties whose states aren't in a state or DC if county['state'] in states_fips: yield county def counties2(variables='NAME'): """generator for all counties""" # since we can get all the counties in one call, # this function is for demonstrating the use of walking through # the states to get at the counties for state in us.states.STATES: geo={'for':'county:*', 'in':'state:{fips}'.format(fips=state.fips)} for county in c.sf1.get(variables, geo=geo): yield county def tracts(variables='NAME'): for state in us.states.STATES: # handy to print out state to monitor progress # print state.fips, state counties_in_state={'for':'county:*', 'in':'state:{fips}'.format(fips=state.fips)} for county in c.sf1.get('NAME', geo=counties_in_state): # print county['state'], county['NAME'] tracts_in_county = {'for':'tract:*', 'in': 'state:{s_fips} county:{c_fips}'.format(s_fips=state.fips, c_fips=county['county'])} for tract in c.sf1.get(variables,geo=tracts_in_county): yield tract def msas(variables="NAME"): for state in us.STATES: geo = {'for':'metropolitan statistical area/micropolitan statistical area:*', 'in':'state:{state_fips}'.format(state_fips=state.fips) } for msa in c.sf1.get(variables, geo=geo): yield msa def block_groups(variables='NAME'): # http://api.census.gov/data/2010/sf1?get=P0010001&for=block+group:*&in=state:02+county:170 # let's use the county generator for county in counties(variables): geo = {'for':'block group:*', 'in':'state:{state} county:{county}'.format(state=county['state'], county=county['county']) } for block_group in c.sf1.get(variables, geo): yield block_group def blocks(variables='NAME'): # http://api.census.gov/data/2010/sf1?get=P0010001&for=block:*&in=state:02+county:290+tract:00100 # make use of the tract generator for tract in tracts(variables): geo={'for':'block:*', 'in':'state:{state} county:{county} tract:{tract}'.format(state=tract['state'], county=tract['county'], tract=tract['tract']) } for block in c.sf1.get(variables, geo): yield block def csas(variables="NAME"): # http://api.census.gov/data/2010/sf1?get=P0010001&for=combined+statistical+area:*&in=state:24 for state in us.STATES: geo = {'for':'combined statistical area:*', 'in':'state:{state_fips}'.format(state_fips=state.fips) } for csa in c.sf1.get(variables, geo=geo): yield csa def districts(variables="NAME"): # http://api.census.gov/data/2010/sf1?get=P0010001&for=congressional+district:*&in=state:24 for state in us.STATES: geo = {'for':'congressional district:*', 'in':'state:{state_fips}'.format(state_fips=state.fips) } for district in c.sf1.get(variables, geo=geo): yield district def zip_code_tabulation_areas(variables="NAME"): # http://api.census.gov/data/2010/sf1?get=P0010001&for=zip+code+tabulation+area:*&in=state:02 for state in us.STATES: geo = {'for':'zip code tabulation area:*', 'in':'state:{state_fips}'.format(state_fips=state.fips) } for zip_code_tabulation_area in c.sf1.get(variables, geo=geo): yield zip_code_tabulation_area def census_labels(prefix='P005', n0=1, n1=17, field_width=4, include_name=True, join=False): """convenience function to generate census labels""" label_format = "{i:0%dd}" % (field_width) variables = [prefix + label_format.format(i=i) for i in xrange(n0,n1+1)] if include_name: variables = ['NAME'] + variables if join: return ",".join(variables) else: return variables def rdot_labels(other=True): if other: return ['White', 'Black', 'Asian', 'Hispanic', 'Other'] else: return ['White', 'Black', 'Asian', 'Hispanic'] FINAL_LABELS = ['NAME', 'Total'] + rdot_labels() + ['p_White', 'p_Black', 'p_Asian', 'p_Hispanic', 'p_Other'] + ['entropy5', 'entropy4', 'entropy_rice', 'gini_simpson'] def convert_to_rdotmap(row): """takes the P005 variables and maps to a series with White, Black, Asian, Hispanic, Other Total""" return pd.Series({'Total':row['P0050001'], 'White':row['P0050003'], 'Black':row['P0050004'], 'Asian':row['P0050006'], 'Hispanic':row['P0050010'], 'Other': row['P0050005'] + row['P0050007'] + row['P0050008'] + row['P0050009'], }, index=['Total', 'White', 'Black', 'Hispanic', 'Asian', 'Other']) def normalize(s): """take a Series and divide each item by the sum so that the new series adds up to 1.0""" total = np.sum(s) return s.astype('float') / total def normalize_relabel(s): """take a Series and divide each item by the sum so that the new series adds up to 1.0 Also relabel the indices by adding p_ prefix""" total = np.sum(s) new_index = list(Series(s.index).apply(lambda x: "p_"+x)) return Series(list(s.astype('float') / total),new_index) def entropy(series): """Normalized Shannon Index""" # a series in which all the entries are equal should result in normalized entropy of 1.0 # eliminate 0s series1 = series[series!=0] # if len(series) < 2 (i.e., 0 or 1) then return 0 if len(series1) > 1: # calculate the maximum possible entropy for given length of input series max_s = -np.log(1.0/len(series)) total = float(sum(series1)) p = series1.astype('float')/float(total) return sum(-p*np.log(p))/max_s else: return 0.0 def gini_simpson(s): # https://en.wikipedia.org/wiki/Diversity_index#Gini.E2.80.93Simpson_index s1 = normalize(s) return 1-np.sum(s1*s1) def entropy_rice(series): """hard code how Rice U did calculation This function takes the entropy5 calculation and removes the contribution from 'Other' """ # pass in a Series with # 'Asian','Black','Hispanic','White','Other' # http://kinder.rice.edu/uploadedFiles/Urban_Research_Center/Media/Houston%20Region%20Grows%20More%20Ethnically%20Diverse%202-13.pdf s0 = normalize(series) s_other = s0['Other']*np.log(s0['Other']) if s0['Other'] > 0 else 0.0 return (np.log(0.2)*entropy(series) - s_other)/np.log(0.25) def diversity(df): """Takes a df with the P005 variables and does entropy calculation""" # convert populations to int df[census_labels(include_name=False)] = df[census_labels(include_name=False)].astype('int') df = pd.concat((df, df.apply(convert_to_rdotmap, axis=1)),axis=1) df = pd.concat((df,df[rdot_labels()].apply(normalize_relabel,axis=1)), axis=1) df['entropy5'] = df.apply(lambda x:entropy(x[rdot_labels()]), axis=1) df['entropy4'] = df.apply(lambda x:entropy(x[rdot_labels(other=False)]), axis=1) df['entropy_rice'] = df.apply(lambda x:entropy_rice(x[rdot_labels()]), axis=1) df['gini_simpson'] = df.apply(lambda x:gini_simpson(x[rdot_labels()]), axis=1) return df # grab states, convert populations to int states_df = DataFrame(list(states(census_labels()))) states_df = diversity(states_df) states_df[FINAL_LABELS].head() states_df.sort_index(by='entropy5', ascending=False)[FINAL_LABELS].head() r = list(counties(census_labels())) counties_df = DataFrame(r) counties_df = diversity(counties_df) counties_df[FINAL_LABELS].head() counties_df.sort_index(by='entropy5', ascending=False)[FINAL_LABELS].head() # msas r = list(msas(census_labels())) len(r) df=DataFrame(r) df[census_labels(include_name=False)] = df[census_labels(include_name=False)].astype('int') msas_grouped = df.groupby('metropolitan statistical area/micropolitan statistical area') #df1 = msas_grouped.apply(lambda x:Series((list(x['NAME']), sum(x['P0050001'])), index=['msas','total_pop'])).sort_index(by='total_pop', ascending=False) df1 = msas_grouped.apply(lambda x:Series((list(x['NAME']), ), index=['msas'])) df2 = msas_grouped.sum() df3 = pd.concat((df1,df2), axis=1) df3['NAME'] = df3.apply(lambda x: "; ".join(x['msas']), axis=1) msas_df = diversity(df3) # grab the ten most populous msas and sort by entropy_rice msas_df.sort_index(by='Total', ascending=False)[:10].sort_index(by='entropy_rice', ascending=False)[FINAL_LABELS] # Testing code def to_unicode(vals): return [unicode(v) for v in vals] def test_msas_df(msas_df): min_set_of_columns = set(['Asian','Black','Hispanic', 'Other', 'Total', 'White', 'entropy4', 'entropy5', 'entropy_rice', 'gini_simpson','p_Asian', 'p_Black', 'p_Hispanic', 'p_Other','p_White']) assert min_set_of_columns & set(msas_df.columns) == min_set_of_columns # https://www.census.gov/geo/maps-data/data/tallies/national_geo_tallies.html # 366 metro areas # 576 micropolitan areas assert len(msas_df) == 942 # total number of people in metro/micro areas assert msas_df.Total.sum() == 289261315 assert msas_df['White'].sum() == 180912856 assert msas_df['Other'].sum() == 8540181 # list of msas in descendng order by entropy_rice top_10_metros = msas_df.sort_index(by='Total', ascending=False)[:10] msa_codes_in_top_10_pop_sorted_by_entropy_rice = list(top_10_metros.sort_index(by='entropy_rice', ascending=False).index) assert to_unicode(msa_codes_in_top_10_pop_sorted_by_entropy_rice)== [u'26420', u'35620', u'47900', u'31100', u'19100', u'33100', u'16980', u'12060', u'37980', u'14460'] top_10_metro = msas_df.sort_index(by='Total', ascending=False)[:10] list(top_10_metro.sort_index(by='entropy_rice', ascending=False)['entropy5']) np.testing.assert_allclose(top_10_metro.sort_index(by='entropy_rice', ascending=False)['entropy5'], [0.79628076626851163, 0.80528601550164602, 0.80809418318973791, 0.7980698349711991, 0.75945930510650161, 0.74913610558765376, 0.73683277781032397, 0.72964862063970914, 0.64082509648457675, 0.55697288400004963]) np.testing.assert_allclose(top_10_metro.sort_index(by='entropy_rice', ascending=False)['entropy_rice'], [0.87361766576115552, 0.87272877244078051, 0.85931803868749834, 0.85508015237749468, 0.82169723530719896, 0.81953527301129059, 0.80589423784325431, 0.78602596561378812, 0.68611350427640316, 0.56978827050565117]) # you are on the right track if test_msas_df doesn't complain test_msas_df(msas_df) # code to save your dataframe to a CSV # upload the CSV to bCourses # uncomment to run # msas_df.to_csv("msas_2010.csv", encoding="UTF-8") # load back the CSV and test again # df = DataFrame.from_csv("msas_2010.csv", encoding="UTF-8") # test_msas_df(df) all_categories = census_labels('P005',2,10, include_name=False) + \ census_labels('P005',11,17, include_name=False) all_categories msas_df['entropy_all'] = msas_df.apply(lambda x:entropy(x[all_categories]), axis=1) msas_df.sort_index(by='entropy_all', ascending=False)[FINAL_LABELS + ['entropy_all']][:20] msas_df.sort_index(by='P0050001', ascending=False).head() top_10_metros = msas_df.sort_index(by='Total', ascending=False)[:10] top_10_metros['City'] = top_10_metros['NAME'].apply(lambda name: name.split('-')[0]) top_10_metros.sort(columns=['entropy_rice'], inplace=True, ascending=True) cities = pd.Series(top_10_metros['City']) diversity = pd.Series(top_10_metros['entropy_rice']) p_white = pd.Series(top_10_metros['p_White']) p_asian = pd.Series(top_10_metros['p_Asian']) p_black = pd.Series(top_10_metros['p_Black']) p_latino = pd.Series(top_10_metros['p_Hispanic']) import matplotlib.pyplot as plt fig = plt.figure(figsize=(10, 8)) ax = plt.subplot(111) # y axis locations for diversity and races y_div = np.arange(len(cities))*2 y_race = (np.arange(len(cities))*2)+1 # diversity bars pDiversity = ax.barh(y_div, diversity, alpha=0.4) # stacked horizontal bars pWhite = ax.barh(y_race, p_white, color='b') pLatino = ax.barh(y_race, p_latino, color='g', left=p_white) pBlack = ax.barh(y_race, p_black, color='r', left=p_white+p_latino) pAsian = ax.barh(y_race, p_asian, color='c', left=p_white+p_latino+p_black) plt.yticks(y_race, cities) # legend foo https://stackoverflow.com/questions/4700614/how-to-put-the-legend-out-of-the-plot # Shink current axis's height by 10% on the bottom box = ax.get_position() ax.set_position([box.x0, box.y0 + box.height * 0.1, box.width, box.height * 0.85]) # Put a legend below current axis ax.legend((pWhite, pLatino, pBlack, pAsian, pDiversity), ('White', 'Latino', 'Black', 'Asian', 'Diversity'), loc='upper center', bbox_to_anchor=(0.5, -0.05), fancybox=True, shadow=True, ncol=5) plt.show() # If you want to save it