Goals¶

Learn about how to use the Census variables around Hispanic origin to calculate quantities around diversity (remembering the Racial Dot Map as our framing example)

In [1]:

%pylab --no-import-all inline

Populating the interactive namespace from numpy and matplotlib

In [2]:

import numpy as np
import matplotlib.pyplot as plt
from pandas import DataFrame, Series, Index
import pandas as pd

from itertools import islice

In [3]:

import census
import us

import settings

The census documentation has example URLs but needs your API key to work. In this notebook, we'll use the IPython notebook HTML display mechanism to help out.

In [4]:

c = census.Census(key=settings.CENSUS_KEY)

In [5]:

# generators for the various census geographic entities of interest

def states(variables='NAME'):
    geo={'for':'state:*'}
    states_fips = set([state.fips for state in us.states.STATES])
    # need to filter out non-states
    for r in c.sf1.get(variables, geo=geo):
        if r['state'] in states_fips:
            yield r
            
def counties(variables='NAME'):
    """ask for all the states in one call"""
    
    # tabulate a set of fips codes for the states
    states_fips = set([s.fips for s in us.states.STATES])
    
    geo={'for':'county:*',
             'in':'state:*'}    
    for county in c.sf1.get(variables, geo=geo):
        # eliminate counties whose states aren't in a state or DC
        if county['state'] in states_fips:
            yield county
        

def counties2(variables='NAME'):
    """generator for all counties"""
    
    # since we can get all the counties in one call, 
    # this function is for demonstrating the use of walking through 
    # the states to get at the counties

    for state in us.states.STATES:
        geo={'for':'county:*',
             'in':'state:{fips}'.format(fips=state.fips)}
        for county in c.sf1.get(variables, geo=geo):
            yield county

            
def tracts(variables='NAME'):
    for state in us.states.STATES:
        
        # handy to print out state to monitor progress
        # print state.fips, state
        counties_in_state={'for':'county:*',
             'in':'state:{fips}'.format(fips=state.fips)}
        
        for county in c.sf1.get('NAME', geo=counties_in_state):
            
            # print county['state'], county['NAME']
            tracts_in_county = {'for':'tract:*',
              'in': 'state:{s_fips} county:{c_fips}'.format(s_fips=state.fips, 
                                                            c_fips=county['county'])}
            
            for tract in c.sf1.get(variables,geo=tracts_in_county):
                yield tract

In [6]:

def block_groups(variables='NAME'):
    # http://api.census.gov/data/2010/sf1?get=P0010001&for=block+group:*&in=state:02+county:170
    # let's use the county generator
    for county in counties(variables):
        geo = {'for':'block group:*',
               'in':'state:{state} county:{county}'.format(state=county['state'],
                                                county=county['county'])
               }
        for block_group in c.sf1.get(variables, geo):
            yield block_group
    
    
def blocks(variables='NAME'):
    # http://api.census.gov/data/2010/sf1?get=P0010001&for=block:*&in=state:02+county:290+tract:00100
    
    # make use of the tract generator
    for tract in tracts(variables):
        geo={'for':'block:*',
             'in':'state:{state} county:{county} tract:{tract}'.format(state=tract['state'],
                                                                       county=tract['county'],
                                                                       tract=tract['tract'])
             }
        for block in c.sf1.get(variables, geo):
            yield block
        
       

In [7]:

# msa, csas, districts, zip_codes

def msas(variables="NAME"):
    
     for state in us.STATES:
        geo = {'for':'metropolitan statistical area/micropolitan statistical area:*', 
               'in':'state:{state_fips}'.format(state_fips=state.fips)
               }
    
        for msa in c.sf1.get(variables, geo=geo):
            yield msa

def csas(variables="NAME"):
    # http://api.census.gov/data/2010/sf1?get=P0010001&for=combined+statistical+area:*&in=state:24
    for state in us.STATES:
        geo = {'for':'combined statistical area:*', 
               'in':'state:{state_fips}'.format(state_fips=state.fips)
               }
    
        for csa in c.sf1.get(variables, geo=geo):
            yield csa

def districts(variables="NAME"):
    # http://api.census.gov/data/2010/sf1?get=P0010001&for=congressional+district:*&in=state:24
    for state in us.STATES:
        geo = {'for':'congressional district:*', 
               'in':'state:{state_fips}'.format(state_fips=state.fips)
               }
    
        for district in c.sf1.get(variables, geo=geo):
            yield district    
            
def zip_code_tabulation_areas(variables="NAME"):
    # http://api.census.gov/data/2010/sf1?get=P0010001&for=zip+code+tabulation+area:*&in=state:02
    for state in us.STATES:
        geo = {'for':'zip code tabulation area:*', 
               'in':'state:{state_fips}'.format(state_fips=state.fips)
               }
    
        for zip_code_tabulation_area in c.sf1.get(variables, geo=geo):
            yield zip_code_tabulation_area    

In [8]:

list(islice(msas(), 1))

Out[8]:

[{u'NAME': u'Albertville, AL Micro Area',
  u'metropolitan statistical area/micropolitan statistical area': u'10700',
  u'state': u'01'}]

In [9]:

list(islice(csas(), 1))

Out[9]:

[{u'NAME': u'Atlanta-Sandy Springs-Gainesville, GA-AL CSA (part)',
  u'combined statistical area': u'122',
  u'state': u'01'}]

In [10]:

districts_list = list(islice(districts(), 1))
districts_list

Out[10]:

[{u'NAME': u'Congressional District 1',
  u'congressional district': u'01',
  u'state': u'01'}]

In [11]:

list(islice(zip_code_tabulation_areas(), 1))

Out[11]:

[{u'NAME': u'ZCTA5 30165 (part)',
  u'state': u'01',
  u'zip code tabulation area': u'30165'}]

Note: There are definitely improvements to be made in these generators. One of the most important would be to limit the generators to specific geographies -- typically, we don't want to have all the blocks in the country but the ones in a specific area. A good exercise to rewrite our generators to allow for limited geography.

We can compare the total number of tracts we calculate to:

https://www.census.gov/geo/maps-data/data/tallies/tractblock.html

and

https://www.census.gov/geo/maps-data/data/docs/geo_tallies/Tract_Block2010.txt

Hispanic or Latino Origin and Racial Subcategories¶

http://www.census.gov/developers/data/sf1.xml

compare to http://www.census.gov/prod/cen2010/briefs/c2010br-02.pdf

I think the P0050001 might be the key category

P0010001 = P0050001
P0050001 = P0050002 + P0050010

P0050002 Not Hispanic or Latino (total) =

P0050003 Not Hispanic White only
P0050004 Not Hispanic Black only
P0050006 Not Hispanic Asian only
Not Hispanic Other (should also be P0050002 - (P0050003 + P0050004 + P0050006)
- P0050005 Not Hispanic: American Indian/ American Indian and Alaska Native alone
- P0050007 Not Hispanic: Native Hawaiian and Other Pacific Islander alone
- P0050008 Not Hispanic: Some Other Race alone
- P0050009 Not Hispanic: Two or More Races
P0050010 Hispanic or Latino

P0050010 = P0050011...P0050017

From Hispanic and Latino Americans (Wikipedia):

While the two terms are sometimes used interchangeably, Hispanic is a narrower term which mostly refers to persons of Spanish speaking origin or ancestry, while Latino is more frequently used to refer more generally to anyone of Latin American origin or ancestry, including Brazilians.

and

The Census Bureau's 2010 census does provide a definition of the terms Latino or Hispanic and is as follows: “Hispanic or Latino” refers to a person of Cuban, Mexican, Puerto Rican, South or Central American, or other Spanish culture or origin regardless of race. It allows respondents to self-define whether they were Latino or Hispanic and then identify their specific country or place of origin.[52] On its website, the Census Bureau defines "Hispanic" or "Latino" persons as being "persons who trace their origin [to]... Spanish speaking Central and South America countries, and other Spanish cultures".

In the Racial Dot Map: "Whites are coded as blue; African-Americans, green; Asians, red; Hispanics, orange; and all other racial categories are coded as brown."

In this notebook, we will relate the Racial Dot Map 5-category scheme to the P005* variables.

In [12]:

# let's get the total population -- tabulated in two variables: P0010001, P0050001
# P0050002 Not Hispanic or Latino (total) 
# P0050010 Hispanic or Latino

r = list(states(('NAME','P0010001','P0050001','P0050002','P0050010')))
r[:5]

Out[12]:

[{u'NAME': u'Alabama',
  u'P0010001': u'4779736',
  u'P0050001': u'4779736',
  u'P0050002': u'4594134',
  u'P0050010': u'185602',
  u'state': u'01'},
 {u'NAME': u'Alaska',
  u'P0010001': u'710231',
  u'P0050001': u'710231',
  u'P0050002': u'670982',
  u'P0050010': u'39249',
  u'state': u'02'},
 {u'NAME': u'Arizona',
  u'P0010001': u'6392017',
  u'P0050001': u'6392017',
  u'P0050002': u'4496868',
  u'P0050010': u'1895149',
  u'state': u'04'},
 {u'NAME': u'Arkansas',
  u'P0010001': u'2915918',
  u'P0050001': u'2915918',
  u'P0050002': u'2729868',
  u'P0050010': u'186050',
  u'state': u'05'},
 {u'NAME': u'California',
  u'P0010001': u'37253956',
  u'P0050001': u'37253956',
  u'P0050002': u'23240237',
  u'P0050010': u'14013719',
  u'state': u'06'}]

In [13]:

# Hispanic/Latino origin vs not-Hispanic/Latino
# Compare with http://www.census.gov/prod/cen2010/briefs/c2010br-02.pdf Table 1
# Hispanic/Latino: 50477594
# non-Hispanic/Latino: 258267944

df=DataFrame(r)
df[['P0010001', 'P0050001','P0050002','P0050010']] = \
    df[['P0010001', 'P0050001','P0050002','P0050010']].astype('int')
df[['P0010001', 'P0050001', 'P0050002', 'P0050010']].sum()

Out[13]:

P0010001    308745538
P0050001    308745538
P0050002    258267944
P0050010     50477594
dtype: int64

In [14]:

# is the total Hispanic/Latino population and non-Hispanic populations the same as reported in 
# http://www.census.gov/prod/cen2010/briefs/c2010br-02.pdf Table 1
(df['P0050010'].sum() == 50477594,
 df['P0050002'].sum() == 258267944)

Out[14]:

(True, True)

In [15]:

# How about the non-Hispanic/Latino White only category?
# P0050003
# total should be 196817552

df = DataFrame(list(states('NAME,P0050003')))
df['P0050003'] = df['P0050003'].astype('int')
df.P0050003.sum()

Out[15]:

196817552

Converting to Racial Dot Map Categories¶

SUGGESTED EXERCISE: write a function convert_to_rdotmap(row) tha takes an input Python dict that has the keys: * NAME * P005001, P005002...,P0050016, P0050017

and that returns a Pandas Series with the following columns:

* Total
* White
* Black
* Asian
* Hispanic
* Other
* Name  (note lowercase)

that correspond to those used in the Racial Dot Map.

Also write a function def convert_P005_to_int(df) that converts all the P005* columns to int

In [16]:

# USE a little convience function to calculate the variable names to be used

def P005_range(n0,n1): 
    return tuple(('P005'+ "{i:04d}".format(i=i) for i in xrange(n0,n1)))

P005_vars = P005_range(1,18)
P005_vars_str = ",".join(P005_vars)
P005_vars_with_name = ['NAME'] + list(P005_vars)

P005_vars_with_name

Out[16]:

['NAME',
 'P0050001',
 'P0050002',
 'P0050003',
 'P0050004',
 'P0050005',
 'P0050006',
 'P0050007',
 'P0050008',
 'P0050009',
 'P0050010',
 'P0050011',
 'P0050012',
 'P0050013',
 'P0050014',
 'P0050015',
 'P0050016',
 'P0050017']

In [17]:

# HAVE YOU TRIED THE EXERCISE....IF NOT....TRY IT....HERE'S ONE POSSIBLE ANSWER# 

# http://manishamde.github.io/blog/2013/03/07/pandas-and-python-top-10/#create

def convert_P005_to_int(df):
    # do conversion in place
    df[list(P005_vars)] = df[list(P005_vars)].astype('int')
    return df

def convert_to_rdotmap(row):
    """takes the P005 variables and maps to a series with White, Black, Asian, Hispanic, Other
    Total and Name"""
    return pd.Series({'Total':row['P0050001'],
                      'White':row['P0050003'],
                      'Black':row['P0050004'],
                      'Asian':row['P0050006'],
                      'Hispanic':row['P0050010'],
                      'Other': row['P0050005'] + row['P0050007'] + row['P0050008'] + row['P0050009'],
                      'Name': row['NAME']
                      }, index=['Name', 'Total', 'White', 'Black', 'Hispanic', 'Asian', 'Other'])

In [18]:

from census import Census

import settings
from settings import CENSUS_KEY

import time
from itertools import islice

def P005_range(n0,n1): 
    return tuple(('P005'+ "{i:04d}".format(i=i) for i in xrange(n0,n1)))

P005_vars = P005_range(1,18)
P005_vars_str = ",".join(P005_vars)


# http://manishamde.github.io/blog/2013/03/07/pandas-and-python-top-10/#create
def convert_to_rdotmap(row):
    """takes the P005 variables and maps to a series with White, Black, Asian, Hispanic, Other
    Total and Name"""
    return pd.Series({'Total':row['P0050001'],
                      'White':row['P0050003'],
                      'Black':row['P0050004'],
                      'Asian':row['P0050006'],
                      'Hispanic':row['P0050010'],
                      'Other': row['P0050005'] + row['P0050007'] + row['P0050008'] + row['P0050009'],
                      'Name': row['NAME']
                      }, index=['Name', 'Total', 'White', 'Black', 'Hispanic', 'Asian', 'Other'])


def normalize(s):
    """take a Series and divide each item by the sum so that the new series adds up to 1.0"""
    total = np.sum(s)
    return s.astype('float') / total


def entropy(series):
    """Normalized Shannon Index"""
    # a series in which all the entries are equal should result in normalized entropy of 1.0
    
    # eliminate 0s
    series1 = series[series!=0]

    # if len(series) < 2 (i.e., 0 or 1) then return 0
    
    if len(series) > 1:
        # calculate the maximum possible entropy for given length of input series
        max_s = -np.log(1.0/len(series))
    
        total = float(sum(series1))
        p = series1.astype('float')/float(total)
        return sum(-p*np.log(p))/max_s
    else:
        return 0.0

    
def convert_P005_to_int(df):
    # do conversion in place
    df[list(P005_vars)] = df[list(P005_vars)].astype('int')
    return df
    

def diversity(r):

    """Returns a DataFrame with the following columns
    """
    df = DataFrame(r)
    df = convert_P005_to_int(df)
    # df[list(P005_vars)] = df[list(P005_vars)].astype('int')
    df1 = df.apply(convert_to_rdotmap, axis=1)
    
    df1['entropy5'] = df1[['Asian','Black','Hispanic','White','Other']].apply(entropy,axis=1)
    df1['entropy4'] = df1[['Asian','Black','Hispanic','White']].apply(entropy,axis=1)
    return df1

In [19]:

# states

r=list(states(P005_vars_with_name))
diversity(r)

Out[19]:

	Name	Total	White	Black	Hispanic	Asian	Other	entropy5	entropy4
0	Alabama	4779736	3204402	1244437	185602	52937	92358	0.541001	0.570292
1	Alaska	710231	455320	21949	39249	37459	156254	0.646677	0.475235
2	Arizona	6392017	3695647	239101	1895149	170509	391611	0.663524	0.643529
3	Arkansas	2915918	2173469	447102	186050	35647	73650	0.515025	0.526205
4	California	37253956	14956253	2163804	14013719	4775070	1345110	0.796994	0.843670
5	Colorado	5029196	3520793	188778	1038687	135564	145374	0.558232	0.570130
6	Connecticut	3574097	2546262	335119	479087	134091	79538	0.584509	0.615330
7	Delaware	897934	586752	186782	73221	28308	22871	0.628490	0.660917
8	District of Columbia	601723	209464	301053	54749	20818	15639	0.710288	0.757369
9	Florida	18801310	10884722	2851100	4223806	445216	396466	0.688393	0.741076
10	Georgia	9687653	5413920	2910800	853689	311692	197552	0.677545	0.729666
11	Hawaii	1360301	309343	19904	120842	513294	396918	0.833108	0.750762
12	Idaho	1567582	1316243	8875	175901	18529	48034	0.360829	0.330227
13	Illinois	12830632	8167753	1832924	2027578	580586	221791	0.663131	0.719347
14	Indiana	6483802	5286453	582140	389707	101444	124058	0.430342	0.439752
15	Iowa	3046355	2701123	86906	151544	52597	54185	0.310137	0.300998
16	Kansas	2853118	2230539	162700	300042	66967	92870	0.492215	0.483675
17	Kentucky	4339367	3745655	333075	132836	48338	79463	0.344293	0.340010
18	Louisiana	4533372	2734884	1442420	192560	69327	94181	0.588919	0.623788
19	Maine	1328361	1254297	15154	16935	13442	28533	0.180061	0.137155
20	Maryland	5773552	3157958	1674229	470632	316694	154039	0.714090	0.760596
21	Massachusetts	6547629	4984800	391693	627654	347495	195987	0.535423	0.540767
22	Michigan	9883640	7569939	1383756	436358	236490	257097	0.498010	0.504299
23	Minnesota	5303925	4405142	269141	250258	212996	166388	0.427024	0.407947
24	Mississippi	2967297	1722287	1093512	81481	25477	44540	0.550642	0.591949
25	Missouri	5988927	4850748	687149	212470	97221	141339	0.430525	0.429356
26	Montana	989415	868628	3743	28565	6138	82341	0.295872	0.149198
27	Nebraska	1826341	1499753	80959	167405	31919	46305	0.424281	0.417907
28	Nevada	2700551	1462081	208058	716501	191047	122864	0.751622	0.774363
29	New Hampshire	1316470	1215050	13625	36704	28241	22850	0.232308	0.210183
30	New Jersey	8791894	5214878	1125401	1555144	719827	176644	0.722462	0.783517
31	New Mexico	2059179	833810	35462	953403	26305	210199	0.671781	0.603770
32	New York	19378102	11304247	2783857	3416922	1406194	466882	0.732727	0.787727
33	North Carolina	9535483	6223995	2019854	800120	206579	284935	0.623233	0.645955
34	North Dakota	672591	598007	7720	13467	6839	46558	0.289289	0.165826
35	Ohio	11536504	9359263	1389115	354674	190765	242687	0.422934	0.426370
36	Oklahoma	3751351	2575381	272071	332007	64154	507738	0.623426	0.506346
37	Oregon	3831074	3005848	64984	450062	139436	170744	0.478609	0.444008
38	Pennsylvania	12702379	10094652	1327091	719660	346288	214688	0.465015	0.486249
39	Rhode Island	1052567	803685	51560	130655	29988	36679	0.516377	0.508129
40	South Carolina	4625364	2962740	1279998	235682	58307	88637	0.573768	0.609445
41	South Dakota	814180	689502	9959	22119	7553	85047	0.355383	0.191061
42	Tennessee	6346105	4800782	1049391	290059	90311	115562	0.486619	0.508575
43	Texas	25145561	11397345	2886825	9460921	948426	452044	0.727466	0.793870
44	Utah	2763885	2221719	25951	358340	54176	103699	0.425283	0.393087
45	Vermont	625741	590223	5943	9208	7875	12492	0.183061	0.144800
46	Virginia	8001024	5186450	1523704	631825	436298	222747	0.655915	0.688954
47	Washington	6724540	4876804	229603	755790	475634	386709	0.587508	0.555274
48	West Virginia	1852994	1726256	62122	22268	12285	30063	0.206960	0.183409
49	Wisconsin	5686986	4738411	350898	336056	128052	133569	0.412929	0.408698
50	Wyoming	563626	483874	4351	50231	4279	20891	0.337501	0.288172

51 rows × 9 columns

In [20]:

# counties

r = list(counties(P005_vars_with_name))

In [21]:

df2 = diversity(r)

In [22]:

df2.sort_index(by='entropy5',ascending=False)

Out[22]:

	Name	Total	White	Black	Hispanic	Asian	Other	entropy5	entropy4
1868	Queens County	2230722	616727	395881	613750	508334	96030	0.925644	0.989171
68	Aleutians West Census Area	5561	1745	318	726	1575	1197	0.920216	0.882623
186	Alameda County	1510271	514559	184126	339889	390524	81173	0.910834	0.957875
233	Solano County	413344	168628	58743	99356	59027	27590	0.897416	0.926901
67	Aleutians East Borough	3141	425	212	385	1113	1006	0.896064	0.864996
2601	Fort Bend County	585375	211680	123267	138967	98762	12699	0.882673	0.970379
1851	Kings County	2504700	893306	799066	496285	260129	55914	0.853105	0.934130
219	Sacramento County	1418788	687166	139949	306196	198944	86533	0.842896	0.865689
453	Gwinnett County	805321	354316	184122	162035	84763	20085	0.838965	0.912596
550	Maui County	154834	49193	818	15710	43384	45729	0.833108	0.751354
546	Hawaii County	185079	57831	899	21383	39588	65378	0.830209	0.765941
192	Contra Costa County	1049025	500923	93604	255560	148881	50057	0.829415	0.865931
1207	Montgomery County	971777	478765	161689	165398	134677	31248	0.828220	0.887528
224	San Joaquin County	685306	245919	48540	266341	94547	29959	0.828052	0.869824
1782	Hudson County	634266	195510	71315	267853	83825	15763	0.827493	0.899094
1229	Suffolk County	722023	346979	142980	143455	58963	29646	0.826117	0.871302
549	Kauai County	67091	20611	258	6315	20296	19611	0.824890	0.737361
2936	Manassas Park city	14273	6070	1784	4645	1261	513	0.821891	0.873913
2892	Prince William County	402002	195656	78492	81460	29986	16408	0.818339	0.862245
547	Honolulu County	953207	181684	17929	77433	410019	266142	0.816249	0.722054
223	San Francisco County	805235	337451	46781	121774	265700	33529	0.816230	0.858482
226	San Mateo County	718451	303609	18763	182502	175934	37643	0.811677	0.837993
1967	Robeson County	134168	36160	32347	10932	971	53758	0.809458	0.757652
1936	Hoke County	46952	19142	15392	5823	467	6128	0.808919	0.758726
1858	New York County	1585873	761493	205340	403577	177624	37839	0.807452	0.877051
228	Santa Clara County	1781642	626909	42331	479210	565466	67726	0.806568	0.852239
1780	Essex County	783969	260177	308358	159117	35292	21025	0.803622	0.867143
1748	Clark County	1951269	935955	194821	568644	165121	86728	0.800982	0.835903
1785	Middlesex County	809858	398724	71557	148975	172534	18068	0.800874	0.872133
610	Cook County	5194675	2278358	1265778	1244762	318869	86908	0.800036	0.882240
2579	Dallas County	2368139	784693	518732	905940	117797	40977	0.798837	0.879632
204	Los Angeles County	9818605	2728321	815086	4687889	1325671	261638	0.796781	0.859287
2623	Harris County	4092459	1349646	754258	1671540	249853	67162	0.796176	0.878354
367	Orange County	1145956	526754	223200	308244	55541	32217	0.792964	0.852138
325	Broward County	1748066	760817	449677	438247	55692	43633	0.782936	0.845847
992	Wyandotte County	157505	68170	39046	41633	3886	4770	0.781122	0.834209
2848	Fairfax County	1081726	590622	96078	168482	188737	37807	0.780967	0.826231
1793	Union County	536499	243312	111705	146704	24496	10282	0.780779	0.854469
2294	Philadelphia County	1526006	562585	644287	187611	95521	36002	0.777585	0.842034
2935	Manassas city	37821	17994	4905	11876	1861	1185	0.769676	0.818704
2536	Bell County	310235	157289	63380	67010	8350	14206	0.769433	0.795595
2915	Alexandria city	139966	74878	29778	22524	8351	4435	0.767634	0.815651
1921	Durham County	267587	112697	100260	36077	12180	6373	0.765370	0.827083
222	San Diego County	3095313	1500047	146600	991348	328058	129260	0.764654	0.795817
1784	Mercer County	366513	199909	71378	55318	32545	7363	0.763097	0.831665
2645	Jefferson County	252273	112503	84500	42899	8525	3846	0.749335	0.825618
2742	Tarrant County	1809034	937135	262522	482977	83378	43022	0.748173	0.806698
215	Orange County	3010232	1328499	44000	1012973	532477	92283	0.747159	0.792896
80	Kodiak Island Borough	13592	7137	85	996	2620	2754	0.747039	0.631715
242	Yolo County	200849	100240	4752	60953	25640	9264	0.746835	0.767605
221	San Bernardino County	2035210	677598	170700	1001145	123978	61789	0.745345	0.791244
236	Sutter County	94737	47782	1713	27251	13442	4549	0.745009	0.762589
1949	Mecklenburg County	919628	465372	278042	111944	41991	22279	0.741724	0.798176
1915	Cumberland County	319431	150749	113939	30190	6885	17668	0.738064	0.743731
1789	Passaic County	501226	227144	55480	185677	24556	8369	0.737074	0.807972
2146	Comanche County	124098	73122	20794	13896	2663	13623	0.733880	0.676671
2185	Oklahoma County	718633	425791	108954	108543	21151	54194	0.733371	0.712225
195	Fresno County	930450	304522	45005	468070	86856	25997	0.732562	0.780302
2542	Brazoria County	313166	166674	36880	86643	17013	5956	0.731108	0.795982
1774	Atlantic County	274549	160871	40882	46241	20419	6136	0.730078	0.787988
	...	...	...	...	...	...	...	...	...

3143 rows × 9 columns

In [25]:

msas_list = list(islice(msas('NAME,P0010001'),None))

In [26]:

len(msas_list)

Out[26]:

In [27]:

df = DataFrame(msas_list)

In [29]:

df.P0010001 = df.P0010001.astype('int')

In [34]:

df.groupby('metropolitan statistical area/micropolitan statistical area').apply(lambda x:sum(x['P0010001']))

Out[34]:

metropolitan statistical area/micropolitan statistical area
10020                                                           57999
10100                                                           40602
10140                                                           72797
10180                                                          165252
10220                                                           37492
10300                                                           99892
10420                                                          703200
10460                                                           63797
10500                                                          157308
10540                                                          116672
10580                                                          870716
10620                                                           60585
10660                                                           31255
10700                                                           93019
10740                                                          887077
...
49060                                                           36311
49100                                                           51461
49180                                                          477717
49260                                                           20081
49300                                                          114520
49340                                                          798552
49380                                                           21378
49420                                                          243231
49460                                                           22438
49540                                                           28065
49620                                                          434972
49660                                                          565773
49700                                                          166892
49740                                                          195751
49780                                                           86074
Length: 942, dtype: int64

In [ ]: