%pylab --no-import-all inline

import numpy as np
import matplotlib.pyplot as plt
from pandas import DataFrame, Series, Index
import pandas as pd

from itertools import islice

import census
import us

import settings

c = census.Census(key=settings.CENSUS_KEY)

# generators for the various census geographic entities of interest

def states(variables='NAME'):
    geo={'for':'state:*'}
    states_fips = set([state.fips for state in us.states.STATES])
    # need to filter out non-states
    for r in c.sf1.get(variables, geo=geo):
        if r['state'] in states_fips:
            yield r
            
def counties(variables='NAME'):
    """ask for all the states in one call"""
    
    # tabulate a set of fips codes for the states
    states_fips = set([s.fips for s in us.states.STATES])
    
    geo={'for':'county:*',
             'in':'state:*'}    
    for county in c.sf1.get(variables, geo=geo):
        # eliminate counties whose states aren't in a state or DC
        if county['state'] in states_fips:
            yield county
        

def counties2(variables='NAME'):
    """generator for all counties"""
    
    # since we can get all the counties in one call, 
    # this function is for demonstrating the use of walking through 
    # the states to get at the counties

    for state in us.states.STATES:
        geo={'for':'county:*',
             'in':'state:{fips}'.format(fips=state.fips)}
        for county in c.sf1.get(variables, geo=geo):
            yield county

            
def tracts(variables='NAME'):
    for state in us.states.STATES:
        
        # handy to print out state to monitor progress
        # print state.fips, state
        counties_in_state={'for':'county:*',
             'in':'state:{fips}'.format(fips=state.fips)}
        
        for county in c.sf1.get('NAME', geo=counties_in_state):
            
            # print county['state'], county['NAME']
            tracts_in_county = {'for':'tract:*',
              'in': 'state:{s_fips} county:{c_fips}'.format(s_fips=state.fips, 
                                                            c_fips=county['county'])}
            
            for tract in c.sf1.get(variables,geo=tracts_in_county):
                yield tract


def msas(variables="NAME"):
    
     for state in us.STATES:
        geo = {'for':'metropolitan statistical area/micropolitan statistical area:*', 
               'in':'state:{state_fips}'.format(state_fips=state.fips)
               }
    
        for msa in c.sf1.get(variables, geo=geo):
            yield msa
            
def block_groups(variables='NAME'):
    # http://api.census.gov/data/2010/sf1?get=P0010001&for=block+group:*&in=state:02+county:170
    # let's use the county generator
    for county in counties(variables):
        geo = {'for':'block group:*',
               'in':'state:{state} county:{county}'.format(state=county['state'],
                                                county=county['county'])
               }
        for block_group in c.sf1.get(variables, geo):
            yield block_group
    
    
def blocks(variables='NAME'):
    # http://api.census.gov/data/2010/sf1?get=P0010001&for=block:*&in=state:02+county:290+tract:00100
    
    # make use of the tract generator
    for tract in tracts(variables):
        geo={'for':'block:*',
             'in':'state:{state} county:{county} tract:{tract}'.format(state=tract['state'],
                                                                       county=tract['county'],
                                                                       tract=tract['tract'])
             }
        for block in c.sf1.get(variables, geo):
            yield block
        
def csas(variables="NAME"):
    # http://api.census.gov/data/2010/sf1?get=P0010001&for=combined+statistical+area:*&in=state:24
    for state in us.STATES:
        geo = {'for':'combined statistical area:*', 
               'in':'state:{state_fips}'.format(state_fips=state.fips)
               }
    
        for csa in c.sf1.get(variables, geo=geo):
            yield csa

def districts(variables="NAME"):
    # http://api.census.gov/data/2010/sf1?get=P0010001&for=congressional+district:*&in=state:24
    for state in us.STATES:
        geo = {'for':'congressional district:*', 
               'in':'state:{state_fips}'.format(state_fips=state.fips)
               }
    
        for district in c.sf1.get(variables, geo=geo):
            yield district    
            
def zip_code_tabulation_areas(variables="NAME"):
    # http://api.census.gov/data/2010/sf1?get=P0010001&for=zip+code+tabulation+area:*&in=state:02
    for state in us.STATES:
        geo = {'for':'zip code tabulation area:*', 
               'in':'state:{state_fips}'.format(state_fips=state.fips)
               }
    
        for zip_code_tabulation_area in c.sf1.get(variables, geo=geo):
            yield zip_code_tabulation_area           


def census_labels(prefix='P005', n0=1, n1=17, field_width=4, include_name=True, join=False):
    """convenience function to generate census labels"""
    
    label_format = "{i:0%dd}" % (field_width)
    
    variables = [prefix + label_format.format(i=i) for i in xrange(n0,n1+1)]
    if include_name:
        variables = ['NAME'] + variables

    if join:
        return ",".join(variables)
    else:
        return variables

def rdot_labels(other=True):
    if other:
        return ['White', 'Black', 'Asian', 'Hispanic', 'Other']
    else:
        return ['White', 'Black', 'Asian', 'Hispanic']
    
FINAL_LABELS = ['NAME', 'Total'] + rdot_labels() + ['p_White', 'p_Black', 'p_Asian', 'p_Hispanic', 'p_Other'] + ['entropy5', 'entropy4', 'entropy_rice', 'gini_simpson']
    
def convert_to_rdotmap(row):
    """takes the P005 variables and maps to a series with White, Black, Asian, Hispanic, Other
    Total"""
    return pd.Series({'Total':row['P0050001'],
                      'White':row['P0050003'],
                      'Black':row['P0050004'],
                      'Asian':row['P0050006'],
                      'Hispanic':row['P0050010'],
                      'Other': row['P0050005'] + row['P0050007'] + row['P0050008'] + row['P0050009'],
                      }, index=['Total', 'White', 'Black', 'Hispanic', 'Asian', 'Other'])


def normalize(s):
    """take a Series and divide each item by the sum so that the new series adds up to 1.0"""
    total = np.sum(s)
    return s.astype('float') / total
    
def normalize_relabel(s):
    """take a Series and divide each item by the sum so that the new series adds up to 1.0
    Also relabel the indices by adding p_ prefix"""
    total = np.sum(s)
    new_index = list(Series(s.index).apply(lambda x: "p_"+x))
    return Series(list(s.astype('float') / total),new_index)

def entropy(series):
    """Normalized Shannon Index"""
    # a series in which all the entries are equal should result in normalized entropy of 1.0
    
    # eliminate 0s
    series1 = series[series!=0]

    # if len(series) < 2 (i.e., 0 or 1) then return 0
    
    if len(series1) > 1:
        # calculate the maximum possible entropy for given length of input series
        max_s = -np.log(1.0/len(series))
    
        total = float(sum(series1))
        p = series1.astype('float')/float(total)
        return sum(-p*np.log(p))/max_s
    else:
        return 0.0

def gini_simpson(s):
    # https://en.wikipedia.org/wiki/Diversity_index#Gini.E2.80.93Simpson_index
    s1 = normalize(s)
    return 1-np.sum(s1*s1)

def entropy_rice(series):
    """hard code how Rice U did calculation 
    This function takes the entropy5 calculation and removes the contribution from 'Other'
    """
    # pass in a Series with 
    # 'Asian','Black','Hispanic','White','Other'
    # http://kinder.rice.edu/uploadedFiles/Urban_Research_Center/Media/Houston%20Region%20Grows%20More%20Ethnically%20Diverse%202-13.pdf

    s0 = normalize(series)
    s_other = s0['Other']*np.log(s0['Other']) if s0['Other'] > 0 else 0.0
    return (np.log(0.2)*entropy(series) - s_other)/np.log(0.25)

def diversity(df):
    """Takes a df with the P005 variables and does entropy calculation"""
    # convert populations to int
    df[census_labels(include_name=False)] = df[census_labels(include_name=False)].astype('int')
    df = pd.concat((df, df.apply(convert_to_rdotmap, axis=1)),axis=1)
    df = pd.concat((df,df[rdot_labels()].apply(normalize_relabel,axis=1)), axis=1)
    df['entropy5'] = df.apply(lambda x:entropy(x[rdot_labels()]), axis=1)
    df['entropy4'] = df.apply(lambda x:entropy(x[rdot_labels(other=False)]), axis=1)
    df['entropy_rice'] = df.apply(lambda x:entropy_rice(x[rdot_labels()]), axis=1)
    df['gini_simpson'] = df.apply(lambda x:gini_simpson(x[rdot_labels()]), axis=1)
    return df

# grab states, convert populations to int
states_df = DataFrame(list(states(census_labels())))
states_df = diversity(states_df)

states_df[FINAL_LABELS].head()

states_df.sort_index(by='entropy5', ascending=False)[FINAL_LABELS].head()

r = list(counties(census_labels()))

counties_df = DataFrame(r)
counties_df = diversity(counties_df)
counties_df[FINAL_LABELS].head()

counties_df.sort_index(by='entropy5', ascending=False)[FINAL_LABELS].head()

# msas

r = list(msas(census_labels()))

len(r)

df=DataFrame(r)
df[census_labels(include_name=False)] = df[census_labels(include_name=False)].astype('int')

msas_grouped = df.groupby('metropolitan statistical area/micropolitan statistical area')

#df1 = msas_grouped.apply(lambda x:Series((list(x['NAME']), sum(x['P0050001'])), index=['msas','total_pop'])).sort_index(by='total_pop', ascending=False)
df1 = msas_grouped.apply(lambda x:Series((list(x['NAME']), ), 
                                         index=['msas']))


df2 = msas_grouped.sum()
df3 = pd.concat((df1,df2), axis=1)
df3['NAME'] = df3.apply(lambda x: "; ".join(x['msas']), axis=1)


msas_df = diversity(df3)

# grab the ten most populous msas and sort by entropy_rice
msas_df.sort_index(by='Total', ascending=False)[:10].sort_index(by='entropy_rice', ascending=False)[FINAL_LABELS]

# Testing code

def to_unicode(vals):
    return [unicode(v) for v in vals]

def test_msas_df(msas_df):

    min_set_of_columns =  set(['Asian','Black','Hispanic', 'Other', 'Total', 'White',
     'entropy4', 'entropy5', 'entropy_rice', 'gini_simpson','p_Asian', 'p_Black',
     'p_Hispanic', 'p_Other','p_White'])  
    
    assert min_set_of_columns & set(msas_df.columns) == min_set_of_columns
    
    # https://www.census.gov/geo/maps-data/data/tallies/national_geo_tallies.html
    # 366 metro areas
    # 576 micropolitan areas
    
    assert len(msas_df) == 942  
    
    # total number of people in metro/micro areas
    
    assert msas_df.Total.sum() == 289261315
    assert msas_df['White'].sum() == 180912856
    assert msas_df['Other'].sum() == 8540181
    
    # list of msas in descendng order by entropy_rice 
    top_10_metros = msas_df.sort_index(by='Total', ascending=False)[:10]
    msa_codes_in_top_10_pop_sorted_by_entropy_rice = list(top_10_metros.sort_index(by='entropy_rice', 
                                                ascending=False).index) 
    
    assert to_unicode(msa_codes_in_top_10_pop_sorted_by_entropy_rice)== [u'26420', u'35620', u'47900', u'31100', u'19100', 
        u'33100', u'16980', u'12060', u'37980', u'14460']


    top_10_metro = msas_df.sort_index(by='Total', ascending=False)[:10]
    
    list(top_10_metro.sort_index(by='entropy_rice', ascending=False)['entropy5'])
    
    np.testing.assert_allclose(top_10_metro.sort_index(by='entropy_rice', ascending=False)['entropy5'], 
    [0.79628076626851163, 0.80528601550164602, 0.80809418318973791, 0.7980698349711991,
     0.75945930510650161, 0.74913610558765376, 0.73683277781032397, 0.72964862063970914,
     0.64082509648457675, 0.55697288400004963])
    
    np.testing.assert_allclose(top_10_metro.sort_index(by='entropy_rice', ascending=False)['entropy_rice'],
    [0.87361766576115552,
     0.87272877244078051,
     0.85931803868749834,
     0.85508015237749468,
     0.82169723530719896,
     0.81953527301129059,
     0.80589423784325431,
     0.78602596561378812,
     0.68611350427640316,
     0.56978827050565117])


# you are on the right track if test_msas_df doesn't complain
test_msas_df(msas_df)

# code to save your dataframe to a CSV
# upload the CSV to bCourses
# uncomment to run
# msas_df.to_csv("msas_2010.csv", encoding="UTF-8")

# load back the CSV and test again
# df = DataFrame.from_csv("msas_2010.csv", encoding="UTF-8")
# test_msas_df(df)

all_categories = census_labels('P005',2,10, include_name=False) + \
                 census_labels('P005',11,17, include_name=False)
all_categories

msas_df['entropy_all'] = msas_df.apply(lambda x:entropy(x[all_categories]), axis=1)

msas_df.sort_index(by='entropy_all', ascending=False)[FINAL_LABELS + ['entropy_all']][:20]

msas_df.sort_index(by='P0050001', ascending=False).head()

top_10_metros = msas_df.sort_index(by='Total', ascending=False)[:10]
top_10_metros['City'] = top_10_metros['NAME'].apply(lambda name: name.split('-')[0])
top_10_metros.sort(columns=['entropy_rice'], inplace=True, ascending=True)

cities = pd.Series(top_10_metros['City'])

diversity = pd.Series(top_10_metros['entropy_rice'])

p_white = pd.Series(top_10_metros['p_White'])
p_asian = pd.Series(top_10_metros['p_Asian'])
p_black = pd.Series(top_10_metros['p_Black'])
p_latino = pd.Series(top_10_metros['p_Hispanic'])

import matplotlib.pyplot as plt

fig = plt.figure(figsize=(10, 8))
ax = plt.subplot(111)

# y axis locations for diversity and races
y_div = np.arange(len(cities))*2
y_race = (np.arange(len(cities))*2)+1

# diversity bars
pDiversity = ax.barh(y_div, diversity, alpha=0.4)

# stacked horizontal bars
pWhite = ax.barh(y_race, p_white, color='b')
pLatino = ax.barh(y_race, p_latino, color='g', left=p_white)
pBlack = ax.barh(y_race, p_black, color='r', left=p_white+p_latino)
pAsian = ax.barh(y_race, p_asian, color='c', left=p_white+p_latino+p_black)

plt.yticks(y_race, cities)

# legend foo https://stackoverflow.com/questions/4700614/how-to-put-the-legend-out-of-the-plot
# Shink current axis's height by 10% on the bottom
box = ax.get_position()
ax.set_position([box.x0, box.y0 + box.height * 0.1,
                 box.width, box.height * 0.85])

# Put a legend below current axis
ax.legend((pWhite, pLatino, pBlack, pAsian, pDiversity), ('White', 'Latino', 'Black', 'Asian', 'Diversity'),
          loc='upper center', bbox_to_anchor=(0.5, -0.05),
          fancybox=True, shadow=True, ncol=5)

plt.show()

# If you want to save it