# note version of pandas
import pandas
pandas.__version__

import us
import census
import settings

import pandas as pd
import numpy as np
from pandas import DataFrame, Series
from itertools import islice

c = census.Census(settings.CENSUS_KEY)

def states(variables='NAME'):
    geo={'for':'state:*'}
    
    states_fips = set([state.fips for state in us.states.STATES])
    # need to filter out non-states
    for r in c.sf1.get(variables, geo=geo, year=2010):
        if r['state'] in states_fips:
            yield r

# make a dataframe from the total populations of states in the 2010 Census

df = DataFrame(states('NAME,P0010001'))
df.P0010001 = df.P0010001.astype('int')
df.head()

# check that that we have the right total population

df.P0010001.sum() == 308745538 

# add a column with the first letter 
# we'll be grouping states based on the first letter of the state NAME

df['first_letter'] = df.NAME.apply(lambda s:s[0])
df.head()

# we can explicitly name df.first_letter

grouped = df.groupby(df.first_letter)
grouped

# shorthand for df.first_letter
# same thing as df.groupby(df.first_letter)

grouped = df.groupby('first_letter')
grouped

# count the number of states with each first letter -- look at top of the resulting DataFrame

grouped.count().head()

# count the number of states with each first letter -- look at bottom of the resulting DataFrame

grouped.count().tail()

# we didn't have to explicitly create a new column -- we could groupby on a dynamically generated Series
# note the use of str operations on df.NAME: 
# http://pandas.pydata.org/pandas-docs/stable/basics.html#vectorized-string-methods

df.groupby(df.NAME.str.slice(0,1)).sum()

# we can get groups of indexes

df.groupby('first_letter').groups

# loop through all groups --> here just the first one

for name, group in islice(df.groupby('first_letter'),1):
    print(name)
    print type(group) # yes -- a DataFrame
    print group.index
    print(group),

# how about accessing group 'C'?
# http://stackoverflow.com/a/14734627/7782
# http://stackoverflow.com/questions/19804282/in-pandas-is-there-something-like-a-groupby-get-group-but-with-an-optional-defa
# http://pandas.pydata.org/pandas-docs/dev/generated/pandas.core.groupby.GroupBy.get_group.html

grouped = df.groupby('first_letter')
grouped.get_group('C')

# total population of states starting with 'C'

grouped.get_group('C').P0010001.sum()

# generate a Series of total populations by first letter

grouped = df.groupby('first_letter')
s = grouped['P0010001'].sum()
s

# sort the list to get the most populous groups
# http://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.sort.html
# note sorting in place

s.sort(ascending=False)
s

s.index

# first pass at pulling together the letter, the total population, and the list of corresponding states

for k in s.index:
    print k, s[k], list(grouped.get_group(k).NAME)


# let's try this again and try to do this in a more idiomatic Pandas fashion
# ideally, generate a DataFrame with a NAME column that is a list of all states with the same first letter

# It turns out that apply can come to the rescue
# here' calculate the list of NAMEs

# http://pandas.pydata.org/pandas-docs/dev/groupby.html#flexible-apply
# http://stackoverflow.com/questions/19530568/can-pandas-groupby-aggregate-into-a-list-rather-than-sum-mean-etc

df.groupby("first_letter").apply(lambda x: list(x['NAME']))

# apply can be used to add up the populations by group

df.groupby("first_letter").apply(lambda x: np.sum(x['P0010001']))

# make a tuple out of the list of names and the population

df.groupby("first_letter").apply(lambda x:( list(x['NAME']), np.sum(x['P0010001'])))

# remind ourselves on how to turn a tuple into a Series with a small example

Series(([1,2],2), index=['one','two'])

# we're ready to make a new DataFrame

df.groupby("first_letter").apply(lambda x:Series((list(x['NAME']), np.sum(x['P0010001'])), 
                                                 index=['states','total_pop']))

df2 = df.groupby("first_letter").apply(lambda x:Series((list(x['NAME']), np.sum(x['P0010001'])), 
                                                 index=['states','total_pop'])).sort_index(by='total_pop',ascending=False)

# make sure you understand the syntax here:
# .ix: http://pandas.pydata.org/pandas-docs/dev/indexing.html#advanced-indexing-with-ix
# ability to grab columns by name to return a new DataFrame

df2.ix['C'][['states','total_pop']]

print "states that start with 'C'", df2.ix['C']['states']

print "total population of states that start with 'C'", df2.ix['C']['total_pop']