from pandas import *
import matplotlib as mpl
mpl.rc('figure', figsize=(12, 6))
set_printoptions(max_columns=18)
months = {'JAN' : 1, 'FEB' : 2, 'MAR' : 3, 'APR' : 4, 'MAY' : 5, 'JUN' : 6,
          'JUL' : 7, 'AUG' : 8, 'SEP' : 9, 'OCT': 10, 'NOV': 11, 'DEC' : 12}
of_interest = ['Obama, Barack', 'Romney, Mitt', 'Santorum, Rick', 
               'Paul, Ron', 'Gingrich, Newt']
parties = {
  'Bachmann, Michelle': 'Republican',
  'Romney, Mitt': 'Republican',
  'Obama, Barack': 'Democrat',
  "Roemer, Charles E. 'Buddy' III": 'Reform',
  'Pawlenty, Timothy': 'Republican',
  'Johnson, Gary Earl': 'Libertarian',
  'Paul, Ron': 'Republican',
  'Santorum, Rick': 'Republican',
  'Cain, Herman': 'Republican',
  'Gingrich, Newt': 'Republican',
  'McCotter, Thaddeus G': 'Republican',
  'Huntsman, Jon': 'Republican',
  'Perry, Rick': 'Republican'           
 }

fec = read_csv('P00000001-ALL.txt')

fec

parties    # dictionary

fec.cand_nm

fec['party'] = fec.cand_nm.map(parties)  # map can take dictionary, series... 
                                         # Alternatively, use list comprehension: [parties[x] for x in fec.cand_nm]

fec.ix[0]    # A single record.

fec.party.value_counts()   # Doing it high level way using histogram. 

np.unique(fec.party)

fec.groupby('party')['contb_receipt_amt'].sum()

fec.groupby(['party', 'contb_receipt_dt'])['contb_receipt_amt'].sum()   # Not a python date object.

fec.contb_receipt_dt[0]

print months

def convert_date(val):    # function to convert date to python's date object.
    d, m, y = val.split('-')
    m = months[m]
    return datetime(int ('20' + y), m, int(d))

fec.contb_receipt_dt   # output a list of date (not yet in python objects).
fec['contb_receipt_dt'] = fec.contb_receipt_dt.map(convert_date)

fec.contb_receipt_dt[0]

fec.groupby(['party', 'contb_receipt_dt'])['contb_receipt_amt'].sum() # Same as above, however, date converted to python object.

by_date = fec.groupby(['party', 'contb_receipt_dt'])['contb_receipt_amt'].sum()

by_date.unstack('party')   # Indexes are dates. Columns are political parties.

by_date.unstack('party').cumsum().plot()

s = by_date.unstack('party')[-5:].stack()

s    # Note the inner level indexes. Not every date has 3 parties. Top level indexes are dates. stack() makes 1 dimensional.
     # Labels propagated down for all 3 observations: political parties.

set_printoptions(notebook_repr_html=False)  # turn off html, i.e., without table.
s.unstack()    # unstack() 2 dimensional. 

s.unstack().stack()  # 2 dimensional to 1 dimensional

df = s.unstack('party')   # data frame

s.unstack('contb_receipt_dt')

concat([df, df], axis=1, keys=['A', 'B'])  # join 2 copies of data frame.
                                           # standard hierarchical indexing stuff.

concat([df, df], axis=1, keys=['A', 'B'])['A']   # select out columns by the first key 'A'. Or select out the 'B' group.conc

concat([df, df], axis=1, keys=['A', 'B']).stack('party')

concat([df, df], axis=1, keys=['A', 'B']).stack(0)  # The group indexes have no labels.

result = fec.groupby(['cand_nm', 'contbr_occupation'])['contb_receipt_amt'].sum()   # Top contributors to candidates by occupation.

result  # is a data frame instead of series.

result[:, 'ZOMBIE SLAYER']

result = fec.groupby(['cand_nm', 'contbr_occupation']).sum()  # When doing this aggregation, we don't need to select the columns as above.
# There are columns that are not possible to sum. The code actually tries to sum them all: nuisance columns. Whenever it fails, it just silently drops them out.

result = result.reset_index()  # opposite of set_index(). Took 'cand_nm' and 'contbr_occupation' and set them as col indexes again, so that the indexes are just simple integer indexes.

result

# We have so far aggregated total amount donated to each political party by occupation (the first groupby).
# Additional groupby let us split them up by candidate, sort them in descending order by total amount donated and take the top 5.
set_printoptions(notebook_repr_html=True)
def top5_donors(group):
    return group.sort_index(by='contb_receipt_amt')[-5:]
# top5_donors(result)
result.groupby('cand_nm').apply(top5_donors)   # The retired donated the most to Barack Obama.

df._data   # Internal data structure. The back data frame. Having hierarchical indexing medicates a lot of the need for higher dimensional objects (because they're very sparse).

# Time Series for financial data will be improved next.