from pandas import * import matplotlib as mpl mpl.rc('figure', figsize=(12, 6)) set_printoptions(max_columns=18) months = {'JAN' : 1, 'FEB' : 2, 'MAR' : 3, 'APR' : 4, 'MAY' : 5, 'JUN' : 6, 'JUL' : 7, 'AUG' : 8, 'SEP' : 9, 'OCT': 10, 'NOV': 11, 'DEC' : 12} of_interest = ['Obama, Barack', 'Romney, Mitt', 'Santorum, Rick', 'Paul, Ron', 'Gingrich, Newt'] parties = { 'Bachmann, Michelle': 'Republican', 'Romney, Mitt': 'Republican', 'Obama, Barack': 'Democrat', "Roemer, Charles E. 'Buddy' III": 'Reform', 'Pawlenty, Timothy': 'Republican', 'Johnson, Gary Earl': 'Libertarian', 'Paul, Ron': 'Republican', 'Santorum, Rick': 'Republican', 'Cain, Herman': 'Republican', 'Gingrich, Newt': 'Republican', 'McCotter, Thaddeus G': 'Republican', 'Huntsman, Jon': 'Republican', 'Perry, Rick': 'Republican' } fec = read_csv('P00000001-ALL.txt') fec parties # dictionary fec.cand_nm fec['party'] = fec.cand_nm.map(parties) # map can take dictionary, series... # Alternatively, use list comprehension: [parties[x] for x in fec.cand_nm] fec.ix[0] # A single record. fec.party.value_counts() # Doing it high level way using histogram. np.unique(fec.party) fec.groupby('party')['contb_receipt_amt'].sum() fec.groupby(['party', 'contb_receipt_dt'])['contb_receipt_amt'].sum() # Not a python date object. fec.contb_receipt_dt[0] print months def convert_date(val): # function to convert date to python's date object. d, m, y = val.split('-') m = months[m] return datetime(int ('20' + y), m, int(d)) fec.contb_receipt_dt # output a list of date (not yet in python objects). fec['contb_receipt_dt'] = fec.contb_receipt_dt.map(convert_date) fec.contb_receipt_dt[0] fec.groupby(['party', 'contb_receipt_dt'])['contb_receipt_amt'].sum() # Same as above, however, date converted to python object. by_date = fec.groupby(['party', 'contb_receipt_dt'])['contb_receipt_amt'].sum() by_date.unstack('party') # Indexes are dates. Columns are political parties. by_date.unstack('party').cumsum().plot() s = by_date.unstack('party')[-5:].stack() s # Note the inner level indexes. Not every date has 3 parties. Top level indexes are dates. stack() makes 1 dimensional. # Labels propagated down for all 3 observations: political parties. set_printoptions(notebook_repr_html=False) # turn off html, i.e., without table. s.unstack() # unstack() 2 dimensional. s.unstack().stack() # 2 dimensional to 1 dimensional df = s.unstack('party') # data frame s.unstack('contb_receipt_dt') concat([df, df], axis=1, keys=['A', 'B']) # join 2 copies of data frame. # standard hierarchical indexing stuff. concat([df, df], axis=1, keys=['A', 'B'])['A'] # select out columns by the first key 'A'. Or select out the 'B' group.conc concat([df, df], axis=1, keys=['A', 'B']).stack('party') concat([df, df], axis=1, keys=['A', 'B']).stack(0) # The group indexes have no labels. result = fec.groupby(['cand_nm', 'contbr_occupation'])['contb_receipt_amt'].sum() # Top contributors to candidates by occupation. result # is a data frame instead of series. result[:, 'ZOMBIE SLAYER'] result = fec.groupby(['cand_nm', 'contbr_occupation']).sum() # When doing this aggregation, we don't need to select the columns as above. # There are columns that are not possible to sum. The code actually tries to sum them all: nuisance columns. Whenever it fails, it just silently drops them out. result = result.reset_index() # opposite of set_index(). Took 'cand_nm' and 'contbr_occupation' and set them as col indexes again, so that the indexes are just simple integer indexes. result # We have so far aggregated total amount donated to each political party by occupation (the first groupby). # Additional groupby let us split them up by candidate, sort them in descending order by total amount donated and take the top 5. set_printoptions(notebook_repr_html=True) def top5_donors(group): return group.sort_index(by='contb_receipt_amt')[-5:] # top5_donors(result) result.groupby('cand_nm').apply(top5_donors) # The retired donated the most to Barack Obama. df._data # Internal data structure. The back data frame. Having hierarchical indexing medicates a lot of the need for higher dimensional objects (because they're very sparse). # Time Series for financial data will be improved next.