In [1]:

from pandas import *
import matplotlib as mpl
mpl.rc('figure', figsize=(12, 6))
set_printoptions(max_columns=18)
months = {'JAN' : 1, 'FEB' : 2, 'MAR' : 3, 'APR' : 4, 'MAY' : 5, 'JUN' : 6,
          'JUL' : 7, 'AUG' : 8, 'SEP' : 9, 'OCT': 10, 'NOV': 11, 'DEC' : 12}
of_interest = ['Obama, Barack', 'Romney, Mitt', 'Santorum, Rick', 
               'Paul, Ron', 'Gingrich, Newt']
parties = {
  'Bachmann, Michelle': 'Republican',
  'Romney, Mitt': 'Republican',
  'Obama, Barack': 'Democrat',
  "Roemer, Charles E. 'Buddy' III": 'Reform',
  'Pawlenty, Timothy': 'Republican',
  'Johnson, Gary Earl': 'Libertarian',
  'Paul, Ron': 'Republican',
  'Santorum, Rick': 'Republican',
  'Cain, Herman': 'Republican',
  'Gingrich, Newt': 'Republican',
  'McCotter, Thaddeus G': 'Republican',
  'Huntsman, Jon': 'Republican',
  'Perry, Rick': 'Republican'           
 }

In [2]:

fec = read_csv('P00000001-ALL.txt')

In [3]:

fec

Out[3]:

<class 'pandas.core.frame.DataFrame'>
Int64Index: 536041 entries, 0 to 536040
Data columns:
cmte_id              536041  non-null values
cand_id              536041  non-null values
cand_nm              536041  non-null values
contbr_nm            536041  non-null values
contbr_city          536026  non-null values
contbr_st            536040  non-null values
contbr_zip           535973  non-null values
contbr_employer      531417  non-null values
contbr_occupation    531448  non-null values
contb_receipt_amt    536041  non-null values
contb_receipt_dt     536041  non-null values
receipt_desc         8479  non-null values
memo_cd              49718  non-null values
memo_text            52740  non-null values
form_tp              536041  non-null values
file_num             536041  non-null values
dtypes: float64(1), int64(1), object(14)

In [4]:

parties    # dictionary

Out[4]:

{'Bachmann, Michelle': 'Republican',
 'Cain, Herman': 'Republican',
 'Gingrich, Newt': 'Republican',
 'Huntsman, Jon': 'Republican',
 'Johnson, Gary Earl': 'Libertarian',
 'McCotter, Thaddeus G': 'Republican',
 'Obama, Barack': 'Democrat',
 'Paul, Ron': 'Republican',
 'Pawlenty, Timothy': 'Republican',
 'Perry, Rick': 'Republican',
 "Roemer, Charles E. 'Buddy' III": 'Reform',
 'Romney, Mitt': 'Republican',
 'Santorum, Rick': 'Republican'}

In [5]:

fec.cand_nm

Out[5]:

0     Bachmann, Michelle
1     Bachmann, Michelle
2     Bachmann, Michelle
3     Bachmann, Michelle
4     Bachmann, Michelle
5     Bachmann, Michelle
6     Bachmann, Michelle
7     Bachmann, Michelle
8     Bachmann, Michelle
9     Bachmann, Michelle
10    Bachmann, Michelle
11    Bachmann, Michelle
12    Bachmann, Michelle
13    Bachmann, Michelle
14    Bachmann, Michelle
...
536026    Perry, Rick
536027    Perry, Rick
536028    Perry, Rick
536029    Perry, Rick
536030    Perry, Rick
536031    Perry, Rick
536032    Perry, Rick
536033    Perry, Rick
536034    Perry, Rick
536035    Perry, Rick
536036    Perry, Rick
536037    Perry, Rick
536038    Perry, Rick
536039    Perry, Rick
536040    Perry, Rick
Name: cand_nm, Length: 536041

In [6]:

fec['party'] = fec.cand_nm.map(parties)  # map can take dictionary, series... 
                                         # Alternatively, use list comprehension: [parties[x] for x in fec.cand_nm]

In [8]:

fec.ix[0]    # A single record.

Out[8]:

cmte_id                       C00410118
cand_id                       P20002978
cand_nm              Bachmann, Michelle
contbr_nm               HARVEY, WILLIAM
contbr_city                      MOBILE
contbr_st                            AL
contbr_zip                    366010290
contbr_employer                 RETIRED
contbr_occupation               RETIRED
contb_receipt_amt                   250
contb_receipt_dt              20-JUN-11
receipt_desc                        NaN
memo_cd                             NaN
memo_text                           NaN
form_tp                           SA17A
file_num                         736166
party                        Republican
Name: 0

In [9]:

fec.party.value_counts()   # Doing it high level way using histogram. 

Out[9]:

Democrat       292400
Republican     237575
Reform           5364
Libertarian       702

In [10]:

np.unique(fec.party)

Out[10]:

134010       Democrat
361727    Libertarian
356462         Reform
6067       Republican
Name: party

In [12]:

fec.groupby('party')['contb_receipt_amt'].sum()

Out[12]:

party
Democrat       8.105758e+07
Libertarian    4.132769e+05
Reform         3.390338e+05
Republican     1.192255e+08
Name: contb_receipt_amt

In [13]:

fec.groupby(['party', 'contb_receipt_dt'])['contb_receipt_amt'].sum()   # Not a python date object.

Out[13]:

party     contb_receipt_dt
Democrat  01-AUG-11           175281.00
          01-DEC-11           651532.82
          01-JAN-12            58098.80
          01-JUL-11           165961.00
          01-JUN-11           145459.00
          01-MAY-11            82644.00
          01-NOV-11           122529.87
          01-OCT-11           148977.00
          01-SEP-11           403297.62
          02-AUG-11           164510.11
          02-DEC-11           216056.96
          02-JAN-12            89743.60
          02-JUL-11            17105.00
          02-JUN-11           422453.00
          02-MAY-11           396675.00
...
Republican  30-JUL-11             12249.04
            30-JUN-11           2744932.63
            30-MAR-11             50240.00
            30-MAY-11             17803.60
            30-NOV-11            809014.83
            30-OCT-11             43913.16
            30-SEP-11           4886331.76
            31-AUG-11           1017735.02
            31-DEC-11           1094376.72
            31-JAN-11              6000.00
            31-JAN-12            869890.41
            31-JUL-11             12781.02
            31-MAR-11             62475.00
            31-MAY-11            301339.80
            31-OCT-11            734601.83
Name: contb_receipt_amt, Length: 1183

In [14]:

fec.contb_receipt_dt[0]

Out[14]:

'20-JUN-11'

In [15]:

print months

{'FEB': 2, 'AUG': 8, 'JAN': 1, 'DEC': 12, 'OCT': 10, 'MAR': 3, 'SEP': 9, 'MAY': 5, 'JUN': 6, 'JUL': 7, 'APR': 4, 'NOV': 11}

In [16]:

def convert_date(val):    # function to convert date to python's date object.
    d, m, y = val.split('-')
    m = months[m]
    return datetime(int ('20' + y), m, int(d))

fec.contb_receipt_dt   # output a list of date (not yet in python objects).
fec['contb_receipt_dt'] = fec.contb_receipt_dt.map(convert_date)

In [17]:

fec.contb_receipt_dt[0]

Out[17]:

datetime.datetime(2011, 6, 20, 0, 0)

In [18]:

fec.groupby(['party', 'contb_receipt_dt'])['contb_receipt_amt'].sum() # Same as above, however, date converted to python object.

Out[18]:

party     contb_receipt_dt
Democrat  2011-04-04          640235.12
          2011-04-05          307641.00
          2011-04-06          247542.59
          2011-04-07          252336.00
          2011-04-08          295452.00
          2011-04-09          105879.00
          2011-04-10          100373.00
          2011-04-11          360728.00
          2011-04-12          405160.46
          2011-04-13          382836.33
          2011-04-14          306643.50
          2011-04-15          408377.00
          2011-04-16           82658.00
          2011-04-17           70802.00
          2011-04-18          501971.37
...
Republican  2012-01-17           625365.77
            2012-01-18           888681.17
            2012-01-19          1066250.23
            2012-01-20           401298.03
            2012-01-21           374261.81
            2012-01-22           507168.71
            2012-01-23           645477.15
            2012-01-24           462233.66
            2012-01-25           416931.39
            2012-01-26           256406.86
            2012-01-27           368441.82
            2012-01-28            82775.80
            2012-01-29            75220.02
            2012-01-30           255204.80
            2012-01-31           869890.41
Name: contb_receipt_amt, Length: 1183

In [19]:

by_date = fec.groupby(['party', 'contb_receipt_dt'])['contb_receipt_amt'].sum()

In [22]:

by_date.unstack('party')   # Indexes are dates. Columns are political parties.

Out[22]:

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 376 entries, 2011-01-01 00:00:00 to 2012-01-31 00:00:00
Data columns:
Democrat       303  non-null values
Libertarian    190  non-null values
Reform         314  non-null values
Republican     376  non-null values
dtypes: float64(4)

In [23]:

by_date.unstack('party').cumsum().plot()

Out[23]:

<matplotlib.axes.AxesSubplot at 0x106e6ef90>

In [24]:

s = by_date.unstack('party')[-5:].stack()

In [25]:

s    # Note the inner level indexes. Not every date has 3 parties. Top level indexes are dates. stack() makes 1 dimensional.
     # Labels propagated down for all 3 observations: political parties.

Out[25]:

contb_receipt_dt  party     
2012-01-27        Democrat       305785.47
                  Reform           3176.37
                  Republican     368441.82
2012-01-28        Democrat       235492.85
                  Reform            175.00
                  Republican      82775.80
2012-01-29        Democrat        93177.00
                  Reform            200.00
                  Republican      75220.02
2012-01-30        Democrat       435921.72
                  Reform            130.00
                  Republican     255204.80
2012-01-31        Democrat      1418410.31
                  Reform            150.00
                  Republican     869890.41

In [30]:

set_printoptions(notebook_repr_html=False)  # turn off html, i.e., without table.
s.unstack()    # unstack() 2 dimensional. 

Out[30]:

party               Democrat   Reform  Republican
contb_receipt_dt                                 
2012-01-27         305785.47  3176.37   368441.82
2012-01-28         235492.85   175.00    82775.80
2012-01-29          93177.00   200.00    75220.02
2012-01-30         435921.72   130.00   255204.80
2012-01-31        1418410.31   150.00   869890.41

In [31]:

s.unstack().stack()  # 2 dimensional to 1 dimensional

Out[31]:

contb_receipt_dt  party     
2012-01-27        Democrat       305785.47
                  Reform           3176.37
                  Republican     368441.82
2012-01-28        Democrat       235492.85
                  Reform            175.00
                  Republican      82775.80
2012-01-29        Democrat        93177.00
                  Reform            200.00
                  Republican      75220.02
2012-01-30        Democrat       435921.72
                  Reform            130.00
                  Republican     255204.80
2012-01-31        Democrat      1418410.31
                  Reform            150.00
                  Republican     869890.41

In [34]:

df = s.unstack('party')   # data frame

In [33]:

s.unstack('contb_receipt_dt')

Out[33]:

contb_receipt_dt  2012-01-27  2012-01-28  2012-01-29  2012-01-30  2012-01-31
party                                                                       
Democrat           305785.47   235492.85    93177.00   435921.72  1418410.31
Reform               3176.37      175.00      200.00      130.00      150.00
Republican         368441.82    82775.80    75220.02   255204.80   869890.41

In [35]:

concat([df, df], axis=1, keys=['A', 'B'])  # join 2 copies of data frame.
                                           # standard hierarchical indexing stuff.

Out[35]:

                           A                                B                     
party               Democrat   Reform  Republican    Democrat   Reform  Republican
contb_receipt_dt                                                                  
2012-01-27         305785.47  3176.37   368441.82   305785.47  3176.37   368441.82
2012-01-28         235492.85   175.00    82775.80   235492.85   175.00    82775.80
2012-01-29          93177.00   200.00    75220.02    93177.00   200.00    75220.02
2012-01-30         435921.72   130.00   255204.80   435921.72   130.00   255204.80
2012-01-31        1418410.31   150.00   869890.41  1418410.31   150.00   869890.41

In [36]:

concat([df, df], axis=1, keys=['A', 'B'])['A']   # select out columns by the first key 'A'. Or select out the 'B' group.conc

Out[36]:

party               Democrat   Reform  Republican
contb_receipt_dt                                 
2012-01-27         305785.47  3176.37   368441.82
2012-01-28         235492.85   175.00    82775.80
2012-01-29          93177.00   200.00    75220.02
2012-01-30         435921.72   130.00   255204.80
2012-01-31        1418410.31   150.00   869890.41

In [37]:

concat([df, df], axis=1, keys=['A', 'B']).stack('party')

Out[37]:

                                      A           B
contb_receipt_dt party                             
2012-01-27       Democrat     305785.47   305785.47
                 Reform         3176.37     3176.37
                 Republican   368441.82   368441.82
2012-01-28       Democrat     235492.85   235492.85
                 Reform          175.00      175.00
                 Republican    82775.80    82775.80
2012-01-29       Democrat      93177.00    93177.00
                 Reform          200.00      200.00
                 Republican    75220.02    75220.02
2012-01-30       Democrat     435921.72   435921.72
                 Reform          130.00      130.00
                 Republican   255204.80   255204.80
2012-01-31       Democrat    1418410.31  1418410.31
                 Reform          150.00      150.00
                 Republican   869890.41   869890.41

In [38]:

concat([df, df], axis=1, keys=['A', 'B']).stack(0)  # The group indexes have no labels.

Out[38]:

party                 Democrat   Reform  Republican
contb_receipt_dt                                   
2012-01-27       A   305785.47  3176.37   368441.82
                 B   305785.47  3176.37   368441.82
2012-01-28       A   235492.85   175.00    82775.80
                 B   235492.85   175.00    82775.80
2012-01-29       A    93177.00   200.00    75220.02
                 B    93177.00   200.00    75220.02
2012-01-30       A   435921.72   130.00   255204.80
                 B   435921.72   130.00   255204.80
2012-01-31       A  1418410.31   150.00   869890.41
                 B  1418410.31   150.00   869890.41

In [40]:

result = fec.groupby(['cand_nm', 'contbr_occupation'])['contb_receipt_amt'].sum()   # Top contributors to candidates by occupation.

In [41]:

result  # is a data frame instead of series.

Out[41]:

cand_nm             contbr_occupation             
Bachmann, Michelle  100% DISABLED VETERAN              1869.50
                    100% DISABLED VIETNAM VETERAN       236.25
                    A/C AND HEAT                        250.00
                    ACADEMIC EMPLOYMENT SPECIALIST       25.00
                    ACCOUNT                              50.00
                    ACCOUNT ANALYST                     200.00
                    ACCOUNT EXECUTIVE                   100.00
                    ACCOUNT MANAGER                     230.00
                    ACCOUNTANT                        12979.00
                    ACCOUNTANT TRAINER                  250.00
                    ACCOUNTING                          350.00
                    ACCOUNTING MANAGER                   50.00
                    ACCOUNTING SPECIALIST               250.00
                    ACCOUNTING SUPERVISOR               125.00
                    ACCOUNTING/AUDITING                 700.00
...
Santorum, Rick  WILDLIFE REFUGE MANAGER       200.00
                WIND FARM DEVELOPER           400.00
                WINDOW CLEANER                 85.00
                WINERY                        300.00
                WIRELESS NETWORK ENGINEER     220.15
                WITHHELD                     1000.00
                WNG                           700.00
                WORKER II                     300.00
                WRITER                       4894.55
                WRITER ACTIVIST              1112.00
                WRITER EDITOR                 250.00
                WRITER/ACCOUNT EXEC           500.00
                WRITER/JOURNALIST             250.00
                WRITER/LAWYER                 300.00
                YOUTH MINISTER                250.00
Name: contb_receipt_amt, Length: 40171

In [42]:

result[:, 'ZOMBIE SLAYER']

Out[42]:

cand_nm
Paul, Ron    1556
Name: contb_receipt_amt

In [43]:

result = fec.groupby(['cand_nm', 'contbr_occupation']).sum()  # When doing this aggregation, we don't need to select the columns as above.
# There are columns that are not possible to sum. The code actually tries to sum them all: nuisance columns. Whenever it fails, it just silently drops them out.

Out[43]:

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 40171 entries, ('Bachmann, Michelle', '100% DISABLED VETERAN') to ('Santorum, Rick', 'YOUTH MINISTER')
Data columns:
contb_receipt_amt    40171  non-null values
file_num             40171  non-null values
dtypes: float64(2)

In [44]:

result = result.reset_index()  # opposite of set_index(). Took 'cand_nm' and 'contbr_occupation' and set them as col indexes again, so that the indexes are just simple integer indexes.

In [45]:

result

Out[45]:

<class 'pandas.core.frame.DataFrame'>
Int64Index: 40171 entries, 0 to 40170
Data columns:
cand_nm              40171  non-null values
contbr_occupation    40171  non-null values
contb_receipt_amt    40171  non-null values
dtypes: float64(1), object(2)

In [48]:

# We have so far aggregated total amount donated to each political party by occupation (the first groupby).
# Additional groupby let us split them up by candidate, sort them in descending order by total amount donated and take the top 5.
set_printoptions(notebook_repr_html=True)
def top5_donors(group):
    return group.sort_index(by='contb_receipt_amt')[-5:]
# top5_donors(result)
result.groupby('cand_nm').apply(top5_donors)   # The retired donated the most to Barack Obama.

Out[48]:

		cand_nm	contbr_occupation	contb_receipt_amt
cand_nm	None
Bachmann, Michelle	921	Bachmann, Michelle	PRESIDENT	68781.00
Bachmann, Michelle	893	Bachmann, Michelle	PHYSICIAN	76134.00
Bachmann, Michelle	555	Bachmann, Michelle	HOMEMAKER	102540.00
Bachmann, Michelle	594	Bachmann, Michelle	INFORMATION REQUESTED	147638.89
Bachmann, Michelle	1059	Bachmann, Michelle	RETIRED	824618.05
Cain, Herman	1537	Cain, Herman	CEO	125097.00
Cain, Herman	2227	Cain, Herman	PHYSICIAN	125790.96
Cain, Herman	1898	Cain, Herman	HOMEMAKER	153964.07
Cain, Herman	2391	Cain, Herman	RETIRED	779635.51
Cain, Herman	1929	Cain, Herman	INFORMATION REQUESTED	3838719.32
Gingrich, Newt	4068	Gingrich, Newt	OWNER	273310.00
Gingrich, Newt	4222	Gingrich, Newt	PRESIDENT	332403.00
Gingrich, Newt	3561	Gingrich, Newt	HOMEMAKER	426118.26
Gingrich, Newt	3611	Gingrich, Newt	INFORMATION REQUESTED PER BEST EFFORTS	910147.00
Gingrich, Newt	4484	Gingrich, Newt	RETIRED	2279602.27
Huntsman, Jon	5470	Huntsman, Jon	PRESIDENT	92251.00
Huntsman, Jon	4984	Huntsman, Jon	ATTORNEY	143532.50
Huntsman, Jon	5288	Huntsman, Jon	HOMEMAKER	241524.79
Huntsman, Jon	5563	Huntsman, Jon	RETIRED	352669.00
Huntsman, Jon	5301	Huntsman, Jon	INFORMATION REQUESTED PER BEST EFFORTS	354500.00
Johnson, Gary Earl	5773	Johnson, Gary Earl	HOUSEWIFE	11000.00
Johnson, Gary Earl	5855	Johnson, Gary Earl	SELF-EMPLOYED	14750.00
Johnson, Gary Earl	5735	Johnson, Gary Earl	CONSULTANT	16767.76
Johnson, Gary Earl	5853	Johnson, Gary Earl	SELF EMPLOYED	18781.00
Johnson, Gary Earl	5845	Johnson, Gary Earl	RETIRED	85763.96
McCotter, Thaddeus G	5906	McCotter, Thaddeus G	HOMEMAKER	2400.00
McCotter, Thaddeus G	5918	McCotter, Thaddeus G	RETIRED	2450.00
McCotter, Thaddeus G	5904	McCotter, Thaddeus G	GOVERNMENT	2500.00
McCotter, Thaddeus G	5912	McCotter, Thaddeus G	PHYSICIAN	3650.00
McCotter, Thaddeus G	5902	McCotter, Thaddeus G	EXECUTIVE	3750.00
Obama, Barack	16658	Obama, Barack	PHYSICIAN	2244445.92
Obama, Barack	13527	Obama, Barack	INFORMATION REQUESTED	2647247.90
Obama, Barack	13113	Obama, Barack	HOMEMAKER	2780203.60
Obama, Barack	7091	Obama, Barack	ATTORNEY	7112343.35
Obama, Barack	18862	Obama, Barack	RETIRED	15086766.92
Paul, Ron	25806	Paul, Ron	HOMEMAKER	310823.42
Paul, Ron	28349	Paul, Ron	REQUESTED	381440.72
Paul, Ron	27649	Paul, Ron	PHYSICIAN	509300.85
Paul, Ron	24957	Paul, Ron	ENGINEER	612693.61
Paul, Ron	28461	Paul, Ron	RETIRED	1799513.76
Pawlenty, Timothy	30758	Pawlenty, Timothy	INFORMATION REQUESTED PER BEST EFFORTS	197841.18
Pawlenty, Timothy	30902	Pawlenty, Timothy	PRESIDENT	211708.92
Pawlenty, Timothy	30517	Pawlenty, Timothy	ATTORNEY	238331.10
Pawlenty, Timothy	30978	Pawlenty, Timothy	RETIRED	584195.00
Pawlenty, Timothy	30750	Pawlenty, Timothy	HOMEMAKER	769514.95
Perry, Rick	31205	Perry, Rick	ATTORNEY	768778.80
Perry, Rick	32352	Perry, Rick	PRESIDENT	845150.00
Perry, Rick	32546	Perry, Rick	RETIRED	1091671.00
Perry, Rick	31880	Perry, Rick	HOMEMAKER	1818443.00
Perry, Rick	31916	Perry, Rick	INFORMATION REQUESTED PER BEST EFFORTS	1975508.76
Roemer, Charles E. 'Buddy' III	34273	Roemer, Charles E. 'Buddy' III	STUDENT	6733.34
Roemer, Charles E. 'Buddy' III	33538	Roemer, Charles E. 'Buddy' III	HOMEMAKER	8195.00
Roemer, Charles E. 'Buddy' III	33004	Roemer, Charles E. 'Buddy' III	ATTORNEY	14186.00
Roemer, Charles E. 'Buddy' III	33584	Roemer, Charles E. 'Buddy' III	INFORMATION REQUESTED	18707.37
Roemer, Charles E. 'Buddy' III	34125	Roemer, Charles E. 'Buddy' III	RETIRED	73000.31
Romney, Mitt	37415	Romney, Mitt	PRESIDENT	1522751.99
Romney, Mitt	34633	Romney, Mitt	ATTORNEY	3662610.21
Romney, Mitt	36325	Romney, Mitt	HOMEMAKER	5442013.48
Romney, Mitt	36419	Romney, Mitt	INFORMATION REQUESTED PER BEST EFFORTS	6394753.06
Romney, Mitt	37919	Romney, Mitt	RETIRED	6483596.24
Santorum, Rick	38865	Santorum, Rick	ATTORNEY	107130.00
Santorum, Rick	39189	Santorum, Rick	EXECUTIVE	131456.55
Santorum, Rick	39350	Santorum, Rick	INFORMATION REQUESTED (BEST EFFORTS)	284718.10
Santorum, Rick	39314	Santorum, Rick	HOMEMAKER	350597.75
Santorum, Rick	39822	Santorum, Rick	RETIRED	421952.98

In [49]:

df._data   # Internal data structure. The back data frame. Having hierarchical indexing medicates a lot of the need for higher dimensional objects (because they're very sparse).

Out[49]:

BlockManager
Items: array([Democrat, Reform, Republican], dtype=object)
Axis 1: <class 'pandas.tseries.index.DatetimeIndex'>
[2012-01-27 00:00:00, ..., 2012-01-31 00:00:00]
Length: 5, Freq: None, Timezone: None
FloatBlock: array([Democrat, Reform, Republican], dtype=object), 3 x 5, dtype float64

In [ ]:

# Time Series for financial data will be improved next.