from pandas import *
import matplotlib as mpl
mpl.rc('figure', figsize=(12, 6))
set_printoptions(max_columns=18)
months = {'JAN' : 1, 'FEB' : 2, 'MAR' : 3, 'APR' : 4, 'MAY' : 5, 'JUN' : 6,
'JUL' : 7, 'AUG' : 8, 'SEP' : 9, 'OCT': 10, 'NOV': 11, 'DEC' : 12}
of_interest = ['Obama, Barack', 'Romney, Mitt', 'Santorum, Rick',
'Paul, Ron', 'Gingrich, Newt']
parties = {
'Bachmann, Michelle': 'Republican',
'Romney, Mitt': 'Republican',
'Obama, Barack': 'Democrat',
"Roemer, Charles E. 'Buddy' III": 'Reform',
'Pawlenty, Timothy': 'Republican',
'Johnson, Gary Earl': 'Libertarian',
'Paul, Ron': 'Republican',
'Santorum, Rick': 'Republican',
'Cain, Herman': 'Republican',
'Gingrich, Newt': 'Republican',
'McCotter, Thaddeus G': 'Republican',
'Huntsman, Jon': 'Republican',
'Perry, Rick': 'Republican'
}
fec = read_csv('P00000001-ALL.txt')
fec
<class 'pandas.core.frame.DataFrame'> Int64Index: 536041 entries, 0 to 536040 Data columns: cmte_id 536041 non-null values cand_id 536041 non-null values cand_nm 536041 non-null values contbr_nm 536041 non-null values contbr_city 536026 non-null values contbr_st 536040 non-null values contbr_zip 535973 non-null values contbr_employer 531417 non-null values contbr_occupation 531448 non-null values contb_receipt_amt 536041 non-null values contb_receipt_dt 536041 non-null values receipt_desc 8479 non-null values memo_cd 49718 non-null values memo_text 52740 non-null values form_tp 536041 non-null values file_num 536041 non-null values dtypes: float64(1), int64(1), object(14)
parties # dictionary
{'Bachmann, Michelle': 'Republican', 'Cain, Herman': 'Republican', 'Gingrich, Newt': 'Republican', 'Huntsman, Jon': 'Republican', 'Johnson, Gary Earl': 'Libertarian', 'McCotter, Thaddeus G': 'Republican', 'Obama, Barack': 'Democrat', 'Paul, Ron': 'Republican', 'Pawlenty, Timothy': 'Republican', 'Perry, Rick': 'Republican', "Roemer, Charles E. 'Buddy' III": 'Reform', 'Romney, Mitt': 'Republican', 'Santorum, Rick': 'Republican'}
fec.cand_nm
0 Bachmann, Michelle 1 Bachmann, Michelle 2 Bachmann, Michelle 3 Bachmann, Michelle 4 Bachmann, Michelle 5 Bachmann, Michelle 6 Bachmann, Michelle 7 Bachmann, Michelle 8 Bachmann, Michelle 9 Bachmann, Michelle 10 Bachmann, Michelle 11 Bachmann, Michelle 12 Bachmann, Michelle 13 Bachmann, Michelle 14 Bachmann, Michelle ... 536026 Perry, Rick 536027 Perry, Rick 536028 Perry, Rick 536029 Perry, Rick 536030 Perry, Rick 536031 Perry, Rick 536032 Perry, Rick 536033 Perry, Rick 536034 Perry, Rick 536035 Perry, Rick 536036 Perry, Rick 536037 Perry, Rick 536038 Perry, Rick 536039 Perry, Rick 536040 Perry, Rick Name: cand_nm, Length: 536041
fec['party'] = fec.cand_nm.map(parties) # map can take dictionary, series...
# Alternatively, use list comprehension: [parties[x] for x in fec.cand_nm]
fec.ix[0] # A single record.
cmte_id C00410118 cand_id P20002978 cand_nm Bachmann, Michelle contbr_nm HARVEY, WILLIAM contbr_city MOBILE contbr_st AL contbr_zip 366010290 contbr_employer RETIRED contbr_occupation RETIRED contb_receipt_amt 250 contb_receipt_dt 20-JUN-11 receipt_desc NaN memo_cd NaN memo_text NaN form_tp SA17A file_num 736166 party Republican Name: 0
fec.party.value_counts() # Doing it high level way using histogram.
Democrat 292400 Republican 237575 Reform 5364 Libertarian 702
np.unique(fec.party)
134010 Democrat 361727 Libertarian 356462 Reform 6067 Republican Name: party
fec.groupby('party')['contb_receipt_amt'].sum()
party Democrat 8.105758e+07 Libertarian 4.132769e+05 Reform 3.390338e+05 Republican 1.192255e+08 Name: contb_receipt_amt
fec.groupby(['party', 'contb_receipt_dt'])['contb_receipt_amt'].sum() # Not a python date object.
party contb_receipt_dt Democrat 01-AUG-11 175281.00 01-DEC-11 651532.82 01-JAN-12 58098.80 01-JUL-11 165961.00 01-JUN-11 145459.00 01-MAY-11 82644.00 01-NOV-11 122529.87 01-OCT-11 148977.00 01-SEP-11 403297.62 02-AUG-11 164510.11 02-DEC-11 216056.96 02-JAN-12 89743.60 02-JUL-11 17105.00 02-JUN-11 422453.00 02-MAY-11 396675.00 ... Republican 30-JUL-11 12249.04 30-JUN-11 2744932.63 30-MAR-11 50240.00 30-MAY-11 17803.60 30-NOV-11 809014.83 30-OCT-11 43913.16 30-SEP-11 4886331.76 31-AUG-11 1017735.02 31-DEC-11 1094376.72 31-JAN-11 6000.00 31-JAN-12 869890.41 31-JUL-11 12781.02 31-MAR-11 62475.00 31-MAY-11 301339.80 31-OCT-11 734601.83 Name: contb_receipt_amt, Length: 1183
fec.contb_receipt_dt[0]
'20-JUN-11'
print months
{'FEB': 2, 'AUG': 8, 'JAN': 1, 'DEC': 12, 'OCT': 10, 'MAR': 3, 'SEP': 9, 'MAY': 5, 'JUN': 6, 'JUL': 7, 'APR': 4, 'NOV': 11}
def convert_date(val): # function to convert date to python's date object.
d, m, y = val.split('-')
m = months[m]
return datetime(int ('20' + y), m, int(d))
fec.contb_receipt_dt # output a list of date (not yet in python objects).
fec['contb_receipt_dt'] = fec.contb_receipt_dt.map(convert_date)
fec.contb_receipt_dt[0]
datetime.datetime(2011, 6, 20, 0, 0)
fec.groupby(['party', 'contb_receipt_dt'])['contb_receipt_amt'].sum() # Same as above, however, date converted to python object.
party contb_receipt_dt Democrat 2011-04-04 640235.12 2011-04-05 307641.00 2011-04-06 247542.59 2011-04-07 252336.00 2011-04-08 295452.00 2011-04-09 105879.00 2011-04-10 100373.00 2011-04-11 360728.00 2011-04-12 405160.46 2011-04-13 382836.33 2011-04-14 306643.50 2011-04-15 408377.00 2011-04-16 82658.00 2011-04-17 70802.00 2011-04-18 501971.37 ... Republican 2012-01-17 625365.77 2012-01-18 888681.17 2012-01-19 1066250.23 2012-01-20 401298.03 2012-01-21 374261.81 2012-01-22 507168.71 2012-01-23 645477.15 2012-01-24 462233.66 2012-01-25 416931.39 2012-01-26 256406.86 2012-01-27 368441.82 2012-01-28 82775.80 2012-01-29 75220.02 2012-01-30 255204.80 2012-01-31 869890.41 Name: contb_receipt_amt, Length: 1183
by_date = fec.groupby(['party', 'contb_receipt_dt'])['contb_receipt_amt'].sum()
by_date.unstack('party') # Indexes are dates. Columns are political parties.
<class 'pandas.core.frame.DataFrame'> DatetimeIndex: 376 entries, 2011-01-01 00:00:00 to 2012-01-31 00:00:00 Data columns: Democrat 303 non-null values Libertarian 190 non-null values Reform 314 non-null values Republican 376 non-null values dtypes: float64(4)
by_date.unstack('party').cumsum().plot()
<matplotlib.axes.AxesSubplot at 0x106e6ef90>
s = by_date.unstack('party')[-5:].stack()
s # Note the inner level indexes. Not every date has 3 parties. Top level indexes are dates. stack() makes 1 dimensional.
# Labels propagated down for all 3 observations: political parties.
contb_receipt_dt party 2012-01-27 Democrat 305785.47 Reform 3176.37 Republican 368441.82 2012-01-28 Democrat 235492.85 Reform 175.00 Republican 82775.80 2012-01-29 Democrat 93177.00 Reform 200.00 Republican 75220.02 2012-01-30 Democrat 435921.72 Reform 130.00 Republican 255204.80 2012-01-31 Democrat 1418410.31 Reform 150.00 Republican 869890.41
set_printoptions(notebook_repr_html=False) # turn off html, i.e., without table.
s.unstack() # unstack() 2 dimensional.
party Democrat Reform Republican contb_receipt_dt 2012-01-27 305785.47 3176.37 368441.82 2012-01-28 235492.85 175.00 82775.80 2012-01-29 93177.00 200.00 75220.02 2012-01-30 435921.72 130.00 255204.80 2012-01-31 1418410.31 150.00 869890.41
s.unstack().stack() # 2 dimensional to 1 dimensional
contb_receipt_dt party 2012-01-27 Democrat 305785.47 Reform 3176.37 Republican 368441.82 2012-01-28 Democrat 235492.85 Reform 175.00 Republican 82775.80 2012-01-29 Democrat 93177.00 Reform 200.00 Republican 75220.02 2012-01-30 Democrat 435921.72 Reform 130.00 Republican 255204.80 2012-01-31 Democrat 1418410.31 Reform 150.00 Republican 869890.41
df = s.unstack('party') # data frame
s.unstack('contb_receipt_dt')
contb_receipt_dt 2012-01-27 2012-01-28 2012-01-29 2012-01-30 2012-01-31 party Democrat 305785.47 235492.85 93177.00 435921.72 1418410.31 Reform 3176.37 175.00 200.00 130.00 150.00 Republican 368441.82 82775.80 75220.02 255204.80 869890.41
concat([df, df], axis=1, keys=['A', 'B']) # join 2 copies of data frame.
# standard hierarchical indexing stuff.
A B party Democrat Reform Republican Democrat Reform Republican contb_receipt_dt 2012-01-27 305785.47 3176.37 368441.82 305785.47 3176.37 368441.82 2012-01-28 235492.85 175.00 82775.80 235492.85 175.00 82775.80 2012-01-29 93177.00 200.00 75220.02 93177.00 200.00 75220.02 2012-01-30 435921.72 130.00 255204.80 435921.72 130.00 255204.80 2012-01-31 1418410.31 150.00 869890.41 1418410.31 150.00 869890.41
concat([df, df], axis=1, keys=['A', 'B'])['A'] # select out columns by the first key 'A'. Or select out the 'B' group.conc
party Democrat Reform Republican contb_receipt_dt 2012-01-27 305785.47 3176.37 368441.82 2012-01-28 235492.85 175.00 82775.80 2012-01-29 93177.00 200.00 75220.02 2012-01-30 435921.72 130.00 255204.80 2012-01-31 1418410.31 150.00 869890.41
concat([df, df], axis=1, keys=['A', 'B']).stack('party')
A B contb_receipt_dt party 2012-01-27 Democrat 305785.47 305785.47 Reform 3176.37 3176.37 Republican 368441.82 368441.82 2012-01-28 Democrat 235492.85 235492.85 Reform 175.00 175.00 Republican 82775.80 82775.80 2012-01-29 Democrat 93177.00 93177.00 Reform 200.00 200.00 Republican 75220.02 75220.02 2012-01-30 Democrat 435921.72 435921.72 Reform 130.00 130.00 Republican 255204.80 255204.80 2012-01-31 Democrat 1418410.31 1418410.31 Reform 150.00 150.00 Republican 869890.41 869890.41
concat([df, df], axis=1, keys=['A', 'B']).stack(0) # The group indexes have no labels.
party Democrat Reform Republican contb_receipt_dt 2012-01-27 A 305785.47 3176.37 368441.82 B 305785.47 3176.37 368441.82 2012-01-28 A 235492.85 175.00 82775.80 B 235492.85 175.00 82775.80 2012-01-29 A 93177.00 200.00 75220.02 B 93177.00 200.00 75220.02 2012-01-30 A 435921.72 130.00 255204.80 B 435921.72 130.00 255204.80 2012-01-31 A 1418410.31 150.00 869890.41 B 1418410.31 150.00 869890.41
result = fec.groupby(['cand_nm', 'contbr_occupation'])['contb_receipt_amt'].sum() # Top contributors to candidates by occupation.
result # is a data frame instead of series.
cand_nm contbr_occupation Bachmann, Michelle 100% DISABLED VETERAN 1869.50 100% DISABLED VIETNAM VETERAN 236.25 A/C AND HEAT 250.00 ACADEMIC EMPLOYMENT SPECIALIST 25.00 ACCOUNT 50.00 ACCOUNT ANALYST 200.00 ACCOUNT EXECUTIVE 100.00 ACCOUNT MANAGER 230.00 ACCOUNTANT 12979.00 ACCOUNTANT TRAINER 250.00 ACCOUNTING 350.00 ACCOUNTING MANAGER 50.00 ACCOUNTING SPECIALIST 250.00 ACCOUNTING SUPERVISOR 125.00 ACCOUNTING/AUDITING 700.00 ... Santorum, Rick WILDLIFE REFUGE MANAGER 200.00 WIND FARM DEVELOPER 400.00 WINDOW CLEANER 85.00 WINERY 300.00 WIRELESS NETWORK ENGINEER 220.15 WITHHELD 1000.00 WNG 700.00 WORKER II 300.00 WRITER 4894.55 WRITER ACTIVIST 1112.00 WRITER EDITOR 250.00 WRITER/ACCOUNT EXEC 500.00 WRITER/JOURNALIST 250.00 WRITER/LAWYER 300.00 YOUTH MINISTER 250.00 Name: contb_receipt_amt, Length: 40171
result[:, 'ZOMBIE SLAYER']
cand_nm Paul, Ron 1556 Name: contb_receipt_amt
result = fec.groupby(['cand_nm', 'contbr_occupation']).sum() # When doing this aggregation, we don't need to select the columns as above.
# There are columns that are not possible to sum. The code actually tries to sum them all: nuisance columns. Whenever it fails, it just silently drops them out.
<class 'pandas.core.frame.DataFrame'> MultiIndex: 40171 entries, ('Bachmann, Michelle', '100% DISABLED VETERAN') to ('Santorum, Rick', 'YOUTH MINISTER') Data columns: contb_receipt_amt 40171 non-null values file_num 40171 non-null values dtypes: float64(2)
result = result.reset_index() # opposite of set_index(). Took 'cand_nm' and 'contbr_occupation' and set them as col indexes again, so that the indexes are just simple integer indexes.
result
<class 'pandas.core.frame.DataFrame'> Int64Index: 40171 entries, 0 to 40170 Data columns: cand_nm 40171 non-null values contbr_occupation 40171 non-null values contb_receipt_amt 40171 non-null values dtypes: float64(1), object(2)
# We have so far aggregated total amount donated to each political party by occupation (the first groupby).
# Additional groupby let us split them up by candidate, sort them in descending order by total amount donated and take the top 5.
set_printoptions(notebook_repr_html=True)
def top5_donors(group):
return group.sort_index(by='contb_receipt_amt')[-5:]
# top5_donors(result)
result.groupby('cand_nm').apply(top5_donors) # The retired donated the most to Barack Obama.
cand_nm | contbr_occupation | contb_receipt_amt | ||
---|---|---|---|---|
cand_nm | None | |||
Bachmann, Michelle | 921 | Bachmann, Michelle | PRESIDENT | 68781.00 |
Bachmann, Michelle | 893 | Bachmann, Michelle | PHYSICIAN | 76134.00 |
Bachmann, Michelle | 555 | Bachmann, Michelle | HOMEMAKER | 102540.00 |
Bachmann, Michelle | 594 | Bachmann, Michelle | INFORMATION REQUESTED | 147638.89 |
Bachmann, Michelle | 1059 | Bachmann, Michelle | RETIRED | 824618.05 |
Cain, Herman | 1537 | Cain, Herman | CEO | 125097.00 |
Cain, Herman | 2227 | Cain, Herman | PHYSICIAN | 125790.96 |
Cain, Herman | 1898 | Cain, Herman | HOMEMAKER | 153964.07 |
Cain, Herman | 2391 | Cain, Herman | RETIRED | 779635.51 |
Cain, Herman | 1929 | Cain, Herman | INFORMATION REQUESTED | 3838719.32 |
Gingrich, Newt | 4068 | Gingrich, Newt | OWNER | 273310.00 |
Gingrich, Newt | 4222 | Gingrich, Newt | PRESIDENT | 332403.00 |
Gingrich, Newt | 3561 | Gingrich, Newt | HOMEMAKER | 426118.26 |
Gingrich, Newt | 3611 | Gingrich, Newt | INFORMATION REQUESTED PER BEST EFFORTS | 910147.00 |
Gingrich, Newt | 4484 | Gingrich, Newt | RETIRED | 2279602.27 |
Huntsman, Jon | 5470 | Huntsman, Jon | PRESIDENT | 92251.00 |
Huntsman, Jon | 4984 | Huntsman, Jon | ATTORNEY | 143532.50 |
Huntsman, Jon | 5288 | Huntsman, Jon | HOMEMAKER | 241524.79 |
Huntsman, Jon | 5563 | Huntsman, Jon | RETIRED | 352669.00 |
Huntsman, Jon | 5301 | Huntsman, Jon | INFORMATION REQUESTED PER BEST EFFORTS | 354500.00 |
Johnson, Gary Earl | 5773 | Johnson, Gary Earl | HOUSEWIFE | 11000.00 |
Johnson, Gary Earl | 5855 | Johnson, Gary Earl | SELF-EMPLOYED | 14750.00 |
Johnson, Gary Earl | 5735 | Johnson, Gary Earl | CONSULTANT | 16767.76 |
Johnson, Gary Earl | 5853 | Johnson, Gary Earl | SELF EMPLOYED | 18781.00 |
Johnson, Gary Earl | 5845 | Johnson, Gary Earl | RETIRED | 85763.96 |
McCotter, Thaddeus G | 5906 | McCotter, Thaddeus G | HOMEMAKER | 2400.00 |
McCotter, Thaddeus G | 5918 | McCotter, Thaddeus G | RETIRED | 2450.00 |
McCotter, Thaddeus G | 5904 | McCotter, Thaddeus G | GOVERNMENT | 2500.00 |
McCotter, Thaddeus G | 5912 | McCotter, Thaddeus G | PHYSICIAN | 3650.00 |
McCotter, Thaddeus G | 5902 | McCotter, Thaddeus G | EXECUTIVE | 3750.00 |
Obama, Barack | 16658 | Obama, Barack | PHYSICIAN | 2244445.92 |
Obama, Barack | 13527 | Obama, Barack | INFORMATION REQUESTED | 2647247.90 |
Obama, Barack | 13113 | Obama, Barack | HOMEMAKER | 2780203.60 |
Obama, Barack | 7091 | Obama, Barack | ATTORNEY | 7112343.35 |
Obama, Barack | 18862 | Obama, Barack | RETIRED | 15086766.92 |
Paul, Ron | 25806 | Paul, Ron | HOMEMAKER | 310823.42 |
Paul, Ron | 28349 | Paul, Ron | REQUESTED | 381440.72 |
Paul, Ron | 27649 | Paul, Ron | PHYSICIAN | 509300.85 |
Paul, Ron | 24957 | Paul, Ron | ENGINEER | 612693.61 |
Paul, Ron | 28461 | Paul, Ron | RETIRED | 1799513.76 |
Pawlenty, Timothy | 30758 | Pawlenty, Timothy | INFORMATION REQUESTED PER BEST EFFORTS | 197841.18 |
Pawlenty, Timothy | 30902 | Pawlenty, Timothy | PRESIDENT | 211708.92 |
Pawlenty, Timothy | 30517 | Pawlenty, Timothy | ATTORNEY | 238331.10 |
Pawlenty, Timothy | 30978 | Pawlenty, Timothy | RETIRED | 584195.00 |
Pawlenty, Timothy | 30750 | Pawlenty, Timothy | HOMEMAKER | 769514.95 |
Perry, Rick | 31205 | Perry, Rick | ATTORNEY | 768778.80 |
Perry, Rick | 32352 | Perry, Rick | PRESIDENT | 845150.00 |
Perry, Rick | 32546 | Perry, Rick | RETIRED | 1091671.00 |
Perry, Rick | 31880 | Perry, Rick | HOMEMAKER | 1818443.00 |
Perry, Rick | 31916 | Perry, Rick | INFORMATION REQUESTED PER BEST EFFORTS | 1975508.76 |
Roemer, Charles E. 'Buddy' III | 34273 | Roemer, Charles E. 'Buddy' III | STUDENT | 6733.34 |
Roemer, Charles E. 'Buddy' III | 33538 | Roemer, Charles E. 'Buddy' III | HOMEMAKER | 8195.00 |
Roemer, Charles E. 'Buddy' III | 33004 | Roemer, Charles E. 'Buddy' III | ATTORNEY | 14186.00 |
Roemer, Charles E. 'Buddy' III | 33584 | Roemer, Charles E. 'Buddy' III | INFORMATION REQUESTED | 18707.37 |
Roemer, Charles E. 'Buddy' III | 34125 | Roemer, Charles E. 'Buddy' III | RETIRED | 73000.31 |
Romney, Mitt | 37415 | Romney, Mitt | PRESIDENT | 1522751.99 |
Romney, Mitt | 34633 | Romney, Mitt | ATTORNEY | 3662610.21 |
Romney, Mitt | 36325 | Romney, Mitt | HOMEMAKER | 5442013.48 |
Romney, Mitt | 36419 | Romney, Mitt | INFORMATION REQUESTED PER BEST EFFORTS | 6394753.06 |
Romney, Mitt | 37919 | Romney, Mitt | RETIRED | 6483596.24 |
Santorum, Rick | 38865 | Santorum, Rick | ATTORNEY | 107130.00 |
Santorum, Rick | 39189 | Santorum, Rick | EXECUTIVE | 131456.55 |
Santorum, Rick | 39350 | Santorum, Rick | INFORMATION REQUESTED (BEST EFFORTS) | 284718.10 |
Santorum, Rick | 39314 | Santorum, Rick | HOMEMAKER | 350597.75 |
Santorum, Rick | 39822 | Santorum, Rick | RETIRED | 421952.98 |
df._data # Internal data structure. The back data frame. Having hierarchical indexing medicates a lot of the need for higher dimensional objects (because they're very sparse).
BlockManager Items: array([Democrat, Reform, Republican], dtype=object) Axis 1: <class 'pandas.tseries.index.DatetimeIndex'> [2012-01-27 00:00:00, ..., 2012-01-31 00:00:00] Length: 5, Freq: None, Timezone: None FloatBlock: array([Democrat, Reform, Republican], dtype=object), 3 x 5, dtype float64
# Time Series for financial data will be improved next.