%pylab inline import numpy as np import matplotlib.pyplot as plt from pandas import * from sklearn.manifold import MDS from statsmodels.iolib.foreign import genfromdta import os from sklearn.metrics.pairwise import euclidean_distances import re import math datdir = os.path.abspath(os.path.join('data', 'roll_call')) csv_files = [os.path.join(datdir, f) for f in os.listdir(datdir) if f.endswith('.csv')] roll_call = {} for f in csv_files: s = re.search('sen(.*)kh', f) congnum = int(f[(s.start()+3):(s.end()-2)]) roll_call[congnum] = read_csv(f) def roll_call_simplified(df): vote_codes = {1: 1, 2: 1, 3: 1, 4: -1, 5: -1, 6: -1, 7: 0, 8: 0, 9: 0, 0: 0} # Remove the vice-president. no_pres = df[df.state < 99] # Find the columns with vote data. These are typically 'V1', 'V915', etc., # but some dataset appear to have them as 'var1', 'var919', etc. # Checking that the first letter is 'v' or 'V' will find these (and only) # these columns. vote_cols = [c for c in df.columns if c[0].upper() == 'V'] # Would like to just call replace on the whole df, but doesn't seem # to work, so we'll apply replace to each column. no_pres[vote_cols] = no_pres[vote_cols].apply(lambda x: x.replace(vote_codes)) return no_pres for cong in roll_call: roll_call[cong] = roll_call_simplified(roll_call[cong]) distance = {} for cong in roll_call: vote_cols = [c for c in roll_call[cong].columns if c[0].upper() == 'V' ] distance[cong] = euclidean_distances(roll_call[cong][vote_cols].values) rc110 = roll_call[110] dist110 = distance[110] dems = np.where(rc110.party == 100)[0] repubs = np.where(rc110.party == 200)[0] names = roll_call[110].name.values mds = MDS().fit_transform(dist110) plt.figure(figsize = (8, 5)) # Plot invisible points to annotate with names plt.plot(mds[:, 0], mds[:, 1], '.', alpha = 0) plt.title('MDS analysis of Senators in the 110th Congress') # Plot dem names for i in dems: plt.annotate(names[i], (mds[i, 0], mds[i, 1]), color = 'blue', alpha = 0.3, horizontalalignment = 'center', verticalalignment = 'center', family = 'sans-serif') # Plot repub names for i in repubs: plt.annotate(names[i], (mds[i, 0], mds[i, 1]), color = 'red', alpha = 0.5, horizontalalignment = 'center', verticalalignment = 'center', family = 'sans-serif') # Turn off axes labels. plt.setp(plt.gca().get_yaxis(), visible = False) plt.setp(plt.gca().get_xaxis(), visible = False) fig, ax = plt.subplots(nrows = 3, ncols = 4, figsize = (20, 10)) plt.subplots_adjust(hspace = 0, wspace = 0) for a, cong in zip(ax.ravel(), roll_call): rc = roll_call[cong] dist = distance[cong] dems = np.where(rc.party == 100)[0] repubs = np.where(rc.party == 200)[0] mds = MDS().fit_transform(dist) a.plot(mds[dems, 0], mds[dems, 1], '.b', mfc = 'white') a.plot(mds[repubs, 0], mds[repubs, 1], 'xr') # Label which Congress is being plotted a.text(0, .99, cong, transform = a.transAxes, verticalalignment = 'top', fontsize = 14) # Turn off ticklabels plt.setp(a.get_yaxis().get_ticklabels(), visible = False) plt.setp(a.get_xaxis().get_ticklabels(), visible = False) # Turn off the 12th subplot plt.setp(ax[2, 3], visible = False)