%pylab inline

import numpy as np
import matplotlib.pyplot as plt
from pandas import *
from sklearn.manifold import MDS
from statsmodels.iolib.foreign import genfromdta
import os
from sklearn.metrics.pairwise import euclidean_distances
import re
import math

datdir = os.path.abspath(os.path.join('data', 'roll_call'))
csv_files = [os.path.join(datdir, f) for f in os.listdir(datdir) 
                                     if f.endswith('.csv')]

roll_call = {}
for f in csv_files:
    s = re.search('sen(.*)kh', f)
    congnum = int(f[(s.start()+3):(s.end()-2)])
    roll_call[congnum] = read_csv(f)

def roll_call_simplified(df):
    vote_codes = {1:  1,
                  2:  1,
                  3:  1,
                  4: -1,
                  5: -1,
                  6: -1,
                  7:  0,
                  8:  0,
                  9:  0,
                  0:  0}
    
    # Remove the vice-president.
    no_pres = df[df.state < 99]
    
    # Find the columns with vote data. These are typically 'V1', 'V915', etc.,
    # but some dataset appear to have them as 'var1', 'var919', etc.
    # Checking that the first letter is 'v' or 'V' will find these (and only)
    # these columns.
    vote_cols = [c for c in df.columns if c[0].upper() == 'V']

    # Would like to just call replace on the whole df, but doesn't seem
    # to work, so we'll apply replace to each column.
    no_pres[vote_cols] = no_pres[vote_cols].apply(lambda x: x.replace(vote_codes))
    return no_pres

for cong in roll_call:
    roll_call[cong] = roll_call_simplified(roll_call[cong])

distance = {}
for cong in roll_call:
    vote_cols = [c for c in roll_call[cong].columns if c[0].upper() == 'V' ]
    distance[cong] = euclidean_distances(roll_call[cong][vote_cols].values)

rc110 = roll_call[110]
dist110 = distance[110]
dems = np.where(rc110.party == 100)[0]
repubs = np.where(rc110.party == 200)[0]
names = roll_call[110].name.values

mds = MDS().fit_transform(dist110)

plt.figure(figsize = (8, 5))
# Plot invisible points to annotate with names
plt.plot(mds[:, 0], mds[:, 1], '.', alpha = 0)
plt.title('MDS analysis of Senators in the 110th Congress')
# Plot dem names
for i in dems:
    plt.annotate(names[i], (mds[i, 0], mds[i, 1]),
                 color = 'blue',
                 alpha = 0.3, 
                 horizontalalignment = 'center', 
                 verticalalignment = 'center',
                 family = 'sans-serif')
# Plot repub names
for i in repubs:
    plt.annotate(names[i], (mds[i, 0], mds[i, 1]),
                 color = 'red',
                 alpha = 0.5, 
                 horizontalalignment = 'center', 
                 verticalalignment = 'center',
                 family = 'sans-serif')

    # Turn off axes labels.
plt.setp(plt.gca().get_yaxis(), visible = False)
plt.setp(plt.gca().get_xaxis(), visible = False)

fig, ax = plt.subplots(nrows = 3, ncols = 4, figsize = (20, 10))
plt.subplots_adjust(hspace = 0, wspace = 0)
for a, cong in zip(ax.ravel(), roll_call):
    rc = roll_call[cong]
    dist = distance[cong]
    dems = np.where(rc.party == 100)[0]
    repubs = np.where(rc.party == 200)[0]
    
    mds = MDS().fit_transform(dist)

    a.plot(mds[dems, 0], mds[dems, 1], '.b', mfc = 'white')
    a.plot(mds[repubs, 0], mds[repubs, 1], 'xr')
    # Label which Congress is being plotted
    a.text(0, .99, cong, transform = a.transAxes,
           verticalalignment = 'top', fontsize = 14)
    # Turn off ticklabels
    plt.setp(a.get_yaxis().get_ticklabels(), visible = False)
    plt.setp(a.get_xaxis().get_ticklabels(), visible = False)

# Turn off the 12th subplot
plt.setp(ax[2, 3], visible = False)