%matplotlib inline
import json
import numpy as np
import networkx as nx
import requests
from pattern import web
import matplotlib.pyplot as plt
import ast
from itertools import combinations, permutations
import operator
# set some nicer defaults for matplotlib
from matplotlib import rcParams
#these colors come from colorbrewer2.org. Each is an RGB triplet
dark2_colors = [(0.10588235294117647, 0.6196078431372549, 0.4666666666666667),
(0.8509803921568627, 0.37254901960784315, 0.00784313725490196),
(0.4588235294117647, 0.4392156862745098, 0.7019607843137254),
(0.9058823529411765, 0.1607843137254902, 0.5411764705882353),
(0.4, 0.6509803921568628, 0.11764705882352941),
(0.9019607843137255, 0.6705882352941176, 0.00784313725490196),
(0.6509803921568628, 0.4627450980392157, 0.11372549019607843),
(0.4, 0.4, 0.4)]
rcParams['figure.figsize'] = (10, 6)
rcParams['figure.dpi'] = 150
rcParams['axes.color_cycle'] = dark2_colors
rcParams['lines.linewidth'] = 2
rcParams['axes.grid'] = False
rcParams['axes.facecolor'] = 'white'
rcParams['font.size'] = 14
rcParams['patch.edgecolor'] = 'none'
def remove_border(axes=None, top=False, right=False, left=True, bottom=True):
"""
Minimize chartjunk by stripping out unnecessary plot borders and axis ticks
The top/right/left/bottom keywords toggle whether the corresponding plot border is drawn
"""
ax = axes or plt.gca()
ax.spines['top'].set_visible(top)
ax.spines['right'].set_visible(right)
ax.spines['left'].set_visible(left)
ax.spines['bottom'].set_visible(bottom)
#turn off all ticks
ax.yaxis.set_ticks_position('none')
ax.xaxis.set_ticks_position('none')
#now re-enable visibles
if top:
ax.xaxis.tick_top()
if bottom:
ax.xaxis.tick_bottom()
if left:
ax.yaxis.tick_left()
if right:
ax.yaxis.tick_right()
The website govtrack.us collects data on activities in the Senate and House of Representatives. It's a great source of information for making data-driven assessments about Congress.
The directories at http://www.govtrack.us/data/congress/113/votes/2013 contain JSON information about every vote cast for the current (113th) Congress. Subdirectories beginning with "S" correspond to Senate votes, while subdirectories beginning with "H" correspond to House votes.
Below are two functions. One that downloads and parses a single Senate vote page given the vote number, and another that repeatedly calls this function to build a full collection of Senate votes from the 113th Congress.
"""
Function
--------
get_senate_vote
Scrapes a single JSON page for a particular Senate vote, given by the vote number
Parameters
----------
vote : int
The vote number to fetch
Returns
-------
json : dict
The JSON-decoded dictionary for that vote
Examples
--------
>>> get_senate_vote(11)['bill']
{u'congress': 113,
u'number': 325,
u'title': u'A bill to ensure the complete and timely payment of the obligations of the United States Government until May 19, 2013, and for other purposes.',
u'type': u'hr'}
"""
def get_senate_vote(vote):
# url = "https://www.govtrack.us/data/congress/113/votes/2013/s{}/data.json".format(vote)
url = "https://www.govtrack.us/data/congress/101/votes/1989/s{}/data.json".format(vote)
page = requests.get(url).text
try:
data = json.loads(page)
return data
except ValueError:
raise Exception("Not a valid vote number.")
"""
Function
--------
get_all_votes
Scrapes all the Senate votes from http://www.govtrack.us/data/congress/113/votes/2013,
and returns a list of dicts
Parameters
-----------
None
Returns
--------
vote_dicts : list of dicts
List of JSON-parsed dicts for each senate vote
"""
def get_all_votes():
vote_num = 1
vote_dicts = []
while True:
try:
vote_dict = get_senate_vote(vote_num)
vote_dicts.append(vote_dict)
vote_num += 1
except Exception:
break
return vote_dicts
vote_data = get_all_votes()
Now we turn these data into a NetworkX graph according to the spec below.
"""
Function
--------
vote_graph
Parameters
----------
data : list of dicts
The vote database returned from get_all_votes
Returns
-------
graph : NetworkX Graph object, with the following properties
1. Each node in the graph is labeled using the `display_name` of a Senator (e.g., 'Lee (R-UT)')
2. Each node has a `color` attribute set to 'r' for Republicans,
'b' for Democrats, and 'k' for Independent/other parties.
3. The edges between two nodes are weighted by the number of
times two senators have cast the same Yea or Nay vote
4. Each edge also has a `difference` attribute, which is set to `1 / weight`.
Examples
--------
>>> graph = vote_graph(vote_data)
>>> graph.node['Lee (R-UT)']
{'color': 'r'} # attributes for this senator
>>> len(graph['Lee (R-UT)']) # connections to other senators
101
>>> graph['Lee (R-UT)']['Baldwin (D-WI)'] # edge relationship between Lee and Baldwin
{'difference': 0.02, 'weight': 50}
"""
def vote_graph(data):
graph = nx.Graph()
# set for all senator display names - these will be our nodes
all_senators = set()
# list for roll_call dicts, one for each vote
roll_calls = []
for vote in data:
# dict with keys for each vote class; values are lists of senator display names
roll_call = {}
for key, value in vote['votes'].iteritems():
senators = []
for senator in value:
if senator == 'VP':
continue
senators.append(senator['display_name'])
# senators = [senator['display_name'] for senator in value]
roll_call[key] = senators
# add any new senators to the set
all_senators.update(senators)
roll_calls.append(roll_call)
# all combinations of 2 senator display names
all_senator_pairs = combinations(all_senators, 2)
common_votes = {}
for pair in all_senator_pairs:
common_votes[pair] = 0
for vote in roll_calls:
yea_pairs = combinations(vote['Yea'], 2)
for pair in yea_pairs:
try:
common_votes[pair] += 1
except KeyError:
# flip senator names so we can find the pair in the common_votes db
common_votes[(pair[1],pair[0])] += 1
nay_pairs = combinations(vote['Nay'], 2)
for pair in nay_pairs:
try:
common_votes[pair] += 1
except KeyError:
common_votes[(pair[1],pair[0])] += 1
for senator in all_senators:
party = senator.split()[1][1]
# use color names that Graphviz understands
if party == 'D':
graph.add_node(senator, color='blue')
elif party == 'R':
graph.add_node(senator, color='red')
else:
graph.add_node(senator, color='black')
for pair, votes in common_votes.iteritems():
# don't draw an edge for senators with 0 votes in common
if votes == 0:
continue
graph.add_edge(pair[0], pair[1], weight=votes, difference=1.0/votes)
return graph
votes = vote_graph(vote_data)
nx.write_gexf(votes, 'votes-101-1989.gexf')
Network plots often look impressive, but creating sensible network plots is tricky. From Ben Fry, the author of the Processing program:
Usually a graph layout isn’t the best option for data sets larger than a few dozen nodes. You’re most likely to wind up with enormous spider webs or balls of string, and the mess seen so far is more often the case than not. Graphs can be a powerful way to represent relationships between data, but they are also a very abstract concept, which means that they run the danger of meaning something only to the creator of the graph. Often, simply showing the structure of the data says very little about what it actually means, even though it’s a perfectly accurate means of representing the data. Everything looks like a graph, but almost nothing should ever be drawn as one.
Here are bad and better ways of visualizing the senate vote network.
First, consider the "default" plot from NetworkX.
#this makes sure draw_spring results are the same at each call
np.random.seed(1)
color = [votes.node[senator]['color'] for senator in votes.nodes()]
#determine position of each node using a spring layout
pos = nx.spring_layout(votes, iterations=200)
#plot the edges
nx.draw_networkx_edges(votes, pos, alpha = .05)
#plot the nodes
nx.draw_networkx_nodes(votes, pos, node_color=color)
#draw the labels
lbls = nx.draw_networkx_labels(votes, pos, alpha=5, font_size=8)
#coordinate information is meaningless here, so let's remove it
plt.xticks([])
plt.yticks([])
remove_border(left=False, bottom=False)
The spring layout tries to group nodes with large edge-weights near to each other. In this context, that means it tries to organize the Senate into similarly-voting cliques. However, there's simply too much going on in this plot.
Here we compute the Minimum Spanning Tree
of the graph, using the difference
edge attribute as the weight to minimize.
mst = nx.minimum_spanning_tree(votes, weight='difference')
# this makes sure draw_spring results are the same at each call
np.random.seed(1)
# larger figure size makes graph easier to read
plt.figure(figsize=(25,25))
color = [mst.node[senator]['color'] for senator in mst.nodes()]
# use a Graphviz layout to determine node positions
pos = nx.graphviz_layout(mst, prog='neato', args='-Goverlap=false -Gsep=+150,150')
# plot the edges
nx.draw_networkx_edges(mst, pos, alpha = .5)
# plot the nodes
nx.draw_networkx_nodes(mst, pos, node_color=color)
# offset pos will allow us to pull labels down, so they're not drawn directly on nodes
offset_pos = nx.graphviz_layout(mst, prog='neato', args='-Goverlap=false -Gsep=+150,150')
for x in offset_pos:
offset_pos[x] = (offset_pos[x][0], offset_pos[x][1] - 95)
# draw the labels
lbls = nx.draw_networkx_labels(mst, offset_pos, alpha=5, font_size=12)
# coordinate information is meaningless here, so let's remove it
plt.xticks([])
plt.yticks([])
remove_border(left=False, bottom=False)
centralities = nx.closeness_centrality(votes, distance='difference')
centrality = zip(centralities.keys(), centralities.values())
get_score = operator.itemgetter(1)
centrality = sorted(centrality, key=get_score, reverse=True)
senators, scores = [], []
for senator, score in centrality:
senators.append(senator)
scores.append(score)
scores = map(lambda x: round(x, 4), scores)
def visualize(title, senator_lst, score_lst, bar_color=dark2_colors[0], subplot=111, xmin=0, xmax=100):
"For visualizing node/score data with a horizontal bar graph"
pos = np.arange(len(senator_lst))
if str(subplot)[2] == '1':
plt.figure(figsize=(10, 30))
plt.subplot(subplot)
plt.title(title)
plt.barh(pos, score_lst, color=bar_color)
# add the numbers to the side of each bar
for p, senator, score in zip(pos, senator_lst, score_lst):
if score < 0.5:
plt.annotate(str(score), xy=(score + 0.0001, p + .5), va='center')
else:
plt.annotate(str(score), xy=(score + 1, p + .5), va='center')
# customize ticks
ticks = plt.yticks(pos + .5, senator_lst)
xt = plt.xticks()[0]
plt.xticks(xt, [' '] * len(xt))
# minimize chartjunk
remove_border(left=False, bottom=False)
plt.grid(axis = 'x', color ='white', linestyle='-')
# set plot limits
plt.ylim(pos.max() + 1, pos.min())
plt.xlim(xmin, xmax)
plt.tight_layout()
visualize("Senator Centrality Scores - 113th Congress", senators, scores, xmax=scores[0]+1)
At the time of writing, the 5 senators with the highest centralities are Collins (R-ME), Manchin (D-WV), Pryor (D-AR), Hagan (D-NC), and Donnelly (D-IN). At the time of writing, the 5 senators with the lowest centralities are Kerry (D-MA), Booker (D-NJ), Lautenberg (D-NJ), Chiesa (R-NJ), and Markey (D-MA).
It appears that the majority of senators have a high to moderately high centrality score in the range of about 72.0 to 80.0. Kerry, Cowan, and Markey receive low centrality scores because we do not have as much information on these three senators as we do on senators who have served longer terms. Neither Kerry, Cowan, nor Markey has spent a significant amount of time in the 113th Congress: when Kerry left to join the Obama administration, Cowan acted as interim senator before Markey was elected to permanently replaced Kerry. This means that the three men have not had the opportunity to cast many votes as part of this Congress, and we have not had much time to update our priors on their voting habits. On the relatively few occasions that Kerry, Cowan, and Markey have voted, they have voted with the Democractic party. As a result, these three senators appear more partisan than they may actually be, and are given low centrality scores. The same is true of Lautenberg (D-NJ), who died in office at the beginning of the 113th Congress, Chiesa (R-NJ), who was appointed interim senator after Lautenberg's death, and Booker (D-NJ), who was sworn in on October 31, 2013 to replace Lautenberg.
Centrality isn't a perfect proxy for bipartisanship, since it gauges how centralized a node is to the network as a whole, and not how similar a Democrat node is to the Republican sub-network (and vice versa).
paths = nx.all_pairs_shortest_path(mst)
senators = mst.nodes()
republicans, democrats = [], []
for senator in senators:
color = mst.node[senator]['color']
if color == 'red':
republicans.append(senator)
else:
democrats.append(senator)
scores = []
for senator in senators:
if senator in republicans:
lengths = [len(paths[senator][dem_senator]) for dem_senator in democrats]
else:
lengths = [len(paths[senator][rep_senator]) for rep_senator in republicans]
score = np.mean(lengths)
scores.append(score)
avg_lengths = zip(senators, scores)
get_score = operator.itemgetter(1)
avg_lengths = sorted(avg_lengths, key=get_score)
senators, scores = [], []
for senator, score in avg_lengths:
senators.append(senator)
scores.append(score)
scores = map(lambda x: round(x, 4), scores)
visualize(
"Senator 'Shortest Paths' Scores - 113th Congress",
senators,
scores,
bar_color=dark2_colors[1],
xmax=scores[-1] + 1
)
My "shortest paths" metric is calculated as follows. Using the MST, compute all-pairs shortest paths for the graph $G$, then for each senator $s \in G$, compute the mean path length between $s$ and every senator in the opposite party by averaging the relevant shortest paths. The result is the "shortest paths" score for $s$. A senator with a low shortest paths score is closer to the opposing party, and therefore more bipartisan than a senator with a high shortest paths score, who is further from the opposing party. In my metric, independents are treated as members of the Democratic party. Since it operates on the MST, my procedure is forced to use the "difference" metric as the relevant unit of distance when calculating all-pairs shortest paths. As a result, my "shortest paths" metric is able to measure how similar a Democrat node is to the Republican sub-network (and vice versa), succeeding where closeness centrality fails. In so doing, my metric is able to identify Murkowski (R-AK) as one of the most bipartisan members of the Senate. Murkowski is the only current Republican senator from any West Coast state, and her voting record has become significantly more moderate since she won re-election. My model also identifies McCain (R-AZ) as one of the most moderate members of the Republican party.
There are many metrics used to quantify the leadership in the Senate.
Another approach uses the philosophy behind how Google ranks search results. The core idea behind Google's PageRank algorithm is:
The PageRank algorithm thus assigns scores to nodes in a graph based on how many neighbors a node has, as well as the score of those neighbors.
This technique can be adapted to rank Senate leadership. Here, nodes correspond to Senators, and edges correspond to a senator co-sponsoring a bill sponsored by another Senator. The weight of each edge from node A to B is the number of times Senator A has co-sponsored a bill whose primary sponsor is Senator B. If you interpret the PageRank scores of such a network to indicate Senate leadership, you are then assuming:
GovTrack stores information about each Senate bill in the current congress at http://www.govtrack.us/data/congress/113/bills/s/. As before, below are two functions used to scrape these data -- the first function downloads a single bill, and the second function calls the first to loop over all bills.
"""
Function
--------
get_senate_bill
Scrape the bill data from a single JSON page, given the bill number
Parameters
-----------
bill : int
Bill number to fetch
Returns
-------
A dict, parsed from the JSON
Examples
--------
>>> bill = get_senate_bill(10)
>>> bill['sponsor']
{u'district': None,
u'name': u'Reid, Harry',
u'state': u'NV',
u'thomas_id': u'00952',
u'title': u'Sen',
u'type': u'person'}
>>> bill['short_title']
u'Agriculture Reform, Food, and Jobs Act of 2013'
"""
def get_senate_bill(bill):
url = "https://www.govtrack.us/data/congress/113/bills/s/s{}/data.json".format(bill)
page = requests.get(url).text
try:
data = json.loads(page)
return data
except ValueError:
raise Exception("Not a valid bill number.")
"""
Function
--------
get_all_bills
Scrape all Senate bills at http://www.govtrack.us/data/congress/113/bills/s
Parameters
----------
None
Returns
-------
A list of dicts, one for each bill
"""
def get_all_bills():
bill_num = 1
bill_dicts = []
while True:
try:
bill_dict = get_senate_bill(bill_num)
bill_dicts.append(bill_dict)
bill_num += 1
except Exception:
break
return bill_dicts
bill_list = get_all_bills()
Below we construct a digraph from these data.
"""
Function
--------
bill_graph
Turn the bill graph data into a NetworkX Digraph
Parameters
----------
data : list of dicts
The data returned from get_all_bills
Returns
-------
graph : A NetworkX DiGraph, with the following properties
* Each node is a senator. For a label, use the 'name' field
from the 'sponsor' and 'cosponsors' dict items
* Each edge from A to B is assigned a weight equal to how many
bills are sponsored by B and co-sponsored by A
"""
def bill_graph(data):
digraph = nx.DiGraph()
senators = set()
all_sponsor_names = []
for bill in data:
sponsor_names = {}
sponsor_names['sponsor'] = bill['sponsor']['name']
sponsor_names['cosponsors'] = [cosponsor['name'] for cosponsor in bill['cosponsors']]
senators.add(sponsor_names['sponsor'])
senators.update(sponsor_names['cosponsors'])
all_sponsor_names.append(sponsor_names)
# ordered pairs of senators; (A, B) where A cosponsors a bill sponsored by B
all_senator_permutations = permutations(senators, 2)
coincident_sponsorships = {}
for permutation in all_senator_permutations:
coincident_sponsorships[permutation] = 0
for bill in all_sponsor_names:
sponsor = bill['sponsor']
cosponsors = bill['cosponsors']
for cosponsor in cosponsors:
coincident_sponsorships[cosponsor, sponsor] += 1
for senator in senators:
digraph.add_node(senator)
for pair, count in coincident_sponsorships.iteritems():
if count == 0:
continue
digraph.add_edge(pair[0], pair[1], weight=count)
return digraph
bills = bill_graph(bill_list)
Using nx.pagerank_numpy
, we compute the PageRank score for each senator in this graph.
pageranks = nx.pagerank_numpy(bills)
# visualize senator PageRank scores
pagerank = zip(pageranks.keys(), pageranks.values())
get_score = operator.itemgetter(1)
pagerank = sorted(pagerank, key=get_score, reverse=True)
senators, scores = [], []
for senator, score in pagerank:
senators.append(senator)
scores.append(score)
scores = map(lambda x: round(x, 5), scores)
visualize("Senator PageRank Scores",
senators,
scores,
bar_color=dark2_colors[2],
subplot=121,
xmax=scores[0]+0.001
)
# visualize degree of each senator node
degrees = []
for senator in senators:
degrees.append(bills.degree(senator))
senator_degrees = zip(senators, degrees)
get_degree = operator.itemgetter(1)
senator_degrees = sorted(senator_degrees, key=get_degree, reverse=True)
senators, degrees = [], []
for senator, degree in senator_degrees:
senators.append(senator)
degrees.append(degree)
visualize("Senator Node Degrees",
senators,
degrees,
bar_color=dark2_colors[3],
subplot=122,
xmax=degrees[0] + 1
)
# visualize correlation between PageRank score and degree
plt.figure(figsize=(13, 8))
plt.scatter(degrees, scores, color='g', alpha=0.7)
plt.title("Senator PageRank Score vs. Degree - 113th Congress")
plt.xlabel("Degree")
plt.ylabel("PageRank Score")
plt.xlim(xmin=0)
plt.ylim(ymin=0.0)
remove_border()
plt.tight_layout()
At the time of writing, the 5 senators with the highest PageRank scores are Harkin, Reid, Lautenberg, Brown, and Menendez. PageRank appears to be relatively effective at identifying Senate leaders. GovTrack identifies Harkin as a Democratic leader; he is a member of the Committee on Appropriations and chairman of the Committee on Health, Education, Labor, and Pensions. Reid is the Senate Majority Leader. Before his recent death, Lautenberg was also a Democratic leader; he too was a member of the Committee on Appropriations. Brown is less influential than these three, but he is a member of the Committee on Finance. Menendez is also identified as a Democratic leader; he is the chairman of the Committee on Foreign Relations. In spite of these successes, the PageRank method failed to identify McConnell, the Senate Minority Leader, likely because he has sponsored fewer bills than others in the Senate.
As illustrated in the plot above, PageRank rating is directly proportional to the degree of a node. So, the higher a node's degree is, the higher the PageRank score will be for the senator represented by that node.
nx.write_gexf(votes, 'votes.gexf')
from IPython.display import Image
path = 'votes-modularity-pagerank.png'
Image(path)
This network visualization and others like it which I made for the 101st Congress through the 113th Congress were featured on the front page of Yahoo!, The Huffington Post, and PolicyMic. The imgur album with all the visualizations I made can be found here.
The graphs are the product of applying a Force Atlas layout to the votes
graph filtered to include only those edges with weight $\geq$ 100. This means that an edge between 2 senators indicates that they have voted together on at least 100 occasions. PageRank score is encoded using node size. Node color designates Modularity Class; I manually colored the Independent senators green. I used Label Adjust to make the node labels readable, and Clockwise Rotate to orient the graph in a reasonable way, with Democrats on the left and Republicans on the right.
This network visualization is helpful! My visualization shows a clear separation between Democrats and Republicans. It correctly identifies Collins (R-ME) and Murkowski (R-AK) as the most moderate Republican senators, and also accurately shows that senators such as Warren (D-MA), Reed (D-RI), Cruz (R-TX) and Risch (R-ID) are some of the most partisan senators. More bipartisan senators are closer to the center of the graph, near the party divide, while less bipartisan senators are on the perimeter of the graph, furthest from the party divide. In addition, my visualization shows that the 2 Independent senators frequently vote with the Democrats.
css tweaks in this cell