%matplotlib inline import json import numpy as np import networkx as nx import requests from pattern import web import matplotlib.pyplot as plt import ast from itertools import combinations, permutations import operator # set some nicer defaults for matplotlib from matplotlib import rcParams #these colors come from colorbrewer2.org. Each is an RGB triplet dark2_colors = [(0.10588235294117647, 0.6196078431372549, 0.4666666666666667), (0.8509803921568627, 0.37254901960784315, 0.00784313725490196), (0.4588235294117647, 0.4392156862745098, 0.7019607843137254), (0.9058823529411765, 0.1607843137254902, 0.5411764705882353), (0.4, 0.6509803921568628, 0.11764705882352941), (0.9019607843137255, 0.6705882352941176, 0.00784313725490196), (0.6509803921568628, 0.4627450980392157, 0.11372549019607843), (0.4, 0.4, 0.4)] rcParams['figure.figsize'] = (10, 6) rcParams['figure.dpi'] = 150 rcParams['axes.color_cycle'] = dark2_colors rcParams['lines.linewidth'] = 2 rcParams['axes.grid'] = False rcParams['axes.facecolor'] = 'white' rcParams['font.size'] = 14 rcParams['patch.edgecolor'] = 'none' def remove_border(axes=None, top=False, right=False, left=True, bottom=True): """ Minimize chartjunk by stripping out unnecessary plot borders and axis ticks The top/right/left/bottom keywords toggle whether the corresponding plot border is drawn """ ax = axes or plt.gca() ax.spines['top'].set_visible(top) ax.spines['right'].set_visible(right) ax.spines['left'].set_visible(left) ax.spines['bottom'].set_visible(bottom) #turn off all ticks ax.yaxis.set_ticks_position('none') ax.xaxis.set_ticks_position('none') #now re-enable visibles if top: ax.xaxis.tick_top() if bottom: ax.xaxis.tick_bottom() if left: ax.yaxis.tick_left() if right: ax.yaxis.tick_right() """ Function -------- get_senate_vote Scrapes a single JSON page for a particular Senate vote, given by the vote number Parameters ---------- vote : int The vote number to fetch Returns ------- json : dict The JSON-decoded dictionary for that vote Examples -------- >>> get_senate_vote(11)['bill'] {u'congress': 113, u'number': 325, u'title': u'A bill to ensure the complete and timely payment of the obligations of the United States Government until May 19, 2013, and for other purposes.', u'type': u'hr'} """ def get_senate_vote(vote): # url = "https://www.govtrack.us/data/congress/113/votes/2013/s{}/data.json".format(vote) url = "https://www.govtrack.us/data/congress/101/votes/1989/s{}/data.json".format(vote) page = requests.get(url).text try: data = json.loads(page) return data except ValueError: raise Exception("Not a valid vote number.") """ Function -------- get_all_votes Scrapes all the Senate votes from http://www.govtrack.us/data/congress/113/votes/2013, and returns a list of dicts Parameters ----------- None Returns -------- vote_dicts : list of dicts List of JSON-parsed dicts for each senate vote """ def get_all_votes(): vote_num = 1 vote_dicts = [] while True: try: vote_dict = get_senate_vote(vote_num) vote_dicts.append(vote_dict) vote_num += 1 except Exception: break return vote_dicts vote_data = get_all_votes() """ Function -------- vote_graph Parameters ---------- data : list of dicts The vote database returned from get_all_votes Returns ------- graph : NetworkX Graph object, with the following properties 1. Each node in the graph is labeled using the `display_name` of a Senator (e.g., 'Lee (R-UT)') 2. Each node has a `color` attribute set to 'r' for Republicans, 'b' for Democrats, and 'k' for Independent/other parties. 3. The edges between two nodes are weighted by the number of times two senators have cast the same Yea or Nay vote 4. Each edge also has a `difference` attribute, which is set to `1 / weight`. Examples -------- >>> graph = vote_graph(vote_data) >>> graph.node['Lee (R-UT)'] {'color': 'r'} # attributes for this senator >>> len(graph['Lee (R-UT)']) # connections to other senators 101 >>> graph['Lee (R-UT)']['Baldwin (D-WI)'] # edge relationship between Lee and Baldwin {'difference': 0.02, 'weight': 50} """ def vote_graph(data): graph = nx.Graph() # set for all senator display names - these will be our nodes all_senators = set() # list for roll_call dicts, one for each vote roll_calls = [] for vote in data: # dict with keys for each vote class; values are lists of senator display names roll_call = {} for key, value in vote['votes'].iteritems(): senators = [] for senator in value: if senator == 'VP': continue senators.append(senator['display_name']) # senators = [senator['display_name'] for senator in value] roll_call[key] = senators # add any new senators to the set all_senators.update(senators) roll_calls.append(roll_call) # all combinations of 2 senator display names all_senator_pairs = combinations(all_senators, 2) common_votes = {} for pair in all_senator_pairs: common_votes[pair] = 0 for vote in roll_calls: yea_pairs = combinations(vote['Yea'], 2) for pair in yea_pairs: try: common_votes[pair] += 1 except KeyError: # flip senator names so we can find the pair in the common_votes db common_votes[(pair[1],pair[0])] += 1 nay_pairs = combinations(vote['Nay'], 2) for pair in nay_pairs: try: common_votes[pair] += 1 except KeyError: common_votes[(pair[1],pair[0])] += 1 for senator in all_senators: party = senator.split()[1][1] # use color names that Graphviz understands if party == 'D': graph.add_node(senator, color='blue') elif party == 'R': graph.add_node(senator, color='red') else: graph.add_node(senator, color='black') for pair, votes in common_votes.iteritems(): # don't draw an edge for senators with 0 votes in common if votes == 0: continue graph.add_edge(pair[0], pair[1], weight=votes, difference=1.0/votes) return graph votes = vote_graph(vote_data) nx.write_gexf(votes, 'votes-101-1989.gexf') #this makes sure draw_spring results are the same at each call np.random.seed(1) color = [votes.node[senator]['color'] for senator in votes.nodes()] #determine position of each node using a spring layout pos = nx.spring_layout(votes, iterations=200) #plot the edges nx.draw_networkx_edges(votes, pos, alpha = .05) #plot the nodes nx.draw_networkx_nodes(votes, pos, node_color=color) #draw the labels lbls = nx.draw_networkx_labels(votes, pos, alpha=5, font_size=8) #coordinate information is meaningless here, so let's remove it plt.xticks([]) plt.yticks([]) remove_border(left=False, bottom=False) mst = nx.minimum_spanning_tree(votes, weight='difference') # this makes sure draw_spring results are the same at each call np.random.seed(1) # larger figure size makes graph easier to read plt.figure(figsize=(25,25)) color = [mst.node[senator]['color'] for senator in mst.nodes()] # use a Graphviz layout to determine node positions pos = nx.graphviz_layout(mst, prog='neato', args='-Goverlap=false -Gsep=+150,150') # plot the edges nx.draw_networkx_edges(mst, pos, alpha = .5) # plot the nodes nx.draw_networkx_nodes(mst, pos, node_color=color) # offset pos will allow us to pull labels down, so they're not drawn directly on nodes offset_pos = nx.graphviz_layout(mst, prog='neato', args='-Goverlap=false -Gsep=+150,150') for x in offset_pos: offset_pos[x] = (offset_pos[x][0], offset_pos[x][1] - 95) # draw the labels lbls = nx.draw_networkx_labels(mst, offset_pos, alpha=5, font_size=12) # coordinate information is meaningless here, so let's remove it plt.xticks([]) plt.yticks([]) remove_border(left=False, bottom=False) centralities = nx.closeness_centrality(votes, distance='difference') centrality = zip(centralities.keys(), centralities.values()) get_score = operator.itemgetter(1) centrality = sorted(centrality, key=get_score, reverse=True) senators, scores = [], [] for senator, score in centrality: senators.append(senator) scores.append(score) scores = map(lambda x: round(x, 4), scores) def visualize(title, senator_lst, score_lst, bar_color=dark2_colors[0], subplot=111, xmin=0, xmax=100): "For visualizing node/score data with a horizontal bar graph" pos = np.arange(len(senator_lst)) if str(subplot)[2] == '1': plt.figure(figsize=(10, 30)) plt.subplot(subplot) plt.title(title) plt.barh(pos, score_lst, color=bar_color) # add the numbers to the side of each bar for p, senator, score in zip(pos, senator_lst, score_lst): if score < 0.5: plt.annotate(str(score), xy=(score + 0.0001, p + .5), va='center') else: plt.annotate(str(score), xy=(score + 1, p + .5), va='center') # customize ticks ticks = plt.yticks(pos + .5, senator_lst) xt = plt.xticks()[0] plt.xticks(xt, [' '] * len(xt)) # minimize chartjunk remove_border(left=False, bottom=False) plt.grid(axis = 'x', color ='white', linestyle='-') # set plot limits plt.ylim(pos.max() + 1, pos.min()) plt.xlim(xmin, xmax) plt.tight_layout() visualize("Senator Centrality Scores - 113th Congress", senators, scores, xmax=scores[0]+1) paths = nx.all_pairs_shortest_path(mst) senators = mst.nodes() republicans, democrats = [], [] for senator in senators: color = mst.node[senator]['color'] if color == 'red': republicans.append(senator) else: democrats.append(senator) scores = [] for senator in senators: if senator in republicans: lengths = [len(paths[senator][dem_senator]) for dem_senator in democrats] else: lengths = [len(paths[senator][rep_senator]) for rep_senator in republicans] score = np.mean(lengths) scores.append(score) avg_lengths = zip(senators, scores) get_score = operator.itemgetter(1) avg_lengths = sorted(avg_lengths, key=get_score) senators, scores = [], [] for senator, score in avg_lengths: senators.append(senator) scores.append(score) scores = map(lambda x: round(x, 4), scores) visualize( "Senator 'Shortest Paths' Scores - 113th Congress", senators, scores, bar_color=dark2_colors[1], xmax=scores[-1] + 1 ) """ Function -------- get_senate_bill Scrape the bill data from a single JSON page, given the bill number Parameters ----------- bill : int Bill number to fetch Returns ------- A dict, parsed from the JSON Examples -------- >>> bill = get_senate_bill(10) >>> bill['sponsor'] {u'district': None, u'name': u'Reid, Harry', u'state': u'NV', u'thomas_id': u'00952', u'title': u'Sen', u'type': u'person'} >>> bill['short_title'] u'Agriculture Reform, Food, and Jobs Act of 2013' """ def get_senate_bill(bill): url = "https://www.govtrack.us/data/congress/113/bills/s/s{}/data.json".format(bill) page = requests.get(url).text try: data = json.loads(page) return data except ValueError: raise Exception("Not a valid bill number.") """ Function -------- get_all_bills Scrape all Senate bills at http://www.govtrack.us/data/congress/113/bills/s Parameters ---------- None Returns ------- A list of dicts, one for each bill """ def get_all_bills(): bill_num = 1 bill_dicts = [] while True: try: bill_dict = get_senate_bill(bill_num) bill_dicts.append(bill_dict) bill_num += 1 except Exception: break return bill_dicts bill_list = get_all_bills() """ Function -------- bill_graph Turn the bill graph data into a NetworkX Digraph Parameters ---------- data : list of dicts The data returned from get_all_bills Returns ------- graph : A NetworkX DiGraph, with the following properties * Each node is a senator. For a label, use the 'name' field from the 'sponsor' and 'cosponsors' dict items * Each edge from A to B is assigned a weight equal to how many bills are sponsored by B and co-sponsored by A """ def bill_graph(data): digraph = nx.DiGraph() senators = set() all_sponsor_names = [] for bill in data: sponsor_names = {} sponsor_names['sponsor'] = bill['sponsor']['name'] sponsor_names['cosponsors'] = [cosponsor['name'] for cosponsor in bill['cosponsors']] senators.add(sponsor_names['sponsor']) senators.update(sponsor_names['cosponsors']) all_sponsor_names.append(sponsor_names) # ordered pairs of senators; (A, B) where A cosponsors a bill sponsored by B all_senator_permutations = permutations(senators, 2) coincident_sponsorships = {} for permutation in all_senator_permutations: coincident_sponsorships[permutation] = 0 for bill in all_sponsor_names: sponsor = bill['sponsor'] cosponsors = bill['cosponsors'] for cosponsor in cosponsors: coincident_sponsorships[cosponsor, sponsor] += 1 for senator in senators: digraph.add_node(senator) for pair, count in coincident_sponsorships.iteritems(): if count == 0: continue digraph.add_edge(pair[0], pair[1], weight=count) return digraph bills = bill_graph(bill_list) pageranks = nx.pagerank_numpy(bills) # visualize senator PageRank scores pagerank = zip(pageranks.keys(), pageranks.values()) get_score = operator.itemgetter(1) pagerank = sorted(pagerank, key=get_score, reverse=True) senators, scores = [], [] for senator, score in pagerank: senators.append(senator) scores.append(score) scores = map(lambda x: round(x, 5), scores) visualize("Senator PageRank Scores", senators, scores, bar_color=dark2_colors[2], subplot=121, xmax=scores[0]+0.001 ) # visualize degree of each senator node degrees = [] for senator in senators: degrees.append(bills.degree(senator)) senator_degrees = zip(senators, degrees) get_degree = operator.itemgetter(1) senator_degrees = sorted(senator_degrees, key=get_degree, reverse=True) senators, degrees = [], [] for senator, degree in senator_degrees: senators.append(senator) degrees.append(degree) visualize("Senator Node Degrees", senators, degrees, bar_color=dark2_colors[3], subplot=122, xmax=degrees[0] + 1 ) # visualize correlation between PageRank score and degree plt.figure(figsize=(13, 8)) plt.scatter(degrees, scores, color='g', alpha=0.7) plt.title("Senator PageRank Score vs. Degree - 113th Congress") plt.xlabel("Degree") plt.ylabel("PageRank Score") plt.xlim(xmin=0) plt.ylim(ymin=0.0) remove_border() plt.tight_layout() nx.write_gexf(votes, 'votes.gexf') from IPython.display import Image path = 'votes-modularity-pagerank.png' Image(path)