# Standard packages for data analysis
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt

# pandas handles tabular data
import pandas as pd

# networkx handles network data
import networkx as nx

# json handles reading and writing JSON data
import json

# To visualize webpages within this webpage
from IPython.display import HTML

# To run queries against MediaWiki APIs
from wikitools import wiki, api

# Some other helper functions
from collections import Counter
from operator import itemgetter

HTML('http://en.wikipedia.org/w/api.php?action=query&list=users&ususers=Madcoverboy|Jimbo_Wales&usprop=blockinfo|groups|editcount|registration|gender')

def wikipedia_query(query_params,site_url='http://en.wikipedia.org/w/api.php'):
    site = wiki.Wiki(url=site_url)
    request = api.APIRequest(site, query_params)
    result = request.query()
    return result[query_params['action']]

user_query = {'action':'query',
              'list':'users',
              'usprop':'blockinfo|groups|editcount|registration|gender',
              'ususers':'Madcoverboy|Jimbo Wales'}

query_results = wikipedia_query(user_query)
query_results

query_results['users'][1]['editcount']

def get_user_properties(user):
    result = wikipedia_query({'action':'query',
                              'list':'users',
                              'usprop':'blockinfo|groups|editcount|registration|gender',
                              'ususers':user})
    return result

koavf_query_results = get_user_properties('Koavf')
koavf_query_results

with open('koavf_query_results.json','wb') as f:
    json.dump(koavf_query_results,f)

with open('koavf_query_results.json','rb') as f:
    loaded_koavf_query_results = json.load(f)
    
loaded_koavf_query_results

query_results['users']

df = pd.DataFrame(query_results['users'])
df.to_csv('query_results.csv',quotechar='"',index=False)
df

pd.read_csv('query_results.csv',quotechar='"')

outlink_query = {'action': 'query',
                 'prop': 'links',
                 'titles': 'Hillary Rodham Clinton',
                 'pllimit': '500',
                 'plnamespace':'0'}

hrc_outlink_data = wikipedia_query(outlink_query)

hrc_outlink_data['pages'][u'5043192']['links'][:5]

hrc_outlink_list = [link['title'] for link in hrc_outlink_data['pages'][u'5043192']['links']]
print "There are {0} links from the Hillary Rodham Clinton article".format(len(hrc_outlink_list))
hrc_outlink_list[:10]

outlink_query_hc = {'action': 'query',
                    'prop': 'links',
                    'titles': 'Hillary Clinton',
                    'pllimit': '500',
                    'plnamespace': '0'}

hc_outlink_data = wikipedia_query(outlink_query_hc)
hc_outlink_data

outlink_query_hc_redirect = {'action': 'query',
                             'prop': 'links',
                             'titles': 'Hillary Clinton', # still "Hillary Clinton"
                             'pllimit': '500',
                             'plnamespace': '0',
                             'redirects': 'True'} # redirects parameter added

hcr_outlink_data = wikipedia_query(outlink_query_hc_redirect)
hcr_outlink_list = [link['title'] for link in hcr_outlink_data['pages'][u'5043192']['links']]
print "There are {0} links from the Hillary Clinton article".format(len(hcr_outlink_list))
hcr_outlink_list[:10]

inlink_query_hrc = {'action': 'query',
                    'redirects': 'True',
                    'prop': 'linkshere',
                    'titles': 'Hillary Rodham Clinton',
                    'lhlimit': '500',
                    'lhnamespace': '0',
                    'lhshow': '!redirect',
                    'lhprop': 'title'}

hrc_inlink_data = wikipedia_query(inlink_query_hrc)

hrc_inlink_list = [link['title'] for link in hrc_inlink_data['pages'][u'5043192']['linkshere']]
print "There are {0} links to the Hillary Rodham Clinton article".format(len(hrc_inlink_list))
hrc_inlink_list[:10]

alllinks_query_hrc = {'action': 'query',
                      'redirects': 'True',
                      'prop': 'links|linkshere', #combined both prop calls with a pipe
                      'titles': 'Hillary Rodham Clinton',
                      'pllimit': '500', #still need the "prop=links" "pl" parameters and below
                      'plnamespace': '0',
                      'lhlimit': '500', #still need the "prop=linkshere" "lh" parameters and below
                      'lhnamespace': '0',
                      'lhshow': '!redirect',
                      'lhprop': 'title'}

hrc_alllink_data = wikipedia_query(alllinks_query_hrc)

hrc_alllink_outlist = [link['title'] for link in hrc_alllink_data['pages'][u'5043192']['links']]
hrc_alllink_inlist = [link['title'] for link in hrc_alllink_data['pages'][u'5043192']['linkshere']]
print "There are {0} out links from and {1} in links to the Hillary Rodham Clinton article".format(len(hrc_alllink_outlist),len(hrc_alllink_inlist))

def get_article_links(article):
    query = {'action': 'query',
             'redirects': 'True',
             'prop': 'links|linkshere',
             'titles': article, # the article variable is passed into here
             'pllimit': '500',
             'plnamespace': '0',
             'lhlimit': '500',
             'lhnamespace': '0',
             'lhshow': '!redirect',
             'lhprop': 'title'}
    results = wikipedia_query(query) # do the query
    page_id = results['pages'].keys()[0] # get the page_id
    
    if 'links' in results['pages'][page_id].keys(): #sometimes there are no links
        outlist = [link['title'] for link in results['pages'][page_id]['links']] # clean up outlinks
    else:
        outlist = [] # return empty list if no outlinks
    
    if 'linkshere' in results['pages'][page_id].keys(): #sometimes there are no links
        inlist = [link['title'] for link in results['pages'][page_id]['linkshere']] # clean up inlinks
    else:
        inlist = [] # return empty list if no inlinks
    return outlist,inlist

bc_out, bc_in = get_article_links("Bill Clinton")
print "There are {0} out links from and {1} in links to the Bill Clinton article".format(len(bc_out),len(bc_in))

clinton_link_data = {"Hillary Rodham Clinton": {"In": hrc_alllink_inlist,
                                                "Out": hrc_alllink_outlist},
                     "Bill Clinton": {"In": bc_in,
                                      "Out": bc_out}
                     }

with open('clinton_link_data.json','wb') as f:
    json.dump(clinton_link_data,f)

hrc_alllink_outlist[:5]

hrc_g = nx.DiGraph()

for article in hrc_alllink_outlist:
    hrc_g.add_edge("Hillary Rodham Clinton",article)
    
for article in hrc_alllink_inlist:
    hrc_g.add_edge(article,"Hillary Rodham Clinton")

len(hrc_alllink_outlist) + len(hrc_alllink_inlist)

hrc_g.number_of_nodes()

print "There are {0} edges and {1} nodes in the network".format(hrc_g.number_of_edges(), hrc_g.number_of_nodes())

reciprocal_edges = list()
for (i,j) in hrc_g.edges():
    if hrc_g.has_edge(j,i) and (j,i) not in reciprocal_edges:
        reciprocal_edges.append((i,j))
        
reciprocation_fraction = round(float(len(reciprocal_edges))/hrc_g.number_of_edges(),3)
print "There are {0} reciprocated edges out of {1} edges in the network, giving a reciprocation fraction of {2}.".format(len(reciprocal_edges),hrc_g.number_of_edges(),reciprocation_fraction)

bc_g = nx.DiGraph()

for article in bc_out:
    bc_g.add_edge("Bill Clinton",article)
    
for article in bc_in:
    bc_g.add_edge(article,"Bill Clinton")

bc_reciprocal_edges = list()
for (i,j) in bc_g.edges():
    if bc_g.has_edge(j,i) and (j,i) not in bc_reciprocal_edges:
        bc_reciprocal_edges.append((i,j))
        
bc_reciprocation_fraction = round(float(len(bc_reciprocal_edges))/bc_g.number_of_edges(),3)
print "There are {0} reciprocated edges out of {1} edges in the network, giving a reciprocation fraction of {2}.".format(len(bc_reciprocal_edges),bc_g.number_of_edges(),bc_reciprocation_fraction)

HTML('<iframe src=https://en.wikipedia.org/w/index.php?title=Hillary_Rodham_Clinton&oldid=256189&useformat=mobile width=700 height=350></iframe>')

oldest_outlinks_query_hrc = {'action': 'parse', #query changes to parse
                             'prop': 'links',
                             'oldid': '256189'}

oldest_outlinks_data = wikipedia_query(oldest_outlinks_query_hrc)
oldest_outlinks_data

oldest_outlink_list = [link['*'] for link in oldest_outlinks_data['links']]
print "There are {0} out links from the Hillary Rodham Clinton article".format(len(oldest_outlink_list))
oldest_outlink_list

revisions_query_hrc = {'action': 'query',
                      'redirects': 'True',
                      'prop': 'revisions',
                      'titles': "Hillary Rodham Clinton",
                      'rvprop': 'ids|user|timestamp|userid|comment|size',
                      'rvlimit': '500',
                      'rvdir': 'newer'}

revisions_data_hrc = wikipedia_query(revisions_query_hrc)

# Extract and convert to DataFrame
hrc_rv_df = pd.DataFrame(revisions_data_hrc['pages']['5043192']['revisions']) 

# Make it clear what's being edited
hrc_rv_df['page'] = [u'Hillary Rodham Clinton']*len(hrc_rv_df)

# Clean up timestamps
hrc_rv_df['timestamp'] = pd.to_datetime(hrc_rv_df['timestamp'],format="%Y-%m-%dT%H:%M:%SZ",unit='s')

# Clean up anon column
hrc_rv_df = hrc_rv_df.replace({'anon':{np.nan:False,u'':True}})

# Sort the data on timestamp and reset the index
hrc_rv_df = hrc_rv_df.sort('timestamp').reset_index(drop=True)
hrc_rv_df.index.name = 'revision'
hrc_rv_df = hrc_rv_df.reset_index()

# Set the index to a MultiIndex
hrc_rv_df.set_index(['page','revision'],inplace=True)

# Save the data to disk
hrc_rv_df.to_csv('hrc_revisions.csv',encoding='utf8')

# Show the first 5 rows
hrc_rv_df.head()

hrc_rv_gb_user = hrc_rv_df.groupby('user')
hrc_user_revisions = hrc_rv_gb_user['revid'].aggregate(len).sort(ascending=False,inplace=False)
print "There are {0} unique users who have made a contribution to the article.".format(len(hrc_user_revisions))
hrc_user_revisions.head(10)

revisions_counter = Counter(hrc_user_revisions.values)
plt.scatter(revisions_counter.keys(),revisions_counter.values(),s=50)
plt.ylabel('Number of users',fontsize=15)
plt.xlabel('Number of revisions',fontsize=15)
plt.yscale('log')
plt.xscale('log')

def count_unique_users(user_series):
    unique_users = []
    unique_count = []
    for user in user_series.values:
        if user not in unique_users:
            unique_users.append(user)
            unique_count.append(len(unique_users))
        else:
            unique_count.append(unique_count[-1])
    return unique_count
        
hrc_rv_df['unique_users'] = count_unique_users(hrc_rv_df['user'])

hrc_rv_df['date'] = hrc_rv_df['timestamp'].apply(lambda x:x.date())
activity_by_day = hrc_rv_df.groupby('date').aggregate({'unique_users':max,'revid':len})
ax = activity_by_day.plot(lw=1,secondary_y=['revid'])
ax.set_xlabel('Time',fontsize=15)

hrc_rv_df['diff'] = hrc_rv_df['size'].diff()
diff_counter = Counter(hrc_rv_df['diff'].values)
plt.scatter(diff_counter.keys(),diff_counter.values(),s=50,alpha=.1)
plt.xlabel('Difference (bytes)',fontsize=15)
plt.ylabel('Number of revisions',fontsize=15)
plt.yscale('log')
plt.xscale('symlog')

activity_by_day = hrc_rv_df.groupby('date').aggregate({'unique_users':max,
                                                       'revid':len,
                                                       'diff':np.median})

# Compute a 60-day rolling average to remove spikiness, plot
pd.rolling_mean(activity_by_day['diff'],60).plot()
plt.yscale('symlog')
plt.xlabel('Time',fontsize=15)
plt.ylabel('Difference (bytes)',fontsize=15)
plt.axhline(0,lw=2,c='k')

# The diff returns timedeltas, but dividing by a 1-second timedelta returns a float
# Round these numbers off to smooth out the distribution and add 1 second to everything to make the plot behave
hrc_rv_df['latency'] = [round(i/np.timedelta64(1,'s'),-1) + 1 for i in hrc_rv_df['timestamp'].diff().values]
diff_counter = Counter(hrc_rv_df['latency'].values)
plt.scatter(diff_counter.keys(),diff_counter.values(),s=50,alpha=.1)
plt.xlabel('Latency time (seconds)',fontsize=15)
plt.ylabel('Number of changes',fontsize=15)
plt.yscale('log')
plt.xscale('log')

hrc_rv_df['latency'].describe()

activity_by_day = hrc_rv_df.groupby('date').aggregate({'unique_users':max,
                                                       'revid':len,
                                                       'diff':np.median,
                                                       'latency':np.median})

# Compute a 60-day rolling average to remove spikiness, plot
pd.rolling_mean(activity_by_day['latency'],60).plot()
plt.yscale('symlog')
plt.xlabel('Time',fontsize=15)
plt.ylabel('Latency time (seconds)',fontsize=15)

hrc_bg = nx.DiGraph()

for user in hrc_rv_df['user'].values:
    if hrc_bg.has_edge(user,u'Hillary Rodham Clinton'):
        hrc_bg[user][u'Hillary Rodham Clinton']['weight'] += 1
    else:
        hrc_bg.add_edge(user,u'Hillary Rodham Clinton',weight=1)

print "There are {0} nodes and {1} edges in the network.".format(hrc_bg.number_of_nodes(),hrc_bg.number_of_edges())

hrc_bg.edges(data=True)[:5]

def get_revision_df(article):
    revisions_query = {'action': 'query',
                      'redirects': 'True',
                      'prop': 'revisions',
                      'titles': article,
                      'rvprop': 'ids|user|timestamp|user|userid|comment|size',
                      'rvlimit': '500',
                      'rvdir': 'newer'}

    revisions_data = wikipedia_query(revisions_query)
    page_id = revisions_data['pages'].keys()[0]

    # Extract and convert to DataFrame. Try/except for links to pages that don't exist
    try:
        df = pd.DataFrame(revisions_data['pages'][page_id]['revisions'])
    except KeyError:
        print u"{0} doesn't exist!".format(article)
        pass

    # Make it clear what's being edited
    df['page'] = [article]*len(df)

    # Clean up timestamps
    df['timestamp'] = pd.to_datetime(df['timestamp'],format="%Y-%m-%dT%H:%M:%SZ",unit='s')

    # Clean up anon column. If/else for articles that have all non-anon editors
    if 'anon' in df.columns:
        df = df.replace({'anon':{np.nan:False,u'':True}})
    else:
        df['anon'] = [False] * len(df)

    # Sort the data on timestamp and reset the index
    df = df.sort('timestamp').reset_index(drop=True)
    df.index.name = 'revision'
    df = df.reset_index()

    # Set the index to a MultiIndex
    df.set_index(['page','revision'],inplace=True)
    
    # Compute additional features
    df['date'] = df['timestamp'].apply(lambda x:x.date())
    df['diff'] = df['size'].diff()
    df['unique_users'] = count_unique_users(df['user'])
    df['latency'] = [round(i/np.timedelta64(1,'s'),-1) + 1 for i in df['timestamp'].diff().values]
    
    # Don't return random other columns
    df = df[[u'anon',u'comment',u'parentid',
             u'revid',u'size',u'timestamp',
             u'user',u'userid',u'unique_users',
             u'date', u'diff', u'latency']]
    
    return df

bc_rv_df = get_revision_df("Bill Clinton")
bc_rv_df.head()

clinton_df = pd.concat([bc_rv_df,hrc_rv_df])

print clinton_df.index.levels[0]
print "There are a total of {0} revisions across both the Hillary and Bill Clinton articles.".format(len(clinton_df))

clinton_df.to_csv('clinton_revisions.csv',encoding='utf8')

clinton_df = pd.read_csv('clinton_revisions.csv',
                          encoding='utf8',
                          index_col=['page','revision'],
                          parse_dates=['timestamp','date'])
clinton_df.head()

clinton_gb_edge = clinton_df.reset_index().groupby(['page','user'])
clinton_edgelist = clinton_gb_edge.agg({'revid':{'weight':len},
                                        'timestamp':{'ts_min':np.min,'ts_max':np.max},
                                        'diff':{'diff_min':np.min,'diff_median':np.median,'diff_max':np.max},
                                        'latency':{'latency_min':np.min,'latency_median':np.median,'latency_max':np.max},
                                        'revision':{'revision_min':np.min,'revision_max':np.max}
                                        })

# Drop the legacy/redundant column names
clinton_edgelist.columns = clinton_edgelist.columns.droplevel(0)

# Convert the ts_min and ts_max to floats for the number of days since Wikipedia was founded
clinton_edgelist['ts_min'] = (clinton_edgelist['ts_min'] - pd.Timestamp('2001-1-16'))/np.timedelta64(1,'D')
clinton_edgelist['ts_max'] = (clinton_edgelist['ts_max'] - pd.Timestamp('2001-1-16'))/np.timedelta64(1,'D')

clinton_edgelist.head()

# Create the usernodelist by grouping on user and aggregating
clinton_gb_user = clinton_df.reset_index().groupby(['user'])
clinton_usernodelist = clinton_gb_user.agg({'revid':{'revisions':len},
                                            'timestamp':{'ts_min':np.min,'ts_max':np.max},
                                            'diff':{'diff_min':np.min,'diff_median':np.median,'diff_max':np.max},
                                            'latency':{'latency_min':np.min,'latency_median':np.median,'latency_max':np.max},
                                            'revision':{'revision_min':np.min,'revision_median':np.median,'revision_max':np.max}
                                            })

# Clean up the columns and convert the timestamps to counts
clinton_usernodelist.columns = clinton_usernodelist.columns.droplevel(0)
clinton_usernodelist['ts_min'] = (clinton_usernodelist['ts_min'] - pd.Timestamp('2001-1-16'))/np.timedelta64(1,'D')
clinton_usernodelist['ts_max'] = (clinton_usernodelist['ts_max'] - pd.Timestamp('2001-1-16'))/np.timedelta64(1,'D')


# Create the usernodelist by grouping on page and aggregating
clinton_gb_page = clinton_df.reset_index().groupby(['page'])
clinton_pagenodelist = clinton_gb_page.agg({'revid':{'revisions':len},
                                            'timestamp':{'ts_min':np.min,'ts_max':np.max},
                                            'diff':{'diff_min':np.min,'diff_median':np.median,'diff_max':np.max},
                                            'latency':{'latency_min':np.min,'latency_median':np.median,'latency_max':np.max},
                                            'revision':{'revision_min':np.min,'revision_median':np.median,'revision_max':np.max}
                                            })

# Clean up the columns and convert the timestamps to counts
clinton_pagenodelist.columns = clinton_pagenodelist.columns.droplevel(0)
clinton_pagenodelist['ts_min'] = (clinton_pagenodelist['ts_min'] - pd.Timestamp('2001-1-16'))/np.timedelta64(1,'D')
clinton_pagenodelist['ts_max'] = (clinton_pagenodelist['ts_max'] - pd.Timestamp('2001-1-16'))/np.timedelta64(1,'D')

clinton_pagenodelist.head()

clinton_g = nx.DiGraph()
# Add the edges and edge attributes
for (article,editor) in iter(clinton_edgelist.index.values):
    edge_attributes = dict(clinton_edgelist.ix[(article,editor)])
    clinton_g.add_edge(editor,article,edge_attributes)

# Add the user nodes and attributes
for node in iter(clinton_usernodelist.index):
    node_attributes = dict(clinton_usernodelist.ix[node])
    clinton_g.add_node(node,node_attributes)

# Add the page nodes and attributes
for node in iter(clinton_pagenodelist.index):
    node_attributes = dict(clinton_pagenodelist.ix[node])
    clinton_g.add_node(node,node_attributes)
    
print "There are {0} nodes and {1} edges in the network.".format(clinton_g.number_of_nodes(),clinton_g.number_of_edges())

clinton_g.edges(data=True)[:3]
# List of DataFrames
dataframe_dict = {u'Bill Clinton': bc_rv_df,
                  u'Hillary Rodham Clinton': hrc_rv_df}

# Set operations
all_links = list(set(hrc_alllink_outlist) | set(hrc_alllink_inlist))

# Start the scrape
errors = list()
for article in all_links:
    try:
        df = get_revision_df(article)
        dataframe_dict[article] = df
    except:
        errors.append(article)
        pass

gigantic_df = pd.concat(dataframe_dict.values())
gigantic_df.to_csv('gigantic_df.csv',encoding='utf8')
len(gigantic_df)

gigantic_df = pd.read_csv('gigantic_df.csv',
                          encoding='utf8',
                          index_col=['page','revision'],
                          parse_dates=['timestamp','date']
                          )

gigantic_df = gigantic_df.drop(("[[History of the United States]] at [[History of the United States#British colonization|British Colonization]]. ([[WP:TW|TW]])",589285361))
gigantic_df = gigantic_df.drop(("United States",32868))

gigantic_df['timestamp'] = pd.to_datetime(gigantic_df['timestamp'],unit='s')
gigantic_df['date'] = pd.to_datetime(gigantic_df['date'],unit='d')

gigantic_df.head()

edge_agg_function = {'revid':{'weight':len},
                     'timestamp':{'ts_min':np.min,'ts_max':np.max},
                     'diff':{'diff_min':np.min,'diff_median':np.median,'diff_max':np.max},
                     'latency':{'latency_min':np.min,'latency_median':np.median,'latency_max':np.max},
                     'revision':{'revision_min':np.min,'revision_max':np.max}
                     }

# Create the edgelist by grouping on both page and user 
gigantic_gb_edge = gigantic_df.reset_index().groupby(['page','user'])
gigantic_edgelist = gigantic_gb_edge.agg(edge_agg_function)

# Drop the legacy/redundant column names
gigantic_edgelist.columns = gigantic_edgelist.columns.droplevel(0)

# Convert the ts_min and ts_max to floats for the number of days since Wikipedia was founded
gigantic_edgelist['ts_min'] = (gigantic_edgelist['ts_min'] - pd.Timestamp('2001-1-16'))/np.timedelta64(1,'D')
gigantic_edgelist['ts_max'] = (gigantic_edgelist['ts_max'] - pd.Timestamp('2001-1-16'))/np.timedelta64(1,'D')

print "There are {0} edges in the network.".format(len(gigantic_edgelist))

node_agg_function = {'revid':{'revisions':len},
                     'timestamp':{'ts_min':np.min,'ts_max':np.max},
                     'diff':{'diff_min':np.min,'diff_median':np.median,'diff_max':np.max},
                     'latency':{'latency_min':np.min,'latency_median':np.median,'latency_max':np.max},
                     'revision':{'revision_min':np.min,'revision_median':np.median,'revision_max':np.max}
                     }

# Create the usernodelist by grouping on user and aggregating
gigantic_gb_user = gigantic_df.reset_index().groupby(['user'])
gigantic_usernodelist = gigantic_gb_user.agg(node_agg_function)

# Clean up the columns and convert the timestamps to counts
gigantic_usernodelist.columns = gigantic_usernodelist.columns.droplevel(0)
gigantic_usernodelist['ts_min'] = (gigantic_usernodelist['ts_min'] - pd.Timestamp('2001-1-16'))/np.timedelta64(1,'D')
gigantic_usernodelist['ts_max'] = (gigantic_usernodelist['ts_max'] - pd.Timestamp('2001-1-16'))/np.timedelta64(1,'D')

print "There are {0} editor nodes in the network.".format(len(gigantic_usernodelist))

gigantic_usernodelist.head()

# Create the usernodelist by grouping on page and aggregating
gigantic_gb_page = gigantic_df.reset_index().groupby(['page'])
gigantic_pagenodelist = gigantic_gb_page.agg(node_agg_function)

# Clean up the columns and convert the timestamps to counts
gigantic_pagenodelist.columns = gigantic_pagenodelist.columns.droplevel(0)
gigantic_pagenodelist['ts_min'] = (gigantic_pagenodelist['ts_min'] - pd.Timestamp('2001-1-16'))/np.timedelta64(1,'D')
gigantic_pagenodelist['ts_max'] = (gigantic_pagenodelist['ts_max'] - pd.Timestamp('2001-1-16'))/np.timedelta64(1,'D')

print "There are {0} page nodes in the network.".format(len(gigantic_pagenodelist))

gigantic_pagenodelist.head()

gigantic_g = nx.DiGraph()
# Add the edges and edge attributes
for (article,editor) in iter(gigantic_edgelist.index.values):
    edge_attributes = dict(gigantic_edgelist.ix[(article,editor)])
    edge_attributes = {k:float(v) for k,v in edge_attributes.iteritems()}
    gigantic_g.add_edge(editor,article,edge_attributes)

# Add the user nodes and attributes
for node in iter(gigantic_usernodelist.index):
    node_attributes = dict(gigantic_usernodelist.ix[node])
    node_attributes = {k:float(v) for k,v in node_attributes.iteritems()}
    gigantic_g.add_node(node,node_attributes)

# Add the page nodes and attributes
for node in iter(gigantic_pagenodelist.index):
    node_attributes = dict(gigantic_pagenodelist.ix[node])
    node_attributes = {k:float(v) for k,v in node_attributes.iteritems()}
    gigantic_g.add_node(node,node_attributes)
    
print "There are {0} nodes and {1} edges in the network.".format(gigantic_g.number_of_nodes(),gigantic_g.number_of_edges())

gigantic_g.edges(data=True)[:3]

nx.write_graphml(gigantic_g,'gigantic_g.graphml')

g_idc = nx.in_degree_centrality(gigantic_g)
g_odc = nx.out_degree_centrality(gigantic_g)

sorted(g_idc.iteritems(), key=itemgetter(1),reverse=True)[:10]

sorted(g_odc.iteritems(), key=itemgetter(1),reverse=True)[:10]

g_size = gigantic_g.number_of_nodes()
g_idc_counter = Counter([v*(g_size-1) for v in g_idc.itervalues() if v != 0])
g_odc_counter = Counter([v*(g_size-1) for v in g_odc.itervalues() if v != 0])

plt.scatter(g_idc_counter.keys(),g_idc_counter.values(),s=50,c='b',label='Articles')
plt.scatter(g_odc_counter.keys(),g_odc_counter.values(),s=50,c='r',label='Editors')
plt.yscale('log')
plt.xscale('log')
plt.xlabel('Number of connections',fontsize=15)
plt.ylabel('Number of nodes',fontsize=15)
plt.legend(loc='upper right',scatterpoints=1)

weights = [attributes['weight'] for i,j,attributes in gigantic_g.edges_iter(data=True)]
weight_counter = Counter(weights)

plt.scatter(weight_counter.keys(),weight_counter.values(),s=50,c='b',label='Weights')
plt.yscale('log')
plt.xscale('log')
plt.xlabel('Number of contributions',fontsize=15)
plt.ylabel('Number of edges',fontsize=15)

article_nn_degree = nx.assortativity.average_degree_connectivity(gigantic_g,source='in',target='out',nodes=gigantic_pagenodelist.index)
editor_nn_degree = nx.assortativity.average_degree_connectivity(gigantic_g,source='out',target='in',nodes=gigantic_usernodelist.index)

plt.scatter(article_nn_degree.keys(),article_nn_degree.values(),s=50,c='b',label='Articles',alpha=.5)
plt.scatter(editor_nn_degree.keys(),editor_nn_degree.values(),s=50,c='r',label='Editors',alpha=.5)
plt.yscale('log')
plt.xscale('log')
plt.xlabel('Degree',fontsize=15)
plt.ylabel('Average neighbor degree',fontsize=15)
plt.legend(loc='upper right',scatterpoints=1)

gigantic_g.edges(data=True)[1]

edge_weight_centrality = [list(),list()]
for (i,j,attributes) in gigantic_g.edges_iter(data=True):
    edge_weight_centrality[0].append(g_idc[j] - g_idc[i])
    edge_weight_centrality[1].append(attributes['weight'])

plt.scatter(edge_weight_centrality[0],edge_weight_centrality[1])
plt.yscale('log')
# Start up the data structure
link_data = {u'Hillary Rodham Clinton':
             {'Out':hrc_alllink_outlist,
              'In':hrc_alllink_inlist}}

# Set operations
all_links = list(set(hrc_alllink_outlist) | set(hrc_alllink_inlist))

# Start the scrape
for article in all_links:
    try:
        _out_links,_in_links = get_article_links(article)
        link_data[article] = {'Out':_out_links,
                              'In':_in_links}
    except:
        print article
        pass
    
# Save the data
with open('link_data.json','wb') as f:
    json.dump(link_data,f)
dtype_dict = {'page':unicode,
              'revision':np.int64,
              'anon':bool,
              'comment':unicode,
              'parentid':np.int64,
              'size':np.int64,
              'timestamp':unicode,
              'user':unicode,
              'userid':np.int64,
              'unique_users':np.int64,
              'date':unicode,
              'diff':np.float64,
              'latency':np.float64
              }