# Standard packages for data analysis %matplotlib inline import numpy as np import matplotlib.pyplot as plt # pandas handles tabular data import pandas as pd # networkx handles network data import networkx as nx # json handles reading and writing JSON data import json # To visualize webpages within this webpage from IPython.display import HTML # To run queries against MediaWiki APIs from wikitools import wiki, api # Some other helper functions from collections import Counter from operator import itemgetter HTML('http://en.wikipedia.org/w/api.php?action=query&list=users&ususers=Madcoverboy|Jimbo_Wales&usprop=blockinfo|groups|editcount|registration|gender') def wikipedia_query(query_params,site_url='http://en.wikipedia.org/w/api.php'): site = wiki.Wiki(url=site_url) request = api.APIRequest(site, query_params) result = request.query() return result[query_params['action']] user_query = {'action':'query', 'list':'users', 'usprop':'blockinfo|groups|editcount|registration|gender', 'ususers':'Madcoverboy|Jimbo Wales'} query_results = wikipedia_query(user_query) query_results query_results['users'][1]['editcount'] def get_user_properties(user): result = wikipedia_query({'action':'query', 'list':'users', 'usprop':'blockinfo|groups|editcount|registration|gender', 'ususers':user}) return result koavf_query_results = get_user_properties('Koavf') koavf_query_results with open('koavf_query_results.json','wb') as f: json.dump(koavf_query_results,f) with open('koavf_query_results.json','rb') as f: loaded_koavf_query_results = json.load(f) loaded_koavf_query_results query_results['users'] df = pd.DataFrame(query_results['users']) df.to_csv('query_results.csv',quotechar='"',index=False) df pd.read_csv('query_results.csv',quotechar='"') outlink_query = {'action': 'query', 'prop': 'links', 'titles': 'Hillary Rodham Clinton', 'pllimit': '500', 'plnamespace':'0'} hrc_outlink_data = wikipedia_query(outlink_query) hrc_outlink_data['pages'][u'5043192']['links'][:5] hrc_outlink_list = [link['title'] for link in hrc_outlink_data['pages'][u'5043192']['links']] print "There are {0} links from the Hillary Rodham Clinton article".format(len(hrc_outlink_list)) hrc_outlink_list[:10] outlink_query_hc = {'action': 'query', 'prop': 'links', 'titles': 'Hillary Clinton', 'pllimit': '500', 'plnamespace': '0'} hc_outlink_data = wikipedia_query(outlink_query_hc) hc_outlink_data outlink_query_hc_redirect = {'action': 'query', 'prop': 'links', 'titles': 'Hillary Clinton', # still "Hillary Clinton" 'pllimit': '500', 'plnamespace': '0', 'redirects': 'True'} # redirects parameter added hcr_outlink_data = wikipedia_query(outlink_query_hc_redirect) hcr_outlink_list = [link['title'] for link in hcr_outlink_data['pages'][u'5043192']['links']] print "There are {0} links from the Hillary Clinton article".format(len(hcr_outlink_list)) hcr_outlink_list[:10] inlink_query_hrc = {'action': 'query', 'redirects': 'True', 'prop': 'linkshere', 'titles': 'Hillary Rodham Clinton', 'lhlimit': '500', 'lhnamespace': '0', 'lhshow': '!redirect', 'lhprop': 'title'} hrc_inlink_data = wikipedia_query(inlink_query_hrc) hrc_inlink_list = [link['title'] for link in hrc_inlink_data['pages'][u'5043192']['linkshere']] print "There are {0} links to the Hillary Rodham Clinton article".format(len(hrc_inlink_list)) hrc_inlink_list[:10] alllinks_query_hrc = {'action': 'query', 'redirects': 'True', 'prop': 'links|linkshere', #combined both prop calls with a pipe 'titles': 'Hillary Rodham Clinton', 'pllimit': '500', #still need the "prop=links" "pl" parameters and below 'plnamespace': '0', 'lhlimit': '500', #still need the "prop=linkshere" "lh" parameters and below 'lhnamespace': '0', 'lhshow': '!redirect', 'lhprop': 'title'} hrc_alllink_data = wikipedia_query(alllinks_query_hrc) hrc_alllink_outlist = [link['title'] for link in hrc_alllink_data['pages'][u'5043192']['links']] hrc_alllink_inlist = [link['title'] for link in hrc_alllink_data['pages'][u'5043192']['linkshere']] print "There are {0} out links from and {1} in links to the Hillary Rodham Clinton article".format(len(hrc_alllink_outlist),len(hrc_alllink_inlist)) def get_article_links(article): query = {'action': 'query', 'redirects': 'True', 'prop': 'links|linkshere', 'titles': article, # the article variable is passed into here 'pllimit': '500', 'plnamespace': '0', 'lhlimit': '500', 'lhnamespace': '0', 'lhshow': '!redirect', 'lhprop': 'title'} results = wikipedia_query(query) # do the query page_id = results['pages'].keys()[0] # get the page_id if 'links' in results['pages'][page_id].keys(): #sometimes there are no links outlist = [link['title'] for link in results['pages'][page_id]['links']] # clean up outlinks else: outlist = [] # return empty list if no outlinks if 'linkshere' in results['pages'][page_id].keys(): #sometimes there are no links inlist = [link['title'] for link in results['pages'][page_id]['linkshere']] # clean up inlinks else: inlist = [] # return empty list if no inlinks return outlist,inlist bc_out, bc_in = get_article_links("Bill Clinton") print "There are {0} out links from and {1} in links to the Bill Clinton article".format(len(bc_out),len(bc_in)) clinton_link_data = {"Hillary Rodham Clinton": {"In": hrc_alllink_inlist, "Out": hrc_alllink_outlist}, "Bill Clinton": {"In": bc_in, "Out": bc_out} } with open('clinton_link_data.json','wb') as f: json.dump(clinton_link_data,f) hrc_alllink_outlist[:5] hrc_g = nx.DiGraph() for article in hrc_alllink_outlist: hrc_g.add_edge("Hillary Rodham Clinton",article) for article in hrc_alllink_inlist: hrc_g.add_edge(article,"Hillary Rodham Clinton") len(hrc_alllink_outlist) + len(hrc_alllink_inlist) hrc_g.number_of_nodes() print "There are {0} edges and {1} nodes in the network".format(hrc_g.number_of_edges(), hrc_g.number_of_nodes()) reciprocal_edges = list() for (i,j) in hrc_g.edges(): if hrc_g.has_edge(j,i) and (j,i) not in reciprocal_edges: reciprocal_edges.append((i,j)) reciprocation_fraction = round(float(len(reciprocal_edges))/hrc_g.number_of_edges(),3) print "There are {0} reciprocated edges out of {1} edges in the network, giving a reciprocation fraction of {2}.".format(len(reciprocal_edges),hrc_g.number_of_edges(),reciprocation_fraction) bc_g = nx.DiGraph() for article in bc_out: bc_g.add_edge("Bill Clinton",article) for article in bc_in: bc_g.add_edge(article,"Bill Clinton") bc_reciprocal_edges = list() for (i,j) in bc_g.edges(): if bc_g.has_edge(j,i) and (j,i) not in bc_reciprocal_edges: bc_reciprocal_edges.append((i,j)) bc_reciprocation_fraction = round(float(len(bc_reciprocal_edges))/bc_g.number_of_edges(),3) print "There are {0} reciprocated edges out of {1} edges in the network, giving a reciprocation fraction of {2}.".format(len(bc_reciprocal_edges),bc_g.number_of_edges(),bc_reciprocation_fraction) HTML('') oldest_outlinks_query_hrc = {'action': 'parse', #query changes to parse 'prop': 'links', 'oldid': '256189'} oldest_outlinks_data = wikipedia_query(oldest_outlinks_query_hrc) oldest_outlinks_data oldest_outlink_list = [link['*'] for link in oldest_outlinks_data['links']] print "There are {0} out links from the Hillary Rodham Clinton article".format(len(oldest_outlink_list)) oldest_outlink_list revisions_query_hrc = {'action': 'query', 'redirects': 'True', 'prop': 'revisions', 'titles': "Hillary Rodham Clinton", 'rvprop': 'ids|user|timestamp|userid|comment|size', 'rvlimit': '500', 'rvdir': 'newer'} revisions_data_hrc = wikipedia_query(revisions_query_hrc) # Extract and convert to DataFrame hrc_rv_df = pd.DataFrame(revisions_data_hrc['pages']['5043192']['revisions']) # Make it clear what's being edited hrc_rv_df['page'] = [u'Hillary Rodham Clinton']*len(hrc_rv_df) # Clean up timestamps hrc_rv_df['timestamp'] = pd.to_datetime(hrc_rv_df['timestamp'],format="%Y-%m-%dT%H:%M:%SZ",unit='s') # Clean up anon column hrc_rv_df = hrc_rv_df.replace({'anon':{np.nan:False,u'':True}}) # Sort the data on timestamp and reset the index hrc_rv_df = hrc_rv_df.sort('timestamp').reset_index(drop=True) hrc_rv_df.index.name = 'revision' hrc_rv_df = hrc_rv_df.reset_index() # Set the index to a MultiIndex hrc_rv_df.set_index(['page','revision'],inplace=True) # Save the data to disk hrc_rv_df.to_csv('hrc_revisions.csv',encoding='utf8') # Show the first 5 rows hrc_rv_df.head() hrc_rv_gb_user = hrc_rv_df.groupby('user') hrc_user_revisions = hrc_rv_gb_user['revid'].aggregate(len).sort(ascending=False,inplace=False) print "There are {0} unique users who have made a contribution to the article.".format(len(hrc_user_revisions)) hrc_user_revisions.head(10) revisions_counter = Counter(hrc_user_revisions.values) plt.scatter(revisions_counter.keys(),revisions_counter.values(),s=50) plt.ylabel('Number of users',fontsize=15) plt.xlabel('Number of revisions',fontsize=15) plt.yscale('log') plt.xscale('log') def count_unique_users(user_series): unique_users = [] unique_count = [] for user in user_series.values: if user not in unique_users: unique_users.append(user) unique_count.append(len(unique_users)) else: unique_count.append(unique_count[-1]) return unique_count hrc_rv_df['unique_users'] = count_unique_users(hrc_rv_df['user']) hrc_rv_df['date'] = hrc_rv_df['timestamp'].apply(lambda x:x.date()) activity_by_day = hrc_rv_df.groupby('date').aggregate({'unique_users':max,'revid':len}) ax = activity_by_day.plot(lw=1,secondary_y=['revid']) ax.set_xlabel('Time',fontsize=15) hrc_rv_df['diff'] = hrc_rv_df['size'].diff() diff_counter = Counter(hrc_rv_df['diff'].values) plt.scatter(diff_counter.keys(),diff_counter.values(),s=50,alpha=.1) plt.xlabel('Difference (bytes)',fontsize=15) plt.ylabel('Number of revisions',fontsize=15) plt.yscale('log') plt.xscale('symlog') activity_by_day = hrc_rv_df.groupby('date').aggregate({'unique_users':max, 'revid':len, 'diff':np.median}) # Compute a 60-day rolling average to remove spikiness, plot pd.rolling_mean(activity_by_day['diff'],60).plot() plt.yscale('symlog') plt.xlabel('Time',fontsize=15) plt.ylabel('Difference (bytes)',fontsize=15) plt.axhline(0,lw=2,c='k') # The diff returns timedeltas, but dividing by a 1-second timedelta returns a float # Round these numbers off to smooth out the distribution and add 1 second to everything to make the plot behave hrc_rv_df['latency'] = [round(i/np.timedelta64(1,'s'),-1) + 1 for i in hrc_rv_df['timestamp'].diff().values] diff_counter = Counter(hrc_rv_df['latency'].values) plt.scatter(diff_counter.keys(),diff_counter.values(),s=50,alpha=.1) plt.xlabel('Latency time (seconds)',fontsize=15) plt.ylabel('Number of changes',fontsize=15) plt.yscale('log') plt.xscale('log') hrc_rv_df['latency'].describe() activity_by_day = hrc_rv_df.groupby('date').aggregate({'unique_users':max, 'revid':len, 'diff':np.median, 'latency':np.median}) # Compute a 60-day rolling average to remove spikiness, plot pd.rolling_mean(activity_by_day['latency'],60).plot() plt.yscale('symlog') plt.xlabel('Time',fontsize=15) plt.ylabel('Latency time (seconds)',fontsize=15) hrc_bg = nx.DiGraph() for user in hrc_rv_df['user'].values: if hrc_bg.has_edge(user,u'Hillary Rodham Clinton'): hrc_bg[user][u'Hillary Rodham Clinton']['weight'] += 1 else: hrc_bg.add_edge(user,u'Hillary Rodham Clinton',weight=1) print "There are {0} nodes and {1} edges in the network.".format(hrc_bg.number_of_nodes(),hrc_bg.number_of_edges()) hrc_bg.edges(data=True)[:5] def get_revision_df(article): revisions_query = {'action': 'query', 'redirects': 'True', 'prop': 'revisions', 'titles': article, 'rvprop': 'ids|user|timestamp|user|userid|comment|size', 'rvlimit': '500', 'rvdir': 'newer'} revisions_data = wikipedia_query(revisions_query) page_id = revisions_data['pages'].keys()[0] # Extract and convert to DataFrame. Try/except for links to pages that don't exist try: df = pd.DataFrame(revisions_data['pages'][page_id]['revisions']) except KeyError: print u"{0} doesn't exist!".format(article) pass # Make it clear what's being edited df['page'] = [article]*len(df) # Clean up timestamps df['timestamp'] = pd.to_datetime(df['timestamp'],format="%Y-%m-%dT%H:%M:%SZ",unit='s') # Clean up anon column. If/else for articles that have all non-anon editors if 'anon' in df.columns: df = df.replace({'anon':{np.nan:False,u'':True}}) else: df['anon'] = [False] * len(df) # Sort the data on timestamp and reset the index df = df.sort('timestamp').reset_index(drop=True) df.index.name = 'revision' df = df.reset_index() # Set the index to a MultiIndex df.set_index(['page','revision'],inplace=True) # Compute additional features df['date'] = df['timestamp'].apply(lambda x:x.date()) df['diff'] = df['size'].diff() df['unique_users'] = count_unique_users(df['user']) df['latency'] = [round(i/np.timedelta64(1,'s'),-1) + 1 for i in df['timestamp'].diff().values] # Don't return random other columns df = df[[u'anon',u'comment',u'parentid', u'revid',u'size',u'timestamp', u'user',u'userid',u'unique_users', u'date', u'diff', u'latency']] return df bc_rv_df = get_revision_df("Bill Clinton") bc_rv_df.head() clinton_df = pd.concat([bc_rv_df,hrc_rv_df]) print clinton_df.index.levels[0] print "There are a total of {0} revisions across both the Hillary and Bill Clinton articles.".format(len(clinton_df)) clinton_df.to_csv('clinton_revisions.csv',encoding='utf8') clinton_df = pd.read_csv('clinton_revisions.csv', encoding='utf8', index_col=['page','revision'], parse_dates=['timestamp','date']) clinton_df.head() clinton_gb_edge = clinton_df.reset_index().groupby(['page','user']) clinton_edgelist = clinton_gb_edge.agg({'revid':{'weight':len}, 'timestamp':{'ts_min':np.min,'ts_max':np.max}, 'diff':{'diff_min':np.min,'diff_median':np.median,'diff_max':np.max}, 'latency':{'latency_min':np.min,'latency_median':np.median,'latency_max':np.max}, 'revision':{'revision_min':np.min,'revision_max':np.max} }) # Drop the legacy/redundant column names clinton_edgelist.columns = clinton_edgelist.columns.droplevel(0) # Convert the ts_min and ts_max to floats for the number of days since Wikipedia was founded clinton_edgelist['ts_min'] = (clinton_edgelist['ts_min'] - pd.Timestamp('2001-1-16'))/np.timedelta64(1,'D') clinton_edgelist['ts_max'] = (clinton_edgelist['ts_max'] - pd.Timestamp('2001-1-16'))/np.timedelta64(1,'D') clinton_edgelist.head() # Create the usernodelist by grouping on user and aggregating clinton_gb_user = clinton_df.reset_index().groupby(['user']) clinton_usernodelist = clinton_gb_user.agg({'revid':{'revisions':len}, 'timestamp':{'ts_min':np.min,'ts_max':np.max}, 'diff':{'diff_min':np.min,'diff_median':np.median,'diff_max':np.max}, 'latency':{'latency_min':np.min,'latency_median':np.median,'latency_max':np.max}, 'revision':{'revision_min':np.min,'revision_median':np.median,'revision_max':np.max} }) # Clean up the columns and convert the timestamps to counts clinton_usernodelist.columns = clinton_usernodelist.columns.droplevel(0) clinton_usernodelist['ts_min'] = (clinton_usernodelist['ts_min'] - pd.Timestamp('2001-1-16'))/np.timedelta64(1,'D') clinton_usernodelist['ts_max'] = (clinton_usernodelist['ts_max'] - pd.Timestamp('2001-1-16'))/np.timedelta64(1,'D') # Create the usernodelist by grouping on page and aggregating clinton_gb_page = clinton_df.reset_index().groupby(['page']) clinton_pagenodelist = clinton_gb_page.agg({'revid':{'revisions':len}, 'timestamp':{'ts_min':np.min,'ts_max':np.max}, 'diff':{'diff_min':np.min,'diff_median':np.median,'diff_max':np.max}, 'latency':{'latency_min':np.min,'latency_median':np.median,'latency_max':np.max}, 'revision':{'revision_min':np.min,'revision_median':np.median,'revision_max':np.max} }) # Clean up the columns and convert the timestamps to counts clinton_pagenodelist.columns = clinton_pagenodelist.columns.droplevel(0) clinton_pagenodelist['ts_min'] = (clinton_pagenodelist['ts_min'] - pd.Timestamp('2001-1-16'))/np.timedelta64(1,'D') clinton_pagenodelist['ts_max'] = (clinton_pagenodelist['ts_max'] - pd.Timestamp('2001-1-16'))/np.timedelta64(1,'D') clinton_pagenodelist.head() clinton_g = nx.DiGraph() # Add the edges and edge attributes for (article,editor) in iter(clinton_edgelist.index.values): edge_attributes = dict(clinton_edgelist.ix[(article,editor)]) clinton_g.add_edge(editor,article,edge_attributes) # Add the user nodes and attributes for node in iter(clinton_usernodelist.index): node_attributes = dict(clinton_usernodelist.ix[node]) clinton_g.add_node(node,node_attributes) # Add the page nodes and attributes for node in iter(clinton_pagenodelist.index): node_attributes = dict(clinton_pagenodelist.ix[node]) clinton_g.add_node(node,node_attributes) print "There are {0} nodes and {1} edges in the network.".format(clinton_g.number_of_nodes(),clinton_g.number_of_edges()) clinton_g.edges(data=True)[:3] # List of DataFrames dataframe_dict = {u'Bill Clinton': bc_rv_df, u'Hillary Rodham Clinton': hrc_rv_df} # Set operations all_links = list(set(hrc_alllink_outlist) | set(hrc_alllink_inlist)) # Start the scrape errors = list() for article in all_links: try: df = get_revision_df(article) dataframe_dict[article] = df except: errors.append(article) pass gigantic_df = pd.concat(dataframe_dict.values()) gigantic_df.to_csv('gigantic_df.csv',encoding='utf8') len(gigantic_df) gigantic_df = pd.read_csv('gigantic_df.csv', encoding='utf8', index_col=['page','revision'], parse_dates=['timestamp','date'] ) gigantic_df = gigantic_df.drop(("[[History of the United States]] at [[History of the United States#British colonization|British Colonization]]. ([[WP:TW|TW]])",589285361)) gigantic_df = gigantic_df.drop(("United States",32868)) gigantic_df['timestamp'] = pd.to_datetime(gigantic_df['timestamp'],unit='s') gigantic_df['date'] = pd.to_datetime(gigantic_df['date'],unit='d') gigantic_df.head() edge_agg_function = {'revid':{'weight':len}, 'timestamp':{'ts_min':np.min,'ts_max':np.max}, 'diff':{'diff_min':np.min,'diff_median':np.median,'diff_max':np.max}, 'latency':{'latency_min':np.min,'latency_median':np.median,'latency_max':np.max}, 'revision':{'revision_min':np.min,'revision_max':np.max} } # Create the edgelist by grouping on both page and user gigantic_gb_edge = gigantic_df.reset_index().groupby(['page','user']) gigantic_edgelist = gigantic_gb_edge.agg(edge_agg_function) # Drop the legacy/redundant column names gigantic_edgelist.columns = gigantic_edgelist.columns.droplevel(0) # Convert the ts_min and ts_max to floats for the number of days since Wikipedia was founded gigantic_edgelist['ts_min'] = (gigantic_edgelist['ts_min'] - pd.Timestamp('2001-1-16'))/np.timedelta64(1,'D') gigantic_edgelist['ts_max'] = (gigantic_edgelist['ts_max'] - pd.Timestamp('2001-1-16'))/np.timedelta64(1,'D') print "There are {0} edges in the network.".format(len(gigantic_edgelist)) node_agg_function = {'revid':{'revisions':len}, 'timestamp':{'ts_min':np.min,'ts_max':np.max}, 'diff':{'diff_min':np.min,'diff_median':np.median,'diff_max':np.max}, 'latency':{'latency_min':np.min,'latency_median':np.median,'latency_max':np.max}, 'revision':{'revision_min':np.min,'revision_median':np.median,'revision_max':np.max} } # Create the usernodelist by grouping on user and aggregating gigantic_gb_user = gigantic_df.reset_index().groupby(['user']) gigantic_usernodelist = gigantic_gb_user.agg(node_agg_function) # Clean up the columns and convert the timestamps to counts gigantic_usernodelist.columns = gigantic_usernodelist.columns.droplevel(0) gigantic_usernodelist['ts_min'] = (gigantic_usernodelist['ts_min'] - pd.Timestamp('2001-1-16'))/np.timedelta64(1,'D') gigantic_usernodelist['ts_max'] = (gigantic_usernodelist['ts_max'] - pd.Timestamp('2001-1-16'))/np.timedelta64(1,'D') print "There are {0} editor nodes in the network.".format(len(gigantic_usernodelist)) gigantic_usernodelist.head() # Create the usernodelist by grouping on page and aggregating gigantic_gb_page = gigantic_df.reset_index().groupby(['page']) gigantic_pagenodelist = gigantic_gb_page.agg(node_agg_function) # Clean up the columns and convert the timestamps to counts gigantic_pagenodelist.columns = gigantic_pagenodelist.columns.droplevel(0) gigantic_pagenodelist['ts_min'] = (gigantic_pagenodelist['ts_min'] - pd.Timestamp('2001-1-16'))/np.timedelta64(1,'D') gigantic_pagenodelist['ts_max'] = (gigantic_pagenodelist['ts_max'] - pd.Timestamp('2001-1-16'))/np.timedelta64(1,'D') print "There are {0} page nodes in the network.".format(len(gigantic_pagenodelist)) gigantic_pagenodelist.head() gigantic_g = nx.DiGraph() # Add the edges and edge attributes for (article,editor) in iter(gigantic_edgelist.index.values): edge_attributes = dict(gigantic_edgelist.ix[(article,editor)]) edge_attributes = {k:float(v) for k,v in edge_attributes.iteritems()} gigantic_g.add_edge(editor,article,edge_attributes) # Add the user nodes and attributes for node in iter(gigantic_usernodelist.index): node_attributes = dict(gigantic_usernodelist.ix[node]) node_attributes = {k:float(v) for k,v in node_attributes.iteritems()} gigantic_g.add_node(node,node_attributes) # Add the page nodes and attributes for node in iter(gigantic_pagenodelist.index): node_attributes = dict(gigantic_pagenodelist.ix[node]) node_attributes = {k:float(v) for k,v in node_attributes.iteritems()} gigantic_g.add_node(node,node_attributes) print "There are {0} nodes and {1} edges in the network.".format(gigantic_g.number_of_nodes(),gigantic_g.number_of_edges()) gigantic_g.edges(data=True)[:3] nx.write_graphml(gigantic_g,'gigantic_g.graphml') g_idc = nx.in_degree_centrality(gigantic_g) g_odc = nx.out_degree_centrality(gigantic_g) sorted(g_idc.iteritems(), key=itemgetter(1),reverse=True)[:10] sorted(g_odc.iteritems(), key=itemgetter(1),reverse=True)[:10] g_size = gigantic_g.number_of_nodes() g_idc_counter = Counter([v*(g_size-1) for v in g_idc.itervalues() if v != 0]) g_odc_counter = Counter([v*(g_size-1) for v in g_odc.itervalues() if v != 0]) plt.scatter(g_idc_counter.keys(),g_idc_counter.values(),s=50,c='b',label='Articles') plt.scatter(g_odc_counter.keys(),g_odc_counter.values(),s=50,c='r',label='Editors') plt.yscale('log') plt.xscale('log') plt.xlabel('Number of connections',fontsize=15) plt.ylabel('Number of nodes',fontsize=15) plt.legend(loc='upper right',scatterpoints=1) weights = [attributes['weight'] for i,j,attributes in gigantic_g.edges_iter(data=True)] weight_counter = Counter(weights) plt.scatter(weight_counter.keys(),weight_counter.values(),s=50,c='b',label='Weights') plt.yscale('log') plt.xscale('log') plt.xlabel('Number of contributions',fontsize=15) plt.ylabel('Number of edges',fontsize=15) article_nn_degree = nx.assortativity.average_degree_connectivity(gigantic_g,source='in',target='out',nodes=gigantic_pagenodelist.index) editor_nn_degree = nx.assortativity.average_degree_connectivity(gigantic_g,source='out',target='in',nodes=gigantic_usernodelist.index) plt.scatter(article_nn_degree.keys(),article_nn_degree.values(),s=50,c='b',label='Articles',alpha=.5) plt.scatter(editor_nn_degree.keys(),editor_nn_degree.values(),s=50,c='r',label='Editors',alpha=.5) plt.yscale('log') plt.xscale('log') plt.xlabel('Degree',fontsize=15) plt.ylabel('Average neighbor degree',fontsize=15) plt.legend(loc='upper right',scatterpoints=1) gigantic_g.edges(data=True)[1] edge_weight_centrality = [list(),list()] for (i,j,attributes) in gigantic_g.edges_iter(data=True): edge_weight_centrality[0].append(g_idc[j] - g_idc[i]) edge_weight_centrality[1].append(attributes['weight']) plt.scatter(edge_weight_centrality[0],edge_weight_centrality[1]) plt.yscale('log') # Start up the data structure link_data = {u'Hillary Rodham Clinton': {'Out':hrc_alllink_outlist, 'In':hrc_alllink_inlist}} # Set operations all_links = list(set(hrc_alllink_outlist) | set(hrc_alllink_inlist)) # Start the scrape for article in all_links: try: _out_links,_in_links = get_article_links(article) link_data[article] = {'Out':_out_links, 'In':_in_links} except: print article pass # Save the data with open('link_data.json','wb') as f: json.dump(link_data,f) dtype_dict = {'page':unicode, 'revision':np.int64, 'anon':bool, 'comment':unicode, 'parentid':np.int64, 'size':np.int64, 'timestamp':unicode, 'user':unicode, 'userid':np.int64, 'unique_users':np.int64, 'date':unicode, 'diff':np.float64, 'latency':np.float64 }