%matplotlib inline import numpy as np import pandas as pd import matplotlib.pyplot as plt import json, requests, re, itertools, urllib2, urlparse import wikipedia_scraping as ws import seaborn as sns import networkx as nx from sklearn.metrics.pairwise import cosine_similarity from bs4 import BeautifulSoup, element from collections import Counter from IPython.display import Image from operator import itemgetter from scipy import stats from matplotlib.lines import Line2D _start = pd.datetime(2001,1,1) _end = pd.datetime(2015,1,1) _filedir = u'C:/Users/bkeegan/Dropbox/Workspace/Wikipedia news events/2014 News/' # http://yearinreview.fb.com/ facebook = ['World Cup','Ebola Outbreak','Elections in Brazil','Robin Williams','Ice Bucket Challenge','Conflict in Gaza','Malaysia Airlines disasters','Super Bowl','Ferguson','Sochi'] # http://www.google.com/trends/topcharts?hl=en#date=2014 google = ['Robin Williams','World Cup','Ebola','Malaysia Airlines','Flappy Bird','ALS Ice Bucket Challenge','ISIS','Ferguson','Frozen','Ukraine'] # https://2014.twitter.com/moments twitter = ['Philip Seymour Hoffman','State of the Union','Carnaval','Malaysia Airlines','Bring Back Our Girls','India Election','Spanish Abdication','Maya Angelou','Ferguson','Robin Williams','Ice Bucket Challenge','Scottish referendum','Ebola','He for She','Hong Kong protests','Mars Orbiter','Malala Yousafzi','US elections','Berlin Wall','Philae'] # Editorial judgment, https://en.wikipedia.org/wiki/2014 wikipedia1 = ['2014 Winter Olympics','Ebola virus epidemic in West Africa','2014 Crimean crisis','Malaysia Airlines Flight 370','Chibok schoolgirl kidnapping','Sinking of the MV Sewol','Islamic State in Iraq and the Levant','2014 FIFA World Cup','Felipe VI','2014 Israel–Gaza conflict','Malaysia Airlines Flight 17','Rosetta spacecraft','Cuba-United States relations'] # Number of contributors, http://stats.wikimedia.org/EN/TablesWikipediaEN.htm#zeitgeist # Excluding repeats like "Deaths in 2014" wikipedia2 = ['2013–14 North American cold wave',] def top_articles(lang): # Read the HTML from the web and convert to soup # Broken URLS here: soup = BeautifulSoup(urllib2.urlopen('http://stats.wikimedia.org/{0}/TablesWikipedia{0}.htm'.format(lang.upper())).read()) soup = BeautifulSoup(urllib2.urlopen('http://stats.wikimedia.org/EN/TablesWikipedia{0}.htm'.format(lang.upper())).read()) # Look for all the paragraphs with 2014 _p = soup.findAll('b',text=re.compile('2014')) # Select only those paragraph parents that have exactly 152 fields, corresponding to the top-25 lists _p2014 = [t.parent for t in _p if len(t.parent) == 152] # Get the text out of the children tags as a list of lists parsed = [[t.text for t in list(p.children) if type(t) != element.NavigableString] for p in _p2014] # Convert to a dictionary keyed by month abbreviation with values as the list of text fields parsed = {month[0].split(u'\xa0')[0]:month[1:] for month in parsed} # Do some crazy dictionary and list comprehensions with zips to convert the values in the list parsed = {k:[{'rank':int(a),'editors':int(b),'article':c} for a,b,c in zip(v[0::3],v[1::3],v[2::3])] for k,v in parsed.items()} # Convert each month into a DataFrame with month information in the index # and then concat all the dfs together, sorting on those with the most editors ranked = pd.concat([pd.DataFrame(parsed[i],index=[i]*len(parsed[i])) for i in parsed.keys()]).sort('editors',ascending=False).reset_index() # rename the reset index to something meaningful ranked.rename(columns={'index':'month'},inplace=True) # Group the articles by name, compute aggregate statistics # Rank on the total number editors and months in the top 25 top_articles = ranked.groupby('article').agg({'month':len,'editors':np.sum,'rank':np.min}) top_articles['editor-month'] = top_articles['month'] * top_articles['editors'] top_articles.sort(['editor-month'],ascending=False,inplace=True) return top_articles country_codes = {'en':'English','ru':'Russian','es':'Spanish','de':'German','ja':'Japanese','fr':'French', 'zh':'Chinese','it':'Italian','pl':'Polish','pt':'Portugese','nl':'Dutch','tr':'Turkish', 'ar':'Arabic','sv':'Swedish','id':'Indonesian','ko':'Korean','cs':'Czech','fa':'Farsi', 'uk':'Ukranian'} top_articles_by_country = {} for country in country_codes.keys(): try: top_articles_by_country[country] = top_articles(country) except urllib2.HTTPError: print "The '{0}' language does not have a stats page ".format(country) pass for _country,_df in top_articles_by_country.items(): _df.to_csv('/Data/{0}.csv'.format(_country),encoding='utf8') def langlink_translater(source_lang,target_lang,article_titles): chunks = ws.chunk_maker(article_titles,40) translation_dict = dict() for chunk in chunks: result = ws.wikipedia_query({'action':'query', 'prop': 'langlinks', 'lllang': source_lang, 'titles': '|'.join(chunk), 'lllimit': '500'},target_lang) if result and 'pages' in result.keys(): translation_dict.update({_d['title'] : _d['langlinks'][0]['*'] for _d in result['pages'].values() if 'langlinks' in _d.keys()}) return translation_dict # This step takes a few minutes translater_dict = {source_lang:{target_lang:langlink_translater(source_lang,target_lang,df.index) for target_lang,df in top_articles_by_country.items()} for source_lang in top_articles_by_country.keys()} # Save the file with open('translater_dict.json','wb') as f: json.dump(translater_dict,f) _filedir country_codes = {'en':'English','ru':'Russian','es':'Spanish','de':'German','ja':'Japanese','fr':'French', 'zh':'Chinese','it':'Italian','pl':'Polish','pt':'Portugese','nl':'Dutch','tr':'Turkish', 'ar':'Arabic','sv':'Swedish','id':'Indonesian','ko':'Korean','cs':'Czech','fa':'Farsi', 'uk':'Ukranian'} top_articles_by_country = dict() for country in country_codes.keys(): top_articles_by_country[country] = pd.read_csv(_filedir + '/Data/{0}.csv'.format(country),encoding='utf8',index_col=0) with open('translater_dict.json','rb') as f: translater_dict = json.load(f) lang_link_exists_dict = dict() top_articles_df = pd.DataFrame() for source_lang,target_dictionary in translater_dict.iteritems(): langlink_exists_df = pd.DataFrame() for target_lang,d in target_dictionary.iteritems(): top_articles_df[target_lang] = pd.Series(top_articles_by_country[target_lang].index) langlink_exists_df[target_lang] = pd.Series(top_articles_by_country[target_lang].index).isin(translater_dict[source_lang][target_lang].keys()) if source_lang == target_lang: langlink_exists_df[target_lang] = [1]*len(langlink_exists_df[target_lang]) langlink_exists_df = langlink_exists_df.reindex_axis(sorted(langlink_exists_df.columns), axis=1) lang_link_exists_dict[source_lang] = langlink_exists_df _df = top_articles_df.ix[:2].T _df.index = [country_codes[i] for i in _df.index] _df.columns = range(1,4) _df.sort() _df.ix['id'].sum(axis=1) _lang = 'en' f, ax = plt.subplots(figsize=(10,5)) _df = lang_link_exists_dict[_lang].ix[:100].T.astype(float) _df = _df.ix[_df.sum(axis=1).sort(inplace=False,ascending=False).index] _y,_x = _df.shape _ax = ax.pcolor(_df,cmap='rainbow',vmin=0,vmax=1) ax.set_frame_on(False) ax.set_xticks(np.arange(0.5,_x+.5,10),minor=False) ax.set_yticks(np.arange(_y)+.5,minor=False) ax.invert_yaxis() ax.set_xticklabels(_df.columns[::10],minor=False,fontsize=12) ax.set_yticklabels([country_codes[x] for x in _df.index],minor=False,fontsize=12) ax.tick_params(axis='x',direction='in',pad=-10) ax.set_xlabel('Article rank',fontsize=15) #f.subplots_adjust(right=0.8) #cbar_ax = f.add_axes([.95, 0.15, 0.025, .75]) #f.colorbar(_ax, cax=cbar_ax) f.tight_layout(); f.savefig('en_lang_link_exists.png',dpi=150) sum_lang_link = pd.DataFrame(np.zeros(lang_link_exists_dict['en'].shape),columns=lang_link_exists_dict['en'].columns) for lang,_df in lang_link_exists_dict.iteritems(): sum_lang_link = sum_lang_link + _df.values.astype(float) #frac_sum_lang_link = sum_lang_link.apply(lambda x:x/19) sum_lang_link.columns = [country_codes[i] for i in sum_lang_link.columns] f, ax = plt.subplots(figsize=(10,5)) _df = sum_lang_link.ix[:100].T.astype(float) _df = _df.ix[_df.sum(axis=1).sort(inplace=False,ascending=False).index] _y,_x = _df.shape _ax = ax.pcolor(_df,cmap='rainbow',vmin=0,vmax=19) ax.set_frame_on(False) ax.set_xticks(np.arange(0,_x,10),minor=False) ax.set_xticklabels(np.arange(0,_x,10),fontsize=12) ax.set_xlabel('Article Rank',fontsize=15) ax.set_title('Number of Languages with Article on Topic',fontsize=20) ax.tick_params(axis='x',direction='in',pad=-10) ax.set_yticks(np.arange(_y)+.5,minor=False) ax.set_yticklabels(_df.index,minor=False) ax.invert_yaxis() #f.subplots_adjust(right=0.8) cbar_ax = f.add_axes([.875, 0.15, 0.025, .75]) f.colorbar(_ax, cax=cbar_ax) f.tight_layout() f.savefig('sum_lang_link.png',dpi=200) _s = sum_lang_link.ix[:100].apply(np.average,axis=1) ax = plt.scatter(_s.index,_s.values,s=50,cmap='rainbow') ax.axes.set_title('Coverage for Top 100 Stories',fontsize=20) ax.axes.set_xlabel('Article Rank',fontsize=16) ax.axes.set_ylabel('Number of Languages Covered',fontsize=16) ax.axes.set_xlim((-1,101)) plt.tight_layout() plt.savefig('top100_coverage.png',dpi=200) article_language_graph = nx.DiGraph() article_language_mapper = dict() # This will be helpful later for source_lang,d in translater_dict.iteritems(): for target_lang,mapping in d.iteritems(): for target_lang_article,source_lang_article in mapping.iteritems(): article_language_graph.add_edge(target_lang_article,source_lang_article) article_language_graph.add_node(source_lang_article,lang=source_lang) article_language_graph.add_node(target_lang_article,lang=target_lang) # Populate the article_language_mapper if source_lang_article in article_language_mapper.keys(): article_language_mapper[source_lang_article].append(source_lang) else: article_language_mapper[source_lang_article] = [source_lang] if target_lang_article in article_language_mapper.keys(): article_language_mapper[target_lang_article].append(target_lang) else: article_language_mapper[target_lang_article] = [target_lang] nx.write_gexf(article_language_graph,'article_language_graph.gexf') article_language_mapper = {k:list(set(v)) for k,v in article_language_mapper.iteritems()} with open('article_language_mapper.json','wb') as f: json.dump(article_language_mapper,f) Image('article_language_links.png') topic_subgraphs = list(nx.components.connected_component_subgraphs(article_language_graph.to_undirected())) subgraph_properties = [{'edges':_subgraph.number_of_edges(),'nodes':_subgraph.number_of_nodes(),'density':nx.density(_subgraph)} for _subgraph in topic_subgraphs] # Uncomment to see what's in these subgraphs #for _subgraph in topic_subgraphs: # if _subgraph.number_of_nodes() > 19: # print _subgraph.nodes() subgraph_df = pd.DataFrame(subgraph_properties) subgraph_df = subgraph_df[subgraph_df['nodes'] > 2] f,ax = plt.subplots(1,1) _ax = subgraph_df.plot(x='nodes',y='edges',kind='scatter',label='Observed Topic',ax=ax) ax.plot([i*(i-1) for i in range(20)],label='Ideal Topic',lw=3,c='r',alpha=.5) ax.axvline(x=19.5,ls='--',lw=3,c='g',alpha=.5,label='Max Topics') ax.set_xlim((0,40)) ax.set_ylim((-1,400)) ax.legend(fontsize=12) ax.set_xlabel('Number of Nodes in Topic',fontsize=18) ax.set_ylabel('Number of Edges in Topic',fontsize=18) ax.set_title('Diagnosing Problems in Topic Subgraphs',fontsize=24) # Based on the results from commented part above, I'm applying three labels to the three outliers _outliers = zip(['China','Taiwan','Ebola'],subgraph_df[subgraph_df['nodes'] > 20][['nodes','edges']].values) for label,(x,y) in _outliers: ax.annotate(label,xy=(x, y),fontsize=12, xytext=(x+2, y+75), arrowprops=dict(arrowstyle="fancy", #linestyle="dashed", color="0.5",shrinkB=8,connectionstyle="arc3,rad=0.3")) plt.tight_layout(); def complete_subgraph_maker(node_list): return itertools.permutations(node_list,2) complete_topic_graph = nx.DiGraph() for _subgraph in topic_subgraphs: if _subgraph.number_of_nodes() < 20: _edgelist = complete_subgraph_maker(_subgraph.nodes()) complete_topic_graph.add_edges_from(_edgelist) # Add the language labels back in as node attributes so we can hopefully translate back for node in complete_topic_graph.nodes(): complete_topic_graph.add_node(node,lang=article_language_mapper[node]) complete_topic_subgraphs = list(nx.components.connected_component_subgraphs(complete_topic_graph.to_undirected())) english_label_subgraphs = [_subgraph for _subgraph in complete_topic_subgraphs for node,data in _subgraph.nodes_iter(data=True) if 'en' in data['lang']] print "Out of the initial {0} topical clusters, there are {1} subgraphs in the complete approach. {2} of these have an English label".format(len(topic_subgraphs), len(complete_topic_subgraphs), len(english_label_subgraphs)) english_topic_graph = nx.DiGraph() topic_translation_dict = dict() for _subgraph in english_label_subgraphs: english_topic_graph.add_edges_from(_subgraph.edges(data=True)) english_topic_graph.add_nodes_from(_subgraph.nodes(data=True)) #1 _english_nodes = [_node for _node,_data in _subgraph.nodes_iter(data=True) if 'en' in _data['lang']] if len(_english_nodes) == 1: topic_translation_dict.update({_node:_english_nodes[0] for _node in _subgraph.nodes_iter()}) else: # I really hope this is never the case, but just to be sure print _english_nodes # Graphs with lists for attributes cant be serialized into GEXF # Comment out #1 to make #2 work, or leave #1 uncommented and #2 commented #nx.write_gexf(english_topic_graph,'english_topic_graph.gexf') #2 translated_articles_by_country = pd.DataFrame() for country in country_codes.keys(): translated_articles_by_country[country] = pd.Series([topic_translation_dict.get(article,np.nan) for article in top_articles_by_country[country].index]) translated_articles_by_country.columns = [country_codes[i] for i in translated_articles_by_country.columns] translated_articles_by_country.index = range(1,len(translated_articles_by_country)+1) translated_articles_by_country.sort(axis=1).head(3).T pd.Series(top_articles_by_country['en'].index).ix[list(np.array(translated_articles_by_country['English'][translated_articles_by_country['English'].isnull()].index) - 1)] f,(ax1,ax2) = plt.subplots(2,1,sharex=True,figsize=(10,5)) # Plot on ax1 _df1 = lang_link_exists_dict['en'].T.astype(float) _df1 = _df1.ix[_df1.sum(axis=1).sort(inplace=False,ascending=False).index] _df1.index = [country_codes[i] for i in _df1.index] _y1,_x1 = _df1.shape _ax1 = ax1.pcolor(_df1,cmap='rainbow',vmin=0,vmax=1) ax1.set_frame_on(False) #ax1.set_xticks(np.arange(0,_x1,10),minor=False) ax1.set_yticks(np.arange(_y1)+.5,minor=False) ax1.invert_yaxis() ax1.set_yticklabels(_df1.index,minor=False,fontsize=8) ax1.set_title('Original',fontsize=18) # Plot on ax2 _df2 = translated_articles_by_country.T.notnull().astype(float) _df2 = _df2.ix[_df1.index] # Use the _df1 index _y2,_x2 = _df2.shape _ax2 = ax2.pcolor(_df2.values.astype(float),cmap='rainbow',vmin=0,vmax=1) ax2.set_frame_on(False) ax2.set_xticks(np.arange(0,_x2,10),minor=False) ax2.set_yticks(np.arange(_y2)+.5,minor=False) ax2.invert_yaxis() ax2.set_yticklabels(_df2.index,minor=False,fontsize=8) ax2.tick_params(axis='x',direction='in',pad=-4) ax2.set_title('Cleaned',fontsize=18) f.subplots_adjust(right=0.8) #cbar_ax = f.add_axes([.95, 0.15, 0.025, .75]) #f.colorbar(_ax, cax=cbar_ax) f.suptitle('Comparing results, English',fontsize=24) #f.subplots_adjust(top=0.5) f.tight_layout(rect=[0,0,1,.9]) top_stories_across_languages = pd.Series(Counter([_val for _array in translated_articles_by_country.values for _val in _array])) top_stories_across_languages = top_stories_across_languages.ix[1:] top_stories_across_languages_top5 = top_stories_across_languages[top_stories_across_languages >= 5].sort(inplace=False,ascending=True) f,ax = plt.subplots(1,1,figsize=(8,10)) _ax = top_stories_across_languages_top5.plot(kind='barh',ax=ax) ax.axes.set_title('Articles With Widest Coverage\n',fontsize=24) ax.axes.set_xlabel('Number of Languages',fontsize=18) f.tight_layout() f.savefig('widest_coverage.png',dpi=200) combined_top_articles_df = pd.concat(top_articles_by_country.values(),keys=top_articles_by_country.keys(),axis=0).reset_index() combined_top_articles_df.rename(columns={'level_0':'lang'},inplace=True) combined_top_articles_df['article'] = combined_top_articles_df['article'].apply(lambda x:topic_translation_dict.get(x,np.nan)) combined_top_articles_agg_article = combined_top_articles_df.groupby('article').agg({'editors':np.sum,'month':np.average,'lang':len}) combined_top_articles_agg_article['editors per month'] = combined_top_articles_agg_article['editors']/combined_top_articles_agg_article['month'] combined_top_articles_agg_article['editors per lang'] = combined_top_articles_agg_article['editors']/combined_top_articles_agg_article['lang'] combined_top_articles_agg_article['editors-lang-month'] = combined_top_articles_agg_article['editors']*combined_top_articles_agg_article['lang']*combined_top_articles_agg_article['month'] combined_top_articles_agg_article.sort('editors-lang-month',ascending=True,inplace=True) f,ax = plt.subplots(1,1,figsize=(8,10)) _ax = combined_top_articles_agg_article['editors-lang-month'].ix[-50:].plot(kind='barh',ax=ax) ax.axes.set_xscale('log') ax.axes.set_xlabel('Editor-language-month score',fontsize=18) ax.axes.set_ylabel('') ax.axes.set_title('Articles with Highest Activity\n',fontsize=24) f.tight_layout(); f.savefig('highest_activity_ranking.png',dpi=200) _melted = pd.melt(translated_articles_by_country.reset_index(),id_vars=['index']) _pivoted = pd.pivot_table(data=_melted,index='value',columns='variable',values='index') top_by_language_pivoted = _pivoted.ix[top_stories_across_languages_top5.index].fillna(0) top_by_combined_pivoted = _pivoted.ix[combined_top_articles_agg_article.index[-50:]].fillna(0) language_cosine = dict() combined_cosine = dict() for _lang1 in country_codes.values(): language_cosine[_lang1] = dict() combined_cosine[_lang1] = dict() for _lang2 in country_codes.values(): if _lang1 != _lang2: language_cosine[_lang1][_lang2] = cosine_similarity(top_by_language_pivoted[_lang1],top_by_language_pivoted[_lang2])[0][0] combined_cosine[_lang1][_lang2] = cosine_similarity(top_by_combined_pivoted[_lang1],top_by_combined_pivoted[_lang2])[0][0] f,(ax1,ax2) = plt.subplots(1,2,figsize=(9,5),sharey=True) _df1 = pd.DataFrame(language_cosine) _order1 = _df1.mean(axis=1).sort(inplace=False,ascending=True).index _df1 = _df1[_order1].ix[_order1] _y1,_x1 = _df1.shape _ax1 = ax1.pcolor(_df1,cmap='rainbow',vmin=0,vmax=.75) ax1.set_title('Language coverage',fontsize=18) ax1.set_frame_on(False) ax1.set_xticks(np.arange(_y1)+.5,minor=False) ax1.set_yticks(np.arange(_y1)+.5,minor=False) ax1.invert_yaxis() ax1.set_xticklabels(_df1.columns,minor=False,fontsize=10,rotation=90) ax1.set_yticklabels(_df1.index,minor=False,fontsize=10) _df2 = pd.DataFrame(combined_cosine) _order2 = _df2.mean(axis=1).sort(inplace=False,ascending=True).index _df2 = _df2[_order1].ix[_order1] # Order the same way as _df1 _y2,_x2 = _df2.shape _ax2 = ax2.pcolor(_df2,cmap='rainbow',vmin=0,vmax=.75) ax2.set_title('Editor-language-month score',fontsize=18) ax2.set_frame_on(False) ax2.set_xticks(np.arange(_y1)+.5,minor=False) ax2.invert_yaxis() ax2.set_xticklabels(_df1.columns,minor=False,fontsize=10,rotation=90) f.subplots_adjust(right=0.8) cbar_ax = f.add_axes([1, 0.15, 0.05, .7]) cb = f.colorbar(_ax2, cax=cbar_ax, label='Cosine similarity') cb.ax.yaxis.label.set_fontsize(15) #f.suptitle('Cosine similarity of rankings across languages',fontsize=24) #f.subplots_adjust(top=0.5) f.tight_layout()#rect=[0,0,1,.9]) f.savefig('cosine_similarity.png',dpi=200) _df = pd.DataFrame(data=np.triu(_df2),index=_df2.index,columns=_df2.columns).replace({0:np.nan}) #del _df['id'] _df.reset_index(inplace=True) coverage = pd.melt(_df,id_vars=['index']).dropna(subset=['value']) coverage.columns = ['Language 1','Language 2','Cosine Similarity'] #coverage['Language 1'] = coverage['Language 1'].apply(lambda x:country_codes.get(x)) #coverage['Language 2'] = coverage['Language 2'].apply(lambda x:country_codes.get(x)) _highest = coverage.sort('Cosine Similarity',inplace=False,ascending=False).reset_index(drop=True).ix[:9] _lowest = coverage[coverage['Language 1'] != 'Indonesian'].sort('Cosine Similarity',inplace=False,ascending=True).reset_index(drop=True).ix[:9] pd.concat([_highest,_lowest],axis=1,keys=['Highest similarities','Lowest similarities']) top_news_articles = [u'2014 FIFA World Cup', u'Malaysia Airlines Flight 370', u'Malaysia Airlines Flight 17', u'2014 Winter Olympics', u'2014 Crimean crisis', u'Felipe VI of Spain', u'Islamic State of Iraq and the Levant',u'Ebola virus epidemic in West Africa',u'Eurovision Song Contest 2014', u'Ice Bucket Challenge', u'2014 Israel\u2013Gaza conflict', u'Minecraft', u'Scottish independence referendum, 2014',u'2014 Hong Kong protests', u'United States elections, 2014', u'Soma mine disaster', u'Indian general election, 2014', u'Gamergate controversy', u'2014 Ferguson unrest',u'Rosetta spacecraft', u'Cuba\u2013United States relations', u'Chibok schoolgirl kidnapping', u'Sinking of the MV Sewol'] revision_dict = dict() for article in top_news_articles: print article revision_dict[article] = ws.get_page_revisions(article,_start,_end,'en') revision_dict[article].to_csv(_filedir + u'Data/{0}.csv'.format(article),encoding='utf8') # http://planspace.org/2013/06/21/how-to-calculate-gini-coefficient-from-raw-data-in-python/ def gini(list_of_values): if len(list_of_values) > 1: sorted_list = sorted(list_of_values) height, area = 0, 0 for value in sorted_list: height += value area += height - value / 2. fair_area = height * len(list_of_values) / 2 gini_value = (fair_area - area) / fair_area else: gini_value = np.nan return gini_value for _df in revision_dict.values(): _df['gini'] = [gini(Counter(_df.ix[:i,'user']).values()) for i in iter(_df.index)] rev_df = pd.concat(revision_dict.values(),keys=revision_dict.keys(),axis=0) rev_df.reset_index(inplace=True,level=0) rev_df.rename(columns={'level_0':'title'},inplace=True) rev_df.reset_index(inplace=True,drop=True) rev_df['anon'] = rev_df['anon'].notnull() rev_df['userhidden'] = rev_df['userhidden'].notnull() rev_df['commenthidden'] = rev_df['commenthidden'].notnull() rev_df.to_csv('revisions.csv',encoding='utf8') revs2014_df = rev_df[rev_df['timestamp'] >= pd.datetime(2014,1,1,0,0,0)] revs2014_df.reset_index(drop=True,inplace=True) revs2014_df.to_csv('revisions_2014.csv',encoding='utf8') rev_df = pd.read_csv('revisions.csv',encoding='utf8',index_col=0,parse_dates=['date','timestamp']) revs2014_df = pd.read_csv('revisions_2014.csv',encoding='utf8',index_col=0,parse_dates=['date','timestamp']) revs2014_df.tail() _agg_function = {'revision':np.max,'unique_users':np.max} revs2014_agg_article = revs2014_df.groupby('title').agg(_agg_function) f,ax = plt.subplots(1,1,figsize=(8,10)) revs2014_agg_article.sort('revision',inplace=False,ascending=True).plot(kind='barh',ax=ax) ax.set_ylabel('') ax.set_xlabel('Count',fontsize=12) f.tight_layout() f.savefig('en_19_activity.png',dpi=200) daily_activity = revs2014_df.groupby(['title','date']).aggregate({'unique_users':max, 'revid':len, 'diff':np.sum, 'latency':np.mean, 'size':np.mean, 'gini':np.mean}) daily_activity = daily_activity.unstack(level=0) daily_activity.index = pd.to_datetime(daily_activity.index) daily_activity['unique_users'] = daily_activity['unique_users'].fillna(method='ffill').fillna(0) daily_activity['revid'] = daily_activity['revid'].fillna(method='ffill').fillna(0) daily_activity['gini'] = daily_activity['gini'].fillna(method='ffill').fillna(0) #daily_activity['link_count'] = daily_activity['link_count'].fillna(method='ffill').fillna(0) daily_activity['size'] = daily_activity['size'].fillna(method='ffill').fillna(0) daily_activity['diff'] = daily_activity['diff'].fillna(0) daily_activity['latency'] = daily_activity['latency'].fillna(0) #daily_activity = daily_activity.fillna(method='ffill').fillna(0) activity_2014 = daily_activity.ix['2014-1-1':] activity_2014.tail() normalized_unique_users = activity_2014['unique_users'] - activity_2014.ix['2014-1-1','unique_users'] ax = normalized_unique_users.plot(colormap='spectral') ax.set_xlabel('Time',fontsize=15) ax.set_ylabel('New unique users since Jan. 1',fontsize=15) ax.legend(loc='center left',bbox_to_anchor=[1,.5]) users_rank_s = normalized_unique_users.ix['2014-12-21'].order(ascending=False) users_rank_s f,ax = plt.subplots(1,1,figsize=(10,6)) _ax = activity_2014['revid'].plot(colormap='spectral',lw=3,ax=ax) #ax.set_yscale('symlog') ax.set_xlabel('Time',fontsize=15) ax.set_ylabel('Revisions',fontsize=15) ax.legend(loc='center left',bbox_to_anchor=[1,.5],ncol=1) ax.set_title('Revisions over time',fontsize=18) f.tight_layout() f.savefig('revisions.png',dpi=200,bbox_inches='tight') revisions_rank_s = (activity_2014['revid'].cumsum().ix['2014-12-21'] - activity_2014['revid'].cumsum().ix['2014-1-1']).order(ascending=False) revisions_rank_s ax = activity_2014['gini'].plot(colormap='spectral',lw=3) #ax.set_yscale('symlog') ax.set_xlabel('Time',fontsize=15) ax.set_ylabel('Gini coefficient',fontsize=15) ax.legend(loc='center left',bbox_to_anchor=[1,.5]) gini_rank_s = activity_2014['gini'].ix['2014-12-21'].order(ascending=False) gini_rank_s ax = (activity_2014['size']/1000.).plot(colormap='spectral',lw=3) #ax.set_yscale('symlog') ax.set_xlabel('Time',fontsize=15) ax.set_ylabel('Article size (kB)',fontsize=15) ax.legend(loc='center left',bbox_to_anchor=[1,.5]) size_rank_s = (activity_2014['size']/1000.).ix['2014-12-21'].order(ascending=False) size_rank_s f,(ax1,ax2,ax3,ax4) = plt.subplots(4,1,figsize=(10,10),sharex=True) _ax1 = activity_2014['unique_users'].ix['1-7-2014':].diff().plot(colormap='spectral',lw=2,ax=ax1,legend=None) ax1.set_xlabel('') ax1.set_ylabel('Users') ax1.set_title('New users',fontsize=15) _ax2 = activity_2014['revid'].plot(colormap='spectral',lw=2,ax=ax2,legend=None) ax2.set_xlabel('') ax2.set_ylabel('Revisions') ax2.set_title('Revisions made',fontsize=15) _ax3 = activity_2014['gini'].diff().plot(colormap='spectral',lw=2,ax=ax3,legend=None) ax3.set_xlabel('') ax3.set_ylabel('Gini delta') ax3.set_title('Change in centralization',fontsize=15) _ax4 = (activity_2014['diff']/1000.).diff().plot(colormap='spectral',lw=2,ax=ax4) ax4.set_xlabel('') ax4.set_ylabel('Kilobytes (kB) delta') ax4.set_title('Change in article size',fontsize=15) ax4.set_ylim((-100,100)) #ax4.set_yscale('symlog') _colors = dict(zip(sorted(revs2014_df['title'].unique()),sns.color_palette('spectral', len(revs2014_df['title'].unique())))) handles, labels = _ax4.get_legend_handles_labels() ax4.legend_.remove() new_handles = [Line2D([0], [0], linestyle="none", marker="o", markersize=10, markerfacecolor=_colors[article]) for article in sorted(revs2014_df['title'].unique())] f.legend(new_handles,labels,loc='center left',bbox_to_anchor=[1,.5],fontsize=15) f.tight_layout() f.savefig('article_changes.png',dpi=200,bbox_inches='tight') _table = pd.concat([users_rank_s.round(2),revisions_rank_s.round(2),gini_rank_s.round(2),size_rank_s.round(2)], axis=1,keys=['Users','Revisions','Gini','Length']) _table[['Revisions','Users','Gini','Length']].sort(['Revisions','Users','Gini','Length'],ascending=False) ax = activity_2014['link_count'].plot(colormap='gist_rainbow') #ax.set_yscale('symlog') ax.set_xlabel('Time',fontsize=15) ax.set_ylabel('Links in article',fontsize=15) ax.legend(loc='center left',bbox_to_anchor=[1,.5]) activity_2014['link_count'].ix['2014-10-16'].order(ascending=False) links_per_byte = (activity_2014['link_count']/activity_2014['size']) ax = links_per_byte.plot(colormap='gist_rainbow') #ax.set_yscale('symlog') ax.set_xlabel('Time',fontsize=15) ax.set_ylabel('Links per byte',fontsize=15) ax.legend(loc='center left',bbox_to_anchor=[1,.5]) links_per_byte.ix['2014-10-16'].order(ascending=True) #ax = daily_activity.ix['2014-1-1':,'latency'].plot(colormap='gist_rainbow') ax = pd.rolling_mean(daily_activity.ix['2013-11-1':,'latency'],28).ix['2014-1-1':].plot(colormap='spectral') ax.set_xlabel('Time',fontsize=15) ax.set_ylabel('Edit latency (seconds)',fontsize=15) ax.set_yscale('symlog') ax.legend(loc='center left',bbox_to_anchor=[1,.5]) daily_activity.ix['2014-1-1':,'latency'].mean().order(ascending=True) pv_df = ws.make_pageview_df(top_news_articles,'en','2013-12-1','2014-12-31') pv_df.to_csv('pageviews_dec.csv',encoding='utf8') pv_df = pv_df.ix[:'2014-12-21'].sort_index(axis=1) pv_df.tail() pv_df = pd.read_csv('pageviews_Dec.csv',encoding='utf8',index_col=0,parse_dates=[0]) del pv_df['Heartbleed'] pv_2014 = pv_df.ix['1-1-2014':] pv_df.tail() f,ax = plt.subplots(1,1,figsize=(10,5)) _ax = pv_df.ix['2014-1-1':].plot(colormap='spectral',lw=3,ax=ax) ax.set_xlabel('Time',fontsize=15) ax.set_ylabel('Pageviews',fontsize=15) ax.legend(loc='center right',bbox_to_anchor=[1.35,.5],fontsize=9,ncol=1) f.tight_layout() f.savefig('pageviews.png',dpi=200,bbox_inches='tight') pv_df.ix['2014-1-1':].sum().sort(ascending=False,inplace=False) ax = (pv_df.ix['2014-1-1':]/pv_df.max(axis=0)).plot(colormap='spectral',lw=3) #ax.set_yscale('symlog') ax.set_xlabel('Time',fontsize=15) ax.set_ylabel('Pageviews',fontsize=15) #ax.set_ylim((10**2,10**7)) ax.legend(loc='center left',bbox_to_anchor=[1,.5]) f,ax = plt.subplots(1,1,figsize=(8,8)) pv_df.sum(axis=0).sort(inplace=False,ascending=True).plot(kind='barh',ax=ax) ax.set_title('Cumulative pageviews',fontsize=18) ax.set_xlabel('Total page views',fontsize=15) ax.set_xscale('symlog') f.tight_layout() f.savefig('cumulative_pageviews.png',dpi=200) pv_melted = pd.melt(pv_df.reset_index(),id_vars=['index']) pv_gb_page = pv_melted.groupby('variable') _idx = pv_melted.groupby('variable')['value'].agg(lambda col: col.idxmax()) pv_max = pv_melted.ix[_idx] pv_max.columns = ['date','article','pageviews'] pv_max = pv_max.set_index('article') pv_max pv_gb_page.groups.keys() f,ax = plt.subplots(1,1,figsize=(8,8)) pv_df.max().sort(inplace=False,ascending=True).plot(kind='barh',ax=ax) ax.set_title('Most Pageviews in a Day',fontsize=18) ax.set_xlabel('Pageviews',fontsize=15) ax.set_xscale('symlog') f.tight_layout() f.savefig('max_pageviews.png',dpi=200) f,ax = plt.subplots(1,1,figsize=(10,5)) _cmap = 'spectral' _topics = ['Malaysia Airlines Flight 17','2014 FIFA World Cup','Islamic State of Iraq and the Levant','Minecraft'] _data = pv_df[_topics].ix['1-1-2014':] #_ax = _data.plot(lw=3,ax=ax,cmap=_cmap) #ax.set_yscale('symlog') #ax.set_ylim((1e2,1e7)) ax.legend(fontsize=12,loc='upper left') _colors = dict(zip(pv_df.columns,sns.color_palette(_cmap, len(pv_df.columns)))) for d in _topics: ax.plot(pv_df.ix['1-1-2014':,d].index,pv_df.ix['1-1-2014':,d].values,c=_colors[d],lw=2,label=d) #ax.fill_between(_data[d].index, _data[d].values, _data['Minecraft'].values, facecolor=_colors[d], alpha=0.33) ax.set_xticklabels(['Jan 2014','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec']) ax.legend(loc='center right',bbox_to_anchor=[1.4,.5],fontsize=12) f.tight_layout() f.savefig('pageviews_shapes.png',dpi=200,bbox_inches='tight') f,ax = plt.subplots(1,1,figsize=(8,8)) (pv_df.max()/pv_df.sum()).sort(inplace=False,ascending=True).plot(kind='barh',ax=ax) ax.set_title('Fraction of Total Pageviews in Peak',fontsize=18) ax.set_xlabel('Fraction of Pageviews',fontsize=15) #ax.set_xscale('log') f.tight_layout() f.savefig('peak_fraction.png',dpi=200) pv_2014.columns information_conduced_df = pv_2014/(activity_2014['revid']+1) ax = pd.rolling_mean(information_conduced_df,7).ix['2014-1-1':].plot(colormap='spectral') ax.set_yscale('symlog') ax.set_xlabel('Time',fontsize=15) ax.set_ylabel('Consumption/Production ratio',fontsize=15) #ax.set_ylim((0,20000)) ax.legend(loc='center left',bbox_to_anchor=[1,.5]) _s = pd.melt(information_conduced_df.reset_index(),id_vars='index').replace({np.inf:np.nan,-np.inf:np.nan}).dropna() _s.columns = ['date','article','ratio'] _top = _s.sort('ratio',inplace=False,ascending=False).reset_index(drop=True).ix[:10] # Exclude uninteresting edge cases. Sorry Your Excellence, you're a boring fellow. _bottom = _s[(_s['ratio'] > 0) & (_s['article'] != 'Felipe VI of Spain')] _bottom = _bottom.sort('ratio',inplace=False,ascending=True).reset_index(drop=True).ix[:10] pd.concat([_top,_bottom],keys=['Greater consumption per production','Lesser consumption per production'],axis=1) _p = pd.melt(activity_2014['revid'].reset_index(),id_vars='date') _c = pd.melt(pv_2014.reset_index(),id_vars='index') _j = pd.merge(_p,_c,left_on=['date','title'],right_on=['index','variable'],copy=False) _j = _j[['date','title','value_x','value_y']] _j.columns = ['date','article','production','consumption'] _j_gb = _j.groupby('article') f,ax = plt.subplots(1,1,figsize=(10,10)) _colors = dict(zip(sorted(_j_gb.groups.keys()),sns.color_palette('spectral', len(_j_gb.groups.keys())))) for article in sorted(_j_gb.groups.keys()): _data = _j_gb.get_group(article)[['production','consumption']] _data['production_z'] = (_data['production'] - _data['production'].mean())/_data['production'].std() _data['consumption_z'] = (_data['consumption'] - _data['consumption'].mean())/_data['consumption'].std() _data['ratio'] = np.abs(_data['consumption_z'] + _data['production_z']) #_data = _data[(_data['production'] > 0) & (_data['consumption'] > 0)] sns.regplot(_data['production_z'],_data['consumption_z'], ci=None,color=_colors[article],label=article,ax=ax,lowess=True, scatter_kws={'s':50*np.abs(_data['ratio']),'alpha':.33},line_kws={'lw':5}) #ax.scatter(_data['production'].values,_data['consumption'].values,c=_colors[article],s=50,lw=0,alpha=.5,label=article) #ax.scatter(_data['production'].values,_data['consumption'].values,c=_colors[article],s=250*np.log(_data['ratio']),lw=0,alpha=.5,label=article) ax.set_xlabel('Revisions (Z-score)',fontsize=15) ax.set_ylabel('Pageviews (Z-score)',fontsize=15) ax.set_yscale('symlog') ax.set_ylim((-1,20)) ax.set_xscale('symlog') ax.set_xlim((-1,20)) ax.plot((-1,20),(-1,20),'--',lw=3,c='k') handles, labels = ax.get_legend_handles_labels() new_handles = [Line2D([0], [0], linestyle="none", marker="o", markersize=10, markerfacecolor=_colors[article]) for article in sorted(_j_gb.groups.keys())] ax.legend(new_handles,labels,loc='center left',bbox_to_anchor=[1,.5],fontsize=12) f.savefig('pv_vs_revision.png',dpi=200,bbox_inches='tight') handles[0] agg_function = {'revid':{'weight':len}, 'timestamp':{'ts_min':np.min,'ts_max':np.max}, 'diff':{'diff_min':np.min,'diff_median':np.median,'diff_max':np.max,'total_changes':np.sum}, 'latency':{'latency_min':np.min,'latency_median':np.median,'latency_max':np.max}, 'revision':{'revision_min':np.min,'revision_median':np.median,'revision_max':np.max}, #'link_count':{'link_count_min':np.min,'link_count_median':np.median,'link_count_max':np.max} } revs_gb_edge = revs.groupby(['title','user']) revs_edgelist = revs_gb_edge.agg(agg_function) revs_edgelist.columns = revs_edgelist.columns.droplevel(0) # Convert the ts_min and ts_max to floats for the number of days since Wikipedia was founded revs_edgelist['ts_min'] = (revs_edgelist['ts_min'] - pd.Timestamp('2001-1-1'))/np.timedelta64(1,'D') revs_edgelist['ts_max'] = (revs_edgelist['ts_max'] - pd.Timestamp('2001-1-1'))/np.timedelta64(1,'D') revs_edgelist.head() revs_edgelist.ix[[i for i in revs_edgelist.index if i[0] == i[1]]] revs_gb_page = revs.groupby('title') revs_pagenodelist = revs_gb_page.agg(agg_function) revs_pagenodelist.columns = revs_pagenodelist.columns.droplevel(0) # Convert the ts_min and ts_max to floats for the number of days since Wikipedia was founded revs_pagenodelist['ts_min'] = (revs_pagenodelist['ts_min'] - pd.Timestamp('2001-1-1'))/np.timedelta64(1,'D') revs_pagenodelist['ts_max'] = (revs_pagenodelist['ts_max'] - pd.Timestamp('2001-1-1'))/np.timedelta64(1,'D') revs_pagenodelist['article'] = [1]*len(revs_pagenodelist) revs_pagenodelist.head() revs_gb_user = revs.groupby('user') revs_usernodelist = revs_gb_user.agg(agg_function) revs_usernodelist.columns = revs_usernodelist.columns.droplevel(0) # Convert the ts_min and ts_max to floats for the number of days since Wikipedia was founded revs_usernodelist['ts_min'] = (revs_usernodelist['ts_min'] - pd.Timestamp('2001-1-1'))/np.timedelta64(1,'D') revs_usernodelist['ts_max'] = (revs_usernodelist['ts_max'] - pd.Timestamp('2001-1-1'))/np.timedelta64(1,'D') revs_usernodelist['article'] = [0]*len(revs_usernodelist) revs_usernodelist.head() coauthorship_g = nx.DiGraph() # Add the edges and edge attributes for (article,editor) in iter(revs_edgelist.index.values): edge_attributes = {k:float(v) for k,v in dict(revs_edgelist.ix[(article,editor)]).items()} if article != editor: coauthorship_g.add_edge(article,editor,edge_attributes) # Add the user nodes and attributes for node in iter(revs_usernodelist.index): node_attributes = {k:float(v) for k,v in dict(revs_usernodelist.ix[node]).items()} coauthorship_g.add_node(node,node_attributes,type='user') # Add the page nodes and attributes for node in iter(revs_pagenodelist.index): node_attributes = {k:float(v) for k,v in dict(revs_pagenodelist.ix[node]).items()} coauthorship_g.add_node(node,node_attributes,type='page') print "There are {0} nodes and {1} edges in the network.".format(coauthorship_g.number_of_nodes(),coauthorship_g.number_of_edges()) coauthorship_g.edges(data=True)[:1] nx.write_gexf(coauthorship_g,'coauthorship_g.gexf') edges_wt1 = [(i,j) for (i,j,k) in coauthorship_g.edges_iter(data=True) if k['weight'] == 1] coauthorship_g_gt1 = coauthorship_g.copy() coauthorship_g_gt1.remove_edges_from(edges_wt1) isolates = nx.isolates(coauthorship_g_gt1) coauthorship_g_gt1.remove_nodes_from(isolates) nx.write_gexf(coauthorship_g_gt1,'coauthorship_g_gt1.gexf') print "There are {0} nodes and {1} edges in the network.".format(coauthorship_g_gt1.number_of_nodes(),coauthorship_g_gt1.number_of_edges()) Image('coauthorship_g_gt1.png') revs_edgelist.sort('weight',inplace=False,ascending=False)['weight'].reset_index().head(10) revs_edgelist[(revs_edgelist['weight'] > 10)]['total_changes'].abs().sort(inplace=False,ascending=False).head(10) _n = len(coauthorship_g_gt1) - 1 idc = {k:int(v*_n) for k,v in nx.in_degree_centrality(coauthorship_g_gt1).iteritems()} odc = {k:int(v*_n) for k,v in nx.out_degree_centrality(coauthorship_g_gt1).iteritems()} pd.Series(idc).sort(inplace=False,ascending=False).ix[:20] bp_g_gt1 = coauthorship_g_gt1.to_undirected() pages = list(revs_pagenodelist.index) users = list(set(coauthorship_g_gt1.nodes()) - set(pages)) clustering = nx.bipartite.clustering(bp_g_gt1,pages) pd.Series(clustering).sort(inplace=False,ascending=False) revs2014_gb_article = revs2014_df.groupby('title') aftermath_df_list = list() for _article in pv_max.index: _df = revs2014_gb_article.get_group(_article) _before = pv_max.ix[_article,'date'] - np.timedelta64(1,'D') _after = pv_max.ix[_article,'date'] + np.timedelta64(2,'D') _aftermath = _df[(_df['timestamp'] > _before) & (_df['timestamp'] < _after)] aftermath_df_list.append(_aftermath) aftermath_revs = pd.concat(aftermath_df_list) aftermath_revs_gb_edge = aftermath_revs.groupby(['title','user']) aftermath_revs_edgelist = aftermath_revs_gb_edge.agg(agg_function) aftermath_revs_edgelist.columns = aftermath_revs_edgelist.columns.droplevel(0) aftermath_revs_edgelist['ts_min'] = (aftermath_revs_edgelist['ts_min'] - pd.Timestamp('2001-1-1'))/np.timedelta64(1,'D') aftermath_revs_edgelist['ts_max'] = (aftermath_revs_edgelist['ts_max'] - pd.Timestamp('2001-1-1'))/np.timedelta64(1,'D') aftermath_revs_gb_page = aftermath_revs.groupby('title') aftermath_revs_pagenodelist = aftermath_revs_gb_page.agg(agg_function) aftermath_revs_pagenodelist.columns = aftermath_revs_pagenodelist.columns.droplevel(0) aftermath_revs_pagenodelist['ts_min'] = (aftermath_revs_pagenodelist['ts_min'] - pd.Timestamp('2001-1-1'))/np.timedelta64(1,'D') aftermath_revs_pagenodelist['ts_max'] = (aftermath_revs_pagenodelist['ts_max'] - pd.Timestamp('2001-1-1'))/np.timedelta64(1,'D') aftermath_revs_pagenodelist['article'] = [1]*len(aftermath_revs_pagenodelist) aftermath_revs_gb_user = aftermath_revs.groupby('user') aftermath_revs_usernodelist = aftermath_revs_gb_user.agg(agg_function) aftermath_revs_usernodelist.columns = aftermath_revs_usernodelist.columns.droplevel(0) aftermath_revs_usernodelist['ts_min'] = (aftermath_revs_usernodelist['ts_min'] - pd.Timestamp('2001-1-1'))/np.timedelta64(1,'D') aftermath_revs_usernodelist['ts_max'] = (aftermath_revs_usernodelist['ts_max'] - pd.Timestamp('2001-1-1'))/np.timedelta64(1,'D') aftermath_revs_usernodelist['article'] = [0]*len(aftermath_revs_usernodelist) aftermath_coauthorship_g = nx.DiGraph() # Add the edges and edge attributes for (article,editor) in iter(aftermath_revs_edgelist.index.values): edge_attributes = {k:float(v) for k,v in dict(aftermath_revs_edgelist.ix[(article,editor)]).items()} if article != editor: aftermath_coauthorship_g.add_edge(article,editor,edge_attributes) # Add the user nodes and attributes for node in iter(aftermath_revs_usernodelist.index): node_attributes = {k:float(v) for k,v in dict(aftermath_revs_usernodelist.ix[node]).items()} aftermath_coauthorship_g.add_node(node,node_attributes,type='user') # Add the page nodes and attributes for node in iter(aftermath_revs_pagenodelist.index): node_attributes = {k:float(v) for k,v in dict(aftermath_revs_pagenodelist.ix[node]).items()} aftermath_coauthorship_g.add_node(node,node_attributes,type='page') print "There are {0} nodes and {1} edges in the network.".format(aftermath_coauthorship_g.number_of_nodes(),aftermath_coauthorship_g.number_of_edges()) nx.write_gexf(aftermath_coauthorship_g,'aftermath_coauthorship_g.gexf') _df = revs2014_gb_article.get_group('Gamergate controversy') _df[(_df['date'] < pd.datetime(2014,10,25)) & (_df['date'] > pd.datetime(2014,10,22))] _pages1 = revs2014_gb_article.groups.keys() _pages2 = [_n for _n,_d in aftermath_coauthorship_g.nodes_iter(data=True) if 'page' in _d.values()] _pages3 = list(aftermath_revs['title'].unique()) set(_pages1) - set(_pages3) Image('aftermath_coauthorship_g.png') _article_overlaps = dict() _aftermath_overlaps = dict() for _article1 in pages: _article_overlaps[_article1] = dict() _aftermath_overlaps[_article1] = dict() for _article2 in pages: if _article1 != _article2: try: _article_overlaps[_article1][_article2] = len(set(coauthorship_g.neighbors(_article1)) & set(coauthorship_g.neighbors(_article2)))/float(len(set(coauthorship_g.neighbors(_article1)))) except nx.NetworkXError: _article_overlaps[_article1][_article2] = np.nan try: # Some articles have no editing activity in the aftermath window _aftermath_overlaps[_article1][_article2] = len(set(aftermath_coauthorship_g.neighbors(_article1)) & set(aftermath_coauthorship_g.neighbors(_article2)))/float(len(set(aftermath_coauthorship_g.neighbors(_article1)))) except nx.NetworkXError: _aftermath_overlaps[_article1][_article2] = np.nan _article_overlaps_df = pd.DataFrame(_article_overlaps) _order = _article_overlaps_df.mean(axis=1).sort(inplace=False,ascending=True).index _article_overlaps_df = _article_overlaps_df[_order].ix[_order] _x1,_y1 = _article_overlaps_df.shape _aftermath_overlaps_df = pd.DataFrame(_aftermath_overlaps) _aftermath_overlaps_df = _aftermath_overlaps_df[_order].ix[_order] _x2,_y2 = _aftermath_overlaps_df.shape f,(ax1,ax2) = plt.subplots(1,2,figsize=(12,8),sharey=True) _ax1 = ax1.pcolor(_article_overlaps_df.values,cmap='rainbow',vmin=0,vmax=.25) ax1.set_frame_on(False) ax1.set_xticks(np.arange(0.5,_x+.5),minor=False) ax1.set_yticks(np.arange(_y)+.5,minor=False) ax1.invert_yaxis() ax1.set_xticklabels(_article_overlaps_df.columns,minor=False,fontsize=12,rotation=90) ax1.set_yticklabels(_article_overlaps_df.index,minor=False,fontsize=12) ax1.tick_params(axis='x',direction='in',pad=3) ax1.set_title('Complete coauthorship',fontsize=15) _ax2 = ax2.pcolor(_aftermath_overlaps_df.values,cmap='rainbow',vmin=0,vmax=.25) ax2.set_frame_on(False) ax2.set_xticks(np.arange(0.5,_x+.5),minor=False) #ax2.set_yticks(np.arange(_y)+.5,minor=False) ax2.invert_yaxis() ax2.set_xticklabels(_article_overlaps_df.columns,minor=False,fontsize=12,rotation=90) #ax2.set_yticklabels(_article_overlaps_df.index,minor=False,fontsize=12) ax2.tick_params(axis='x',direction='in',pad=3) ax2.set_title('Aftermath coauthorship',fontsize=15) #ax.set_xlabel('Article rank',fontsize=15) f.subplots_adjust(right=0.8) cbar_ax = f.add_axes([1, 0.25, 0.05, 0.7]) f.colorbar(_ax1, cax=cbar_ax,label='Editor overlap') f.tight_layout() f.savefig('editor_overlap.png',dpi=200,bbox_inches='tight') 624150417['] _idx = rev_df.groupby(['title','date']).agg({'revid':lambda x:x.idxmax()}) max_daily_revid = rev_df[['title','date','revid']].ix[_idx['revid']] max_daily_revid = pd.pivot_table(data=max_daily_revid,columns='title',index='date',values='revid') max_daily_revid.fillna(method='ffill',inplace=True) max_daily_revid = max_daily_revid.ix[pd.date_range(start='1-1-2014',end='12-22-2014')] # It turns out a bunch of revisions were deleted on the ISIS article max_daily_revid.ix[pd.to_datetime('2014-09-03').date(),'Islamic State of Iraq and the Levant'] = np.nan #624150417 len([_revid for _article in max_daily_revid.columns for _revid in max_daily_revid[_article].ix[pd.datetime(2014,1,1).date():].dropna().unique()]) #parsed_revid_data = dict() for _article in max_daily_revid.columns[14:15]: print _article parsed_revid_data[_article] = dict() _unique_revids = max_daily_revid[_article].ix[pd.datetime(2014,1,1).date():].dropna().unique() for _revid in _unique_revids: try: parsed_revid_data[_article][_revid] = ws.wikipedia_query({'action':'parse', 'oldid': _revid, 'redirects': True, 'prop': 'revid|langlinks|categories|externallinks|iwlinks|templates|images'},'en') except: print "Revision {0} has an error".format(str(_revid)) parsed_revid_data[_article][_revid] = np.nan pass simlified_parsed_revid_data = {_rev:_payload for _article,_revs in parsed_revid_data.items() for _rev,_payload in _revs.items()} with open('parsed_revid_data.json','wb') as f: json.dump(parsed_revid_data,f) with open('simlified_parsed_revid_data.json','wb') as f: json.dump(simlified_parsed_revid_data,f) parsed_revid_data['Islamic State of Iraq and the Levant'][624083851] with open('parsed_revid_data.json','rb') as f: parsed_revid_data = json.load(f) _final_revs = dict(max_daily_revid.ix[pd.to_datetime('2014-12-21').date()]) urls = list() for _article, _rev in _final_revs.items(): if 'externallinks' in parsed_revid_data[_article][_rev].keys(): for _url in parsed_revid_data[_article][_rev]['externallinks']: urls.append(urlparse.urlparse(_url)[1]) _s = pd.Series(Counter(urls)).sort(ascending=False,inplace=False) _s.head(10) westerners = ['bbc','guardian','ft','telegraph','independent', 'nytimes','reuters','washingtonpost','cnn','wsj','abc','nbc','cbs','yahoo','bloomberg'] def western_link_fraction(_revid,_revdict): try: _urls = _revdict[_revid]['externallinks'] _domains = [urlparse.urlparse(_url)[1] for _url in _urls] _western = [any(_w in _d for _w in westerners) for _d in _domains] if len(_western) > 0: return float(sum(_western))/len(_western) else: return 0 except KeyError: return np.nan western_link_df = pd.DataFrame(index=pd.date_range('1-1-2014','12-21-2014')) for article in max_daily_revid.columns: western_link_df[article] = max_daily_revid[article].apply(lambda x:western_link_fraction(x,simlified_parsed_revid_data)) f,ax = plt.subplots(1,1,figsize=(10,6)) _ax = western_link_df.plot(colormap='spectral',ax=ax) _ax.legend(loc='center left',bbox_to_anchor=[1,.5]) f.tight_layout() f.savefig('western_links.png',dpi=200,bbox_inches='tight') def chunk_maker(a_list,size): chunk_num = len(a_list)/size chunks = list() for c in range(chunk_num + 1): start = c * (size + 1) end = (c + 1) * (size + 1) elements = list(itertools.islice(a_list,start,end)) if len(elements) > 0: chunks.append(elements) return chunks # http://stackoverflow.com/a/319291/1574687 def valid_ip(address): try: parts = address.split(".") if len(parts) != 4: return False for item in parts: if not 0 <= int(item) <= 255 and len(item) > 3: return False return True except ValueError: return False non_ipv4_users = list(bp_g_gt1_usernodelist[~bp_g_gt1_usernodelist.index.map(valid_ip)].index) chunks = chunk_maker(non_ipv4_users,50) user_properties = list() for chunk in chunks: user_string = u'|'.join(chunk) props = ws.get_user_properties(user_string,'en') for prop in props['users']: user_properties.append(prop) with open('user_properties.json','wb') as f: json.dump(user_properties,f) with open('user_properties.json','rb') as f: user_properties2 = json.load(f) user_props_df = pd.DataFrame(user_properties2).set_index('name') user_props_df = user_props_df[user_props_df['userid'].notnull()] user_props_df['registration'] = pd.to_datetime(user_props_df['registration'],format='%Y-%m-%dT%H:%M:%SZ') user_props_df['blockedtimestamp'] = pd.to_datetime(user_props_df['blockedtimestamp'],format='%Y-%m-%dT%H:%M:%SZ') user_props_df['account_age'] = (pd.datetime.today().date() - user_props_df['registration'])/np.timedelta64(1,'D') user_props_df['blocked'] = user_props_df['blockexpiry'].notnull() user_props_df['blocked_account_age'] = (user_props_df['blockedtimestamp'] - user_props_df['registration'])/np.timedelta64(1,'D') user_props_df['editcount'] = user_props_df['editcount'].map(float) user_props_df['permissions'] = user_props_df['groups'].apply(len) - 2 user_props_df.drop(['invalid','blockedbyid','blockid','userid','blockedby','blockexpiry','blockreason'],inplace=True,axis=1) user_props_df.head() gender_count = user_props_df.groupby('gender').agg({'editcount':len}) print gender_count sns.barplot(gender_count.index,gender_count.values,palette='muted') plt.yscale('log') plt.ylabel('Number of users',fontsize=15) plt.xlabel('') sns.boxplot(user_props_df['editcount'],groupby=user_props_df['gender'],color='muted') plt.yscale('log') plt.ylabel('Total revisions',fontsize=15) plt.xlabel('') female_editcounts = user_props_df[user_props_df['gender'] == 'female']['editcount'].values male_editcounts = user_props_df[user_props_df['gender'] == 'male']['editcount'].values stats.mannwhitneyu(female_editcounts,male_editcounts) sns.boxplot(user_props_df['account_age'],groupby=user_props_df['gender'],color='muted') plt.yscale('log') plt.ylabel('Account age',fontsize=15) plt.xlabel('') female_account_ages = user_props_df[user_props_df['gender'] == 'female']['account_age'].values male_account_ages = user_props_df[user_props_df['gender'] == 'male']['account_age'].values stats.mannwhitneyu(female_account_ages,male_account_ages) blocked = user_props_df[user_props_df['blocked']] print blocked.groupby('gender').agg({'editcount':len}) print '\n' print blocked.groupby('gender').agg({'editcount':len})/gender_count bp_g_gt1_usernodelist.head() bp_g_gt1_usernodelist = revs_usernodelist[revs_usernodelist.index.isin(users)] bp_g_gt1_usernodelist = bp_g_gt1_usernodelist.join(user_props_df,how='left') bp_g_gt1_usernodelist['degree'] = pd.Series({k:v for k,v in idc.iteritems() if k in bp_g_gt1_usernodelist.index}) bp_g_gt1_edgelist = revs_edgelist[revs_edgelist.index.isin(coauthorship_g_gt1.edges())] bp_g_gt1_edgelist['article_degree'] = pd.Series({i:odc[i[0]] for i in iter(bp_g_gt1_edgelist.index.values)}) bp_g_gt1_edgelist['editor_degree'] = pd.Series({i:idc[i[1]] for i in iter(bp_g_gt1_edgelist.index.values)}) bp_g_gt1_edgelist['article_age'] = pd.Series({i:bp_g_gt1_edgelist.ix[i,'ts_min'] - revs_pagenodelist.ix[i[0],'ts_min'] for i in iter(bp_g_gt1_edgelist.index.values)}) bp_g_gt1_edgelist['editor_age'] = pd.Series({i:bp_g_gt1_usernodelist.ix[i[1],'account_age'] for i in iter(bp_g_gt1_edgelist.index.values)}) bp_g_gt1_edgelist['gender'] = pd.Series({i:bp_g_gt1_usernodelist.ix[i[1],'gender'] for i in iter(bp_g_gt1_edgelist.index.values)}) bp_g_gt1_edgelist['permissions'] = pd.Series({i:bp_g_gt1_usernodelist.ix[i[1],'permissions'] for i in iter(bp_g_gt1_edgelist.index.values)}) bp_g_gt1_edgelist['editcount'] = pd.Series({i:bp_g_gt1_usernodelist.ix[i[1],'editcount'] for i in iter(bp_g_gt1_edgelist.index.values)}) bp_g_gt1_edgelist['persistence'] = bp_g_gt1_edgelist['revision_max'] - bp_g_gt1_edgelist['revision_min'] bp_g_gt1_edgelist['revision_min_frac'] = pd.Series({i:bp_g_gt1_edgelist.ix[i,'revision_min']/float(revs_pagenodelist.ix[i[0],'revision_max']) for i in iter(bp_g_gt1_edgelist.index.values)}) bp_g_gt1_edgelist['revision_max_frac'] = pd.Series({i:bp_g_gt1_edgelist.ix[i,'revision_max']/float(revs_pagenodelist.ix[i[0],'revision_max']) for i in iter(bp_g_gt1_edgelist.index.values)}) ax = sns.boxplot(bp_g_gt1_usernodelist['weight']/bp_g_gt1_usernodelist['degree'],groupby=bp_g_gt1_usernodelist['degree'],color='coolwarm') ax.set_yscale('symlog') ax.set_xlabel('Articles edited',fontsize=15) ax.set_ylabel('Revisions made',fontsize=15) ax = sns.boxplot(bp_g_gt1_usernodelist['weight']/bp_g_gt1_usernodelist['degree'],groupby=bp_g_gt1_usernodelist['gender'],color='muted') ax.set_yscale('symlog') ax.set_xlabel('Articles edited',fontsize=15) ax.set_ylabel('Revisions made',fontsize=15) ax = sns.boxplot(bp_g_gt1_usernodelist['latency_median'],groupby=bp_g_gt1_usernodelist['degree'],color='coolwarm') ax.set_yscale('symlog') ax.set_xlabel('Articles edited',fontsize=15) ax.set_ylabel('Latency (s)',fontsize=15) ax = sns.boxplot(bp_g_gt1_usernodelist['latency_median'],groupby=bp_g_gt1_usernodelist['gender'],color='muted') ax.set_yscale('symlog') ax.set_xlabel('Articles edited',fontsize=15) ax.set_ylabel('Latency (s)',fontsize=15) bp_g_gt1_edgelist.head() plt.scatter(bp_g_gt1_edgelist['weight'],bp_g_gt1_edgelist['persistence'],alpha=.5) plt.plot((0,10**4),(0,10**4),color='k',linestyle='--',linewidth=2) plt.xscale('symlog') plt.yscale('symlog') plt.xlim((0,10**4)) plt.ylim((0,10**4)) plt.xlabel('Revisions made',fontsize=15) plt.ylabel('Persistence (days)',fontsize=15) plt.scatter(bp_g_gt1_edgelist['weight']*np.random.uniform(.9,1.1,size=len(bp_g_gt1_edgelist)),bp_g_gt1_edgelist['revision_min_frac'],alpha=.5) plt.xscale('symlog') plt.xlim((2,10**4)) plt.ylim((0,1)) plt.xlabel('Revisions made',fontsize=15) plt.ylabel('Latency',fontsize=15) ax = sns.boxplot(bp_g_gt1_edgelist['weight'],groupby=bp_g_gt1_edgelist['gender'],color='muted') ax.set_yscale('symlog') ax.set_xlabel('',fontsize=15) ax.set_ylabel('Revisions',fontsize=15) ax = sns.boxplot(bp_g_gt1_edgelist['weight'],groupby=bp_g_gt1_edgelist['permissions'],color='gist_rainbow') ax.set_yscale('symlog') ax.set_xlabel('Permissions',fontsize=15) ax.set_ylabel('Revisions',fontsize=15) ax = sns.boxplot(bp_g_gt1_edgelist['editor_age'],groupby=bp_g_gt1_edgelist['editor_degree'],color='coolwarm') ax.set_yscale('symlog') ax.set_xlabel('Articles edited',fontsize=15) ax.set_ylabel('Editor age (days)',fontsize=15) ax = sns.boxplot(bp_g_gt1_edgelist['article_age'],groupby=bp_g_gt1_edgelist['editor_degree'],color='coolwarm') ax.set_yscale('symlog') ax.set_xlabel('Articles edited',fontsize=15) ax.set_ylabel('Time since first edit (days)',fontsize=15) ax = sns.boxplot(bp_g_gt1_edgelist['editor_age'],groupby=bp_g_gt1_edgelist['gender'],color='muted') ax.set_yscale('symlog') ax.set_xlabel('Articles edited',fontsize=15) ax.set_ylabel('First edit to article',fontsize=15) ax = sns.boxplot(bp_g_gt1_edgelist['editor_age'],groupby=bp_g_gt1_edgelist['permissions'],color='pastel') #ax.set_yscale('symlog') ax.set_xlabel('Permissions',fontsize=15) ax.set_ylabel('Editor age',fontsize=15) ax = sns.boxplot(bp_g_gt1_edgelist['revision_min_frac'],groupby=bp_g_gt1_edgelist['editor_degree'],color='coolwarm') #ax.set_yscale('symlog') ax.set_xlabel('Articles edited',fontsize=15) ax.set_ylabel('First edit to article',fontsize=15) ax = sns.boxplot(bp_g_gt1_edgelist['revision_min_frac'],groupby=bp_g_gt1_edgelist['gender'],color='muted') #ax.set_yscale('symlog') ax.set_xlabel('Articles edited',fontsize=15) ax.set_ylabel('First edit to article',fontsize=15) ax = sns.boxplot(bp_g_gt1_edgelist['revision_min_frac'],groupby=bp_g_gt1_edgelist['permissions'],color='pastel') #ax.set_yscale('symlog') ax.set_xlabel('Permissions',fontsize=15) ax.set_ylabel('First edit to article',fontsize=15)