%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json, requests, re, itertools, urllib2, urlparse
import wikipedia_scraping as ws
import seaborn as sns
import networkx as nx

from sklearn.metrics.pairwise import cosine_similarity
from bs4 import BeautifulSoup, element
from collections import Counter
from IPython.display import Image
from operator import itemgetter
from scipy import stats
from matplotlib.lines import Line2D

_start = pd.datetime(2001,1,1)
_end = pd.datetime(2015,1,1)
_filedir = u'C:/Users/bkeegan/Dropbox/Workspace/Wikipedia news events/2014 News/'

# http://yearinreview.fb.com/
facebook = ['World Cup','Ebola Outbreak','Elections in Brazil','Robin Williams','Ice Bucket Challenge','Conflict in Gaza','Malaysia Airlines disasters','Super Bowl','Ferguson','Sochi']

# http://www.google.com/trends/topcharts?hl=en#date=2014
google = ['Robin Williams','World Cup','Ebola','Malaysia Airlines','Flappy Bird','ALS Ice Bucket Challenge','ISIS','Ferguson','Frozen','Ukraine']

# https://2014.twitter.com/moments
twitter = ['Philip Seymour Hoffman','State of the Union','Carnaval','Malaysia Airlines','Bring Back Our Girls','India Election','Spanish Abdication','Maya Angelou','Ferguson','Robin Williams','Ice Bucket Challenge','Scottish referendum','Ebola','He for She','Hong Kong protests','Mars Orbiter','Malala Yousafzi','US elections','Berlin Wall','Philae']

# Editorial judgment, https://en.wikipedia.org/wiki/2014
wikipedia1 = ['2014 Winter Olympics','Ebola virus epidemic in West Africa','2014 Crimean crisis','Malaysia Airlines Flight 370','Chibok schoolgirl kidnapping','Sinking of the MV Sewol','Islamic State in Iraq and the Levant','2014 FIFA World Cup','Felipe VI','2014 Israel–Gaza conflict','Malaysia Airlines Flight 17','Rosetta spacecraft','Cuba-United States relations']

# Number of contributors, http://stats.wikimedia.org/EN/TablesWikipediaEN.htm#zeitgeist
# Excluding repeats like "Deaths in 2014"
wikipedia2 = ['2013–14 North American cold wave',]


def top_articles(lang):
    # Read the HTML from the web and convert to soup
    # Broken URLS here: soup = BeautifulSoup(urllib2.urlopen('http://stats.wikimedia.org/{0}/TablesWikipedia{0}.htm'.format(lang.upper())).read()) 
    soup = BeautifulSoup(urllib2.urlopen('http://stats.wikimedia.org/EN/TablesWikipedia{0}.htm'.format(lang.upper())).read())

    # Look for all the paragraphs with 2014
    _p = soup.findAll('b',text=re.compile('2014'))

    # Select only those paragraph parents that have exactly 152 fields, corresponding to the top-25 lists
    _p2014 = [t.parent for t in _p if len(t.parent) == 152]

    # Get the text out of the children tags as a list of lists
    parsed = [[t.text for t in list(p.children) if type(t) != element.NavigableString] for p in _p2014]

    # Convert to a dictionary keyed by month abbreviation with values as the list of text fields
    parsed = {month[0].split(u'\xa0')[0]:month[1:] for month in parsed}

    # Do some crazy dictionary and list comprehensions with zips to convert the values in the list
    parsed = {k:[{'rank':int(a),'editors':int(b),'article':c} for a,b,c in zip(v[0::3],v[1::3],v[2::3])] for k,v in parsed.items()}

    # Convert each month into a DataFrame with month information in the index
    # and then concat all the dfs together, sorting on those with the most editors
    ranked = pd.concat([pd.DataFrame(parsed[i],index=[i]*len(parsed[i])) for i in parsed.keys()]).sort('editors',ascending=False).reset_index()

    # rename the reset index to something meaningful
    ranked.rename(columns={'index':'month'},inplace=True)

    # Group the articles by name, compute aggregate statistics
    # Rank on the total number editors and months in the top 25
    top_articles = ranked.groupby('article').agg({'month':len,'editors':np.sum,'rank':np.min})
    top_articles['editor-month'] = top_articles['month'] * top_articles['editors']
    top_articles.sort(['editor-month'],ascending=False,inplace=True)
    return top_articles

country_codes = {'en':'English','ru':'Russian','es':'Spanish','de':'German','ja':'Japanese','fr':'French',
                 'zh':'Chinese','it':'Italian','pl':'Polish','pt':'Portugese','nl':'Dutch','tr':'Turkish',
                 'ar':'Arabic','sv':'Swedish','id':'Indonesian','ko':'Korean','cs':'Czech','fa':'Farsi',
                 'uk':'Ukranian'}

top_articles_by_country = {}
for country in country_codes.keys():
    try:
        top_articles_by_country[country] = top_articles(country)
    except urllib2.HTTPError:
        print "The '{0}' language does not have a stats page ".format(country)
        pass

for _country,_df in top_articles_by_country.items():
    _df.to_csv('/Data/{0}.csv'.format(_country),encoding='utf8')

def langlink_translater(source_lang,target_lang,article_titles):
    chunks = ws.chunk_maker(article_titles,40)
    translation_dict = dict()
    
    for chunk in chunks:
        result = ws.wikipedia_query({'action':'query',
                                     'prop': 'langlinks',
                                     'lllang': source_lang,
                                     'titles': '|'.join(chunk),
                                     'lllimit': '500'},target_lang)
        if result and 'pages' in result.keys():
            translation_dict.update({_d['title'] : _d['langlinks'][0]['*'] for _d in result['pages'].values() if 'langlinks' in _d.keys()})
            
    return translation_dict

# This step takes a few minutes
translater_dict = {source_lang:{target_lang:langlink_translater(source_lang,target_lang,df.index) for target_lang,df in top_articles_by_country.items()} for source_lang in top_articles_by_country.keys()}

# Save the file
with open('translater_dict.json','wb') as f:
    json.dump(translater_dict,f)

_filedir

country_codes = {'en':'English','ru':'Russian','es':'Spanish','de':'German','ja':'Japanese','fr':'French',
                 'zh':'Chinese','it':'Italian','pl':'Polish','pt':'Portugese','nl':'Dutch','tr':'Turkish',
                 'ar':'Arabic','sv':'Swedish','id':'Indonesian','ko':'Korean','cs':'Czech','fa':'Farsi',
                 'uk':'Ukranian'}

top_articles_by_country = dict()
for country in country_codes.keys():
    top_articles_by_country[country] = pd.read_csv(_filedir + '/Data/{0}.csv'.format(country),encoding='utf8',index_col=0)

with open('translater_dict.json','rb') as f:
    translater_dict = json.load(f)

lang_link_exists_dict = dict()
top_articles_df = pd.DataFrame()

for source_lang,target_dictionary in translater_dict.iteritems():
    langlink_exists_df = pd.DataFrame()
    for target_lang,d in target_dictionary.iteritems():
        top_articles_df[target_lang] = pd.Series(top_articles_by_country[target_lang].index)
        langlink_exists_df[target_lang] = pd.Series(top_articles_by_country[target_lang].index).isin(translater_dict[source_lang][target_lang].keys())
        if source_lang == target_lang:
            langlink_exists_df[target_lang] = [1]*len(langlink_exists_df[target_lang])
        langlink_exists_df = langlink_exists_df.reindex_axis(sorted(langlink_exists_df.columns), axis=1)
        lang_link_exists_dict[source_lang] = langlink_exists_df

_df = top_articles_df.ix[:2].T
_df.index = [country_codes[i] for i in _df.index]
_df.columns = range(1,4)
_df.sort()


_df.ix['id'].sum(axis=1)

_lang = 'en'
f, ax = plt.subplots(figsize=(10,5))
_df = lang_link_exists_dict[_lang].ix[:100].T.astype(float)
_df = _df.ix[_df.sum(axis=1).sort(inplace=False,ascending=False).index]

_y,_x = _df.shape
_ax = ax.pcolor(_df,cmap='rainbow',vmin=0,vmax=1)

ax.set_frame_on(False)
ax.set_xticks(np.arange(0.5,_x+.5,10),minor=False)
ax.set_yticks(np.arange(_y)+.5,minor=False)
ax.invert_yaxis()
ax.set_xticklabels(_df.columns[::10],minor=False,fontsize=12)
ax.set_yticklabels([country_codes[x] for x in _df.index],minor=False,fontsize=12)
ax.tick_params(axis='x',direction='in',pad=-10)
ax.set_xlabel('Article rank',fontsize=15)
#f.subplots_adjust(right=0.8)
#cbar_ax = f.add_axes([.95, 0.15, 0.025, .75])
#f.colorbar(_ax, cax=cbar_ax)

f.tight_layout();
f.savefig('en_lang_link_exists.png',dpi=150)

sum_lang_link = pd.DataFrame(np.zeros(lang_link_exists_dict['en'].shape),columns=lang_link_exists_dict['en'].columns)
for lang,_df in lang_link_exists_dict.iteritems():
    sum_lang_link = sum_lang_link + _df.values.astype(float)
#frac_sum_lang_link = sum_lang_link.apply(lambda x:x/19)
sum_lang_link.columns = [country_codes[i] for i in sum_lang_link.columns]

f, ax = plt.subplots(figsize=(10,5))
_df = sum_lang_link.ix[:100].T.astype(float)
_df = _df.ix[_df.sum(axis=1).sort(inplace=False,ascending=False).index]
_y,_x = _df.shape
_ax = ax.pcolor(_df,cmap='rainbow',vmin=0,vmax=19)

ax.set_frame_on(False)
ax.set_xticks(np.arange(0,_x,10),minor=False)
ax.set_xticklabels(np.arange(0,_x,10),fontsize=12)
ax.set_xlabel('Article Rank',fontsize=15)
ax.set_title('Number of Languages with Article on Topic',fontsize=20)
ax.tick_params(axis='x',direction='in',pad=-10)
ax.set_yticks(np.arange(_y)+.5,minor=False)
ax.set_yticklabels(_df.index,minor=False)
ax.invert_yaxis()

#f.subplots_adjust(right=0.8)
cbar_ax = f.add_axes([.875, 0.15, 0.025, .75])
f.colorbar(_ax, cax=cbar_ax)

f.tight_layout()
f.savefig('sum_lang_link.png',dpi=200)

_s = sum_lang_link.ix[:100].apply(np.average,axis=1)
ax = plt.scatter(_s.index,_s.values,s=50,cmap='rainbow')
ax.axes.set_title('Coverage for Top 100 Stories',fontsize=20)
ax.axes.set_xlabel('Article Rank',fontsize=16)
ax.axes.set_ylabel('Number of Languages Covered',fontsize=16)
ax.axes.set_xlim((-1,101))
plt.tight_layout()
plt.savefig('top100_coverage.png',dpi=200)

article_language_graph = nx.DiGraph()
article_language_mapper = dict() # This will be helpful later

for source_lang,d in translater_dict.iteritems():
    for target_lang,mapping in d.iteritems():
        for target_lang_article,source_lang_article in mapping.iteritems():
            article_language_graph.add_edge(target_lang_article,source_lang_article)
            article_language_graph.add_node(source_lang_article,lang=source_lang)
            article_language_graph.add_node(target_lang_article,lang=target_lang)
            
            # Populate the article_language_mapper
            if source_lang_article in article_language_mapper.keys():
                article_language_mapper[source_lang_article].append(source_lang)
            else:
                article_language_mapper[source_lang_article] = [source_lang]
            if target_lang_article in article_language_mapper.keys():
                article_language_mapper[target_lang_article].append(target_lang)
            else:
                article_language_mapper[target_lang_article] = [target_lang]
            
nx.write_gexf(article_language_graph,'article_language_graph.gexf')
article_language_mapper = {k:list(set(v)) for k,v in article_language_mapper.iteritems()}

with open('article_language_mapper.json','wb') as f:
    json.dump(article_language_mapper,f)

Image('article_language_links.png')

topic_subgraphs = list(nx.components.connected_component_subgraphs(article_language_graph.to_undirected()))
subgraph_properties = [{'edges':_subgraph.number_of_edges(),'nodes':_subgraph.number_of_nodes(),'density':nx.density(_subgraph)} for _subgraph in topic_subgraphs]

# Uncomment to see what's in these subgraphs
#for _subgraph in topic_subgraphs:
#    if _subgraph.number_of_nodes() > 19:
#        print _subgraph.nodes()

subgraph_df = pd.DataFrame(subgraph_properties)
subgraph_df = subgraph_df[subgraph_df['nodes'] > 2]

f,ax = plt.subplots(1,1)
_ax = subgraph_df.plot(x='nodes',y='edges',kind='scatter',label='Observed Topic',ax=ax)

ax.plot([i*(i-1) for i in range(20)],label='Ideal Topic',lw=3,c='r',alpha=.5)
ax.axvline(x=19.5,ls='--',lw=3,c='g',alpha=.5,label='Max Topics')
ax.set_xlim((0,40))
ax.set_ylim((-1,400))
ax.legend(fontsize=12)
ax.set_xlabel('Number of Nodes in Topic',fontsize=18)
ax.set_ylabel('Number of Edges in Topic',fontsize=18)
ax.set_title('Diagnosing Problems in Topic Subgraphs',fontsize=24)

# Based on the results from commented part above, I'm applying three labels to the three outliers
_outliers = zip(['China','Taiwan','Ebola'],subgraph_df[subgraph_df['nodes'] > 20][['nodes','edges']].values)
for label,(x,y) in _outliers:
    ax.annotate(label,xy=(x, y),fontsize=12,
                xytext=(x+2, y+75),
                arrowprops=dict(arrowstyle="fancy", #linestyle="dashed",
                                color="0.5",shrinkB=8,connectionstyle="arc3,rad=0.3"))
plt.tight_layout();

def complete_subgraph_maker(node_list):
    return itertools.permutations(node_list,2)

complete_topic_graph = nx.DiGraph()
for _subgraph in topic_subgraphs:
    if _subgraph.number_of_nodes() < 20:
        _edgelist = complete_subgraph_maker(_subgraph.nodes())
        complete_topic_graph.add_edges_from(_edgelist)

# Add the language labels back in as node attributes so we can hopefully translate back
for node in complete_topic_graph.nodes():
    complete_topic_graph.add_node(node,lang=article_language_mapper[node])

complete_topic_subgraphs = list(nx.components.connected_component_subgraphs(complete_topic_graph.to_undirected()))
english_label_subgraphs = [_subgraph for _subgraph in complete_topic_subgraphs for node,data in _subgraph.nodes_iter(data=True) if 'en' in data['lang']]
print "Out of the initial {0} topical clusters, there are {1} subgraphs in the complete approach. {2} of these have an English label".format(len(topic_subgraphs), len(complete_topic_subgraphs), len(english_label_subgraphs))

english_topic_graph = nx.DiGraph()
topic_translation_dict = dict()

for _subgraph in english_label_subgraphs:
    english_topic_graph.add_edges_from(_subgraph.edges(data=True))
    english_topic_graph.add_nodes_from(_subgraph.nodes(data=True)) #1
    _english_nodes = [_node for _node,_data in _subgraph.nodes_iter(data=True) if 'en' in _data['lang']]
    if len(_english_nodes) == 1:
        topic_translation_dict.update({_node:_english_nodes[0] for _node in _subgraph.nodes_iter()})
    else:
        # I really hope this is never the case, but just to be sure
        print _english_nodes

# Graphs with lists for attributes cant be serialized into GEXF
# Comment out #1 to make #2 work, or leave #1 uncommented and #2 commented
#nx.write_gexf(english_topic_graph,'english_topic_graph.gexf') #2

translated_articles_by_country = pd.DataFrame()

for country in country_codes.keys():
    translated_articles_by_country[country] = pd.Series([topic_translation_dict.get(article,np.nan) for article in top_articles_by_country[country].index])

translated_articles_by_country.columns = [country_codes[i] for i in translated_articles_by_country.columns]
translated_articles_by_country.index = range(1,len(translated_articles_by_country)+1)
translated_articles_by_country.sort(axis=1).head(3).T

pd.Series(top_articles_by_country['en'].index).ix[list(np.array(translated_articles_by_country['English'][translated_articles_by_country['English'].isnull()].index) - 1)]

f,(ax1,ax2) = plt.subplots(2,1,sharex=True,figsize=(10,5))

# Plot on ax1
_df1 = lang_link_exists_dict['en'].T.astype(float)
_df1 = _df1.ix[_df1.sum(axis=1).sort(inplace=False,ascending=False).index]
_df1.index = [country_codes[i] for i in _df1.index]
_y1,_x1 = _df1.shape
_ax1 = ax1.pcolor(_df1,cmap='rainbow',vmin=0,vmax=1)
ax1.set_frame_on(False)
#ax1.set_xticks(np.arange(0,_x1,10),minor=False)
ax1.set_yticks(np.arange(_y1)+.5,minor=False)
ax1.invert_yaxis()
ax1.set_yticklabels(_df1.index,minor=False,fontsize=8)
ax1.set_title('Original',fontsize=18)

# Plot on ax2
_df2 = translated_articles_by_country.T.notnull().astype(float)
_df2 = _df2.ix[_df1.index] # Use the _df1 index
_y2,_x2 = _df2.shape
_ax2 = ax2.pcolor(_df2.values.astype(float),cmap='rainbow',vmin=0,vmax=1)
ax2.set_frame_on(False)
ax2.set_xticks(np.arange(0,_x2,10),minor=False)
ax2.set_yticks(np.arange(_y2)+.5,minor=False)
ax2.invert_yaxis()
ax2.set_yticklabels(_df2.index,minor=False,fontsize=8)
ax2.tick_params(axis='x',direction='in',pad=-4)
ax2.set_title('Cleaned',fontsize=18)

f.subplots_adjust(right=0.8)
#cbar_ax = f.add_axes([.95, 0.15, 0.025, .75])
#f.colorbar(_ax, cax=cbar_ax)
f.suptitle('Comparing results, English',fontsize=24)
#f.subplots_adjust(top=0.5)
f.tight_layout(rect=[0,0,1,.9])

top_stories_across_languages = pd.Series(Counter([_val for _array in translated_articles_by_country.values for _val in _array]))
top_stories_across_languages = top_stories_across_languages.ix[1:]
top_stories_across_languages_top5 = top_stories_across_languages[top_stories_across_languages >= 5].sort(inplace=False,ascending=True)

f,ax = plt.subplots(1,1,figsize=(8,10))
_ax = top_stories_across_languages_top5.plot(kind='barh',ax=ax)
ax.axes.set_title('Articles With Widest Coverage\n',fontsize=24)
ax.axes.set_xlabel('Number of Languages',fontsize=18)
f.tight_layout()
f.savefig('widest_coverage.png',dpi=200)

combined_top_articles_df = pd.concat(top_articles_by_country.values(),keys=top_articles_by_country.keys(),axis=0).reset_index()
combined_top_articles_df.rename(columns={'level_0':'lang'},inplace=True)
combined_top_articles_df['article'] = combined_top_articles_df['article'].apply(lambda x:topic_translation_dict.get(x,np.nan))

combined_top_articles_agg_article = combined_top_articles_df.groupby('article').agg({'editors':np.sum,'month':np.average,'lang':len})
combined_top_articles_agg_article['editors per month'] = combined_top_articles_agg_article['editors']/combined_top_articles_agg_article['month']
combined_top_articles_agg_article['editors per lang'] = combined_top_articles_agg_article['editors']/combined_top_articles_agg_article['lang']
combined_top_articles_agg_article['editors-lang-month'] = combined_top_articles_agg_article['editors']*combined_top_articles_agg_article['lang']*combined_top_articles_agg_article['month']
combined_top_articles_agg_article.sort('editors-lang-month',ascending=True,inplace=True)

f,ax = plt.subplots(1,1,figsize=(8,10))
_ax = combined_top_articles_agg_article['editors-lang-month'].ix[-50:].plot(kind='barh',ax=ax)
ax.axes.set_xscale('log')
ax.axes.set_xlabel('Editor-language-month score',fontsize=18)
ax.axes.set_ylabel('')
ax.axes.set_title('Articles with Highest Activity\n',fontsize=24)
f.tight_layout();
f.savefig('highest_activity_ranking.png',dpi=200)

_melted = pd.melt(translated_articles_by_country.reset_index(),id_vars=['index'])
_pivoted = pd.pivot_table(data=_melted,index='value',columns='variable',values='index')

top_by_language_pivoted = _pivoted.ix[top_stories_across_languages_top5.index].fillna(0)
top_by_combined_pivoted = _pivoted.ix[combined_top_articles_agg_article.index[-50:]].fillna(0)

language_cosine = dict()
combined_cosine = dict()
for _lang1 in country_codes.values():
    language_cosine[_lang1] = dict()
    combined_cosine[_lang1] = dict()
    for _lang2 in country_codes.values():
        if _lang1 != _lang2:
            language_cosine[_lang1][_lang2] = cosine_similarity(top_by_language_pivoted[_lang1],top_by_language_pivoted[_lang2])[0][0]
            combined_cosine[_lang1][_lang2] = cosine_similarity(top_by_combined_pivoted[_lang1],top_by_combined_pivoted[_lang2])[0][0]

f,(ax1,ax2) = plt.subplots(1,2,figsize=(9,5),sharey=True)

_df1 = pd.DataFrame(language_cosine)
_order1 = _df1.mean(axis=1).sort(inplace=False,ascending=True).index
_df1 = _df1[_order1].ix[_order1]
_y1,_x1 = _df1.shape
_ax1 = ax1.pcolor(_df1,cmap='rainbow',vmin=0,vmax=.75)
ax1.set_title('Language coverage',fontsize=18)
ax1.set_frame_on(False)
ax1.set_xticks(np.arange(_y1)+.5,minor=False)
ax1.set_yticks(np.arange(_y1)+.5,minor=False)
ax1.invert_yaxis()
ax1.set_xticklabels(_df1.columns,minor=False,fontsize=10,rotation=90)
ax1.set_yticklabels(_df1.index,minor=False,fontsize=10)

_df2 = pd.DataFrame(combined_cosine)
_order2 = _df2.mean(axis=1).sort(inplace=False,ascending=True).index
_df2 = _df2[_order1].ix[_order1] # Order the same way as _df1
_y2,_x2 = _df2.shape
_ax2 = ax2.pcolor(_df2,cmap='rainbow',vmin=0,vmax=.75)
ax2.set_title('Editor-language-month score',fontsize=18)
ax2.set_frame_on(False)
ax2.set_xticks(np.arange(_y1)+.5,minor=False)
ax2.invert_yaxis()
ax2.set_xticklabels(_df1.columns,minor=False,fontsize=10,rotation=90)

f.subplots_adjust(right=0.8)
cbar_ax = f.add_axes([1, 0.15, 0.05, .7])
cb = f.colorbar(_ax2, cax=cbar_ax, label='Cosine similarity')
cb.ax.yaxis.label.set_fontsize(15)

#f.suptitle('Cosine similarity of rankings across languages',fontsize=24)
#f.subplots_adjust(top=0.5)
f.tight_layout()#rect=[0,0,1,.9])
f.savefig('cosine_similarity.png',dpi=200)

_df = pd.DataFrame(data=np.triu(_df2),index=_df2.index,columns=_df2.columns).replace({0:np.nan})
#del _df['id']
_df.reset_index(inplace=True)
coverage = pd.melt(_df,id_vars=['index']).dropna(subset=['value'])
coverage.columns = ['Language 1','Language 2','Cosine Similarity']
#coverage['Language 1'] = coverage['Language 1'].apply(lambda x:country_codes.get(x))
#coverage['Language 2'] = coverage['Language 2'].apply(lambda x:country_codes.get(x))
_highest = coverage.sort('Cosine Similarity',inplace=False,ascending=False).reset_index(drop=True).ix[:9]
_lowest = coverage[coverage['Language 1'] != 'Indonesian'].sort('Cosine Similarity',inplace=False,ascending=True).reset_index(drop=True).ix[:9]
pd.concat([_highest,_lowest],axis=1,keys=['Highest similarities','Lowest similarities'])

top_news_articles = [u'2014 FIFA World Cup', u'Malaysia Airlines Flight 370', u'Malaysia Airlines Flight 17',
                     u'2014 Winter Olympics', u'2014 Crimean crisis', u'Felipe VI of Spain', 
                     u'Islamic State of Iraq and the Levant',u'Ebola virus epidemic in West Africa',u'Eurovision Song Contest 2014',
                     u'Ice Bucket Challenge', u'2014 Israel\u2013Gaza conflict', u'Minecraft',
                     u'Scottish independence referendum, 2014',u'2014 Hong Kong protests', u'United States elections, 2014',
                     u'Soma mine disaster', u'Indian general election, 2014', u'Gamergate controversy',
                     u'2014 Ferguson unrest',u'Rosetta spacecraft', u'Cuba\u2013United States relations',
                     u'Chibok schoolgirl kidnapping', u'Sinking of the MV Sewol']

revision_dict = dict()

for article in top_news_articles:
    print article
    revision_dict[article] = ws.get_page_revisions(article,_start,_end,'en')
    revision_dict[article].to_csv(_filedir + u'Data/{0}.csv'.format(article),encoding='utf8')

# http://planspace.org/2013/06/21/how-to-calculate-gini-coefficient-from-raw-data-in-python/
def gini(list_of_values):
    if len(list_of_values) > 1:
        sorted_list = sorted(list_of_values)
        height, area = 0, 0
        for value in sorted_list:
            height += value
            area += height - value / 2.
        fair_area = height * len(list_of_values) / 2
        gini_value = (fair_area - area) / fair_area
    else:
        gini_value = np.nan
    return gini_value

for _df in revision_dict.values():
    _df['gini'] = [gini(Counter(_df.ix[:i,'user']).values()) for i in iter(_df.index)]

rev_df = pd.concat(revision_dict.values(),keys=revision_dict.keys(),axis=0)
rev_df.reset_index(inplace=True,level=0)
rev_df.rename(columns={'level_0':'title'},inplace=True)
rev_df.reset_index(inplace=True,drop=True)
rev_df['anon'] = rev_df['anon'].notnull()
rev_df['userhidden'] = rev_df['userhidden'].notnull()
rev_df['commenthidden'] = rev_df['commenthidden'].notnull()
rev_df.to_csv('revisions.csv',encoding='utf8')

revs2014_df = rev_df[rev_df['timestamp'] >= pd.datetime(2014,1,1,0,0,0)]
revs2014_df.reset_index(drop=True,inplace=True)
revs2014_df.to_csv('revisions_2014.csv',encoding='utf8')

rev_df = pd.read_csv('revisions.csv',encoding='utf8',index_col=0,parse_dates=['date','timestamp'])
revs2014_df = pd.read_csv('revisions_2014.csv',encoding='utf8',index_col=0,parse_dates=['date','timestamp'])
revs2014_df.tail()

_agg_function = {'revision':np.max,'unique_users':np.max}
revs2014_agg_article = revs2014_df.groupby('title').agg(_agg_function)

f,ax = plt.subplots(1,1,figsize=(8,10))
revs2014_agg_article.sort('revision',inplace=False,ascending=True).plot(kind='barh',ax=ax)
ax.set_ylabel('')
ax.set_xlabel('Count',fontsize=12)
f.tight_layout()
f.savefig('en_19_activity.png',dpi=200)

daily_activity = revs2014_df.groupby(['title','date']).aggregate({'unique_users':max,
                                                           'revid':len,
                                                           'diff':np.sum,
                                                           'latency':np.mean,
                                                           'size':np.mean,
                                                           'gini':np.mean})
daily_activity = daily_activity.unstack(level=0)
daily_activity.index = pd.to_datetime(daily_activity.index)

daily_activity['unique_users'] = daily_activity['unique_users'].fillna(method='ffill').fillna(0)
daily_activity['revid'] = daily_activity['revid'].fillna(method='ffill').fillna(0)
daily_activity['gini'] = daily_activity['gini'].fillna(method='ffill').fillna(0)
#daily_activity['link_count'] = daily_activity['link_count'].fillna(method='ffill').fillna(0)
daily_activity['size'] = daily_activity['size'].fillna(method='ffill').fillna(0)
daily_activity['diff'] = daily_activity['diff'].fillna(0)
daily_activity['latency'] = daily_activity['latency'].fillna(0)

#daily_activity = daily_activity.fillna(method='ffill').fillna(0)
activity_2014 = daily_activity.ix['2014-1-1':]
activity_2014.tail()

normalized_unique_users = activity_2014['unique_users'] - activity_2014.ix['2014-1-1','unique_users']
ax = normalized_unique_users.plot(colormap='spectral')
ax.set_xlabel('Time',fontsize=15)
ax.set_ylabel('New unique users since Jan. 1',fontsize=15)
ax.legend(loc='center left',bbox_to_anchor=[1,.5])

users_rank_s = normalized_unique_users.ix['2014-12-21'].order(ascending=False)
users_rank_s

f,ax = plt.subplots(1,1,figsize=(10,6))
_ax = activity_2014['revid'].plot(colormap='spectral',lw=3,ax=ax)
#ax.set_yscale('symlog')
ax.set_xlabel('Time',fontsize=15)
ax.set_ylabel('Revisions',fontsize=15)
ax.legend(loc='center left',bbox_to_anchor=[1,.5],ncol=1)
ax.set_title('Revisions over time',fontsize=18)
f.tight_layout()
f.savefig('revisions.png',dpi=200,bbox_inches='tight')

revisions_rank_s = (activity_2014['revid'].cumsum().ix['2014-12-21'] - activity_2014['revid'].cumsum().ix['2014-1-1']).order(ascending=False)
revisions_rank_s

ax = activity_2014['gini'].plot(colormap='spectral',lw=3)
#ax.set_yscale('symlog')
ax.set_xlabel('Time',fontsize=15)
ax.set_ylabel('Gini coefficient',fontsize=15)
ax.legend(loc='center left',bbox_to_anchor=[1,.5])

gini_rank_s = activity_2014['gini'].ix['2014-12-21'].order(ascending=False)
gini_rank_s

ax = (activity_2014['size']/1000.).plot(colormap='spectral',lw=3)
#ax.set_yscale('symlog')
ax.set_xlabel('Time',fontsize=15)
ax.set_ylabel('Article size (kB)',fontsize=15)
ax.legend(loc='center left',bbox_to_anchor=[1,.5])

size_rank_s = (activity_2014['size']/1000.).ix['2014-12-21'].order(ascending=False)
size_rank_s

f,(ax1,ax2,ax3,ax4) = plt.subplots(4,1,figsize=(10,10),sharex=True)

_ax1 = activity_2014['unique_users'].ix['1-7-2014':].diff().plot(colormap='spectral',lw=2,ax=ax1,legend=None)
ax1.set_xlabel('')
ax1.set_ylabel('Users')
ax1.set_title('New users',fontsize=15)

_ax2 = activity_2014['revid'].plot(colormap='spectral',lw=2,ax=ax2,legend=None)
ax2.set_xlabel('')
ax2.set_ylabel('Revisions')
ax2.set_title('Revisions made',fontsize=15)

_ax3 = activity_2014['gini'].diff().plot(colormap='spectral',lw=2,ax=ax3,legend=None)
ax3.set_xlabel('')
ax3.set_ylabel('Gini delta')
ax3.set_title('Change in centralization',fontsize=15)

_ax4 = (activity_2014['diff']/1000.).diff().plot(colormap='spectral',lw=2,ax=ax4)
ax4.set_xlabel('')
ax4.set_ylabel('Kilobytes (kB) delta')
ax4.set_title('Change in article size',fontsize=15)
ax4.set_ylim((-100,100))
#ax4.set_yscale('symlog')

_colors = dict(zip(sorted(revs2014_df['title'].unique()),sns.color_palette('spectral', len(revs2014_df['title'].unique()))))

handles, labels = _ax4.get_legend_handles_labels()
ax4.legend_.remove()
new_handles = [Line2D([0], [0], linestyle="none", marker="o", markersize=10, markerfacecolor=_colors[article]) for article in sorted(revs2014_df['title'].unique())]
f.legend(new_handles,labels,loc='center left',bbox_to_anchor=[1,.5],fontsize=15)
f.tight_layout()
f.savefig('article_changes.png',dpi=200,bbox_inches='tight')

_table = pd.concat([users_rank_s.round(2),revisions_rank_s.round(2),gini_rank_s.round(2),size_rank_s.round(2)],
                   axis=1,keys=['Users','Revisions','Gini','Length'])
_table[['Revisions','Users','Gini','Length']].sort(['Revisions','Users','Gini','Length'],ascending=False)

ax = activity_2014['link_count'].plot(colormap='gist_rainbow')
#ax.set_yscale('symlog')
ax.set_xlabel('Time',fontsize=15)
ax.set_ylabel('Links in article',fontsize=15)
ax.legend(loc='center left',bbox_to_anchor=[1,.5])

activity_2014['link_count'].ix['2014-10-16'].order(ascending=False)

links_per_byte = (activity_2014['link_count']/activity_2014['size'])
ax = links_per_byte.plot(colormap='gist_rainbow')
#ax.set_yscale('symlog')
ax.set_xlabel('Time',fontsize=15)
ax.set_ylabel('Links per byte',fontsize=15)
ax.legend(loc='center left',bbox_to_anchor=[1,.5])

links_per_byte.ix['2014-10-16'].order(ascending=True)

#ax = daily_activity.ix['2014-1-1':,'latency'].plot(colormap='gist_rainbow')
ax = pd.rolling_mean(daily_activity.ix['2013-11-1':,'latency'],28).ix['2014-1-1':].plot(colormap='spectral')
ax.set_xlabel('Time',fontsize=15)
ax.set_ylabel('Edit latency (seconds)',fontsize=15)
ax.set_yscale('symlog')
ax.legend(loc='center left',bbox_to_anchor=[1,.5])

daily_activity.ix['2014-1-1':,'latency'].mean().order(ascending=True)
pv_df = ws.make_pageview_df(top_news_articles,'en','2013-12-1','2014-12-31')
pv_df.to_csv('pageviews_dec.csv',encoding='utf8')
pv_df = pv_df.ix[:'2014-12-21'].sort_index(axis=1)
pv_df.tail()
pv_df = pd.read_csv('pageviews_Dec.csv',encoding='utf8',index_col=0,parse_dates=[0])
del pv_df['Heartbleed']
pv_2014 = pv_df.ix['1-1-2014':]
pv_df.tail()

f,ax = plt.subplots(1,1,figsize=(10,5))
_ax = pv_df.ix['2014-1-1':].plot(colormap='spectral',lw=3,ax=ax)
ax.set_xlabel('Time',fontsize=15)
ax.set_ylabel('Pageviews',fontsize=15)


ax.legend(loc='center right',bbox_to_anchor=[1.35,.5],fontsize=9,ncol=1)
f.tight_layout()
f.savefig('pageviews.png',dpi=200,bbox_inches='tight')

pv_df.ix['2014-1-1':].sum().sort(ascending=False,inplace=False)

ax = (pv_df.ix['2014-1-1':]/pv_df.max(axis=0)).plot(colormap='spectral',lw=3)
#ax.set_yscale('symlog')
ax.set_xlabel('Time',fontsize=15)
ax.set_ylabel('Pageviews',fontsize=15)
#ax.set_ylim((10**2,10**7))
ax.legend(loc='center left',bbox_to_anchor=[1,.5])

f,ax = plt.subplots(1,1,figsize=(8,8))
pv_df.sum(axis=0).sort(inplace=False,ascending=True).plot(kind='barh',ax=ax)
ax.set_title('Cumulative pageviews',fontsize=18)
ax.set_xlabel('Total page views',fontsize=15)
ax.set_xscale('symlog')
f.tight_layout()
f.savefig('cumulative_pageviews.png',dpi=200)

pv_melted = pd.melt(pv_df.reset_index(),id_vars=['index'])
pv_gb_page = pv_melted.groupby('variable')
_idx = pv_melted.groupby('variable')['value'].agg(lambda col: col.idxmax())
pv_max = pv_melted.ix[_idx]
pv_max.columns = ['date','article','pageviews']
pv_max = pv_max.set_index('article')
pv_max

pv_gb_page.groups.keys()

f,ax = plt.subplots(1,1,figsize=(8,8))
pv_df.max().sort(inplace=False,ascending=True).plot(kind='barh',ax=ax)
ax.set_title('Most Pageviews in a Day',fontsize=18)
ax.set_xlabel('Pageviews',fontsize=15)
ax.set_xscale('symlog')
f.tight_layout()
f.savefig('max_pageviews.png',dpi=200)

f,ax = plt.subplots(1,1,figsize=(10,5))
_cmap = 'spectral'
_topics = ['Malaysia Airlines Flight 17','2014 FIFA World Cup','Islamic State of Iraq and the Levant','Minecraft']
_data = pv_df[_topics].ix['1-1-2014':]
#_ax = _data.plot(lw=3,ax=ax,cmap=_cmap)
#ax.set_yscale('symlog')
#ax.set_ylim((1e2,1e7))

ax.legend(fontsize=12,loc='upper left')

_colors = dict(zip(pv_df.columns,sns.color_palette(_cmap, len(pv_df.columns))))
for d in _topics:
    ax.plot(pv_df.ix['1-1-2014':,d].index,pv_df.ix['1-1-2014':,d].values,c=_colors[d],lw=2,label=d)
    #ax.fill_between(_data[d].index, _data[d].values, _data['Minecraft'].values, facecolor=_colors[d], alpha=0.33)
ax.set_xticklabels(['Jan 2014','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'])
ax.legend(loc='center right',bbox_to_anchor=[1.4,.5],fontsize=12)
f.tight_layout()
f.savefig('pageviews_shapes.png',dpi=200,bbox_inches='tight')


f,ax = plt.subplots(1,1,figsize=(8,8))
(pv_df.max()/pv_df.sum()).sort(inplace=False,ascending=True).plot(kind='barh',ax=ax)
ax.set_title('Fraction of Total Pageviews in Peak',fontsize=18)
ax.set_xlabel('Fraction of Pageviews',fontsize=15)
#ax.set_xscale('log')
f.tight_layout()
f.savefig('peak_fraction.png',dpi=200)

pv_2014.columns

information_conduced_df = pv_2014/(activity_2014['revid']+1)
ax = pd.rolling_mean(information_conduced_df,7).ix['2014-1-1':].plot(colormap='spectral')
ax.set_yscale('symlog')
ax.set_xlabel('Time',fontsize=15)
ax.set_ylabel('Consumption/Production ratio',fontsize=15)
#ax.set_ylim((0,20000))
ax.legend(loc='center left',bbox_to_anchor=[1,.5])

_s = pd.melt(information_conduced_df.reset_index(),id_vars='index').replace({np.inf:np.nan,-np.inf:np.nan}).dropna()
_s.columns = ['date','article','ratio']
_top = _s.sort('ratio',inplace=False,ascending=False).reset_index(drop=True).ix[:10]
# Exclude uninteresting edge cases. Sorry Your Excellence, you're a boring fellow.
_bottom = _s[(_s['ratio'] > 0) & (_s['article'] != 'Felipe VI of Spain')]
_bottom = _bottom.sort('ratio',inplace=False,ascending=True).reset_index(drop=True).ix[:10]
pd.concat([_top,_bottom],keys=['Greater consumption per production','Lesser consumption per production'],axis=1)

_p = pd.melt(activity_2014['revid'].reset_index(),id_vars='date')
_c = pd.melt(pv_2014.reset_index(),id_vars='index')
_j = pd.merge(_p,_c,left_on=['date','title'],right_on=['index','variable'],copy=False)
_j = _j[['date','title','value_x','value_y']]
_j.columns = ['date','article','production','consumption']
_j_gb = _j.groupby('article')

f,ax = plt.subplots(1,1,figsize=(10,10))

_colors = dict(zip(sorted(_j_gb.groups.keys()),sns.color_palette('spectral', len(_j_gb.groups.keys()))))

for article in sorted(_j_gb.groups.keys()):
    _data = _j_gb.get_group(article)[['production','consumption']]
    _data['production_z'] = (_data['production'] - _data['production'].mean())/_data['production'].std()
    _data['consumption_z'] = (_data['consumption'] - _data['consumption'].mean())/_data['consumption'].std()
    _data['ratio'] = np.abs(_data['consumption_z'] + _data['production_z'])
    #_data = _data[(_data['production'] > 0) & (_data['consumption'] > 0)]
    sns.regplot(_data['production_z'],_data['consumption_z'],
                ci=None,color=_colors[article],label=article,ax=ax,lowess=True,
                scatter_kws={'s':50*np.abs(_data['ratio']),'alpha':.33},line_kws={'lw':5})
    #ax.scatter(_data['production'].values,_data['consumption'].values,c=_colors[article],s=50,lw=0,alpha=.5,label=article)
    #ax.scatter(_data['production'].values,_data['consumption'].values,c=_colors[article],s=250*np.log(_data['ratio']),lw=0,alpha=.5,label=article)

ax.set_xlabel('Revisions (Z-score)',fontsize=15)
ax.set_ylabel('Pageviews (Z-score)',fontsize=15)
ax.set_yscale('symlog')
ax.set_ylim((-1,20))
ax.set_xscale('symlog')
ax.set_xlim((-1,20))
ax.plot((-1,20),(-1,20),'--',lw=3,c='k')

handles, labels = ax.get_legend_handles_labels()
new_handles = [Line2D([0], [0], linestyle="none", marker="o", markersize=10, markerfacecolor=_colors[article]) for article in sorted(_j_gb.groups.keys())]
ax.legend(new_handles,labels,loc='center left',bbox_to_anchor=[1,.5],fontsize=12)
f.savefig('pv_vs_revision.png',dpi=200,bbox_inches='tight')

handles[0]

agg_function = {'revid':{'weight':len},
              'timestamp':{'ts_min':np.min,'ts_max':np.max},
              'diff':{'diff_min':np.min,'diff_median':np.median,'diff_max':np.max,'total_changes':np.sum},
              'latency':{'latency_min':np.min,'latency_median':np.median,'latency_max':np.max},
              'revision':{'revision_min':np.min,'revision_median':np.median,'revision_max':np.max},
              #'link_count':{'link_count_min':np.min,'link_count_median':np.median,'link_count_max':np.max}
              }

revs_gb_edge = revs.groupby(['title','user'])
revs_edgelist = revs_gb_edge.agg(agg_function)
revs_edgelist.columns = revs_edgelist.columns.droplevel(0)

# Convert the ts_min and ts_max to floats for the number of days since Wikipedia was founded
revs_edgelist['ts_min'] = (revs_edgelist['ts_min'] - pd.Timestamp('2001-1-1'))/np.timedelta64(1,'D')
revs_edgelist['ts_max'] = (revs_edgelist['ts_max'] - pd.Timestamp('2001-1-1'))/np.timedelta64(1,'D')

revs_edgelist.head()

revs_edgelist.ix[[i for i in revs_edgelist.index if i[0] == i[1]]]

revs_gb_page = revs.groupby('title')
revs_pagenodelist = revs_gb_page.agg(agg_function)

revs_pagenodelist.columns = revs_pagenodelist.columns.droplevel(0)

# Convert the ts_min and ts_max to floats for the number of days since Wikipedia was founded
revs_pagenodelist['ts_min'] = (revs_pagenodelist['ts_min'] - pd.Timestamp('2001-1-1'))/np.timedelta64(1,'D')
revs_pagenodelist['ts_max'] = (revs_pagenodelist['ts_max'] - pd.Timestamp('2001-1-1'))/np.timedelta64(1,'D')
revs_pagenodelist['article'] = [1]*len(revs_pagenodelist)
revs_pagenodelist.head()

revs_gb_user = revs.groupby('user')
revs_usernodelist = revs_gb_user.agg(agg_function)

revs_usernodelist.columns = revs_usernodelist.columns.droplevel(0)

# Convert the ts_min and ts_max to floats for the number of days since Wikipedia was founded
revs_usernodelist['ts_min'] = (revs_usernodelist['ts_min'] - pd.Timestamp('2001-1-1'))/np.timedelta64(1,'D')
revs_usernodelist['ts_max'] = (revs_usernodelist['ts_max'] - pd.Timestamp('2001-1-1'))/np.timedelta64(1,'D')

revs_usernodelist['article'] = [0]*len(revs_usernodelist)
revs_usernodelist.head()

coauthorship_g = nx.DiGraph()
# Add the edges and edge attributes
for (article,editor) in iter(revs_edgelist.index.values):
    edge_attributes = {k:float(v) for k,v in dict(revs_edgelist.ix[(article,editor)]).items()}
    if article != editor:
        coauthorship_g.add_edge(article,editor,edge_attributes)

# Add the user nodes and attributes
for node in iter(revs_usernodelist.index):
    node_attributes = {k:float(v) for k,v in dict(revs_usernodelist.ix[node]).items()}
    coauthorship_g.add_node(node,node_attributes,type='user')

# Add the page nodes and attributes
for node in iter(revs_pagenodelist.index):
    node_attributes = {k:float(v) for k,v in dict(revs_pagenodelist.ix[node]).items()}
    coauthorship_g.add_node(node,node_attributes,type='page')
    
print "There are {0} nodes and {1} edges in the network.".format(coauthorship_g.number_of_nodes(),coauthorship_g.number_of_edges())

coauthorship_g.edges(data=True)[:1]

nx.write_gexf(coauthorship_g,'coauthorship_g.gexf')

edges_wt1 = [(i,j) for (i,j,k) in coauthorship_g.edges_iter(data=True) if k['weight'] == 1]

coauthorship_g_gt1 = coauthorship_g.copy()

coauthorship_g_gt1.remove_edges_from(edges_wt1)
isolates = nx.isolates(coauthorship_g_gt1)
coauthorship_g_gt1.remove_nodes_from(isolates)

nx.write_gexf(coauthorship_g_gt1,'coauthorship_g_gt1.gexf')

print "There are {0} nodes and {1} edges in the network.".format(coauthorship_g_gt1.number_of_nodes(),coauthorship_g_gt1.number_of_edges()) 

Image('coauthorship_g_gt1.png')

revs_edgelist.sort('weight',inplace=False,ascending=False)['weight'].reset_index().head(10)

revs_edgelist[(revs_edgelist['weight'] > 10)]['total_changes'].abs().sort(inplace=False,ascending=False).head(10)

_n = len(coauthorship_g_gt1) - 1
idc = {k:int(v*_n) for k,v in nx.in_degree_centrality(coauthorship_g_gt1).iteritems()}
odc = {k:int(v*_n) for k,v in nx.out_degree_centrality(coauthorship_g_gt1).iteritems()}
pd.Series(idc).sort(inplace=False,ascending=False).ix[:20]

bp_g_gt1 = coauthorship_g_gt1.to_undirected()
pages = list(revs_pagenodelist.index)
users = list(set(coauthorship_g_gt1.nodes()) - set(pages))
clustering = nx.bipartite.clustering(bp_g_gt1,pages)

pd.Series(clustering).sort(inplace=False,ascending=False)

revs2014_gb_article = revs2014_df.groupby('title')
aftermath_df_list = list()

for _article in pv_max.index:
    _df = revs2014_gb_article.get_group(_article)
    _before = pv_max.ix[_article,'date'] - np.timedelta64(1,'D')
    _after =  pv_max.ix[_article,'date'] + np.timedelta64(2,'D')
    _aftermath = _df[(_df['timestamp'] > _before) & (_df['timestamp'] < _after)]
    aftermath_df_list.append(_aftermath)

aftermath_revs = pd.concat(aftermath_df_list)

aftermath_revs_gb_edge = aftermath_revs.groupby(['title','user'])
aftermath_revs_edgelist = aftermath_revs_gb_edge.agg(agg_function)
aftermath_revs_edgelist.columns = aftermath_revs_edgelist.columns.droplevel(0)
aftermath_revs_edgelist['ts_min'] = (aftermath_revs_edgelist['ts_min'] - pd.Timestamp('2001-1-1'))/np.timedelta64(1,'D')
aftermath_revs_edgelist['ts_max'] = (aftermath_revs_edgelist['ts_max'] - pd.Timestamp('2001-1-1'))/np.timedelta64(1,'D')

aftermath_revs_gb_page = aftermath_revs.groupby('title')
aftermath_revs_pagenodelist = aftermath_revs_gb_page.agg(agg_function)
aftermath_revs_pagenodelist.columns = aftermath_revs_pagenodelist.columns.droplevel(0)
aftermath_revs_pagenodelist['ts_min'] = (aftermath_revs_pagenodelist['ts_min'] - pd.Timestamp('2001-1-1'))/np.timedelta64(1,'D')
aftermath_revs_pagenodelist['ts_max'] = (aftermath_revs_pagenodelist['ts_max'] - pd.Timestamp('2001-1-1'))/np.timedelta64(1,'D')
aftermath_revs_pagenodelist['article'] = [1]*len(aftermath_revs_pagenodelist)

aftermath_revs_gb_user = aftermath_revs.groupby('user')
aftermath_revs_usernodelist = aftermath_revs_gb_user.agg(agg_function)
aftermath_revs_usernodelist.columns = aftermath_revs_usernodelist.columns.droplevel(0)
aftermath_revs_usernodelist['ts_min'] = (aftermath_revs_usernodelist['ts_min'] - pd.Timestamp('2001-1-1'))/np.timedelta64(1,'D')
aftermath_revs_usernodelist['ts_max'] = (aftermath_revs_usernodelist['ts_max'] - pd.Timestamp('2001-1-1'))/np.timedelta64(1,'D')
aftermath_revs_usernodelist['article'] = [0]*len(aftermath_revs_usernodelist)

aftermath_coauthorship_g = nx.DiGraph()
# Add the edges and edge attributes
for (article,editor) in iter(aftermath_revs_edgelist.index.values):
    edge_attributes = {k:float(v) for k,v in dict(aftermath_revs_edgelist.ix[(article,editor)]).items()}
    if article != editor:
        aftermath_coauthorship_g.add_edge(article,editor,edge_attributes)

# Add the user nodes and attributes
for node in iter(aftermath_revs_usernodelist.index):
    node_attributes = {k:float(v) for k,v in dict(aftermath_revs_usernodelist.ix[node]).items()}
    aftermath_coauthorship_g.add_node(node,node_attributes,type='user')

# Add the page nodes and attributes
for node in iter(aftermath_revs_pagenodelist.index):
    node_attributes = {k:float(v) for k,v in dict(aftermath_revs_pagenodelist.ix[node]).items()}
    aftermath_coauthorship_g.add_node(node,node_attributes,type='page')
    
print "There are {0} nodes and {1} edges in the network.".format(aftermath_coauthorship_g.number_of_nodes(),aftermath_coauthorship_g.number_of_edges())
nx.write_gexf(aftermath_coauthorship_g,'aftermath_coauthorship_g.gexf')

_df = revs2014_gb_article.get_group('Gamergate controversy')
_df[(_df['date'] < pd.datetime(2014,10,25)) & (_df['date'] > pd.datetime(2014,10,22))]

_pages1 = revs2014_gb_article.groups.keys()
_pages2 = [_n for _n,_d in aftermath_coauthorship_g.nodes_iter(data=True) if 'page' in _d.values()]
_pages3 = list(aftermath_revs['title'].unique())
set(_pages1) - set(_pages3)

Image('aftermath_coauthorship_g.png')

_article_overlaps = dict()
_aftermath_overlaps = dict()

for _article1 in pages:
    _article_overlaps[_article1] = dict()
    _aftermath_overlaps[_article1] = dict()
    for _article2 in pages:
        if _article1 != _article2:
            try:
                _article_overlaps[_article1][_article2] = len(set(coauthorship_g.neighbors(_article1)) & set(coauthorship_g.neighbors(_article2)))/float(len(set(coauthorship_g.neighbors(_article1))))
            except nx.NetworkXError:
                _article_overlaps[_article1][_article2] = np.nan
            try: # Some articles have no editing activity in the aftermath window
                _aftermath_overlaps[_article1][_article2] = len(set(aftermath_coauthorship_g.neighbors(_article1)) & set(aftermath_coauthorship_g.neighbors(_article2)))/float(len(set(aftermath_coauthorship_g.neighbors(_article1))))
            except nx.NetworkXError:
                _aftermath_overlaps[_article1][_article2] = np.nan
        
_article_overlaps_df = pd.DataFrame(_article_overlaps)
_order = _article_overlaps_df.mean(axis=1).sort(inplace=False,ascending=True).index
_article_overlaps_df = _article_overlaps_df[_order].ix[_order]
_x1,_y1 = _article_overlaps_df.shape

_aftermath_overlaps_df = pd.DataFrame(_aftermath_overlaps)
_aftermath_overlaps_df = _aftermath_overlaps_df[_order].ix[_order]
_x2,_y2 = _aftermath_overlaps_df.shape

f,(ax1,ax2) = plt.subplots(1,2,figsize=(12,8),sharey=True)
_ax1 = ax1.pcolor(_article_overlaps_df.values,cmap='rainbow',vmin=0,vmax=.25)
ax1.set_frame_on(False)
ax1.set_xticks(np.arange(0.5,_x+.5),minor=False)
ax1.set_yticks(np.arange(_y)+.5,minor=False)
ax1.invert_yaxis()
ax1.set_xticklabels(_article_overlaps_df.columns,minor=False,fontsize=12,rotation=90)
ax1.set_yticklabels(_article_overlaps_df.index,minor=False,fontsize=12)
ax1.tick_params(axis='x',direction='in',pad=3)
ax1.set_title('Complete coauthorship',fontsize=15)

_ax2 = ax2.pcolor(_aftermath_overlaps_df.values,cmap='rainbow',vmin=0,vmax=.25)
ax2.set_frame_on(False)
ax2.set_xticks(np.arange(0.5,_x+.5),minor=False)
#ax2.set_yticks(np.arange(_y)+.5,minor=False)
ax2.invert_yaxis()
ax2.set_xticklabels(_article_overlaps_df.columns,minor=False,fontsize=12,rotation=90)
#ax2.set_yticklabels(_article_overlaps_df.index,minor=False,fontsize=12)
ax2.tick_params(axis='x',direction='in',pad=3)
ax2.set_title('Aftermath coauthorship',fontsize=15)

#ax.set_xlabel('Article rank',fontsize=15)
f.subplots_adjust(right=0.8)
cbar_ax = f.add_axes([1, 0.25, 0.05, 0.7])
f.colorbar(_ax1, cax=cbar_ax,label='Editor overlap')
f.tight_layout()
f.savefig('editor_overlap.png',dpi=200,bbox_inches='tight')

624150417[']

_idx = rev_df.groupby(['title','date']).agg({'revid':lambda x:x.idxmax()})
max_daily_revid = rev_df[['title','date','revid']].ix[_idx['revid']]
max_daily_revid = pd.pivot_table(data=max_daily_revid,columns='title',index='date',values='revid')
max_daily_revid.fillna(method='ffill',inplace=True)
max_daily_revid = max_daily_revid.ix[pd.date_range(start='1-1-2014',end='12-22-2014')]

# It turns out a bunch of revisions were deleted on the ISIS article
max_daily_revid.ix[pd.to_datetime('2014-09-03').date(),'Islamic State of Iraq and the Levant'] = np.nan #624150417

len([_revid for _article in max_daily_revid.columns for _revid in max_daily_revid[_article].ix[pd.datetime(2014,1,1).date():].dropna().unique()])

#parsed_revid_data = dict()

for _article in max_daily_revid.columns[14:15]:
    print _article
    parsed_revid_data[_article] = dict() 
    _unique_revids = max_daily_revid[_article].ix[pd.datetime(2014,1,1).date():].dropna().unique()
    for _revid in _unique_revids:
        try:
            parsed_revid_data[_article][_revid] = ws.wikipedia_query({'action':'parse',
                                                                  'oldid': _revid,
                                                                  'redirects': True,
                                                                  'prop': 'revid|langlinks|categories|externallinks|iwlinks|templates|images'},'en')
            
        except:
            print "Revision {0} has an error".format(str(_revid))
            parsed_revid_data[_article][_revid] = np.nan
            pass

simlified_parsed_revid_data = {_rev:_payload for _article,_revs in parsed_revid_data.items() for _rev,_payload in _revs.items()}        
        
with open('parsed_revid_data.json','wb') as f:
    json.dump(parsed_revid_data,f)
    
with open('simlified_parsed_revid_data.json','wb') as f:
    json.dump(simlified_parsed_revid_data,f)

parsed_revid_data['Islamic State of Iraq and the Levant'][624083851]

with open('parsed_revid_data.json','rb') as f:
    parsed_revid_data = json.load(f)

_final_revs = dict(max_daily_revid.ix[pd.to_datetime('2014-12-21').date()])

urls = list()
for _article, _rev in _final_revs.items():
    if 'externallinks' in parsed_revid_data[_article][_rev].keys():
        for _url in parsed_revid_data[_article][_rev]['externallinks']:
            urls.append(urlparse.urlparse(_url)[1])
            
_s = pd.Series(Counter(urls)).sort(ascending=False,inplace=False)
_s.head(10)

westerners = ['bbc','guardian','ft','telegraph','independent',
              'nytimes','reuters','washingtonpost','cnn','wsj','abc','nbc','cbs','yahoo','bloomberg']

def western_link_fraction(_revid,_revdict):
    try:
        _urls = _revdict[_revid]['externallinks']
        _domains = [urlparse.urlparse(_url)[1] for _url in _urls]
        _western = [any(_w in _d for _w in westerners) for _d in _domains]
        if len(_western) > 0:
            return float(sum(_western))/len(_western)
        else:
            return 0
    except KeyError:
        return np.nan

western_link_df = pd.DataFrame(index=pd.date_range('1-1-2014','12-21-2014'))

for article in max_daily_revid.columns:
    western_link_df[article] = max_daily_revid[article].apply(lambda x:western_link_fraction(x,simlified_parsed_revid_data))

f,ax = plt.subplots(1,1,figsize=(10,6))

_ax = western_link_df.plot(colormap='spectral',ax=ax)
_ax.legend(loc='center left',bbox_to_anchor=[1,.5])

f.tight_layout()
f.savefig('western_links.png',dpi=200,bbox_inches='tight')

def chunk_maker(a_list,size):
    chunk_num = len(a_list)/size
    chunks = list()
    for c in range(chunk_num + 1):
        start = c * (size + 1)
        end = (c + 1) * (size + 1)
        elements = list(itertools.islice(a_list,start,end))
        if len(elements) > 0:
            chunks.append(elements)
    return chunks

# http://stackoverflow.com/a/319291/1574687
def valid_ip(address):
    try:
        parts = address.split(".")
        if len(parts) != 4:
            return False
        for item in parts:
            if not 0 <= int(item) <= 255 and len(item) > 3:
                return False
        return True
    except ValueError:
        return False
non_ipv4_users = list(bp_g_gt1_usernodelist[~bp_g_gt1_usernodelist.index.map(valid_ip)].index)
chunks = chunk_maker(non_ipv4_users,50)

user_properties = list()

for chunk in chunks:
    user_string = u'|'.join(chunk)
    props = ws.get_user_properties(user_string,'en')
    for prop in props['users']:
        user_properties.append(prop)
        
with open('user_properties.json','wb') as f:
    json.dump(user_properties,f)
with open('user_properties.json','rb') as f:
    user_properties2 = json.load(f)
    
user_props_df = pd.DataFrame(user_properties2).set_index('name')
user_props_df = user_props_df[user_props_df['userid'].notnull()]
user_props_df['registration'] = pd.to_datetime(user_props_df['registration'],format='%Y-%m-%dT%H:%M:%SZ')
user_props_df['blockedtimestamp'] = pd.to_datetime(user_props_df['blockedtimestamp'],format='%Y-%m-%dT%H:%M:%SZ')
user_props_df['account_age'] = (pd.datetime.today().date() - user_props_df['registration'])/np.timedelta64(1,'D')
user_props_df['blocked'] = user_props_df['blockexpiry'].notnull()
user_props_df['blocked_account_age'] = (user_props_df['blockedtimestamp'] - user_props_df['registration'])/np.timedelta64(1,'D')
user_props_df['editcount'] = user_props_df['editcount'].map(float)
user_props_df['permissions'] = user_props_df['groups'].apply(len) - 2
user_props_df.drop(['invalid','blockedbyid','blockid','userid','blockedby','blockexpiry','blockreason'],inplace=True,axis=1)
user_props_df.head()

gender_count = user_props_df.groupby('gender').agg({'editcount':len})

print gender_count

sns.barplot(gender_count.index,gender_count.values,palette='muted')
plt.yscale('log')
plt.ylabel('Number of users',fontsize=15)
plt.xlabel('')

sns.boxplot(user_props_df['editcount'],groupby=user_props_df['gender'],color='muted')
plt.yscale('log')
plt.ylabel('Total revisions',fontsize=15)
plt.xlabel('')

female_editcounts = user_props_df[user_props_df['gender'] == 'female']['editcount'].values
male_editcounts = user_props_df[user_props_df['gender'] == 'male']['editcount'].values
stats.mannwhitneyu(female_editcounts,male_editcounts)

sns.boxplot(user_props_df['account_age'],groupby=user_props_df['gender'],color='muted')
plt.yscale('log')
plt.ylabel('Account age',fontsize=15)
plt.xlabel('')

female_account_ages = user_props_df[user_props_df['gender'] == 'female']['account_age'].values
male_account_ages = user_props_df[user_props_df['gender'] == 'male']['account_age'].values
stats.mannwhitneyu(female_account_ages,male_account_ages)

blocked = user_props_df[user_props_df['blocked']]
print blocked.groupby('gender').agg({'editcount':len})
print '\n'
print blocked.groupby('gender').agg({'editcount':len})/gender_count

bp_g_gt1_usernodelist.head()


bp_g_gt1_usernodelist = revs_usernodelist[revs_usernodelist.index.isin(users)]
bp_g_gt1_usernodelist = bp_g_gt1_usernodelist.join(user_props_df,how='left')
bp_g_gt1_usernodelist['degree'] = pd.Series({k:v for k,v in idc.iteritems() if k in bp_g_gt1_usernodelist.index})

bp_g_gt1_edgelist = revs_edgelist[revs_edgelist.index.isin(coauthorship_g_gt1.edges())]
bp_g_gt1_edgelist['article_degree'] = pd.Series({i:odc[i[0]] for i in iter(bp_g_gt1_edgelist.index.values)})
bp_g_gt1_edgelist['editor_degree'] = pd.Series({i:idc[i[1]] for i in iter(bp_g_gt1_edgelist.index.values)})
bp_g_gt1_edgelist['article_age'] = pd.Series({i:bp_g_gt1_edgelist.ix[i,'ts_min'] - revs_pagenodelist.ix[i[0],'ts_min'] for i in iter(bp_g_gt1_edgelist.index.values)})
bp_g_gt1_edgelist['editor_age'] = pd.Series({i:bp_g_gt1_usernodelist.ix[i[1],'account_age'] for i in iter(bp_g_gt1_edgelist.index.values)})
bp_g_gt1_edgelist['gender'] = pd.Series({i:bp_g_gt1_usernodelist.ix[i[1],'gender'] for i in iter(bp_g_gt1_edgelist.index.values)})
bp_g_gt1_edgelist['permissions'] = pd.Series({i:bp_g_gt1_usernodelist.ix[i[1],'permissions'] for i in iter(bp_g_gt1_edgelist.index.values)})
bp_g_gt1_edgelist['editcount'] = pd.Series({i:bp_g_gt1_usernodelist.ix[i[1],'editcount'] for i in iter(bp_g_gt1_edgelist.index.values)})
bp_g_gt1_edgelist['persistence'] = bp_g_gt1_edgelist['revision_max'] - bp_g_gt1_edgelist['revision_min']
bp_g_gt1_edgelist['revision_min_frac'] = pd.Series({i:bp_g_gt1_edgelist.ix[i,'revision_min']/float(revs_pagenodelist.ix[i[0],'revision_max']) for i in iter(bp_g_gt1_edgelist.index.values)})
bp_g_gt1_edgelist['revision_max_frac'] = pd.Series({i:bp_g_gt1_edgelist.ix[i,'revision_max']/float(revs_pagenodelist.ix[i[0],'revision_max']) for i in iter(bp_g_gt1_edgelist.index.values)})

ax = sns.boxplot(bp_g_gt1_usernodelist['weight']/bp_g_gt1_usernodelist['degree'],groupby=bp_g_gt1_usernodelist['degree'],color='coolwarm')
ax.set_yscale('symlog')
ax.set_xlabel('Articles edited',fontsize=15)
ax.set_ylabel('Revisions made',fontsize=15)

ax = sns.boxplot(bp_g_gt1_usernodelist['weight']/bp_g_gt1_usernodelist['degree'],groupby=bp_g_gt1_usernodelist['gender'],color='muted')
ax.set_yscale('symlog')
ax.set_xlabel('Articles edited',fontsize=15)
ax.set_ylabel('Revisions made',fontsize=15)

ax = sns.boxplot(bp_g_gt1_usernodelist['latency_median'],groupby=bp_g_gt1_usernodelist['degree'],color='coolwarm')
ax.set_yscale('symlog')
ax.set_xlabel('Articles edited',fontsize=15)
ax.set_ylabel('Latency (s)',fontsize=15)

ax = sns.boxplot(bp_g_gt1_usernodelist['latency_median'],groupby=bp_g_gt1_usernodelist['gender'],color='muted')
ax.set_yscale('symlog')
ax.set_xlabel('Articles edited',fontsize=15)
ax.set_ylabel('Latency (s)',fontsize=15)

bp_g_gt1_edgelist.head()

plt.scatter(bp_g_gt1_edgelist['weight'],bp_g_gt1_edgelist['persistence'],alpha=.5)
plt.plot((0,10**4),(0,10**4),color='k',linestyle='--',linewidth=2)
plt.xscale('symlog')
plt.yscale('symlog')
plt.xlim((0,10**4))
plt.ylim((0,10**4))
plt.xlabel('Revisions made',fontsize=15)
plt.ylabel('Persistence (days)',fontsize=15)

plt.scatter(bp_g_gt1_edgelist['weight']*np.random.uniform(.9,1.1,size=len(bp_g_gt1_edgelist)),bp_g_gt1_edgelist['revision_min_frac'],alpha=.5)
plt.xscale('symlog')
plt.xlim((2,10**4))
plt.ylim((0,1))
plt.xlabel('Revisions made',fontsize=15)
plt.ylabel('Latency',fontsize=15)

ax = sns.boxplot(bp_g_gt1_edgelist['weight'],groupby=bp_g_gt1_edgelist['gender'],color='muted')
ax.set_yscale('symlog')
ax.set_xlabel('',fontsize=15)
ax.set_ylabel('Revisions',fontsize=15)

ax = sns.boxplot(bp_g_gt1_edgelist['weight'],groupby=bp_g_gt1_edgelist['permissions'],color='gist_rainbow')
ax.set_yscale('symlog')
ax.set_xlabel('Permissions',fontsize=15)
ax.set_ylabel('Revisions',fontsize=15)

ax = sns.boxplot(bp_g_gt1_edgelist['editor_age'],groupby=bp_g_gt1_edgelist['editor_degree'],color='coolwarm')
ax.set_yscale('symlog')
ax.set_xlabel('Articles edited',fontsize=15)
ax.set_ylabel('Editor age (days)',fontsize=15)

ax = sns.boxplot(bp_g_gt1_edgelist['article_age'],groupby=bp_g_gt1_edgelist['editor_degree'],color='coolwarm')
ax.set_yscale('symlog')
ax.set_xlabel('Articles edited',fontsize=15)
ax.set_ylabel('Time since first edit (days)',fontsize=15)

ax = sns.boxplot(bp_g_gt1_edgelist['editor_age'],groupby=bp_g_gt1_edgelist['gender'],color='muted')
ax.set_yscale('symlog')
ax.set_xlabel('Articles edited',fontsize=15)
ax.set_ylabel('First edit to article',fontsize=15)

ax = sns.boxplot(bp_g_gt1_edgelist['editor_age'],groupby=bp_g_gt1_edgelist['permissions'],color='pastel')
#ax.set_yscale('symlog')
ax.set_xlabel('Permissions',fontsize=15)
ax.set_ylabel('Editor age',fontsize=15)

ax = sns.boxplot(bp_g_gt1_edgelist['revision_min_frac'],groupby=bp_g_gt1_edgelist['editor_degree'],color='coolwarm')
#ax.set_yscale('symlog')
ax.set_xlabel('Articles edited',fontsize=15)
ax.set_ylabel('First edit to article',fontsize=15)

ax = sns.boxplot(bp_g_gt1_edgelist['revision_min_frac'],groupby=bp_g_gt1_edgelist['gender'],color='muted')
#ax.set_yscale('symlog')
ax.set_xlabel('Articles edited',fontsize=15)
ax.set_ylabel('First edit to article',fontsize=15)

ax = sns.boxplot(bp_g_gt1_edgelist['revision_min_frac'],groupby=bp_g_gt1_edgelist['permissions'],color='pastel')
#ax.set_yscale('symlog')
ax.set_xlabel('Permissions',fontsize=15)
ax.set_ylabel('First edit to article',fontsize=15)