import pandas as pd
import numpy
from collections import defaultdict
from matplotlib.pylab import style
import json
style.use('fivethirtyeight')
%pylab inline
java_min_int = -2147483648

allrecs = pd.read_csv('snapshot_data/2014-10-13/gender-index-data-2014-10-13.csv',na_values=[java_min_int])
def split_column(q_str):
    if type(q_str) is float:
        if numpy.isnan(q_str):
            return q_str 
    if type(q_str) is str:
        qs = q_str.split('|')
        return qs[:-1] #cos the format will always end with a |

for col in ['gender','site_links']:
    allrecs[col] = allrecs[col].apply(split_column)

allrecs.head(5)

lang_culture_map = pd.DataFrame.from_csv('helpers/aggregation_maps/lang_culture.csv')

def agg_culture(wikiname_list):
    cultures = set()
    if isinstance(wikiname_list, list):
        for wikiname in wikiname_list:
            parts = wikiname.split('wiki')
            if parts[1] == '': #this was a pedia
                lang_code = parts[0]
                try:
                    culture = lang_culture_map.ix[lang_code]['culture']
                    cultures.add(culture)
                except KeyError:
                    continue
    return list(cultures)


len(allrecs)

allrecs['cultures'] = allrecs['site_links'].apply(agg_culture)

def dofd():
    return defaultdict(int)

culture_gender_dict = defaultdict(dofd)

for row in allrecs.iterrows():
    colbit = row[1]
    gender_list = colbit[3]
    if isinstance(gender_list, list):
        gender = gender_list[0]
    else:
        gender = None
    cultures = colbit[8]
    for culture in cultures:
        culture_gender_dict[culture][gender] += 1

lang_cultures = pd.DataFrame.from_dict(culture_gender_dict, orient='index')

lang_cultures.to_json('helpers/lang_cultures.json')

lang_cultures = pd.DataFrame.from_dict(json.load(open('helpers/lang_cultures.json','r')))

lang_cultures = lang_cultures.drop('Northern Sotho Wikipedia',axis=0)

import pywikibot
#Tranforming QIDs into English labels.
enwp = pywikibot.Site('en','wikipedia')
wikidata = enwp.data_repository()

retrieved = dict()

def english_label(qid):
    if qid:
        if type(qid) is float:
            if math.isnan(qid):
                return None
        #first see if we've done it
        try:
            return retrieved[qid]
        except KeyError:
            try:
                page = pywikibot.ItemPage(wikidata, qid)
                data = page.get()
                lab = data['labels']['en']
                retrieved[qid] = lab
                return lab
            except KeyError:
                retrieved[qid] = qid
                return qid
            except:
                return qid
    return qid

lang_cultures.columns = map(english_label, lang_cultures.columns)

lang_cultures['human_total'] = lang_cultures.sum(axis=1)
lang_cultures['gendered_total'] = lang_cultures['human_total'] - lang_cultures['null']
lang_cultures['nonbin_total'] = lang_cultures['gendered_total'] - lang_cultures['female'] - lang_cultures['male']
lang_cultures['fem_per'] = lang_cultures['female'] / lang_cultures['gendered_total']
lang_cultures['nonbin_per'] = lang_cultures['nonbin_total'] / lang_cultures['gendered_total']

lang_cultures.fillna(0,inplace=True)

lang_cultures

fig, ax = plt.subplots(1,1,figsize=(8,8))
lang_cultures[['gendered_total','fem_per']].plot(kind='scatter', x='gendered_total', y='fem_per', logx=True, ax=ax, c='#74bc3a')

ax.yaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, y: '{:.0%}'.format(x )))
ax.xaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, y: '{:,.0f}'.format(x )))

ax.set_xlim(min(lang_cultures['gendered_total']) * 0.6, max(lang_cultures['gendered_total']) *3)
ax.set_ylim(min(lang_cultures['fem_per']) * 0.85, max(lang_cultures['fem_per']) *1.15)

for label, x, y in zip(lang_cultures.index, lang_cultures['gendered_total'], lang_cultures['fem_per']):
    plt.annotate(
        label, 
        xy = (x, y), xytext = (5,2),
        textcoords = 'offset points', ha = 'left', va = 'bottom')
        #bbox = dict(boxstyle = 'round,pad=0.5', fc = 'yellow', alpha = 0.5),
        #arrowprops = dict(arrowstyle = '->', connectionstyle = 'arc3,rad=0'))

plt.annotate("", xy=(10000,0.5), xytext=(0,0)) 
plt.title('Female % vs. Total Biographies \nLanguage Aggregated By Culture', fontsize=24)
plt.xlabel('Number of Biographies Recorded')
plt.ylabel('Composition of Biographies Which Are Female')