{english_label(qid): language_count for qid, language_count in used_sexes_count.iteritems()}

show_by_lang_plot()

diffdf.sort(columns='change%', ascending=True)[['female_may2013','female_march2014','change%']].head(10)

diffdf.sort(columns='change%', ascending=False)[['female_may2013','female_march2014','change%']].head(10)

top_non_MF.sort('non_MF%', ascending=False)

sex_props_df.sort('props_per_item')

import json
from collections import defaultdict
import pandas as pd
import pywikibot
import decimal
NOPLACES = decimal.Decimal(10) ** 0
TWOPLACES = decimal.Decimal(10) ** -2
%pylab inline

norm_sex[sexdf['total']>1000].sort(columns='non_MF', ascending=False).head(10)

jsonfile = open('lang_sex.json','r')
bigdict = json.load(jsonfile)
lang_sex = defaultdict(dict)
for keystring, count in bigdict.iteritems():
    lang, sex = keystring.split('--')
    lang_sex[lang][sex] = count
    
used_sexes = defaultdict(list)
for lang, sex_dict in lang_sex.iteritems():
    for sex in sex_dict.iterkeys():
        used_sexes[sex].append(lang)
        
used_sexes_count = {sex: len(lang_list) for sex, lang_list in used_sexes.iteritems()}

sexdf = pd.DataFrame.from_dict(lang_sex, orient='index')
sexdf = sexdf.fillna(value=0)
#sexdf.plot(kind='bar', stacked=True, figsize=(10,10))
#Norm_sex is not "normal" sex, but rather the Sex-data normed into percentages.
norm_sex = sexdf.apply(lambda col: col / float(col.sum()), axis=1)

#Tranforming QIDs into English labels.
enwp = pywikibot.Site('en','wikipedia')
wikidata = enwp.data_repository()

def english_label(qid):
    page = pywikibot.ItemPage(wikidata, qid)
    data = page.get()
    return data['labels']['en']

sex_qs = [str(q) for q in norm_sex.columns]
sex_labels = [english_label(sex_q) for sex_q in sex_qs]

norm_sex.columns = sex_labels

#norm_sex.index = [label.replace('wiki','') for label in norm_sex.index]
#comparing by total between two different dataframes requires 
#that norm_sex has not had any rows modified since it was created from sexdf
sexdf['total'] = sexdf.sum(axis=1)
fs1000 = norm_sex[sexdf['total']>10000].sort('female', ascending=True)

def show_by_lang_plot():
    fsplot = fs1000.plot(kind='bar', stacked=True, legend=True, figsize=(13,8), alpha=0.9, ylim=(0,1),
                         title= '''Comoposition of Wikidata Prorerty:P21 "Sex or Gender" by Language 
    (Languages with over 1,000 associated P21)''',
                         colormap='Set1')

    plt.yticks(linspace(0, 1, num=11), [str(decimal.Decimal(x * 100).quantize(NOPLACES))+'%' for x in arange(0,1.1,0.1)])
    
    ticklocs, langs = plt.xticks()
    langstrs = [str(decimal.Decimal(norm_sex.loc[lang.get_text()]['female']* 100).quantize(TWOPLACES))+'%  '+ lang.get_text() for lang in langs]
    plt.xticks(ticklocs, langstrs)
    plt.xlabel('Language-Wiki percentage "female"')

fs1000

maydf = pd.read_table('may2013.csv',sep=',', index_col=0)
maydf['female'] = maydf['perc'] / 100.0
diffdf = maydf.join(other=norm_sex,how='inner',lsuffix='_may2013', rsuffix='_march2014')
diffdf['change%'] = (diffdf['female_march2014'] - diffdf['female_may2013']) / diffdf['female_may2013']
diffdf['change%'] = diffdf['change%'].apply(lambda x: decimal.Decimal(x * 100).quantize(TWOPLACES) )

non_MF_cols = [col for col in norm_sex.columns if col not in ['male','female']]
norm_sex['non_MF'] = norm_sex[non_MF_cols].sum(axis=1)

top_non_MF_dict = dict()
for s in non_MF_cols:
    t = norm_sex[sexdf['total']>1000].sort(columns=s, ascending=False)[s].head(1)
    top_non_MF_dict[s] = {'wiki':t.index[0],'non_MF%':t[0]*100}
top_non_MF = pd.DataFrame.from_dict(data=top_non_MF_dict, orient='index')

jsonfile = open('sex_propcount.json','r')
sex_props_json = json.load(jsonfile)
sex_props = defaultdict(dict)
for keystring, count in sex_props_json.iteritems():
    sex, prop = keystring.split('_')
    sex_props[sex][prop] = count
    
sex_props_df = pd.DataFrame.from_dict(sex_props, orient='index')

sex_qs = [str(q) for q in sex_props_df.index]
sex_labels = [english_label(sex_q) for sex_q in sex_qs]

sex_props_df.columns = ['item_count', 'total_props']

sex_props_df.index = sex_labels

sex_props_df['props_per_item'] = sex_props_df['total_props'] / sex_props_df['item_count']