import json from collections import defaultdict import pandas as pd import pywikibot import decimal NOPLACES = decimal.Decimal(10) ** 0 TWOPLACES = decimal.Decimal(10) ** -2 %pylab inline jsonfile = open('lang_sex.json','r') bigdict = json.load(jsonfile) lang_sex = defaultdict(dict) for keystring, count in bigdict.iteritems(): lang, sex = keystring.split('--') lang_sex[lang][sex] = count sex_df = pd.DataFrame.from_dict(lang_sex, orient='index') sex_df = sex_df.fillna(value=0.0) sex_df #norm_sex is joke on heteronormativity norm_sex = sex_df.apply(lambda row: row / row.sum(), axis=1) norm_sex #Tranforming QIDs into English labels. enwp = pywikibot.Site('en','wikipedia') wikidata = enwp.data_repository() def english_label(qid): page = pywikibot.ItemPage(wikidata, qid) data = page.get() return data['labels']['en'] sex_qs = [str(q) for q in norm_sex.columns] sex_labels = [english_label(sex_q) for sex_q in sex_qs] norm_sex.columns = sex_labels norm_sex sex_df['total'] = sex_df.sum(axis=1) female_sorted_10000_items = norm_sex[sex_df['total']>10000].sort('female', ascending=True) female_sorted_10000_items.plot(kind='bar', stacked=True, legend=True, figsize=(13,8), ylim=(0,1), title= '''Comoposition of Wikidata Prorerty:P21 "Sex or Gender" by Language (Languages with over 1,000 associated P21)''')