{english_label(qid): language_count for qid, language_count in used_sexes_count.iteritems()} show_by_lang_plot() diffdf.sort(columns='change%', ascending=True)[['female_may2013','female_march2014','change%']].head(10) diffdf.sort(columns='change%', ascending=False)[['female_may2013','female_march2014','change%']].head(10) top_non_MF.sort('non_MF%', ascending=False) sex_props_df.sort('props_per_item') import json from collections import defaultdict import pandas as pd import pywikibot import decimal NOPLACES = decimal.Decimal(10) ** 0 TWOPLACES = decimal.Decimal(10) ** -2 %pylab inline norm_sex[sexdf['total']>1000].sort(columns='non_MF', ascending=False).head(10) jsonfile = open('lang_sex.json','r') bigdict = json.load(jsonfile) lang_sex = defaultdict(dict) for keystring, count in bigdict.iteritems(): lang, sex = keystring.split('--') lang_sex[lang][sex] = count used_sexes = defaultdict(list) for lang, sex_dict in lang_sex.iteritems(): for sex in sex_dict.iterkeys(): used_sexes[sex].append(lang) used_sexes_count = {sex: len(lang_list) for sex, lang_list in used_sexes.iteritems()} sexdf = pd.DataFrame.from_dict(lang_sex, orient='index') sexdf = sexdf.fillna(value=0) #sexdf.plot(kind='bar', stacked=True, figsize=(10,10)) #Norm_sex is not "normal" sex, but rather the Sex-data normed into percentages. norm_sex = sexdf.apply(lambda col: col / float(col.sum()), axis=1) #Tranforming QIDs into English labels. enwp = pywikibot.Site('en','wikipedia') wikidata = enwp.data_repository() def english_label(qid): page = pywikibot.ItemPage(wikidata, qid) data = page.get() return data['labels']['en'] sex_qs = [str(q) for q in norm_sex.columns] sex_labels = [english_label(sex_q) for sex_q in sex_qs] norm_sex.columns = sex_labels #norm_sex.index = [label.replace('wiki','') for label in norm_sex.index] #comparing by total between two different dataframes requires #that norm_sex has not had any rows modified since it was created from sexdf sexdf['total'] = sexdf.sum(axis=1) fs1000 = norm_sex[sexdf['total']>10000].sort('female', ascending=True) def show_by_lang_plot(): fsplot = fs1000.plot(kind='bar', stacked=True, legend=True, figsize=(13,8), alpha=0.9, ylim=(0,1), title= '''Comoposition of Wikidata Prorerty:P21 "Sex or Gender" by Language (Languages with over 1,000 associated P21)''', colormap='Set1') plt.yticks(linspace(0, 1, num=11), [str(decimal.Decimal(x * 100).quantize(NOPLACES))+'%' for x in arange(0,1.1,0.1)]) ticklocs, langs = plt.xticks() langstrs = [str(decimal.Decimal(norm_sex.loc[lang.get_text()]['female']* 100).quantize(TWOPLACES))+'% '+ lang.get_text() for lang in langs] plt.xticks(ticklocs, langstrs) plt.xlabel('Language-Wiki percentage "female"') fs1000 maydf = pd.read_table('may2013.csv',sep=',', index_col=0) maydf['female'] = maydf['perc'] / 100.0 diffdf = maydf.join(other=norm_sex,how='inner',lsuffix='_may2013', rsuffix='_march2014') diffdf['change%'] = (diffdf['female_march2014'] - diffdf['female_may2013']) / diffdf['female_may2013'] diffdf['change%'] = diffdf['change%'].apply(lambda x: decimal.Decimal(x * 100).quantize(TWOPLACES) ) non_MF_cols = [col for col in norm_sex.columns if col not in ['male','female']] norm_sex['non_MF'] = norm_sex[non_MF_cols].sum(axis=1) top_non_MF_dict = dict() for s in non_MF_cols: t = norm_sex[sexdf['total']>1000].sort(columns=s, ascending=False)[s].head(1) top_non_MF_dict[s] = {'wiki':t.index[0],'non_MF%':t[0]*100} top_non_MF = pd.DataFrame.from_dict(data=top_non_MF_dict, orient='index') jsonfile = open('sex_propcount.json','r') sex_props_json = json.load(jsonfile) sex_props = defaultdict(dict) for keystring, count in sex_props_json.iteritems(): sex, prop = keystring.split('_') sex_props[sex][prop] = count sex_props_df = pd.DataFrame.from_dict(sex_props, orient='index') sex_qs = [str(q) for q in sex_props_df.index] sex_labels = [english_label(sex_q) for sex_q in sex_qs] sex_props_df.columns = ['item_count', 'total_props'] sex_props_df.index = sex_labels sex_props_df['props_per_item'] = sex_props_df['total_props'] / sex_props_df['item_count']