import pandas as pd import numpy import json from collections import defaultdict %pylab inline java_min_int = -2147483648 import math import pywikibot #Tranforming QIDs into English labels. enwp = pywikibot.Site('en','wikipedia') wikidata = enwp.data_repository() retrieved = dict() def english_label(qid): if type(qid) is float: if math.isnan(qid): return None #first see if we've done it try: return retrieved[qid] except KeyError: try: page = pywikibot.ItemPage(wikidata, qid) data = page.get() lab = data['labels']['en'] retrieved[qid] = lab return lab except KeyError: retrieved[qid] = qid return qid allrecs = pd.DataFrame.from_dict(json.load(open('helpers/world_cultures_shortcut.json','r'))) pd.crosstab[allrecs['culture']] genrecs = allrecs[allrecs['gender'].apply(lambda x: x is not None)] genrecs['en_gender'] = genrecs['gender'].apply(english_label) cultures = genrecs[['culture','en_gender']].groupby(by='culture') # it should probably be done this way cultpiv = pd.DataFrame.pivot_table(pd.DataFrame(cultures), index='culture', values='en_gender') perc_dict = defaultdict(dict) for group, df in cultures: total = float(len(df)) fem = len(df[df["en_gender"] == 'female']) mal = len(df[df["en_gender"] == 'male']) fem_per = fem / total nonbin_per = (total - (fem + mal)) / total perc_dict[group]['total'] = total perc_dict[group]['female %'] = fem_per perc_dict[group]['nonbinary %'] = nonbin_per cultplotdf = pd.DataFrame.from_dict(perc_dict, orient='index').sort('female %') from matplotlib.pylab import style style.use('fivethirtyeight') fig, ax = plt.subplots(1, 1, figsize=(6,6)) cultplotdf[['total','female %']].plot(kind='bar',secondary_y=['total'], ax=ax, grid=False) ax.right_ax.set_ylabel('total biographies') ax.set_ylabel('female %') ax.legend(loc=2) ax.right_ax.legend(loc=1) ax.set_xlabel('culture') ax.yaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, y: '{:.0%}'.format(x ))) ax.right_ax.yaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, y: '{:,}'.format(int(x)))) fig.suptitle('Female Percentage of Biographies by Culture', fontsize=24) fig.subplots_adjust(top=0.88) fig, ax = plt.subplots(1, 1, figsize=(6,6)) cultplotdf[['total','nonbinary %']].plot(kind='bar',secondary_y=['total'], ax=ax, grid=False) ax.right_ax.set_ylabel('total biographies') ax.set_ylabel('nonbinary %') ax.legend(loc=2) ax.right_ax.legend(loc=1) ax.set_xlabel('culture') ax.yaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, y: '{:.4%}'.format(x ))) ax.right_ax.yaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, y: '{:,}'.format(int(x)))) fig.suptitle('Nonbinary Percentage of Biographies by Culture', fontsize=24) fig.subplots_adjust(top=0.88) fig, axes = plt.subplots(nrows=3, ncols=1, sharex=True, figsize=(10,10)) measures = ['total', 'fem_per', 'nonbin_per'] for meas, ax in zip(measures, axes): cultplotdf[meas].plot(kind='bar', ax=ax, legend=False) fig.legend = legend(bbox_to_anchor=(1.05, 1.5), loc=2, borderaxespad=0) fig.suptitle('Volumes of Dates of Birth and Death in Wikidata \n by Gender \n Total and Modern Timeframes', fontsize=24) fig.tight_layout() subplots_adjust(hspace=0.1, top=0.82) fig = plt.figure() ax = fig.add_subplot(111) ax2 = ax.twinx() width =0.4 cultplotdf['total'].plot(kind='bar', ax=ax, width=width, position=1) cultplotdf['fem_per'].plot(kind='bar', ax=ax, width=width, position=0) ax.set_ylabel('Total Biographies') ax2.set_ylabel('Female Percentage')