import pandas as pd
import numpy
import json
from collections import defaultdict
%pylab inline
java_min_int = -2147483648
Populating the interactive namespace from numpy and matplotlib
import math
import pywikibot
#Tranforming QIDs into English labels.
enwp = pywikibot.Site('en','wikipedia')
wikidata = enwp.data_repository()
retrieved = dict()
def english_label(qid):
if type(qid) is float:
if math.isnan(qid):
return None
#first see if we've done it
try:
return retrieved[qid]
except KeyError:
try:
page = pywikibot.ItemPage(wikidata, qid)
data = page.get()
lab = data['labels']['en']
retrieved[qid] = lab
return lab
except KeyError:
retrieved[qid] = qid
return qid
VERBOSE:pywiki:Starting 1 threads...
allrecs = pd.DataFrame.from_dict(json.load(open('helpers/world_cultures_shortcut.json','r')))
pd.crosstab[allrecs['culture']]
--------------------------------------------------------------------------- TypeError Traceback (most recent call last) <ipython-input-167-c9d92ccf2fb2> in <module>() ----> 1 pd.crosstab[allrecs['culture']] TypeError: 'function' object has no attribute '__getitem__'
genrecs = allrecs[allrecs['gender'].apply(lambda x: x is not None)]
genrecs['en_gender'] = genrecs['gender'].apply(english_label)
VERBOSE:pywiki:Found 1 wikidata:wikidata processes running, including this one. -c:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
cultures = genrecs[['culture','en_gender']].groupby(by='culture')
# it should probably be done this way cultpiv = pd.DataFrame.pivot_table(pd.DataFrame(cultures), index='culture', values='en_gender')
perc_dict = defaultdict(dict)
for group, df in cultures:
total = float(len(df))
fem = len(df[df["en_gender"] == 'female'])
mal = len(df[df["en_gender"] == 'male'])
fem_per = fem / total
nonbin_per = (total - (fem + mal)) / total
perc_dict[group]['total'] = total
perc_dict[group]['female %'] = fem_per
perc_dict[group]['nonbinary %'] = nonbin_per
cultplotdf = pd.DataFrame.from_dict(perc_dict, orient='index').sort('female %')
from matplotlib.pylab import style
style.use('fivethirtyeight')
fig, ax = plt.subplots(1, 1, figsize=(6,6))
cultplotdf[['total','female %']].plot(kind='bar',secondary_y=['total'], ax=ax, grid=False)
ax.right_ax.set_ylabel('total biographies')
ax.set_ylabel('female %')
ax.legend(loc=2)
ax.right_ax.legend(loc=1)
ax.set_xlabel('culture')
ax.yaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, y: '{:.0%}'.format(x )))
ax.right_ax.yaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, y: '{:,}'.format(int(x))))
fig.suptitle('Female Percentage of Biographies by Culture', fontsize=24)
fig.subplots_adjust(top=0.88)
fig, ax = plt.subplots(1, 1, figsize=(6,6))
cultplotdf[['total','nonbinary %']].plot(kind='bar',secondary_y=['total'], ax=ax, grid=False)
ax.right_ax.set_ylabel('total biographies')
ax.set_ylabel('nonbinary %')
ax.legend(loc=2)
ax.right_ax.legend(loc=1)
ax.set_xlabel('culture')
ax.yaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, y: '{:.4%}'.format(x )))
ax.right_ax.yaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, y: '{:,}'.format(int(x))))
fig.suptitle('Nonbinary Percentage of Biographies by Culture', fontsize=24)
fig.subplots_adjust(top=0.88)
fig, axes = plt.subplots(nrows=3, ncols=1, sharex=True, figsize=(10,10))
measures = ['total', 'fem_per', 'nonbin_per']
for meas, ax in zip(measures, axes):
cultplotdf[meas].plot(kind='bar', ax=ax, legend=False)
fig.legend = legend(bbox_to_anchor=(1.05, 1.5), loc=2, borderaxespad=0)
fig.suptitle('Volumes of Dates of Birth and Death in Wikidata \n by Gender \n Total and Modern Timeframes', fontsize=24)
fig.tight_layout()
subplots_adjust(hspace=0.1, top=0.82)
fig = plt.figure()
ax = fig.add_subplot(111)
ax2 = ax.twinx()
width =0.4
cultplotdf['total'].plot(kind='bar', ax=ax, width=width, position=1)
cultplotdf['fem_per'].plot(kind='bar', ax=ax, width=width, position=0)
ax.set_ylabel('Total Biographies')
ax2.set_ylabel('Female Percentage')
<matplotlib.text.Text at 0x7ff981638210>