import pandas as pd
import numpy
from collections import defaultdict
from matplotlib.pylab import style
import json
style.use('fivethirtyeight')
%pylab inline
java_min_int = -2147483648
Populating the interactive namespace from numpy and matplotlib
allrecs = pd.read_csv('snapshot_data/2014-10-13/gender-index-data-2014-10-13.csv',na_values=[java_min_int])
def split_column(q_str):
if type(q_str) is float:
if numpy.isnan(q_str):
return q_str
if type(q_str) is str:
qs = q_str.split('|')
return qs[:-1] #cos the format will always end with a |
for col in ['gender','site_links']:
allrecs[col] = allrecs[col].apply(split_column)
allrecs.head(5)
qid | dob | dod | gender | ethnic_group | citizenship | place_of_birth | site_links | |
---|---|---|---|---|---|---|---|---|
0 | Q23 | 1732 | 1799 | [Q6581097] | NaN | Q30| | Q494413| | [zhwiki, kywiki, euwiki, plwiki, bswiki, angwi... |
1 | Q42 | 1952 | 2001 | [Q6581097] | NaN | Q145| | Q350| | [zhwiki, jvwiki, euwiki, plwiki, bswiki, eswik... |
2 | Q207 | 1946 | NaN | [Q6581097] | NaN | Q30| | Q49145| | [uzwiki, eswiki, kowikiquote, huwiki, liwikiqu... |
3 | Q297 | NaN | 1660 | [Q6581097] | NaN | Q29| | Q8717| | [zhwiki, kywiki, plwiki, euwiki, bswiki, uzwik... |
4 | Q326 | 1942 | NaN | [Q6581097] | NaN | Q298|Q39| | Q2887| | [zhwiki, plwiki, euwiki, kowiki, frwiki, eswik... |
lang_culture_map = pd.DataFrame.from_csv('helpers/aggregation_maps/lang_culture.csv')
def agg_culture(wikiname_list):
cultures = set()
if isinstance(wikiname_list, list):
for wikiname in wikiname_list:
parts = wikiname.split('wiki')
if parts[1] == '': #this was a pedia
lang_code = parts[0]
try:
culture = lang_culture_map.ix[lang_code]['culture']
cultures.add(culture)
except KeyError:
continue
return list(cultures)
len(allrecs)
2561999
allrecs['cultures'] = allrecs['site_links'].apply(agg_culture)
--------------------------------------------------------------------------- KeyboardInterrupt Traceback (most recent call last) <ipython-input-6-e25fe682b4b1> in <module>() ----> 1 allrecs['cultures'] = allrecs['site_links'].apply(agg_culture) /usr/local/lib/python2.7/dist-packages/pandas/core/series.pyc in apply(self, func, convert_dtype, args, **kwds) 2056 values = lib.map_infer(values, lib.Timestamp) 2057 -> 2058 mapped = lib.map_infer(values, f, convert=convert_dtype) 2059 if len(mapped) and isinstance(mapped[0], Series): 2060 from pandas.core.frame import DataFrame /usr/local/lib/python2.7/dist-packages/pandas/lib.so in pandas.lib.map_infer (pandas/lib.c:57158)() <ipython-input-4-ad100d1b5480> in agg_culture(wikiname_list) 9 lang_code = parts[0] 10 try: ---> 11 culture = lang_culture_map.ix[lang_code]['culture'] 12 cultures.add(culture) 13 except KeyError: /usr/local/lib/python2.7/dist-packages/pandas/core/indexing.pyc in __getitem__(self, key) 70 return self._getitem_tuple(key) 71 else: ---> 72 return self._getitem_axis(key, axis=0) 73 74 def _get_label(self, label, axis=0): /usr/local/lib/python2.7/dist-packages/pandas/core/indexing.pyc in _getitem_axis(self, key, axis) 926 return self._get_loc(key, axis=axis) 927 --> 928 return self._get_label(key, axis=axis) 929 930 def _getitem_iterable(self, key, axis=0): /usr/local/lib/python2.7/dist-packages/pandas/core/indexing.pyc in _get_label(self, label, axis) 86 raise IndexingError('no slices here, handle elsewhere') 87 ---> 88 return self.obj._xs(label, axis=axis) 89 90 def _get_loc(self, key, axis=0): /usr/local/lib/python2.7/dist-packages/pandas/core/generic.pyc in xs(self, key, axis, level, copy, drop_level) 1469 1470 result = Series(new_values, index=self.columns, -> 1471 name=self.index[loc]) 1472 1473 else: /usr/local/lib/python2.7/dist-packages/pandas/core/series.pyc in __init__(self, data, index, dtype, name, copy, fastpath) 216 217 object.__setattr__(self, 'name', name) --> 218 self._set_axis(0, index, fastpath=True) 219 220 @classmethod /usr/local/lib/python2.7/dist-packages/pandas/core/series.pyc in _set_axis(self, axis, labels, fastpath) 259 self._set_subtyp(is_all_dates) 260 --> 261 object.__setattr__(self, '_index', labels) 262 if not fastpath: 263 self._data.set_axis(axis, labels) KeyboardInterrupt:
def dofd():
return defaultdict(int)
culture_gender_dict = defaultdict(dofd)
for row in allrecs.iterrows():
colbit = row[1]
gender_list = colbit[3]
if isinstance(gender_list, list):
gender = gender_list[0]
else:
gender = None
cultures = colbit[8]
for culture in cultures:
culture_gender_dict[culture][gender] += 1
lang_cultures = pd.DataFrame.from_dict(culture_gender_dict, orient='index')
lang_cultures.to_json('helpers/lang_cultures.json')
lang_cultures = pd.DataFrame.from_dict(json.load(open('helpers/lang_cultures.json','r')))
lang_cultures = lang_cultures.drop('Northern Sotho Wikipedia',axis=0)
import pywikibot
#Tranforming QIDs into English labels.
enwp = pywikibot.Site('en','wikipedia')
wikidata = enwp.data_repository()
retrieved = dict()
def english_label(qid):
if qid:
if type(qid) is float:
if math.isnan(qid):
return None
#first see if we've done it
try:
return retrieved[qid]
except KeyError:
try:
page = pywikibot.ItemPage(wikidata, qid)
data = page.get()
lab = data['labels']['en']
retrieved[qid] = lab
return lab
except KeyError:
retrieved[qid] = qid
return qid
except:
return qid
return qid
lang_cultures.columns = map(english_label, lang_cultures.columns)
VERBOSE:pywiki:Found 1 wikidata:wikidata processes running, including this one.
lang_cultures['human_total'] = lang_cultures.sum(axis=1)
lang_cultures['gendered_total'] = lang_cultures['human_total'] - lang_cultures['null']
lang_cultures['nonbin_total'] = lang_cultures['gendered_total'] - lang_cultures['female'] - lang_cultures['male']
lang_cultures['fem_per'] = lang_cultures['female'] / lang_cultures['gendered_total']
lang_cultures['nonbin_per'] = lang_cultures['nonbin_total'] / lang_cultures['gendered_total']
lang_cultures.fillna(0,inplace=True)
lang_cultures
transgender female | intersex | fa'afafine | transgender male | female animal | male animal | woman | genderqueer | female | male | kathoey | null | human_total | gendered_total | nonbin_total | fem_per | nonbin_per | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Africa | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1888 | 12824 | 0 | 306 | 15018 | 14712 | 0 | 0.128331 | 0.000000 |
Catholic european | 39 | 3 | 1 | 7 | 0 | 6 | 0 | 3 | 122048 | 712655 | 0 | 33480 | 868242 | 834762 | 59 | 0.146207 | 0.000071 |
Confucian | 24 | 2 | 1 | 1 | 0 | 0 | 0 | 2 | 47355 | 133508 | 1 | 163710 | 344604 | 180894 | 31 | 0.261783 | 0.000171 |
Constructed | 6 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 4321 | 27085 | 1 | 2252 | 33667 | 31415 | 9 | 0.137546 | 0.000286 |
English-speaking | 99 | 17 | 1 | 19 | 0 | 0 | 0 | 7 | 186455 | 1016500 | 1 | 3259 | 1206358 | 1203099 | 144 | 0.154979 | 0.000120 |
Islamic | 29 | 1 | 0 | 2 | 0 | 0 | 0 | 1 | 20931 | 100099 | 1 | 19529 | 140593 | 121064 | 34 | 0.172892 | 0.000281 |
Latin America | 25 | 3 | 1 | 4 | 0 | 0 | 0 | 1 | 38195 | 202735 | 0 | 1979 | 242943 | 240964 | 34 | 0.158509 | 0.000141 |
Orthodox | 23 | 1 | 0 | 1 | 1 | 2 | 1 | 1 | 53843 | 324884 | 1 | 15316 | 394074 | 378758 | 31 | 0.142157 | 0.000082 |
Protestant European | 26 | 2 | 1 | 7 | 0 | 0 | 0 | 3 | 132250 | 683435 | 1 | 17611 | 833336 | 815725 | 40 | 0.162126 | 0.000049 |
South Asia | 13 | 1 | 0 | 2 | 0 | 0 | 0 | 1 | 17000 | 68817 | 1 | 15180 | 101015 | 85835 | 18 | 0.198054 | 0.000210 |
fig, ax = plt.subplots(1,1,figsize=(8,8))
lang_cultures[['gendered_total','fem_per']].plot(kind='scatter', x='gendered_total', y='fem_per', logx=True, ax=ax, c='#74bc3a')
ax.yaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, y: '{:.0%}'.format(x )))
ax.xaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, y: '{:,.0f}'.format(x )))
ax.set_xlim(min(lang_cultures['gendered_total']) * 0.6, max(lang_cultures['gendered_total']) *3)
ax.set_ylim(min(lang_cultures['fem_per']) * 0.85, max(lang_cultures['fem_per']) *1.15)
for label, x, y in zip(lang_cultures.index, lang_cultures['gendered_total'], lang_cultures['fem_per']):
plt.annotate(
label,
xy = (x, y), xytext = (5,2),
textcoords = 'offset points', ha = 'left', va = 'bottom')
#bbox = dict(boxstyle = 'round,pad=0.5', fc = 'yellow', alpha = 0.5),
#arrowprops = dict(arrowstyle = '->', connectionstyle = 'arc3,rad=0'))
plt.annotate("", xy=(10000,0.5), xytext=(0,0))
plt.title('Female % vs. Total Biographies \nLanguage Aggregated By Culture', fontsize=24)
plt.xlabel('Number of Biographies Recorded')
plt.ylabel('Composition of Biographies Which Are Female')
<matplotlib.text.Text at 0x7feccbfe6a50>