In [8]:

import pandas as pd
import numpy
from collections import defaultdict
from matplotlib.pylab import style
import json
style.use('fivethirtyeight')
%pylab inline
java_min_int = -2147483648

Populating the interactive namespace from numpy and matplotlib

In [2]:

allrecs = pd.read_csv('snapshot_data/2014-10-13/gender-index-data-2014-10-13.csv',na_values=[java_min_int])
def split_column(q_str):
    if type(q_str) is float:
        if numpy.isnan(q_str):
            return q_str 
    if type(q_str) is str:
        qs = q_str.split('|')
        return qs[:-1] #cos the format will always end with a |

for col in ['gender','site_links']:
    allrecs[col] = allrecs[col].apply(split_column)

In [3]:

allrecs.head(5)

Out[3]:

	qid	dob	dod	gender	ethnic_group	citizenship	place_of_birth	site_links
0	Q23	1732	1799	[Q6581097]	NaN	Q30\|	Q494413\|	[zhwiki, kywiki, euwiki, plwiki, bswiki, angwi...
1	Q42	1952	2001	[Q6581097]	NaN	Q145\|	Q350\|	[zhwiki, jvwiki, euwiki, plwiki, bswiki, eswik...
2	Q207	1946	NaN	[Q6581097]	NaN	Q30\|	Q49145\|	[uzwiki, eswiki, kowikiquote, huwiki, liwikiqu...
3	Q297	NaN	1660	[Q6581097]	NaN	Q29\|	Q8717\|	[zhwiki, kywiki, plwiki, euwiki, bswiki, uzwik...
4	Q326	1942	NaN	[Q6581097]	NaN	Q298\|Q39\|	Q2887\|	[zhwiki, plwiki, euwiki, kowiki, frwiki, eswik...

In [4]:

lang_culture_map = pd.DataFrame.from_csv('helpers/aggregation_maps/lang_culture.csv')

def agg_culture(wikiname_list):
    cultures = set()
    if isinstance(wikiname_list, list):
        for wikiname in wikiname_list:
            parts = wikiname.split('wiki')
            if parts[1] == '': #this was a pedia
                lang_code = parts[0]
                try:
                    culture = lang_culture_map.ix[lang_code]['culture']
                    cultures.add(culture)
                except KeyError:
                    continue
    return list(cultures)

In [5]:

len(allrecs)

Out[5]:

In [6]:

allrecs['cultures'] = allrecs['site_links'].apply(agg_culture)

---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-6-e25fe682b4b1> in <module>()
----> 1 allrecs['cultures'] = allrecs['site_links'].apply(agg_culture)

/usr/local/lib/python2.7/dist-packages/pandas/core/series.pyc in apply(self, func, convert_dtype, args, **kwds)
   2056             values = lib.map_infer(values, lib.Timestamp)
   2057 
-> 2058         mapped = lib.map_infer(values, f, convert=convert_dtype)
   2059         if len(mapped) and isinstance(mapped[0], Series):
   2060             from pandas.core.frame import DataFrame

/usr/local/lib/python2.7/dist-packages/pandas/lib.so in pandas.lib.map_infer (pandas/lib.c:57158)()

<ipython-input-4-ad100d1b5480> in agg_culture(wikiname_list)
      9                 lang_code = parts[0]
     10                 try:
---> 11                     culture = lang_culture_map.ix[lang_code]['culture']
     12                     cultures.add(culture)
     13                 except KeyError:

/usr/local/lib/python2.7/dist-packages/pandas/core/indexing.pyc in __getitem__(self, key)
     70             return self._getitem_tuple(key)
     71         else:
---> 72             return self._getitem_axis(key, axis=0)
     73 
     74     def _get_label(self, label, axis=0):

/usr/local/lib/python2.7/dist-packages/pandas/core/indexing.pyc in _getitem_axis(self, key, axis)
    926                     return self._get_loc(key, axis=axis)
    927 
--> 928             return self._get_label(key, axis=axis)
    929 
    930     def _getitem_iterable(self, key, axis=0):

/usr/local/lib/python2.7/dist-packages/pandas/core/indexing.pyc in _get_label(self, label, axis)
     86             raise IndexingError('no slices here, handle elsewhere')
     87 
---> 88         return self.obj._xs(label, axis=axis)
     89 
     90     def _get_loc(self, key, axis=0):

/usr/local/lib/python2.7/dist-packages/pandas/core/generic.pyc in xs(self, key, axis, level, copy, drop_level)
   1469 
   1470             result = Series(new_values, index=self.columns,
-> 1471                             name=self.index[loc])
   1472 
   1473         else:

/usr/local/lib/python2.7/dist-packages/pandas/core/series.pyc in __init__(self, data, index, dtype, name, copy, fastpath)
    216 
    217         object.__setattr__(self, 'name', name)
--> 218         self._set_axis(0, index, fastpath=True)
    219 
    220     @classmethod

/usr/local/lib/python2.7/dist-packages/pandas/core/series.pyc in _set_axis(self, axis, labels, fastpath)
    259         self._set_subtyp(is_all_dates)
    260 
--> 261         object.__setattr__(self, '_index', labels)
    262         if not fastpath:
    263             self._data.set_axis(axis, labels)

KeyboardInterrupt:

In [ ]:

def dofd():
    return defaultdict(int)

culture_gender_dict = defaultdict(dofd)

for row in allrecs.iterrows():
    colbit = row[1]
    gender_list = colbit[3]
    if isinstance(gender_list, list):
        gender = gender_list[0]
    else:
        gender = None
    cultures = colbit[8]
    for culture in cultures:
        culture_gender_dict[culture][gender] += 1

In [ ]:

lang_cultures = pd.DataFrame.from_dict(culture_gender_dict, orient='index')

In [ ]:

lang_cultures.to_json('helpers/lang_cultures.json')

In [37]:

lang_cultures = pd.DataFrame.from_dict(json.load(open('helpers/lang_cultures.json','r')))

In [51]:

lang_cultures = lang_cultures.drop('Northern Sotho Wikipedia',axis=0)

In [55]:

import pywikibot
#Tranforming QIDs into English labels.
enwp = pywikibot.Site('en','wikipedia')
wikidata = enwp.data_repository()

retrieved = dict()

def english_label(qid):
    if qid:
        if type(qid) is float:
            if math.isnan(qid):
                return None
        #first see if we've done it
        try:
            return retrieved[qid]
        except KeyError:
            try:
                page = pywikibot.ItemPage(wikidata, qid)
                data = page.get()
                lab = data['labels']['en']
                retrieved[qid] = lab
                return lab
            except KeyError:
                retrieved[qid] = qid
                return qid
            except:
                return qid
    return qid

In [56]:

lang_cultures.columns = map(english_label, lang_cultures.columns)

VERBOSE:pywiki:Found 1 wikidata:wikidata processes running, including this one.

In [57]:

lang_cultures['human_total'] = lang_cultures.sum(axis=1)
lang_cultures['gendered_total'] = lang_cultures['human_total'] - lang_cultures['null']
lang_cultures['nonbin_total'] = lang_cultures['gendered_total'] - lang_cultures['female'] - lang_cultures['male']
lang_cultures['fem_per'] = lang_cultures['female'] / lang_cultures['gendered_total']
lang_cultures['nonbin_per'] = lang_cultures['nonbin_total'] / lang_cultures['gendered_total']

In [58]:

lang_cultures.fillna(0,inplace=True)

In [59]:

lang_cultures

Out[59]:

	transgender female	intersex	fa'afafine	transgender male	female animal	male animal	woman	genderqueer	female	male	kathoey	null	human_total	gendered_total	nonbin_total	fem_per	nonbin_per
Africa	0	0	0	0	0	0	0	0	1888	12824	0	306	15018	14712	0	0.128331	0.000000
Catholic european	39	3	1	7	0	6	0	3	122048	712655	0	33480	868242	834762	59	0.146207	0.000071
Confucian	24	2	1	1	0	0	0	2	47355	133508	1	163710	344604	180894	31	0.261783	0.000171
Constructed	6	1	0	1	0	0	0	0	4321	27085	1	2252	33667	31415	9	0.137546	0.000286
English-speaking	99	17	1	19	0	0	0	7	186455	1016500	1	3259	1206358	1203099	144	0.154979	0.000120
Islamic	29	1	0	2	0	0	0	1	20931	100099	1	19529	140593	121064	34	0.172892	0.000281
Latin America	25	3	1	4	0	0	0	1	38195	202735	0	1979	242943	240964	34	0.158509	0.000141
Orthodox	23	1	0	1	1	2	1	1	53843	324884	1	15316	394074	378758	31	0.142157	0.000082
Protestant European	26	2	1	7	0	0	0	3	132250	683435	1	17611	833336	815725	40	0.162126	0.000049
South Asia	13	1	0	2	0	0	0	1	17000	68817	1	15180	101015	85835	18	0.198054	0.000210

In [88]:

fig, ax = plt.subplots(1,1,figsize=(8,8))
lang_cultures[['gendered_total','fem_per']].plot(kind='scatter', x='gendered_total', y='fem_per', logx=True, ax=ax, c='#74bc3a')

ax.yaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, y: '{:.0%}'.format(x )))
ax.xaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, y: '{:,.0f}'.format(x )))

ax.set_xlim(min(lang_cultures['gendered_total']) * 0.6, max(lang_cultures['gendered_total']) *3)
ax.set_ylim(min(lang_cultures['fem_per']) * 0.85, max(lang_cultures['fem_per']) *1.15)

for label, x, y in zip(lang_cultures.index, lang_cultures['gendered_total'], lang_cultures['fem_per']):
    plt.annotate(
        label, 
        xy = (x, y), xytext = (5,2),
        textcoords = 'offset points', ha = 'left', va = 'bottom')
        #bbox = dict(boxstyle = 'round,pad=0.5', fc = 'yellow', alpha = 0.5),
        #arrowprops = dict(arrowstyle = '->', connectionstyle = 'arc3,rad=0'))

plt.annotate("", xy=(10000,0.5), xytext=(0,0)) 
plt.title('Female % vs. Total Biographies \nLanguage Aggregated By Culture', fontsize=24)
plt.xlabel('Number of Biographies Recorded')
plt.ylabel('Composition of Biographies Which Are Female')

Out[88]:

<matplotlib.text.Text at 0x7feccbfe6a50>

In [ ]: