import pandas as pd import numpy from collections import defaultdict from matplotlib.pylab import style import json style.use('fivethirtyeight') %pylab inline java_min_int = -2147483648 allrecs = pd.read_csv('snapshot_data/2014-10-13/gender-index-data-2014-10-13.csv',na_values=[java_min_int]) def split_column(q_str): if type(q_str) is float: if numpy.isnan(q_str): return q_str if type(q_str) is str: qs = q_str.split('|') return qs[:-1] #cos the format will always end with a | for col in ['gender','site_links']: allrecs[col] = allrecs[col].apply(split_column) allrecs.head(5) lang_culture_map = pd.DataFrame.from_csv('helpers/aggregation_maps/lang_culture.csv') def agg_culture(wikiname_list): cultures = set() if isinstance(wikiname_list, list): for wikiname in wikiname_list: parts = wikiname.split('wiki') if parts[1] == '': #this was a pedia lang_code = parts[0] try: culture = lang_culture_map.ix[lang_code]['culture'] cultures.add(culture) except KeyError: continue return list(cultures) len(allrecs) allrecs['cultures'] = allrecs['site_links'].apply(agg_culture) def dofd(): return defaultdict(int) culture_gender_dict = defaultdict(dofd) for row in allrecs.iterrows(): colbit = row[1] gender_list = colbit[3] if isinstance(gender_list, list): gender = gender_list[0] else: gender = None cultures = colbit[8] for culture in cultures: culture_gender_dict[culture][gender] += 1 lang_cultures = pd.DataFrame.from_dict(culture_gender_dict, orient='index') lang_cultures.to_json('helpers/lang_cultures.json') lang_cultures = pd.DataFrame.from_dict(json.load(open('helpers/lang_cultures.json','r'))) lang_cultures = lang_cultures.drop('Northern Sotho Wikipedia',axis=0) import pywikibot #Tranforming QIDs into English labels. enwp = pywikibot.Site('en','wikipedia') wikidata = enwp.data_repository() retrieved = dict() def english_label(qid): if qid: if type(qid) is float: if math.isnan(qid): return None #first see if we've done it try: return retrieved[qid] except KeyError: try: page = pywikibot.ItemPage(wikidata, qid) data = page.get() lab = data['labels']['en'] retrieved[qid] = lab return lab except KeyError: retrieved[qid] = qid return qid except: return qid return qid lang_cultures.columns = map(english_label, lang_cultures.columns) lang_cultures['human_total'] = lang_cultures.sum(axis=1) lang_cultures['gendered_total'] = lang_cultures['human_total'] - lang_cultures['null'] lang_cultures['nonbin_total'] = lang_cultures['gendered_total'] - lang_cultures['female'] - lang_cultures['male'] lang_cultures['fem_per'] = lang_cultures['female'] / lang_cultures['gendered_total'] lang_cultures['nonbin_per'] = lang_cultures['nonbin_total'] / lang_cultures['gendered_total'] lang_cultures.fillna(0,inplace=True) lang_cultures fig, ax = plt.subplots(1,1,figsize=(8,8)) lang_cultures[['gendered_total','fem_per']].plot(kind='scatter', x='gendered_total', y='fem_per', logx=True, ax=ax, c='#74bc3a') ax.yaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, y: '{:.0%}'.format(x ))) ax.xaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, y: '{:,.0f}'.format(x ))) ax.set_xlim(min(lang_cultures['gendered_total']) * 0.6, max(lang_cultures['gendered_total']) *3) ax.set_ylim(min(lang_cultures['fem_per']) * 0.85, max(lang_cultures['fem_per']) *1.15) for label, x, y in zip(lang_cultures.index, lang_cultures['gendered_total'], lang_cultures['fem_per']): plt.annotate( label, xy = (x, y), xytext = (5,2), textcoords = 'offset points', ha = 'left', va = 'bottom') #bbox = dict(boxstyle = 'round,pad=0.5', fc = 'yellow', alpha = 0.5), #arrowprops = dict(arrowstyle = '->', connectionstyle = 'arc3,rad=0')) plt.annotate("", xy=(10000,0.5), xytext=(0,0)) plt.title('Female % vs. Total Biographies \nLanguage Aggregated By Culture', fontsize=24) plt.xlabel('Number of Biographies Recorded') plt.ylabel('Composition of Biographies Which Are Female')