import pandas import math import json from numpy.random import * from matplotlib import style style.use('fivethirtyeight') %pylab inline sitelinks = pandas.read_csv('snapshot_data/2014-10-13/property_indexes/site_linkss-index.csv',index_col=0).fillna(0) sitelinks.head() sitelinks['human_total'] = sitelinks.sum(axis=1) sitelinks['gendered_total'] = sitelinks['human_total'] - sitelinks['nan'] sitelinks['gendered_per'] = sitelinks['gendered_total'] / sitelinks['human_total'] sitelinks['nonbin_total'] = sitelinks['gendered_total'] - sitelinks['female'] - sitelinks['male'] sitelinks[sitelinks['human_total'] > 10000].sort('gendered_per').head() sitelinks[sitelinks != 0] sitelinks.drop(float('nan'), inplace=True) suffixes = set() for bits in map(lambda x: x.split('wiki'), sitelinks.index): if len(bits) == 3: continue else: pre = bits[0] suff = bits[1] suffixes.add(suff) suffixes def wikityper(wikiname): suff = wikiname.split('wiki')[1] return 'pedia' if suff == '' else suff def wikilanger(wikiname): pre = wikiname.split('wiki')[0] return pre if pre else None sitelinks['wikitype'] = map(wikityper, sitelinks.index) sitelinks['wikilang'] = map(wikilanger, sitelinks.index) sitelinks['fem_per'] = sitelinks['female'] / sitelinks['gendered_total'] sitelinks['nonbin_per'] = sitelinks['nonbin_total'] / sitelinks['gendered_total'] wikitypes = sitelinks.groupby(by='wikitype') wikitypes.mean()[['female','male','fem_per','nonbin_per','gendered_total']] lang_map = json.load(open('helpers/wiki_code_map.json','r')) def lookup_lang(lang): try: full= lang_map[lang] if full.split()[-1].lower() == 'wikipedia': return ' '.join(full.split()[:-1]) else: return full except: return lang splitpoints = 5 for sort_term, sort_term_text in [('gendered_total', 'number of Gendered Biographies'), ('fem_per', 'percentage of female Biographies')]: ssl = sitelinks[sitelinks['wikitype']=='pedia'].sort([sort_term]) planstep = len(ssl)/float(splitpoints) for per_type, std_ylim, title_text in [('fem_per', 0.8, 'Female Composition'), ('nonbin_per',0.005, 'Non-binary Gender Percentage')]: fig, axes = plt.subplots(nrows=splitpoints, ncols=1, figsize=(12,20)) plt.subplots_adjust(hspace = 0.8 ) for splitpoint in range(0,splitpoints): begin = int(math.ceil(splitpoint * planstep)) end = int(math.floor((splitpoint+1) * planstep)) bios_list = ssl.iloc[begin:end]['gendered_total'] minbio = int(min(bios_list)) maxbio = int(max(bios_list)) ratios_list = ssl.iloc[begin:end][per_type] maxratio = max(ratios_list) minratio = min(ratios_list) bios_size = bios_list.apply(lambda x: math.log(x)/math.log(maxbio)) my_colors = [(x, x/2, 0.75) for x in bios_size] ssl.iloc[begin:end][per_type].plot(ax=axes[splitpoint], kind='bar', color=my_colors) axes[splitpoint].set_title(" %s with %s to %s gendered biographies" % (title_text, minbio, maxbio)) axes[splitpoint].set_ylim((minratio*0.9,maxratio*1.1)) axes[splitpoint].grid(False) axes[splitpoint].yaxis.grid(True, linestyle="--", linewidth=0.3) axes[splitpoint].lines[0].set_visible(False) axes[splitpoint].yaxis.set_ticks_position('none') axes[splitpoint].xaxis.set_ticks_position('none') wikilabels = axes[splitpoint].get_xticklabels() wikinames = map(lambda x: x.get_text().split('wiki')[0], wikilabels) fullnames = map(lookup_lang, wikinames) axes[splitpoint].set_xticklabels(fullnames) fig.suptitle("""%s of all Wikipedia Languages\n ordered by %s. Color is locally relative Wiki Size""" % (title_text, sort_term_text), fontsize=24) plt.show sitelinks[sitelinks['wikitype']=='pedia'].sort(['gendered_total'])['gendered_total'].plot(figsize=(36,6),kind='bar', logy=True) #find the correct cut off so that we are only inspecting the top TOP wikis by gendered biographies TOP = 50 scatdata = None for vartotal in range(0, int(max(sitelinks['gendered_total']))): scatdata = sitelinks[(sitelinks['wikitype']=='pedia') & (sitelinks['gendered_total'] > vartotal)] if len(scatdata) > TOP: continue else: print(vartotal) break nonbintot = None for vartotal in range(0, int(max(sitelinks['gendered_total']))): nonbintot= sitelinks[(sitelinks['wikitype']=='pedia') & (sitelinks['nonbin_total'] > vartotal)] if len(nonbintot) > TOP: continue else: print(vartotal) break scatdata.head() scatdata[['gendered_total','fem_per']].to_csv('Magnus Gender analysis/lang_scat.csv') sp =scatdata.plot(kind='scatter', x='gendered_total', y='fem_per', logx=True, figsize=(16,10), c='#e3ae3d') codes = map(lambda x: str(x).split('wiki')[0], scatdata.index) fullnames = map(lookup_lang,codes) sp.set_xlim(min(scatdata['gendered_total']) * 0.85, max(scatdata['gendered_total']) *1.15) sp.set_ylim(min(scatdata['fem_per']) * 0.95, max(scatdata['fem_per']) *1.05) sp.yaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, y: '{:.0%}'.format(x ))) sp.xaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, y: '{:,.0f}'.format(x ))) sp.set_title('Female ratio of biographies, by Wikipedia Language \nTop {} Wikipedias by Biography count\n'.format(TOP), fontsize=24) sp.set_xlabel('Number of gendered Biographies, log scale', fontsize=18) sp.set_ylabel('Female ratio of Biographies', fontsize=18) (x1, x2), (y1, y2) = sp.get_xlim(), sp.get_ylim() middle = (x2-x1)/2.0 , (y2-y1)/2.0 f = matplotlib.font_manager.FontProperties() font1 = f.copy() #font1.set_weight('light') for label, x, y in zip(fullnames, scatdata['gendered_total'], scatdata['fem_per']): plt.annotate( label, xy = (x, y), xytext = (3,-3) if label in ['Latvian','Polish','Dutch','Slovak','Hungarian'] else (0,2), textcoords = 'offset points', ha = 'center', va = 'bottom', fontsize=8, fontproperties=font1) np = scatdata.plot(kind='scatter', x='gendered_total', y='nonbin_per', logx=True, figsize=(16,10), c='#f34141') codes = map(lambda x: str(x).split('wiki')[0], scatdata.index) fullnames = map(lookup_lang,codes) np.yaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, y: '{:.2%}'.format(x ))) np.xaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, y: '{:,.0f}'.format(x ))) np.set_xlim(min(scatdata['gendered_total']) * 0.85, max(scatdata['gendered_total']) *1.15) np.set_ylim(min(scatdata['nonbin_per']) * 0.95, max(scatdata['nonbin_per']) *1.05) np.set_title('Nonbinary ratio of biographies, by Wikipedia Language \nTop {} Wikipedias by Biography count\n'.format(TOP), fontsize=24) np.set_xlabel('Number of gendered Biographies, log scale', fontsize=18) np.set_ylabel('Nonbinary ratio of Biographies', fontsize=18) f = matplotlib.font_manager.FontProperties() font1 = f.copy() #font1.set_weight('light') for label, x, y in zip(fullnames, scatdata['gendered_total'], scatdata['nonbin_per']): plt.annotate( label, xy = (x, y), xytext = (5,-5) if label in ['Czech'] else (0,2), textcoords = 'offset points', ha = 'center', va = 'bottom', fontsize=8, fontproperties=font1) plt.show() ''' np = nonbintot.plot(kind='scatter', x='nonbin_total', y='nonbin_per', logx=True, figsize=(16,10)) codes = map(lambda x: str(x).split('wiki')[0], nonbintot.index) fullnames = map(lookup_lang,codes) np.set_xlim(min(nonbintot['nonbin_total']) * 0.85, max(nonbintot['nonbin_total']) *1.15) np.set_ylim(min(nonbintot['nonbin_per']) * 0.95, max(nonbintot['nonbin_per']) *1.05) np.set_title('Nonbinary percentage of biographies, by Wikipedia Language \nTop 50 Wikipedias by Biography count\n', fontsize=24) np.set_xlabel('Number of gendered Biographies, log scale', fontsize=18) np.set_ylabel('Non binary percentage of Biographies', fontsize=18) for label, x, y in zip(fullnames, nonbintot['nonbin_total'], nonbintot['nonbin_per']): plt.annotate( label, xy = (x, y), xytext = (0,2), textcoords = 'offset points', ha = 'center', va = 'bottom', fontsize=8) plt.show() ''' from sklearn.decomposition import PCA pca = PCA(n_components=2) pca.fit(sitelinks[(sitelinks['wikitype']=='pedia')& (sitelinks['gendered_total']>1000)][['fem_per','gendered_total']]) int(float(-6.2590421446e-09)) from sklearn.decomposition import PCA pca = PCA(n_components=1) logtrans = pandas.DataFrame() logtrans['nonbin_per'] = sitelinks[(sitelinks['wikitype']=='pedia') & (sitelinks['nonbin_per'] != 0)]['nonbin_per'].apply(math.log) logtrans['gen_tot'] = sitelinks[(sitelinks['wikitype']=='pedia')& (sitelinks['nonbin_per'] != 0)]['gendered_total'].apply(math.log) pca.fit(logtrans) print pca.components_, pca.explained_variance_ratio_ sitelinks[['gendered_total','female','male']].corr()