import pandas
import math
import json
from numpy.random import *
from matplotlib import style
style.use('fivethirtyeight')
%pylab inline

sitelinks = pandas.read_csv('snapshot_data/2014-10-13/property_indexes/site_linkss-index.csv',index_col=0).fillna(0)

sitelinks.head()

sitelinks['human_total'] = sitelinks.sum(axis=1)
sitelinks['gendered_total'] = sitelinks['human_total'] - sitelinks['nan']
sitelinks['gendered_per'] = sitelinks['gendered_total'] / sitelinks['human_total']
sitelinks['nonbin_total'] = sitelinks['gendered_total'] - sitelinks['female'] - sitelinks['male']

sitelinks[sitelinks['human_total'] > 10000].sort('gendered_per').head()


sitelinks[sitelinks !=  0]

sitelinks.drop(float('nan'), inplace=True)

suffixes = set()

for bits in map(lambda x: x.split('wiki'), sitelinks.index):
    if len(bits) == 3:
        continue
    else:
        pre = bits[0]
        suff = bits[1]
        suffixes.add(suff)

suffixes

def wikityper(wikiname):
    suff = wikiname.split('wiki')[1]
    return 'pedia' if suff == '' else suff

def wikilanger(wikiname):
    pre = wikiname.split('wiki')[0]
    return pre if pre else None
    
sitelinks['wikitype'] = map(wikityper, sitelinks.index)
sitelinks['wikilang'] = map(wikilanger, sitelinks.index)
sitelinks['fem_per'] = sitelinks['female'] / sitelinks['gendered_total']
sitelinks['nonbin_per'] = sitelinks['nonbin_total'] / sitelinks['gendered_total']

wikitypes = sitelinks.groupby(by='wikitype')

wikitypes.mean()[['female','male','fem_per','nonbin_per','gendered_total']]

lang_map = json.load(open('helpers/wiki_code_map.json','r'))
def lookup_lang(lang):
    try:
            full= lang_map[lang]
            if full.split()[-1].lower() == 'wikipedia':
                return ' '.join(full.split()[:-1])
            else: return full
    except:
        return lang

splitpoints = 5

for sort_term, sort_term_text in [('gendered_total', 'number of Gendered Biographies'), ('fem_per', 'percentage of female Biographies')]:
    ssl = sitelinks[sitelinks['wikitype']=='pedia'].sort([sort_term])
    planstep = len(ssl)/float(splitpoints)

    
    for per_type, std_ylim, title_text in [('fem_per', 0.8, 'Female Composition'), ('nonbin_per',0.005, 'Non-binary Gender Percentage')]:

        fig, axes = plt.subplots(nrows=splitpoints, ncols=1, figsize=(12,20))
        plt.subplots_adjust(hspace = 0.8 )
        for splitpoint in range(0,splitpoints):
            begin = int(math.ceil(splitpoint * planstep))
            end = int(math.floor((splitpoint+1) * planstep))

            
            bios_list = ssl.iloc[begin:end]['gendered_total']
            minbio = int(min(bios_list))
            maxbio = int(max(bios_list))
            ratios_list = ssl.iloc[begin:end][per_type]
            maxratio = max(ratios_list)
            minratio = min(ratios_list)

            bios_size = bios_list.apply(lambda x: math.log(x)/math.log(maxbio)) 

            my_colors = [(x, x/2, 0.75) for x in bios_size]
            

            ssl.iloc[begin:end][per_type].plot(ax=axes[splitpoint], kind='bar', color=my_colors)


            axes[splitpoint].set_title(" %s with %s to %s gendered biographies" % (title_text, minbio, maxbio))
            axes[splitpoint].set_ylim((minratio*0.9,maxratio*1.1))
            axes[splitpoint].grid(False)
            axes[splitpoint].yaxis.grid(True, linestyle="--", linewidth=0.3)
            axes[splitpoint].lines[0].set_visible(False)
            axes[splitpoint].yaxis.set_ticks_position('none')
            axes[splitpoint].xaxis.set_ticks_position('none')
            wikilabels = axes[splitpoint].get_xticklabels()
            wikinames = map(lambda x: x.get_text().split('wiki')[0], wikilabels)
            fullnames = map(lookup_lang, wikinames)
            axes[splitpoint].set_xticklabels(fullnames)
        fig.suptitle("""%s of all Wikipedia Languages\n
        ordered by %s. Color is locally relative Wiki Size""" % (title_text, sort_term_text), fontsize=24)    
        plt.show

sitelinks[sitelinks['wikitype']=='pedia'].sort(['gendered_total'])['gendered_total'].plot(figsize=(36,6),kind='bar', logy=True)

#find the correct cut off so that we are only inspecting the top TOP wikis by gendered biographies
TOP = 50
scatdata = None
for vartotal in range(0, int(max(sitelinks['gendered_total']))):
    scatdata = sitelinks[(sitelinks['wikitype']=='pedia') & (sitelinks['gendered_total'] > vartotal)]
    if len(scatdata) > TOP:
        continue
    else:
        print(vartotal)
        break
        
nonbintot = None
for vartotal in range(0, int(max(sitelinks['gendered_total']))):
    nonbintot= sitelinks[(sitelinks['wikitype']=='pedia') & (sitelinks['nonbin_total'] > vartotal)]
    if len(nonbintot) > TOP:
        continue
    else:
        print(vartotal)
        break

scatdata.head()

scatdata[['gendered_total','fem_per']].to_csv('Magnus Gender analysis/lang_scat.csv')

sp =scatdata.plot(kind='scatter', x='gendered_total', y='fem_per', logx=True, figsize=(16,10), c='#e3ae3d')
codes = map(lambda x: str(x).split('wiki')[0], scatdata.index)
fullnames = map(lookup_lang,codes)

sp.set_xlim(min(scatdata['gendered_total']) * 0.85, max(scatdata['gendered_total']) *1.15)
sp.set_ylim(min(scatdata['fem_per']) * 0.95, max(scatdata['fem_per']) *1.05)

sp.yaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, y: '{:.0%}'.format(x )))
sp.xaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, y: '{:,.0f}'.format(x )))


sp.set_title('Female ratio of biographies, by Wikipedia Language \nTop {} Wikipedias by Biography count\n'.format(TOP), fontsize=24)
sp.set_xlabel('Number of gendered Biographies, log scale', fontsize=18)
sp.set_ylabel('Female ratio of Biographies',  fontsize=18)

(x1, x2), (y1, y2) = sp.get_xlim(), sp.get_ylim()
middle = (x2-x1)/2.0 , (y2-y1)/2.0

f = matplotlib.font_manager.FontProperties()
font1 = f.copy()

#font1.set_weight('light')

for label, x, y in zip(fullnames, scatdata['gendered_total'], scatdata['fem_per']):
    plt.annotate(
        label,
        xy = (x, y), 
        xytext = (3,-3) if label in ['Latvian','Polish','Dutch','Slovak','Hungarian'] else (0,2),
        textcoords = 'offset points', ha = 'center', va = 'bottom',
        fontsize=8, fontproperties=font1)

np = scatdata.plot(kind='scatter', x='gendered_total', y='nonbin_per', logx=True, figsize=(16,10), c='#f34141')
codes = map(lambda x: str(x).split('wiki')[0], scatdata.index)
fullnames = map(lookup_lang,codes)

np.yaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, y: '{:.2%}'.format(x )))
np.xaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, y: '{:,.0f}'.format(x )))

np.set_xlim(min(scatdata['gendered_total']) * 0.85, max(scatdata['gendered_total']) *1.15)
np.set_ylim(min(scatdata['nonbin_per']) * 0.95, max(scatdata['nonbin_per']) *1.05)
np.set_title('Nonbinary ratio of biographies, by Wikipedia Language \nTop {} Wikipedias by Biography count\n'.format(TOP), fontsize=24)
np.set_xlabel('Number of gendered Biographies, log scale', fontsize=18)
np.set_ylabel('Nonbinary ratio of Biographies',  fontsize=18)

f = matplotlib.font_manager.FontProperties()
font1 = f.copy()

#font1.set_weight('light')

for label, x, y in zip(fullnames, scatdata['gendered_total'], scatdata['nonbin_per']):
    plt.annotate(
        label,
        xy = (x, y), 
        xytext = (5,-5) if label in ['Czech'] else (0,2),
        textcoords = 'offset points', ha = 'center', va = 'bottom',
        fontsize=8, fontproperties=font1)
    

plt.show()


'''
np = nonbintot.plot(kind='scatter', x='nonbin_total', y='nonbin_per', logx=True, figsize=(16,10))
codes = map(lambda x: str(x).split('wiki')[0], nonbintot.index)
fullnames = map(lookup_lang,codes)


np.set_xlim(min(nonbintot['nonbin_total']) * 0.85, max(nonbintot['nonbin_total']) *1.15)
np.set_ylim(min(nonbintot['nonbin_per']) * 0.95, max(nonbintot['nonbin_per']) *1.05)
np.set_title('Nonbinary percentage of biographies, by Wikipedia Language \nTop 50 Wikipedias by Biography count\n', fontsize=24)
np.set_xlabel('Number of gendered Biographies, log scale', fontsize=18)
np.set_ylabel('Non binary percentage of Biographies',  fontsize=18)


for label, x, y in zip(fullnames, nonbintot['nonbin_total'], nonbintot['nonbin_per']):
    plt.annotate(
        label,
        xy = (x, y), xytext = (0,2),
        textcoords = 'offset points', ha = 'center', va = 'bottom',
        fontsize=8)
plt.show()
'''
    

from sklearn.decomposition import PCA
pca = PCA(n_components=2)
pca.fit(sitelinks[(sitelinks['wikitype']=='pedia')& (sitelinks['gendered_total']>1000)][['fem_per','gendered_total']])

int(float(-6.2590421446e-09))

from sklearn.decomposition import PCA
pca = PCA(n_components=1)
logtrans = pandas.DataFrame()
logtrans['nonbin_per'] = sitelinks[(sitelinks['wikitype']=='pedia') & (sitelinks['nonbin_per'] != 0)]['nonbin_per'].apply(math.log)
logtrans['gen_tot'] = sitelinks[(sitelinks['wikitype']=='pedia')& (sitelinks['nonbin_per'] != 0)]['gendered_total'].apply(math.log)
pca.fit(logtrans)

print pca.components_, pca.explained_variance_ratio_

sitelinks[['gendered_total','female','male']].corr()