import pandas as pd
import numpy
import json
from collections import defaultdict
from matplotlib.pylab import style
style.use('fivethirtyeight')
%pylab inline
java_min_int = -2147483648

allrecs = pd.read_csv('snapshot_data/2014-10-13/gender-index-data-2014-10-13.csv',na_values=[java_min_int])

def split_column(q_str):
    if type(q_str) is float:
        if numpy.isnan(q_str):
            return q_str 
    if type(q_str) is str:
        qs = q_str.split('|')
        return qs[0] #cos the format will always end with a |
    

for col in ['place_of_birth','gender', 'citizenship','ethnic_group']:
    allrecs[col] = allrecs[col].apply(split_column)

allrecs.head(5)

pobs_map = json.load(open('helpers/aggregation_maps/pobs_map.json','r'))
country_map = pd.DataFrame.from_csv('helpers/aggregation_maps/country_maps.csv')

ethnic_group_map = json.load(open('helpers/aggregation_maps/mechanical_turk/ethnic_groups_map.json','r'))
citizenship_map = json.load(open('helpers/aggregation_maps/mechanical_turk/citizenship_map.json','r')) 

def map_pob(qid):
    if not type(qid) is str:
        return None
    else:
        country_list = pobs_map[qid]
        if len(country_list) == 0:
            return None
        else:
            country = country_list[0] #assumption
            culture = country_map.ix[country]['culture_name']
            return culture

def map_wrapper(m):
    def return_fun(qid):
        try:
            return m[qid]
        except KeyError:
            return None
    return return_fun

mismatch = pd.DataFrame()


#order is important because it determines the preference we will use
col_map_fun = zip(['ethnic_group', 'citizenship', 'place_of_birth'],
                  [map_wrapper(ethnic_group_map),map_wrapper(citizenship_map), map_pob])

def determine_culture(row):
    culture = None
    for col, map_fun in col_map_fun:
        guess = map_fun(row[col])
        if (culture is not None) and (guess is not None):
            if culture != guess:
                mismatch.append(row,ignore_index=True)
        if guess:
            culture = guess
    
    return str(culture).lower() if culture else culture #to return None properly

%%timeit -r 1 -n 1
allrecs.iloc[0:2500].apply(lambda x: determine_culture(x), axis=1)

%%timeit -r 1 -n 1
allrecs.iloc[0:25000].apply(lambda x: determine_culture(x), axis=1)

allrecs['culture'] = allrecs.apply(lambda x: determine_culture(x), axis=1)

print mismatch

allrecs.to_json('helpers/world_cultures_shortcut.json')

allrecs = pd.DataFrame.from_dict(json.load(open('helpers/world_cultures_shortcut.json','r')))

import scipy.stats
scipy.stats.spearmanr(rank_compare[['Rank','Rank_wikidata']])

scipy.stats.mannwhitneyu(rank_compare['Rank'],rank_compare['Rank_wikidata'])

scipy.stats.ranksums(rank_compare['Rank'],rank_compare['Rank_wikidata'])

print rank_compare.to_html()

country_map = pd.DataFrame.from_csv('helpers/aggregation_maps/country_maps.csv')

def map_culture(qid):
    if not type(qid) is str:
        return None
    else:
        country_list = pobs_map[qid]
        if len(country_list) == 0:
            return None
        else:
            country = country_list[0] #assumption
            culture = country_map.ix[country]['culture_name']
            return culture

allrecs['culture'] = allrecs['place_of_birth'].apply(map_culture)

import math
import pywikibot
#Tranforming QIDs into English labels.
enwp = pywikibot.Site('en','wikipedia')
wikidata = enwp.data_repository()

retrieved = dict()

def english_label(qid):
    if qid:
        if type(qid) is float:
            if math.isnan(qid):
                return None
        #first see if we've done it
        try:
            return retrieved[qid]
        except KeyError:
            try:
                page = pywikibot.ItemPage(wikidata, qid)
                data = page.get()
                lab = data['labels']['en']
                retrieved[qid] = lab
                return lab
            except KeyError:
                retrieved[qid] = qid
                return qid
    else:
        return None

english_label('Q6581097')

allrecs['gender_name'] = allrecs['gender'].apply(english_label)

outdf = allrecs[['gender_name','culture']]

outdf.to_csv('helpers/Chi_Squared_Test_Data.csv')

has = defaultdict(dict)
for col in allrecs.columns:
    def test(x):
        if isinstance(x, float):
            return not math.isnan(x)
        else:
            return x is not None
    nonempty = len(allrecs[allrecs[col].apply(test)])
    nonemptyper = nonempty / float(len(allrecs))
    
    has[col]['Items with property'] = nonempty
    has[col]['% of total'] = nonemptyper

hasdf = pd.DataFrame.from_dict(has, orient='index')

print hasdf.sort('% of total').to_html(justify='right', formatters={'% of total':lambda x: '%.2f' % (x*100), 
                                                   'Items with property':lambda x: '{0:,}'.format(x)})

hasdob = allrecs[allrecs['dob'].apply(lambda x: not math.isnan(x))]
len(hasdob)

hasgender = hasdob[hasdob['gender'].apply(lambda x: not math.isnan(x) if type(x) is float else True)]
len(hasgender)

hascult = hasgender[hasgender['culture'].apply(lambda x: x is not None)]
len(hascult)

hascult.head()

culture_groups = hascult.groupby('culture')

def make_perc_series(df):
    years_per = dict()
    dobs = df.groupby('dob')
    #hate to use a for loop, fixlater
    for year, group in dobs:
        nmcount = group[group['gender'] != 'Q6581097']['gender'].count()
        totalcount = group['gender'].count()
        nmper = nmcount / float(totalcount)
        years_per[year] = nmper
        perc_series = pd.TimeSeries(data=years_per)
    
    return perc_series
    
perc_dict = dict()
for name, group in culture_groups:
    perc_series = make_perc_series(group)
    perc_dict[name] = perc_series


perc_df.tail(10)

perc_df = pd.DataFrame.from_dict(perc_dict)
years = range(1800,2000,int(200/6.0))
subbd_df = perc_df.ix[years]
infogram = subbd_df
infogram.to_csv('Magnus Gender analysis/infogram_pob_dob_cult.csv',index=True, encoding='utf-8')

fig, (full, modern) = plt.subplots(1,2, figsize=(20,6))

end_year = 2000

for start_year, ra_len, ax in zip((-1000,1800), (100,10), (full, modern)):
    ra_dict = dict()
    for name, series in perc_dict.iteritems():
        ra_dict[name] = pd.rolling_mean(series, ra_len, min_periods=10)
    cult_dob_per  = pd.DataFrame(ra_dict)
    
    if start_year == 1800:
        year_list = range(1900,end_year,10)
        cult_dob_per.ix[years].to_csv('Magnus Gender analysis/infogram_pob_dob_cult.csv',index=True, encoding='utf-8')
    

    cult_dob_per.plot(cmap='Paired', linewidth=2, ax=ax, legend=False,zorder=-ra_len)
    ax.set_xlim((start_year, end_year))
    ax.set_xticks(range(start_year, end_year,(end_year-start_year) / 16))
    ax.set_ylim((0,0.6))
    ax.set_title(u'{}—{}, with {} year Rolling Average'.format(start_year, end_year,ra_len))
    ax.yaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, y: '{:.0%}'.format(x )))

full.legend = legend(bbox_to_anchor=(0.05, 0.95), loc=2, borderaxespad=0)
full.set_xticks(range(-1000, end_year,(end_year+1000) / 15))
fig.suptitle('Female % of Biographies by Culture, over Time', fontsize=24)
fig.subplots_adjust(top=0.88)


fig, (full, modern) = plt.subplots(2,1, figsize=(12,8), sharex=False)

end_year = 2000

for start_year, ra_len, ax in zip((-1000,1800), (100,10), (full, modern)):
    ra_dict = dict()
    for name, series in perc_dict.iteritems():
        ra_dict[name] = pd.rolling_mean(series, ra_len, min_periods=10)
    cult_dob_per  = pd.DataFrame(ra_dict)
    cult_dob_per.plot(cmap='Paired', linewidth=2, ax=ax, legend=False,zorder=-ra_len)
    ax.set_xlim((start_year, end_year))
    ax.set_xticks(range(start_year, end_year,(end_year-start_year) / 16))
    ax.set_ylim((0,0.6))
    ax.set_title(u'{}—{}, with {} year Rolling Average'.format(start_year, end_year,ra_len))
    ax.yaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, y: '{:.0%}'.format(x )))

full.legend = legend(bbox_to_anchor=(0.05, 0.95), loc=2, borderaxespad=0)
#full.set_xticks(range(-1000, end_year,(end_year+1000) / 15))
fig.suptitle('Female % of Biographies by Culture, over Time', fontsize=24)
fig.subplots_adjust(top=0.88)


dobexists = allrecs[allrecs['dob'].apply(lambda x: not math.isnan(x))]
dobcultureexists = dobexists[dobexists['culture'].apply(lambda x: x is not None)]
len(dobcultureexists)

culture_groups = dobcultureexists[['dob','culture']].groupby(by='culture')

def make_tot_series(df):
    years_tot = dict()
    dobs = df.groupby('dob')
    #hate to use a for loop, fixlater
    for year, group in dobs:
        totalcount = group['culture'].count()
        years_tot[year] = totalcount
        tot_series = pd.TimeSeries(data=years_tot)
    
    return tot_series
    
tot_dict = dict()
for name, group in culture_groups:
    tot_dict[name] = make_tot_series(group)

end_year = 2014
for start_year in [1500, 1800]:
    for ra_len in [2, 5, 10]:
        ra_dict = dict()
        for name, series in tot_dict.iteritems():
            ra_dict[name] = pd.rolling_mean(series, ra_len, min_periods=1)

        cult_dob = pd.DataFrame(ra_dict)
        plt = cult_dob.plot(figsize=(20,6),  cmap='Set2', linewidth=1.5)
        plt.set_xlim((start_year,end_year))
        plt.set_xticks(range(start_year,end_year,(end_year-start_year) / 15))
        plt.set_title('Total Biographies by Date of Birth |  %s Year Rolling Average' % str(ra_len))
        plt.legend(loc=2)

for start_year, end_year in zip([-2000, -1000], [1000,1500]):
    for ra_len in [1,2,10]:
        ra_dict = dict()
        for name, series in tot_dict.iteritems():
            ra_dict[name] = pd.rolling_mean(series, ra_len, min_periods=1)
        cult_dob = pd.DataFrame(ra_dict)
        plt = cult_dob.plot(figsize=(20,6),  cmap='Set2', linewidth=1.5)
        plt.set_ylim((0,50))
        plt.set_yscale('log')
        plt.set_xlim((start_year,end_year))
        plt.set_xticks(range(start_year,end_year,(end_year-start_year) / 15))
        plt.set_title('Total Biographies by Date of Birth |  %s Year Rolling Average' % str(ra_len))
        plt.legend(loc=2)