import pandas
import math
import datetime
import os
import json

import matplotlib.pyplot as plt

plt.style.use('fivethirtyeight')

%pylab inline

java_min_int = -2147483648
gender_ordered = [u'female', u'male', u'transgender female', u'intersex', u"fa'afafine", u'transgender male', u'female animal', u'male animal', u'woman', u'genderqueer', u'kathoey']

import pywikibot
#Tranforming QIDs into English labels.
enwp = pywikibot.Site('en','wikipedia')
wikidata = enwp.data_repository()

retrieved = dict()

def english_label(qid):
    if type(qid) is float:
        if math.isnan(qid):
            return qid
    #first see if we've done it
    try:
        return retrieved[qid]
    except KeyError:
        try:
            page = pywikibot.ItemPage(wikidata, qid)
            data = page.get()
            lab = data['labels']['en']
            retrieved[qid] = lab
            return lab
        except:
            retrieved[qid] = qid
            return qid

def engify_labels(df, index=False):
    if index:
        axis = df.index
    else:
        axis = df.columns
    qids = [str(q) for q in axis]
    labels = [english_label(qid) for qid in qids]
    axis = labels

df = pandas.read_csv('snapshot_data/2014-09-17/gender-index-data-2014-09-17.csv', 
                                 na_values=[java_min_int])

ls snapshot_data/

#in case these things need ungzipping
snap_folders = !ls snapshot_data/
for folder in snap_folders:
    !gunzip -k snapshot_data/$folder/*.gz

folders = !ls snapshot_data/
locs = []
for folder in folders:
        loc = !ls snapshot_data/$folder/gender-index*.csv
        locs.append(loc[0])
print locs

snap_dfs = {loc[-14:-4]: pandas.read_csv(loc, na_values=[java_min_int]) for loc in locs}

def split_column(q_str):
    if type(q_str) is float:
        if numpy.isnan(q_str):
            return [q_str] #returning this way so we can gurantee that column contains list
    if type(q_str) is str:
        qs = q_str.split('|')
        return qs[:-1] #cos the format will always end with a |

for snap, df in snap_dfs.iteritems():
    for column in ['gender', 'ethnic_group', 'citizenship', 'place_of_birth', 'site_links']:
        column_plural = column+'s'
        df[column_plural] = df[column].apply(split_column)

latest = snap_dfs[max(snap_dfs.keys())]
earliest = snap_dfs[min(snap_dfs.keys())]

latest.query('dob == 999')

for df in [earliest, latest]:
    for qid in latest.query('dob == 999')['qid']:
        print 'http://wikidata.org/wiki/'+qid
    print '\n'

from collections import defaultdict
import time

def make_reindex(snap_df):

    def int_dict_factory():
        return defaultdict(int)

    def nan_test(v):
        try:
            if math.isnan(v):
                return True
        except TypeError:
                return False
    #abstracted: we want year-gender, but also
    #gender-ethnicity -citizenship -place of birth, site-links

    params = ['dob','dod','genders','ethnic_groups','citizenships','place_of_births','site_linkss']
    gender_param = {param:defaultdict(int_dict_factory) for param in params}

    for index, row in snap_df.iterrows():
        row_data = {p : row[p] for p in params}
        for param in params:
            gender_dict = gender_param[param]
            vrs = row_data[param]
            genders = row_data['genders']
            if not nan_test(vrs):
                if not nan_test(genders):
                    for gender in genders:
                            if type(vrs) is list:
                                for var in vrs:
                                    gender_dict[gender][var] += 1
                            else: 
                                    gender_dict[gender][vrs] +=  1
                                    
    gender_dfs = {param: pandas.DataFrame.from_dict(gender_param[param], orient='columns') for param in params}
    return gender_dfs 

gender_indexes = {snap: make_reindex(df) for snap, df in snap_dfs.iteritems()}

for snap, gender_dfs in gender_indexes.iteritems():
    for param, gender_df in gender_dfs.iteritems():
        print param
        engify_labels(gender_df)


for snap, gender_df in gender_indexes.iteritems():
    for param, gender_df in gender_dfs.iteritems():
        property_index_dir = 'snapshot_data/%s/property_indexes' % (snap)
        if not os.path.exists(property_index_dir):
            os.makedirs(property_index_dir)
        filename = '%s/%s-index.csv' % (property_index_dir, param)
        filepoint = open(filename, 'w')
        filepoint.write(gender_df.to_csv())
        filepoint.close()

latest_date = max(snap_dfs.keys())
earliest_date = min(snap_dfs.keys())

gender_indexes[latest_date]['dob'].ix[999]

gender_dfs['dob'].plot(kind='area',stacked=True, figsize=(24,8))


plt = gender_dfs['dob'].ix[1800:].plot(kind='area',stacked=True, figsize=(24,8))
plt.set_title('''Wikidata Biography Gender Quantities by Year
    1800 onwards''', size=24)
plt.set_xlabel('Year', size=18)
plt.set_ylabel('Biographies', size=18)
plt.set_xlim((1800,2014))
plt.legend(title='Gender', loc=2)

nonmale_early = gender_indexes[earliest_date]['dod'].copy(deep=True)
nonmale_late = gender_indexes[latest_date]['dod'].copy(deep=True)

del nonmale_early['nan']
nonmale_early['nonmale'] = nonmale_early.sum(axis=1) - nonmale_early['male']
nonmale_early['nm_per'] = (nonmale_early['nonmale'] ) / (nonmale_early['nonmale'] + nonmale_early['male'])

del nonmale_late['nan']
nonmale_late['nonmale'] = nonmale_late.sum(axis=1) - nonmale_late['male']
nonmale_late['nm_per'] = (nonmale_late['nonmale'] ) / (nonmale_late['nonmale'] + nonmale_late['male'])

plt = nonmale_early['nm_per'].plot(figsize(24,8), kind='line')
plt.set_xlim((-1000,2014))

ma = pandas.rolling_mean(nonmale_early['nm_per'], 10)

plt = ma.plot()
plt.set_xlim((-1000,2014))

nonmale_late['nm_per'].plot(figsize=(24,8))
plt.set_xlim((-1000,2014))

gender_ordered

['Date of Birth']*2

plt.style

infogram = pandas.DataFrame.from_csv('snapshot_data/2014-10-13/property_indexes/dob-index.csv')
infogram.fillna(0, inplace=True)
del infogram['nan']
infogram['total'] = infogram.sum(axis=1)
infogram['nonbin'] = infogram['total'] - infogram['male'] - infogram['female']
infogram['fem_per'] = infogram['female']  / (infogram['total'])
infogram['nonbin_per'] = infogram['nonbin'] / infogram['total']
rm = pandas.rolling_mean(infogram[['fem_per','nonbin_per']], 10, min_periods=10)

partyears = range(1800,2000,int(200/6.0))
rm.ix[partyears][['nonbin_per','fem_per']].T.to_csv('Magnus Gender analysis/infogram_dob_rm.csv')
infogram.ix[partyears][['nonbin_per','fem_per','nonbin']].T.to_csv('Magnus Gender analysis/infogram_dob.csv')

rm.ix[1880:1910]['nonbin_per']

fig, axes = pylab.subplots(nrows=2, ncols=2, sharey=True)
for ax, beginning, l, (xtext, ytext) in zip(axes.ravel(), [-4000, 1800] * 2, ['b']*2+['d']*2, [('-4000 BCE to present','Date of Birth'),('1800 CE to present',''),('','Date of Death'),('','')]):
    acro = 'do'+l
    df = pandas.DataFrame.from_csv('snapshot_data/2014-10-13/property_indexes/%s-index.csv' % acro)
    del df['nan']
    df   = df.reindex_axis(gender_ordered,axis=1)
    p = df.plot(kind='area', figsize=(9,10), cmap='Accent', ax=ax, legend=False, linewidth=1)
    p.set_xlim((beginning,2014))
    #p.set_xlabel(xtext)
    p.set_ylabel(ytext)
    p.set_title(xtext, fontsize=12)

fig.legend = legend(bbox_to_anchor=(1.05, 1.5), loc=2, borderaxespad=0)
fig.suptitle('Volumes of Dates of Birth and Death in Wikidata \n by Gender \n Total and Modern Timeframes', fontsize=24)
fig.tight_layout()
subplots_adjust(hspace=0.1, top=0.82)


ra_len = 1

dox = pandas.DataFrame()
nonbindox = pandas.DataFrame()

for l in ['b','d']:
    acro = 'do'+l
    df = pandas.DataFrame.from_csv('snapshot_data/2014-10-13/property_indexes/%s-index.csv' % acro)
    del df['nan']
    df['total'] = df.sum(axis=1)
    df['nonbin'] = df['total'] - df['male'] - df['female']
    df['fem_per'] = df['female']  / (df['total'])
    df['nonbin_per'] = df['nonbin'] / df['total']
    
    ra = pandas.rolling_mean(df['fem_per'], ra_len)
    dox[acro] = ra
    
    nonbinra = pandas.rolling_mean(df['nonbin_per'], ra_len)
    nonbindox[acro] = nonbinra

fig, (pltf, pltb) =  pylab.subplots(nrows=2, ncols=1, sharex=True, figsize=(9,6))
dox.plot(kind='line',  cmap='Paired', linewidth=2, ax=pltf)
pltf.set_xlim((1400,2014))
pltf.set_ylim((0,0.7))
pltf.yaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, y: '{}%'.format(int(x*100) )))
pltf.set_title('Female ratio')
pltf.legend(('Date of Birth', 'Date of Death'),loc=4, bbox_to_anchor=(1.25,-0.25))

nonbindox.plot(kind='line',  cmap='Paired', linewidth=2, ax=pltb, legend=False)
pltb.set_xlim((1400,2014))
pltb.yaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, y: '{}%'.format(x*100)))
pltb.set_title('Non-binary Ratio')

fig.suptitle('Composition of Wikidata Genders in Modern History', fontsize=24)
fig.subplots_adjust(top=0.87)


pob = gender_indexes[latest_date]['place_of_births'].copy(deep=True)

len(pob)

%timeit -n 1 -r 1
def is_or_has_country(qid):
    countries = list() #we're going to return this
    page = pywikibot.ItemPage(wikidata, qid)
    data = page.get()
    claims = data['claims']
    for pid, claimlist in claims.iteritems():
        if pid == 'P17':
            for claim in claimlist:
                countries.append(claim.target.title()) #this is part of a country
        if pid == 'P31':
            for claim in claimlist:
                    if claim.target.title() == 'Q6256':
                        countries.append(qid) #this actually is a  country

    return countries 

place_country = dict()

count=0
for place in pob.index[1:]: #1 because the first index is nan
    place_country[place] = is_or_has_country(place)
    count += 1
    if count % 100 == 0:
        print count

pobs = [p for p in pob.index[1:]]
json.dump(pobs, open('pobs_list.json','w'))

pobs_map = json.load(open('helpers/pobs_map.json','r'))

have_country = [c[0] for c in pobs_map.values() if len(c) != 0]

len(list(set(have_country)))

len(have_country) / float(len(pobs_map.values()))

have_no_country = [p for p, c in  pobs_map.iteritems() if len(c) == 0]

len(have_country)

len(have_no_country)

for place in have_no_country[10:20]:
    print 'http://wikidata.org/wiki/'+place

country_lengths = sum([len(c) for c in have_country])

country_lengths / float(len(have_country))

for place, country_list in pobs_map.iteritems():
    if len(country_list) > 1:
        if reduce(lambda a, b: a != b, country_list ):#all elements are the not  same
            pass#print ['http://wikidata.org/wiki/'+place for place in country_list]

import IPython.display

IPython.display.Image('helpers/Inglehart_Values_Map2.svg.png')

coun = [c for c in pobs_map.itervalues() if c]

country_culture = dict()
for place, country_list in pobs_map.iteritems():
    if country_list:
        qid =  country_list[0]
        if qid in country_culture.keys():
            continue
        else:
            link = 'http://wikidata.org/wiki/%s' % qid
            print english_label(qid)
            print link
            culture_num = input('enter culture num')
            country_culture[qid] = culture_num
        

fp = open('helpers/pob_agg.txt','w')
seen = list()
for place, country_list in pobs_map.iteritems():
    if country_list:
        qid =  country_list[0]
        if qid in seen:
            continue
        else:
            enlab = english_label(qid)
            writestr = u'%s\t%s\t\n' % (qid, enlab)
            fp.write(writestr.encode('utf-8'))
            seen.append(qid)
fp.close()


culture_map ={1:'confucian',
                        2:'orthodox',
                        3:'islamic',
                        4:'south asia',
                        5:'africa',
                        6: 'catholic europe',
                        7: 'protestant europe',
                        8: 'english speaking',
                        9: 'latin america'}

cultures_df.to_csv('helpers/culture_names.csv')

cultures_df = pandas.DataFrame.from_csv('helpers/culture_names.csv')
cultures_df['qid'] = cultures_df.index
cultures_df['cutlure_name'] = cultures_df['culture_number'].apply(lambda x: culture_map[x])

pob = pandas.DataFrame.from_csv('snapshot_data/2014-10-13/property_indexes/place_of_births-index.csv')
pob['qid'] = pob.index
#pob = pob.ix[1:] #remove nan row

qid_countryqid = json.load(open('helpers/pobs_map.json','r'))

def qid_to_country(qid):
    if type(qid) is float:
        if math.isnan(qid):
            return 'no_data'
    else:
        return qid_countryqid[qid]
    
pob['country_qid'] = pob['qid'].apply(lambda qid: qid_to_country(qid))

country_culture = dict(zip(cultures_df['qid'], cultures_df['culture_name']))

def aggregate_culture(qid_list):
    if not type(qid_list) is list and qid_list == 'no_data':
                return 'no_data'
    if len(qid_list) > 0:
        culture_name = country_culture[qid_list[0]]
        return culture_name
    else:
        return 'not_easily_aggregatable'

pob['culture_name'] = pob['country_qid'].apply(lambda qid_list: aggregate_culture(qid_list))

culture_groups = pob.groupby(by=['culture_name'])[u'transgender female', u'intersex', u"fa'afafine", u'transgender male', u'female animal', u'male animal', u'woman', u'genderqueer', u'female', u'male', u'kathoey'].sum().copy(deep=True)


culture_groups.to_csv('helpers/pob_plot_data_oct.csv')

culture_groups['total'] = culture_groups.sum(axis=1)

normed_pobs_agg = culture_groups.apply(lambda x: x/ float(x['total']), axis=1)

pobs_plot = normed_pobs_agg.sort('female')[normed_pobs_agg.columns[:-1]]
pobs_plot_mf = normed_pobs_agg.sort('female')[['male','female']]
pobs_plot_nmf = normed_pobs_agg[[u'transgender female', u'intersex', u"fa'afafine", u'transgender male', u'female animal', u'male animal', u'woman', u'genderqueer', u'kathoey']]


pobs_plot.plot(kind='bar', figsize=(10,10))

plt = pobs_plot_mf.plot(kind='bar', figsize=(12,5), cmap='Paired')
plt.set_title('Place of Birth Gender Composition by 9 world cultures.\nNo Data are items without Birth Data\nNot Easily Aggregratable means Place of Birth did Claim did not have a Country Property.(7%)')
plt.set_ylabel('Gender Composition')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0)


plt = pobs_plot_nmf.plot(kind='bar', figsize=(12,5), cmap='Accent')
plt.set_title('Place of Birth Gender Composition by 9 world cultures.\nNo Data are items without Birth Data\nNot Easily Aggregratable means Place of Birth did Claim did not have a Country Property.(7%)')
plt.set_ylabel('Gender Composition')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0)


pob = gender_indexes[latest_date]['place_of_births'].copy(deep=True)

eg = gender_indexes[latest_date]['ethnic_groups'].copy(deep=True)

eg = pandas.DataFrame.from_csv('snapshot_data/2014-10-13/property_indexes/ethnic_groups-index.csv')

len(eg)

eg=eg.ix[1:].fillna(value=0)
eg['total'] = eg.sum(axis=1)
engify_labels(eg,index=True)

eg['qid'] = eg.index

eg['ethnic_name'] = eg['qid'].apply(lambda x: english_label(x))

eg.sort('total', ascending=False).head(50)

eg_normed = eg[[u'transgender female', u'intersex', u"fa'afafine", u'transgender male', u'female animal', u'male animal', u'woman', u'genderqueer', u'female', u'male', u'kathoey', 'total']].apply(lambda x: x/x['total'], axis=1)


eg_cut = eg[eg['total']>1]
eg_cut_normed = eg_cut[[u'transgender female', u'intersex', u"fa'afafine", u'transgender male', u'female animal', u'male animal', u'woman', u'genderqueer', u'female', u'male', u'kathoey', 'total']].apply(lambda x: x/x['total'], axis=1)


eg_

engify_labels(eg_normed, index=True)

eg_normed.sort(columns=['female'], ascending=False)['female'].plot(kind='bar', figsize=(18,8))

def export_for_crowd_aggregate(df, savename):
    crowd_source_export  = pandas.DataFrame()
    crowd_source_export['qid'] = df.index
    crowd_source_export['en_label'] = crowd_source_export['qid'].apply(lambda x: english_label(x).encode('utf-8'))
    crowd_source_export['aggregate_group'] = ''
    crowd_source_export.to_csv('helpers/%s_map.csv' % savename)

export_for_crowd_aggregate(eg, 'ethnic_groups')

cz = gender_indexes[latest_date]['citizenships'].copy(deep=True)

cz = pandas.DataFrame.from_csv('snapshot_data/2014-10-13/property_indexes/citizenships-index.csv')

cz = cz.ix[1:] #remove the inintial nan ,which are items that had no citienzship

len(cz)

export_for_crowd_aggregate(cz, 'citizenships')

sl = pandas.DataFrame.from_csv('snapshot_data/2014-10-13/property_indexes/site_linkss-index.csv')

len(sl)

yg.plot(kind='scatter', x='year', y='lnmale')
plt.draw()

yg_reg = yg
yg_reg['year'] = yg_reg.index

yg_reg['lnmale'] = numpy.log(yg_reg['male'])

import statsmodels.api as sm

nonnan = yg_reg.ix[start_year:1986].fillna(value=0)
model = sm.OLS(nonnan['male'],nonnan['year'])
results = model.fit()
print(results.summary())
    print(start_year, results.rsquared)

rsquared_results = pandas.DataFrame(columns=['start_year','rsquared'])
for start_year in numpy.arange(-4000,1950,50):
    nm_model = sm.OLS(yg_reg.ix[start_year:1986]['nm_per'], sm.add_constant(yg_reg.ix[start_year:1986]['year']) )
    nm_results = nm_model.fit()
    rsquared_results = rsquared_results.append({'start_year':start_year, 'rsquared':nm_results.rsquared}, ignore_index=True)
    
ax = rsquared_results[rsquared_results['start_year']>500].plot(kind='line',x='start_year',y='rsquared', 
                                                                title=r'$R^2$ value for linear regression on non-male percentage')

ax.set_xlabel('starting year of regresssion untilt 1987')
ax.set_ylabel(r'$R^2$')