import pandas as pd import numpy import json from collections import defaultdict from matplotlib.pylab import style style.use('fivethirtyeight') %pylab inline java_min_int = -2147483648 allrecs = pd.read_csv('snapshot_data/2014-10-13/gender-index-data-2014-10-13.csv',na_values=[java_min_int]) def split_column(q_str): if type(q_str) is float: if numpy.isnan(q_str): return q_str if type(q_str) is str: qs = q_str.split('|') return qs[0] #cos the format will always end with a | for col in ['place_of_birth','gender', 'citizenship','ethnic_group']: allrecs[col] = allrecs[col].apply(split_column) allrecs.head(5) pobs_map = json.load(open('helpers/aggregation_maps/pobs_map.json','r')) country_map = pd.DataFrame.from_csv('helpers/aggregation_maps/country_maps.csv') ethnic_group_map = json.load(open('helpers/aggregation_maps/mechanical_turk/ethnic_groups_map.json','r')) citizenship_map = json.load(open('helpers/aggregation_maps/mechanical_turk/citizenship_map.json','r')) def map_pob(qid): if not type(qid) is str: return None else: country_list = pobs_map[qid] if len(country_list) == 0: return None else: country = country_list[0] #assumption culture = country_map.ix[country]['culture_name'] return culture def map_wrapper(m): def return_fun(qid): try: return m[qid] except KeyError: return None return return_fun mismatch = pd.DataFrame() #order is important because it determines the preference we will use col_map_fun = zip(['ethnic_group', 'citizenship', 'place_of_birth'], [map_wrapper(ethnic_group_map),map_wrapper(citizenship_map), map_pob]) def determine_culture(row): culture = None for col, map_fun in col_map_fun: guess = map_fun(row[col]) if (culture is not None) and (guess is not None): if culture != guess: mismatch.append(row,ignore_index=True) if guess: culture = guess return str(culture).lower() if culture else culture #to return None properly %%timeit -r 1 -n 1 allrecs.iloc[0:2500].apply(lambda x: determine_culture(x), axis=1) %%timeit -r 1 -n 1 allrecs.iloc[0:25000].apply(lambda x: determine_culture(x), axis=1) allrecs['culture'] = allrecs.apply(lambda x: determine_culture(x), axis=1) print mismatch allrecs.to_json('helpers/world_cultures_shortcut.json') allrecs = pd.DataFrame.from_dict(json.load(open('helpers/world_cultures_shortcut.json','r'))) import scipy.stats scipy.stats.spearmanr(rank_compare[['Rank','Rank_wikidata']]) scipy.stats.mannwhitneyu(rank_compare['Rank'],rank_compare['Rank_wikidata']) scipy.stats.ranksums(rank_compare['Rank'],rank_compare['Rank_wikidata']) print rank_compare.to_html() country_map = pd.DataFrame.from_csv('helpers/aggregation_maps/country_maps.csv') def map_culture(qid): if not type(qid) is str: return None else: country_list = pobs_map[qid] if len(country_list) == 0: return None else: country = country_list[0] #assumption culture = country_map.ix[country]['culture_name'] return culture allrecs['culture'] = allrecs['place_of_birth'].apply(map_culture) import math import pywikibot #Tranforming QIDs into English labels. enwp = pywikibot.Site('en','wikipedia') wikidata = enwp.data_repository() retrieved = dict() def english_label(qid): if qid: if type(qid) is float: if math.isnan(qid): return None #first see if we've done it try: return retrieved[qid] except KeyError: try: page = pywikibot.ItemPage(wikidata, qid) data = page.get() lab = data['labels']['en'] retrieved[qid] = lab return lab except KeyError: retrieved[qid] = qid return qid else: return None english_label('Q6581097') allrecs['gender_name'] = allrecs['gender'].apply(english_label) outdf = allrecs[['gender_name','culture']] outdf.to_csv('helpers/Chi_Squared_Test_Data.csv') has = defaultdict(dict) for col in allrecs.columns: def test(x): if isinstance(x, float): return not math.isnan(x) else: return x is not None nonempty = len(allrecs[allrecs[col].apply(test)]) nonemptyper = nonempty / float(len(allrecs)) has[col]['Items with property'] = nonempty has[col]['% of total'] = nonemptyper hasdf = pd.DataFrame.from_dict(has, orient='index') print hasdf.sort('% of total').to_html(justify='right', formatters={'% of total':lambda x: '%.2f' % (x*100), 'Items with property':lambda x: '{0:,}'.format(x)}) hasdob = allrecs[allrecs['dob'].apply(lambda x: not math.isnan(x))] len(hasdob) hasgender = hasdob[hasdob['gender'].apply(lambda x: not math.isnan(x) if type(x) is float else True)] len(hasgender) hascult = hasgender[hasgender['culture'].apply(lambda x: x is not None)] len(hascult) hascult.head() culture_groups = hascult.groupby('culture') def make_perc_series(df): years_per = dict() dobs = df.groupby('dob') #hate to use a for loop, fixlater for year, group in dobs: nmcount = group[group['gender'] != 'Q6581097']['gender'].count() totalcount = group['gender'].count() nmper = nmcount / float(totalcount) years_per[year] = nmper perc_series = pd.TimeSeries(data=years_per) return perc_series perc_dict = dict() for name, group in culture_groups: perc_series = make_perc_series(group) perc_dict[name] = perc_series perc_df.tail(10) perc_df = pd.DataFrame.from_dict(perc_dict) years = range(1800,2000,int(200/6.0)) subbd_df = perc_df.ix[years] infogram = subbd_df infogram.to_csv('Magnus Gender analysis/infogram_pob_dob_cult.csv',index=True, encoding='utf-8') fig, (full, modern) = plt.subplots(1,2, figsize=(20,6)) end_year = 2000 for start_year, ra_len, ax in zip((-1000,1800), (100,10), (full, modern)): ra_dict = dict() for name, series in perc_dict.iteritems(): ra_dict[name] = pd.rolling_mean(series, ra_len, min_periods=10) cult_dob_per = pd.DataFrame(ra_dict) if start_year == 1800: year_list = range(1900,end_year,10) cult_dob_per.ix[years].to_csv('Magnus Gender analysis/infogram_pob_dob_cult.csv',index=True, encoding='utf-8') cult_dob_per.plot(cmap='Paired', linewidth=2, ax=ax, legend=False,zorder=-ra_len) ax.set_xlim((start_year, end_year)) ax.set_xticks(range(start_year, end_year,(end_year-start_year) / 16)) ax.set_ylim((0,0.6)) ax.set_title(u'{}—{}, with {} year Rolling Average'.format(start_year, end_year,ra_len)) ax.yaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, y: '{:.0%}'.format(x ))) full.legend = legend(bbox_to_anchor=(0.05, 0.95), loc=2, borderaxespad=0) full.set_xticks(range(-1000, end_year,(end_year+1000) / 15)) fig.suptitle('Female % of Biographies by Culture, over Time', fontsize=24) fig.subplots_adjust(top=0.88) fig, (full, modern) = plt.subplots(2,1, figsize=(12,8), sharex=False) end_year = 2000 for start_year, ra_len, ax in zip((-1000,1800), (100,10), (full, modern)): ra_dict = dict() for name, series in perc_dict.iteritems(): ra_dict[name] = pd.rolling_mean(series, ra_len, min_periods=10) cult_dob_per = pd.DataFrame(ra_dict) cult_dob_per.plot(cmap='Paired', linewidth=2, ax=ax, legend=False,zorder=-ra_len) ax.set_xlim((start_year, end_year)) ax.set_xticks(range(start_year, end_year,(end_year-start_year) / 16)) ax.set_ylim((0,0.6)) ax.set_title(u'{}—{}, with {} year Rolling Average'.format(start_year, end_year,ra_len)) ax.yaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, y: '{:.0%}'.format(x ))) full.legend = legend(bbox_to_anchor=(0.05, 0.95), loc=2, borderaxespad=0) #full.set_xticks(range(-1000, end_year,(end_year+1000) / 15)) fig.suptitle('Female % of Biographies by Culture, over Time', fontsize=24) fig.subplots_adjust(top=0.88) dobexists = allrecs[allrecs['dob'].apply(lambda x: not math.isnan(x))] dobcultureexists = dobexists[dobexists['culture'].apply(lambda x: x is not None)] len(dobcultureexists) culture_groups = dobcultureexists[['dob','culture']].groupby(by='culture') def make_tot_series(df): years_tot = dict() dobs = df.groupby('dob') #hate to use a for loop, fixlater for year, group in dobs: totalcount = group['culture'].count() years_tot[year] = totalcount tot_series = pd.TimeSeries(data=years_tot) return tot_series tot_dict = dict() for name, group in culture_groups: tot_dict[name] = make_tot_series(group) end_year = 2014 for start_year in [1500, 1800]: for ra_len in [2, 5, 10]: ra_dict = dict() for name, series in tot_dict.iteritems(): ra_dict[name] = pd.rolling_mean(series, ra_len, min_periods=1) cult_dob = pd.DataFrame(ra_dict) plt = cult_dob.plot(figsize=(20,6), cmap='Set2', linewidth=1.5) plt.set_xlim((start_year,end_year)) plt.set_xticks(range(start_year,end_year,(end_year-start_year) / 15)) plt.set_title('Total Biographies by Date of Birth | %s Year Rolling Average' % str(ra_len)) plt.legend(loc=2) for start_year, end_year in zip([-2000, -1000], [1000,1500]): for ra_len in [1,2,10]: ra_dict = dict() for name, series in tot_dict.iteritems(): ra_dict[name] = pd.rolling_mean(series, ra_len, min_periods=1) cult_dob = pd.DataFrame(ra_dict) plt = cult_dob.plot(figsize=(20,6), cmap='Set2', linewidth=1.5) plt.set_ylim((0,50)) plt.set_yscale('log') plt.set_xlim((start_year,end_year)) plt.set_xticks(range(start_year,end_year,(end_year-start_year) / 15)) plt.set_title('Total Biographies by Date of Birth | %s Year Rolling Average' % str(ra_len)) plt.legend(loc=2)