import pandas import math import datetime import os import json import matplotlib.pyplot as plt plt.style.use('fivethirtyeight') %pylab inline java_min_int = -2147483648 gender_ordered = [u'female', u'male', u'transgender female', u'intersex', u"fa'afafine", u'transgender male', u'female animal', u'male animal', u'woman', u'genderqueer', u'kathoey'] import pywikibot #Tranforming QIDs into English labels. enwp = pywikibot.Site('en','wikipedia') wikidata = enwp.data_repository() retrieved = dict() def english_label(qid): if type(qid) is float: if math.isnan(qid): return qid #first see if we've done it try: return retrieved[qid] except KeyError: try: page = pywikibot.ItemPage(wikidata, qid) data = page.get() lab = data['labels']['en'] retrieved[qid] = lab return lab except: retrieved[qid] = qid return qid def engify_labels(df, index=False): if index: axis = df.index else: axis = df.columns qids = [str(q) for q in axis] labels = [english_label(qid) for qid in qids] axis = labels df = pandas.read_csv('snapshot_data/2014-09-17/gender-index-data-2014-09-17.csv', na_values=[java_min_int]) ls snapshot_data/ #in case these things need ungzipping snap_folders = !ls snapshot_data/ for folder in snap_folders: !gunzip -k snapshot_data/$folder/*.gz folders = !ls snapshot_data/ locs = [] for folder in folders: loc = !ls snapshot_data/$folder/gender-index*.csv locs.append(loc[0]) print locs snap_dfs = {loc[-14:-4]: pandas.read_csv(loc, na_values=[java_min_int]) for loc in locs} def split_column(q_str): if type(q_str) is float: if numpy.isnan(q_str): return [q_str] #returning this way so we can gurantee that column contains list if type(q_str) is str: qs = q_str.split('|') return qs[:-1] #cos the format will always end with a | for snap, df in snap_dfs.iteritems(): for column in ['gender', 'ethnic_group', 'citizenship', 'place_of_birth', 'site_links']: column_plural = column+'s' df[column_plural] = df[column].apply(split_column) latest = snap_dfs[max(snap_dfs.keys())] earliest = snap_dfs[min(snap_dfs.keys())] latest.query('dob == 999') for df in [earliest, latest]: for qid in latest.query('dob == 999')['qid']: print 'http://wikidata.org/wiki/'+qid print '\n' from collections import defaultdict import time def make_reindex(snap_df): def int_dict_factory(): return defaultdict(int) def nan_test(v): try: if math.isnan(v): return True except TypeError: return False #abstracted: we want year-gender, but also #gender-ethnicity -citizenship -place of birth, site-links params = ['dob','dod','genders','ethnic_groups','citizenships','place_of_births','site_linkss'] gender_param = {param:defaultdict(int_dict_factory) for param in params} for index, row in snap_df.iterrows(): row_data = {p : row[p] for p in params} for param in params: gender_dict = gender_param[param] vrs = row_data[param] genders = row_data['genders'] if not nan_test(vrs): if not nan_test(genders): for gender in genders: if type(vrs) is list: for var in vrs: gender_dict[gender][var] += 1 else: gender_dict[gender][vrs] += 1 gender_dfs = {param: pandas.DataFrame.from_dict(gender_param[param], orient='columns') for param in params} return gender_dfs gender_indexes = {snap: make_reindex(df) for snap, df in snap_dfs.iteritems()} for snap, gender_dfs in gender_indexes.iteritems(): for param, gender_df in gender_dfs.iteritems(): print param engify_labels(gender_df) for snap, gender_df in gender_indexes.iteritems(): for param, gender_df in gender_dfs.iteritems(): property_index_dir = 'snapshot_data/%s/property_indexes' % (snap) if not os.path.exists(property_index_dir): os.makedirs(property_index_dir) filename = '%s/%s-index.csv' % (property_index_dir, param) filepoint = open(filename, 'w') filepoint.write(gender_df.to_csv()) filepoint.close() latest_date = max(snap_dfs.keys()) earliest_date = min(snap_dfs.keys()) gender_indexes[latest_date]['dob'].ix[999] gender_dfs['dob'].plot(kind='area',stacked=True, figsize=(24,8)) plt = gender_dfs['dob'].ix[1800:].plot(kind='area',stacked=True, figsize=(24,8)) plt.set_title('''Wikidata Biography Gender Quantities by Year 1800 onwards''', size=24) plt.set_xlabel('Year', size=18) plt.set_ylabel('Biographies', size=18) plt.set_xlim((1800,2014)) plt.legend(title='Gender', loc=2) nonmale_early = gender_indexes[earliest_date]['dod'].copy(deep=True) nonmale_late = gender_indexes[latest_date]['dod'].copy(deep=True) del nonmale_early['nan'] nonmale_early['nonmale'] = nonmale_early.sum(axis=1) - nonmale_early['male'] nonmale_early['nm_per'] = (nonmale_early['nonmale'] ) / (nonmale_early['nonmale'] + nonmale_early['male']) del nonmale_late['nan'] nonmale_late['nonmale'] = nonmale_late.sum(axis=1) - nonmale_late['male'] nonmale_late['nm_per'] = (nonmale_late['nonmale'] ) / (nonmale_late['nonmale'] + nonmale_late['male']) plt = nonmale_early['nm_per'].plot(figsize(24,8), kind='line') plt.set_xlim((-1000,2014)) ma = pandas.rolling_mean(nonmale_early['nm_per'], 10) plt = ma.plot() plt.set_xlim((-1000,2014)) nonmale_late['nm_per'].plot(figsize=(24,8)) plt.set_xlim((-1000,2014)) gender_ordered ['Date of Birth']*2 plt.style infogram = pandas.DataFrame.from_csv('snapshot_data/2014-10-13/property_indexes/dob-index.csv') infogram.fillna(0, inplace=True) del infogram['nan'] infogram['total'] = infogram.sum(axis=1) infogram['nonbin'] = infogram['total'] - infogram['male'] - infogram['female'] infogram['fem_per'] = infogram['female'] / (infogram['total']) infogram['nonbin_per'] = infogram['nonbin'] / infogram['total'] rm = pandas.rolling_mean(infogram[['fem_per','nonbin_per']], 10, min_periods=10) partyears = range(1800,2000,int(200/6.0)) rm.ix[partyears][['nonbin_per','fem_per']].T.to_csv('Magnus Gender analysis/infogram_dob_rm.csv') infogram.ix[partyears][['nonbin_per','fem_per','nonbin']].T.to_csv('Magnus Gender analysis/infogram_dob.csv') rm.ix[1880:1910]['nonbin_per'] fig, axes = pylab.subplots(nrows=2, ncols=2, sharey=True) for ax, beginning, l, (xtext, ytext) in zip(axes.ravel(), [-4000, 1800] * 2, ['b']*2+['d']*2, [('-4000 BCE to present','Date of Birth'),('1800 CE to present',''),('','Date of Death'),('','')]): acro = 'do'+l df = pandas.DataFrame.from_csv('snapshot_data/2014-10-13/property_indexes/%s-index.csv' % acro) del df['nan'] df = df.reindex_axis(gender_ordered,axis=1) p = df.plot(kind='area', figsize=(9,10), cmap='Accent', ax=ax, legend=False, linewidth=1) p.set_xlim((beginning,2014)) #p.set_xlabel(xtext) p.set_ylabel(ytext) p.set_title(xtext, fontsize=12) fig.legend = legend(bbox_to_anchor=(1.05, 1.5), loc=2, borderaxespad=0) fig.suptitle('Volumes of Dates of Birth and Death in Wikidata \n by Gender \n Total and Modern Timeframes', fontsize=24) fig.tight_layout() subplots_adjust(hspace=0.1, top=0.82) ra_len = 1 dox = pandas.DataFrame() nonbindox = pandas.DataFrame() for l in ['b','d']: acro = 'do'+l df = pandas.DataFrame.from_csv('snapshot_data/2014-10-13/property_indexes/%s-index.csv' % acro) del df['nan'] df['total'] = df.sum(axis=1) df['nonbin'] = df['total'] - df['male'] - df['female'] df['fem_per'] = df['female'] / (df['total']) df['nonbin_per'] = df['nonbin'] / df['total'] ra = pandas.rolling_mean(df['fem_per'], ra_len) dox[acro] = ra nonbinra = pandas.rolling_mean(df['nonbin_per'], ra_len) nonbindox[acro] = nonbinra fig, (pltf, pltb) = pylab.subplots(nrows=2, ncols=1, sharex=True, figsize=(9,6)) dox.plot(kind='line', cmap='Paired', linewidth=2, ax=pltf) pltf.set_xlim((1400,2014)) pltf.set_ylim((0,0.7)) pltf.yaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, y: '{}%'.format(int(x*100) ))) pltf.set_title('Female ratio') pltf.legend(('Date of Birth', 'Date of Death'),loc=4, bbox_to_anchor=(1.25,-0.25)) nonbindox.plot(kind='line', cmap='Paired', linewidth=2, ax=pltb, legend=False) pltb.set_xlim((1400,2014)) pltb.yaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, y: '{}%'.format(x*100))) pltb.set_title('Non-binary Ratio') fig.suptitle('Composition of Wikidata Genders in Modern History', fontsize=24) fig.subplots_adjust(top=0.87) pob = gender_indexes[latest_date]['place_of_births'].copy(deep=True) len(pob) %timeit -n 1 -r 1 def is_or_has_country(qid): countries = list() #we're going to return this page = pywikibot.ItemPage(wikidata, qid) data = page.get() claims = data['claims'] for pid, claimlist in claims.iteritems(): if pid == 'P17': for claim in claimlist: countries.append(claim.target.title()) #this is part of a country if pid == 'P31': for claim in claimlist: if claim.target.title() == 'Q6256': countries.append(qid) #this actually is a country return countries place_country = dict() count=0 for place in pob.index[1:]: #1 because the first index is nan place_country[place] = is_or_has_country(place) count += 1 if count % 100 == 0: print count pobs = [p for p in pob.index[1:]] json.dump(pobs, open('pobs_list.json','w')) pobs_map = json.load(open('helpers/pobs_map.json','r')) have_country = [c[0] for c in pobs_map.values() if len(c) != 0] len(list(set(have_country))) len(have_country) / float(len(pobs_map.values())) have_no_country = [p for p, c in pobs_map.iteritems() if len(c) == 0] len(have_country) len(have_no_country) for place in have_no_country[10:20]: print 'http://wikidata.org/wiki/'+place country_lengths = sum([len(c) for c in have_country]) country_lengths / float(len(have_country)) for place, country_list in pobs_map.iteritems(): if len(country_list) > 1: if reduce(lambda a, b: a != b, country_list ):#all elements are the not same pass#print ['http://wikidata.org/wiki/'+place for place in country_list] import IPython.display IPython.display.Image('helpers/Inglehart_Values_Map2.svg.png') coun = [c for c in pobs_map.itervalues() if c] country_culture = dict() for place, country_list in pobs_map.iteritems(): if country_list: qid = country_list[0] if qid in country_culture.keys(): continue else: link = 'http://wikidata.org/wiki/%s' % qid print english_label(qid) print link culture_num = input('enter culture num') country_culture[qid] = culture_num fp = open('helpers/pob_agg.txt','w') seen = list() for place, country_list in pobs_map.iteritems(): if country_list: qid = country_list[0] if qid in seen: continue else: enlab = english_label(qid) writestr = u'%s\t%s\t\n' % (qid, enlab) fp.write(writestr.encode('utf-8')) seen.append(qid) fp.close() culture_map ={1:'confucian', 2:'orthodox', 3:'islamic', 4:'south asia', 5:'africa', 6: 'catholic europe', 7: 'protestant europe', 8: 'english speaking', 9: 'latin america'} cultures_df.to_csv('helpers/culture_names.csv') cultures_df = pandas.DataFrame.from_csv('helpers/culture_names.csv') cultures_df['qid'] = cultures_df.index cultures_df['cutlure_name'] = cultures_df['culture_number'].apply(lambda x: culture_map[x]) pob = pandas.DataFrame.from_csv('snapshot_data/2014-10-13/property_indexes/place_of_births-index.csv') pob['qid'] = pob.index #pob = pob.ix[1:] #remove nan row qid_countryqid = json.load(open('helpers/pobs_map.json','r')) def qid_to_country(qid): if type(qid) is float: if math.isnan(qid): return 'no_data' else: return qid_countryqid[qid] pob['country_qid'] = pob['qid'].apply(lambda qid: qid_to_country(qid)) country_culture = dict(zip(cultures_df['qid'], cultures_df['culture_name'])) def aggregate_culture(qid_list): if not type(qid_list) is list and qid_list == 'no_data': return 'no_data' if len(qid_list) > 0: culture_name = country_culture[qid_list[0]] return culture_name else: return 'not_easily_aggregatable' pob['culture_name'] = pob['country_qid'].apply(lambda qid_list: aggregate_culture(qid_list)) culture_groups = pob.groupby(by=['culture_name'])[u'transgender female', u'intersex', u"fa'afafine", u'transgender male', u'female animal', u'male animal', u'woman', u'genderqueer', u'female', u'male', u'kathoey'].sum().copy(deep=True) culture_groups.to_csv('helpers/pob_plot_data_oct.csv') culture_groups['total'] = culture_groups.sum(axis=1) normed_pobs_agg = culture_groups.apply(lambda x: x/ float(x['total']), axis=1) pobs_plot = normed_pobs_agg.sort('female')[normed_pobs_agg.columns[:-1]] pobs_plot_mf = normed_pobs_agg.sort('female')[['male','female']] pobs_plot_nmf = normed_pobs_agg[[u'transgender female', u'intersex', u"fa'afafine", u'transgender male', u'female animal', u'male animal', u'woman', u'genderqueer', u'kathoey']] pobs_plot.plot(kind='bar', figsize=(10,10)) plt = pobs_plot_mf.plot(kind='bar', figsize=(12,5), cmap='Paired') plt.set_title('Place of Birth Gender Composition by 9 world cultures.\nNo Data are items without Birth Data\nNot Easily Aggregratable means Place of Birth did Claim did not have a Country Property.(7%)') plt.set_ylabel('Gender Composition') plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0) plt = pobs_plot_nmf.plot(kind='bar', figsize=(12,5), cmap='Accent') plt.set_title('Place of Birth Gender Composition by 9 world cultures.\nNo Data are items without Birth Data\nNot Easily Aggregratable means Place of Birth did Claim did not have a Country Property.(7%)') plt.set_ylabel('Gender Composition') plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0) pob = gender_indexes[latest_date]['place_of_births'].copy(deep=True) eg = gender_indexes[latest_date]['ethnic_groups'].copy(deep=True) eg = pandas.DataFrame.from_csv('snapshot_data/2014-10-13/property_indexes/ethnic_groups-index.csv') len(eg) eg=eg.ix[1:].fillna(value=0) eg['total'] = eg.sum(axis=1) engify_labels(eg,index=True) eg['qid'] = eg.index eg['ethnic_name'] = eg['qid'].apply(lambda x: english_label(x)) eg.sort('total', ascending=False).head(50) eg_normed = eg[[u'transgender female', u'intersex', u"fa'afafine", u'transgender male', u'female animal', u'male animal', u'woman', u'genderqueer', u'female', u'male', u'kathoey', 'total']].apply(lambda x: x/x['total'], axis=1) eg_cut = eg[eg['total']>1] eg_cut_normed = eg_cut[[u'transgender female', u'intersex', u"fa'afafine", u'transgender male', u'female animal', u'male animal', u'woman', u'genderqueer', u'female', u'male', u'kathoey', 'total']].apply(lambda x: x/x['total'], axis=1) eg_ engify_labels(eg_normed, index=True) eg_normed.sort(columns=['female'], ascending=False)['female'].plot(kind='bar', figsize=(18,8)) def export_for_crowd_aggregate(df, savename): crowd_source_export = pandas.DataFrame() crowd_source_export['qid'] = df.index crowd_source_export['en_label'] = crowd_source_export['qid'].apply(lambda x: english_label(x).encode('utf-8')) crowd_source_export['aggregate_group'] = '' crowd_source_export.to_csv('helpers/%s_map.csv' % savename) export_for_crowd_aggregate(eg, 'ethnic_groups') cz = gender_indexes[latest_date]['citizenships'].copy(deep=True) cz = pandas.DataFrame.from_csv('snapshot_data/2014-10-13/property_indexes/citizenships-index.csv') cz = cz.ix[1:] #remove the inintial nan ,which are items that had no citienzship len(cz) export_for_crowd_aggregate(cz, 'citizenships') sl = pandas.DataFrame.from_csv('snapshot_data/2014-10-13/property_indexes/site_linkss-index.csv') len(sl) yg.plot(kind='scatter', x='year', y='lnmale') plt.draw() yg_reg = yg yg_reg['year'] = yg_reg.index yg_reg['lnmale'] = numpy.log(yg_reg['male']) import statsmodels.api as sm nonnan = yg_reg.ix[start_year:1986].fillna(value=0) model = sm.OLS(nonnan['male'],nonnan['year']) results = model.fit() print(results.summary()) print(start_year, results.rsquared) rsquared_results = pandas.DataFrame(columns=['start_year','rsquared']) for start_year in numpy.arange(-4000,1950,50): nm_model = sm.OLS(yg_reg.ix[start_year:1986]['nm_per'], sm.add_constant(yg_reg.ix[start_year:1986]['year']) ) nm_results = nm_model.fit() rsquared_results = rsquared_results.append({'start_year':start_year, 'rsquared':nm_results.rsquared}, ignore_index=True) ax = rsquared_results[rsquared_results['start_year']>500].plot(kind='line',x='start_year',y='rsquared', title=r'$R^2$ value for linear regression on non-male percentage') ax.set_xlabel('starting year of regresssion untilt 1987') ax.set_ylabel(r'$R^2$')