import pandas
import math
import datetime
import os
import json
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
%pylab inline
java_min_int = -2147483648
gender_ordered = [u'female', u'male', u'transgender female', u'intersex', u"fa'afafine", u'transgender male', u'female animal', u'male animal', u'woman', u'genderqueer', u'kathoey']
import pywikibot
#Tranforming QIDs into English labels.
enwp = pywikibot.Site('en','wikipedia')
wikidata = enwp.data_repository()
retrieved = dict()
def english_label(qid):
if type(qid) is float:
if math.isnan(qid):
return qid
#first see if we've done it
try:
return retrieved[qid]
except KeyError:
try:
page = pywikibot.ItemPage(wikidata, qid)
data = page.get()
lab = data['labels']['en']
retrieved[qid] = lab
return lab
except:
retrieved[qid] = qid
return qid
def engify_labels(df, index=False):
if index:
axis = df.index
else:
axis = df.columns
qids = [str(q) for q in axis]
labels = [english_label(qid) for qid in qids]
axis = labels
VERBOSE:pywiki:Starting 1 threads...
Populating the interactive namespace from numpy and matplotlib
This is how you'd get a dataframe for a specific snapshot
df = pandas.read_csv('snapshot_data/2014-09-17/gender-index-data-2014-09-17.csv',
na_values=[java_min_int])
ls snapshot_data/
2014-09-17/ 2014-10-13/
#in case these things need ungzipping
snap_folders = !ls snapshot_data/
for folder in snap_folders:
!gunzip -k snapshot_data/$folder/*.gz
gzip: snapshot_data/2014-09-17/gender-index-data-2014-09-17.csv already exists; do you wish to overwrite (y or n)? ^C gzip: snapshot_data/2014-10-13/gender-index-data-2014-10-13.csv already exists; do you wish to overwrite (y or n)? ^C
folders = !ls snapshot_data/
locs = []
for folder in folders:
loc = !ls snapshot_data/$folder/gender-index*.csv
locs.append(loc[0])
print locs
['snapshot_data/2014-09-17/gender-index-data-2014-09-17.csv', 'snapshot_data/2014-10-13/gender-index-data-2014-10-13.csv']
snap_dfs = {loc[-14:-4]: pandas.read_csv(loc, na_values=[java_min_int]) for loc in locs}
def split_column(q_str):
if type(q_str) is float:
if numpy.isnan(q_str):
return [q_str] #returning this way so we can gurantee that column contains list
if type(q_str) is str:
qs = q_str.split('|')
return qs[:-1] #cos the format will always end with a |
for snap, df in snap_dfs.iteritems():
for column in ['gender', 'ethnic_group', 'citizenship', 'place_of_birth', 'site_links']:
column_plural = column+'s'
df[column_plural] = df[column].apply(split_column)
latest = snap_dfs[max(snap_dfs.keys())]
earliest = snap_dfs[min(snap_dfs.keys())]
latest.query('dob == 999')
qid | dob | dod | gender | ethnic_group | citizenship | place_of_birth | site_links | genders | ethnic_groups | citizenships | place_of_births | site_linkss | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
25981 | Q337805 | 999 | 1062 | Q6581097| | NaN | Q29520| | Q1209298| | zhwiki|wuuwiki|zh_classicalwiki|bowiki|ruwiki|... | [Q6581097] | [nan] | [Q29520] | [Q1209298] | [zhwiki, wuuwiki, zh_classicalwiki, bowiki, ru... |
555620 | Q15645727 | 999 | 1025 | Q6581072| | NaN | NaN | NaN | jawiki| | [Q6581072] | [nan] | [nan] | [nan] | [jawiki] |
808984 | Q3012130 | 999 | 1082 | Q6581072| | NaN | NaN | NaN | jawiki|kowiki|frwiki|eswiki|cawiki| | [Q6581072] | [nan] | [nan] | [nan] | [jawiki, kowiki, frwiki, eswiki, cawiki] |
1947827 | Q336180 | 999 | 1072 | Q6581097| | NaN | Q794| | NaN | dewiki|jawiki|tgwiki|fawiki|ruwiki|frwiki|enwi... | [Q6581097] | [nan] | [Q794] | [nan] | [dewiki, jawiki, tgwiki, fawiki, ruwiki, frwik... |
1993569 | Q1093355 | 999 | 1025 | Q6581072| | NaN | NaN | NaN | zhwiki|jawiki|kowiki|frwiki|eswiki|jawikiquote| | [Q6581072] | [nan] | [nan] | [nan] | [zhwiki, jawiki, kowiki, frwiki, eswiki, jawik... |
for df in [earliest, latest]:
for qid in latest.query('dob == 999')['qid']:
print 'http://wikidata.org/wiki/'+qid
print '\n'
from collections import defaultdict
import time
def make_reindex(snap_df):
def int_dict_factory():
return defaultdict(int)
def nan_test(v):
try:
if math.isnan(v):
return True
except TypeError:
return False
#abstracted: we want year-gender, but also
#gender-ethnicity -citizenship -place of birth, site-links
params = ['dob','dod','genders','ethnic_groups','citizenships','place_of_births','site_linkss']
gender_param = {param:defaultdict(int_dict_factory) for param in params}
for index, row in snap_df.iterrows():
row_data = {p : row[p] for p in params}
for param in params:
gender_dict = gender_param[param]
vrs = row_data[param]
genders = row_data['genders']
if not nan_test(vrs):
if not nan_test(genders):
for gender in genders:
if type(vrs) is list:
for var in vrs:
gender_dict[gender][var] += 1
else:
gender_dict[gender][vrs] += 1
gender_dfs = {param: pandas.DataFrame.from_dict(gender_param[param], orient='columns') for param in params}
return gender_dfs
gender_indexes = {snap: make_reindex(df) for snap, df in snap_dfs.iteritems()}
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-148-4a18d1828f7e> in <module>() ----> 1 gender_indexes = {snap: make_reindex(df) for snap, df in snap_dfs.iteritems()} NameError: name 'snap_dfs' is not defined
for snap, gender_dfs in gender_indexes.iteritems():
for param, gender_df in gender_dfs.iteritems():
print param
engify_labels(gender_df)
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-238-4c36c56aa8d6> in <module>() ----> 1 for snap, gender_dfs in gender_indexes.iteritems(): 2 for param, gender_df in gender_dfs.iteritems(): 3 print param 4 engify_labels(gender_df) 5 NameError: name 'gender_indexes' is not defined
for snap, gender_df in gender_indexes.iteritems():
for param, gender_df in gender_dfs.iteritems():
property_index_dir = 'snapshot_data/%s/property_indexes' % (snap)
if not os.path.exists(property_index_dir):
os.makedirs(property_index_dir)
filename = '%s/%s-index.csv' % (property_index_dir, param)
filepoint = open(filename, 'w')
filepoint.write(gender_df.to_csv())
filepoint.close()
latest_date = max(snap_dfs.keys())
earliest_date = min(snap_dfs.keys())
gender_indexes[latest_date]['dob'].ix[999]
nan NaN transgender female NaN intersex NaN fa'afafine NaN transgender male NaN male animal NaN woman NaN genderqueer NaN female 3 male 2 kathoey NaN Name: 999.0, dtype: float64
gender_dfs['dob'].plot(kind='area',stacked=True, figsize=(24,8))
<matplotlib.axes.AxesSubplot at 0x7f1c63aa9810>
plt = gender_dfs['dob'].ix[1800:].plot(kind='area',stacked=True, figsize=(24,8))
plt.set_title('''Wikidata Biography Gender Quantities by Year
1800 onwards''', size=24)
plt.set_xlabel('Year', size=18)
plt.set_ylabel('Biographies', size=18)
plt.set_xlim((1800,2014))
plt.legend(title='Gender', loc=2)
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-43-c0de4e952f65> in <module>() ----> 1 plt = gender_dfs['dob'].ix[1800:].plot(kind='area',stacked=True, figsize=(24,8)) 2 plt.set_title('''Wikidata Biography Gender Quantities by Year 3 1800 onwards''', size=24) 4 plt.set_xlabel('Year', size=18) 5 plt.set_ylabel('Biographies', size=18) NameError: name 'gender_dfs' is not defined
nonmale_early = gender_indexes[earliest_date]['dod'].copy(deep=True)
nonmale_late = gender_indexes[latest_date]['dod'].copy(deep=True)
del nonmale_early['nan']
nonmale_early['nonmale'] = nonmale_early.sum(axis=1) - nonmale_early['male']
nonmale_early['nm_per'] = (nonmale_early['nonmale'] ) / (nonmale_early['nonmale'] + nonmale_early['male'])
del nonmale_late['nan']
nonmale_late['nonmale'] = nonmale_late.sum(axis=1) - nonmale_late['male']
nonmale_late['nm_per'] = (nonmale_late['nonmale'] ) / (nonmale_late['nonmale'] + nonmale_late['male'])
plt = nonmale_early['nm_per'].plot(figsize(24,8), kind='line')
plt.set_xlim((-1000,2014))
(-1000, 2014)
ma = pandas.rolling_mean(nonmale_early['nm_per'], 10)
plt = ma.plot()
plt.set_xlim((-1000,2014))
(-1000, 2014)
nonmale_late['nm_per'].plot(figsize=(24,8))
plt.set_xlim((-1000,2014))
gender_ordered
[u'female', u'male', u'transgender female', u'intersex', u"fa'afafine", u'transgender male', u'female animal', u'male animal', u'woman', u'genderqueer', u'kathoey']
['Date of Birth']*2
['Date of Birth', 'Date of Birth']
plt.style
'/usr/local/lib/python2.7/dist-packages/matplotlib/style/__init__.pyc'
infogram = pandas.DataFrame.from_csv('snapshot_data/2014-10-13/property_indexes/dob-index.csv')
infogram.fillna(0, inplace=True)
del infogram['nan']
infogram['total'] = infogram.sum(axis=1)
infogram['nonbin'] = infogram['total'] - infogram['male'] - infogram['female']
infogram['fem_per'] = infogram['female'] / (infogram['total'])
infogram['nonbin_per'] = infogram['nonbin'] / infogram['total']
rm = pandas.rolling_mean(infogram[['fem_per','nonbin_per']], 10, min_periods=10)
partyears = range(1800,2000,int(200/6.0))
rm.ix[partyears][['nonbin_per','fem_per']].T.to_csv('Magnus Gender analysis/infogram_dob_rm.csv')
infogram.ix[partyears][['nonbin_per','fem_per','nonbin']].T.to_csv('Magnus Gender analysis/infogram_dob.csv')
rm.ix[1880:1910]['nonbin_per']
1880 0.000000 1881 0.000000 1882 0.000022 1883 0.000022 1884 0.000022 1885 0.000043 1886 0.000043 1887 0.000043 1888 0.000043 1889 0.000043 1890 0.000043 1891 0.000043 1892 0.000021 1893 0.000021 1894 0.000021 1895 0.000000 1896 0.000000 1897 0.000000 1898 0.000000 1899 0.000000 1900 0.000000 1901 0.000000 1902 0.000000 1903 0.000000 1904 0.000000 1905 0.000000 1906 0.000000 1907 0.000000 1908 0.000000 1909 0.000000 1910 0.000000 Name: nonbin_per, dtype: float64
fig, axes = pylab.subplots(nrows=2, ncols=2, sharey=True)
for ax, beginning, l, (xtext, ytext) in zip(axes.ravel(), [-4000, 1800] * 2, ['b']*2+['d']*2, [('-4000 BCE to present','Date of Birth'),('1800 CE to present',''),('','Date of Death'),('','')]):
acro = 'do'+l
df = pandas.DataFrame.from_csv('snapshot_data/2014-10-13/property_indexes/%s-index.csv' % acro)
del df['nan']
df = df.reindex_axis(gender_ordered,axis=1)
p = df.plot(kind='area', figsize=(9,10), cmap='Accent', ax=ax, legend=False, linewidth=1)
p.set_xlim((beginning,2014))
#p.set_xlabel(xtext)
p.set_ylabel(ytext)
p.set_title(xtext, fontsize=12)
fig.legend = legend(bbox_to_anchor=(1.05, 1.5), loc=2, borderaxespad=0)
fig.suptitle('Volumes of Dates of Birth and Death in Wikidata \n by Gender \n Total and Modern Timeframes', fontsize=24)
fig.tight_layout()
subplots_adjust(hspace=0.1, top=0.82)
ra_len = 1
dox = pandas.DataFrame()
nonbindox = pandas.DataFrame()
for l in ['b','d']:
acro = 'do'+l
df = pandas.DataFrame.from_csv('snapshot_data/2014-10-13/property_indexes/%s-index.csv' % acro)
del df['nan']
df['total'] = df.sum(axis=1)
df['nonbin'] = df['total'] - df['male'] - df['female']
df['fem_per'] = df['female'] / (df['total'])
df['nonbin_per'] = df['nonbin'] / df['total']
ra = pandas.rolling_mean(df['fem_per'], ra_len)
dox[acro] = ra
nonbinra = pandas.rolling_mean(df['nonbin_per'], ra_len)
nonbindox[acro] = nonbinra
fig, (pltf, pltb) = pylab.subplots(nrows=2, ncols=1, sharex=True, figsize=(9,6))
dox.plot(kind='line', cmap='Paired', linewidth=2, ax=pltf)
pltf.set_xlim((1400,2014))
pltf.set_ylim((0,0.7))
pltf.yaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, y: '{}%'.format(int(x*100) )))
pltf.set_title('Female ratio')
pltf.legend(('Date of Birth', 'Date of Death'),loc=4, bbox_to_anchor=(1.25,-0.25))
nonbindox.plot(kind='line', cmap='Paired', linewidth=2, ax=pltb, legend=False)
pltb.set_xlim((1400,2014))
pltb.yaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, y: '{}%'.format(x*100)))
pltb.set_title('Non-binary Ratio')
fig.suptitle('Composition of Wikidata Genders in Modern History', fontsize=24)
fig.subplots_adjust(top=0.87)
<matplotlib.text.Text at 0x7f1990eb4290>
pob = gender_indexes[latest_date]['place_of_births'].copy(deep=True)
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-4-4e0945f5cefe> in <module>() ----> 1 pob = gender_indexes[latest_date]['place_of_births'].copy(deep=True) NameError: name 'gender_indexes' is not defined
len(pob)
72894
72,000 "places" how, many of these are countries. that is instance of "Q6256"
%timeit -n 1 -r 1
def is_or_has_country(qid):
countries = list() #we're going to return this
page = pywikibot.ItemPage(wikidata, qid)
data = page.get()
claims = data['claims']
for pid, claimlist in claims.iteritems():
if pid == 'P17':
for claim in claimlist:
countries.append(claim.target.title()) #this is part of a country
if pid == 'P31':
for claim in claimlist:
if claim.target.title() == 'Q6256':
countries.append(qid) #this actually is a country
return countries
place_country = dict()
count=0
for place in pob.index[1:]: #1 because the first index is nan
place_country[place] = is_or_has_country(place)
count += 1
if count % 100 == 0:
print count
pobs = [p for p in pob.index[1:]]
json.dump(pobs, open('pobs_list.json','w'))
do some processing on wmflabs to save on bandwidth
pobs_map = json.load(open('helpers/pobs_map.json','r'))
What percentage of pobs are of have a country? Which have more than one country?
have_country = [c[0] for c in pobs_map.values() if len(c) != 0]
len(list(set(have_country)))
235
len(have_country) / float(len(pobs_map.values()))
0.9363862099241353
have_no_country = [p for p, c in pobs_map.iteritems() if len(c) == 0]
len(have_country)
68256
len(have_no_country)
4637
for place in have_no_country[10:20]:
print 'http://wikidata.org/wiki/'+place
http://wikidata.org/wiki/Q361099 http://wikidata.org/wiki/Q4180803 http://wikidata.org/wiki/Q579468 http://wikidata.org/wiki/Q504912 http://wikidata.org/wiki/Q1013242 http://wikidata.org/wiki/Q1958565 http://wikidata.org/wiki/Q15763 http://wikidata.org/wiki/Q1091714 http://wikidata.org/wiki/Q448469 http://wikidata.org/wiki/Q7285906
there's not an obvious easy way to programmatically determine these, would probably have to go over them by hand
the sad part is that they probaby represent minority locations (and thus pepople)
country_lengths = sum([len(c) for c in have_country])
country_lengths / float(len(have_country))
1.0032524613220817
Ok, which have more than one country.
for place, country_list in pobs_map.iteritems():
if len(country_list) > 1:
if reduce(lambda a, b: a != b, country_list ):#all elements are the not same
pass#print ['http://wikidata.org/wiki/'+place for place in country_list]
import IPython.display
IPython.display.Image('helpers/Inglehart_Values_Map2.svg.png')
coun = [c for c in pobs_map.itervalues() if c]
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-3-03814802914c> in <module>() ----> 1 coun = [c for c in pobs_map.itervalues() if c] NameError: name 'pobs_map' is not defined
country_culture = dict()
for place, country_list in pobs_map.iteritems():
if country_list:
qid = country_list[0]
if qid in country_culture.keys():
continue
else:
link = 'http://wikidata.org/wiki/%s' % qid
print english_label(qid)
print link
culture_num = input('enter culture num')
country_culture[qid] = culture_num
fp = open('helpers/pob_agg.txt','w')
seen = list()
for place, country_list in pobs_map.iteritems():
if country_list:
qid = country_list[0]
if qid in seen:
continue
else:
enlab = english_label(qid)
writestr = u'%s\t%s\t\n' % (qid, enlab)
fp.write(writestr.encode('utf-8'))
seen.append(qid)
fp.close()
culture_map ={1:'confucian',
2:'orthodox',
3:'islamic',
4:'south asia',
5:'africa',
6: 'catholic europe',
7: 'protestant europe',
8: 'english speaking',
9: 'latin america'}
cultures_df.to_csv('helpers/culture_names.csv')
cultures_df = pandas.DataFrame.from_csv('helpers/culture_names.csv')
cultures_df['qid'] = cultures_df.index
cultures_df['cutlure_name'] = cultures_df['culture_number'].apply(lambda x: culture_map[x])
pob = pandas.DataFrame.from_csv('snapshot_data/2014-10-13/property_indexes/place_of_births-index.csv')
pob['qid'] = pob.index
#pob = pob.ix[1:] #remove nan row
qid_countryqid = json.load(open('helpers/pobs_map.json','r'))
def qid_to_country(qid):
if type(qid) is float:
if math.isnan(qid):
return 'no_data'
else:
return qid_countryqid[qid]
pob['country_qid'] = pob['qid'].apply(lambda qid: qid_to_country(qid))
country_culture = dict(zip(cultures_df['qid'], cultures_df['culture_name']))
def aggregate_culture(qid_list):
if not type(qid_list) is list and qid_list == 'no_data':
return 'no_data'
if len(qid_list) > 0:
culture_name = country_culture[qid_list[0]]
return culture_name
else:
return 'not_easily_aggregatable'
pob['culture_name'] = pob['country_qid'].apply(lambda qid_list: aggregate_culture(qid_list))
culture_groups = pob.groupby(by=['culture_name'])[u'transgender female', u'intersex', u"fa'afafine", u'transgender male', u'female animal', u'male animal', u'woman', u'genderqueer', u'female', u'male', u'kathoey'].sum().copy(deep=True)
culture_groups.to_csv('helpers/pob_plot_data_oct.csv')
culture_groups['total'] = culture_groups.sum(axis=1)
normed_pobs_agg = culture_groups.apply(lambda x: x/ float(x['total']), axis=1)
pobs_plot = normed_pobs_agg.sort('female')[normed_pobs_agg.columns[:-1]]
pobs_plot_mf = normed_pobs_agg.sort('female')[['male','female']]
pobs_plot_nmf = normed_pobs_agg[[u'transgender female', u'intersex', u"fa'afafine", u'transgender male', u'female animal', u'male animal', u'woman', u'genderqueer', u'kathoey']]
pobs_plot.plot(kind='bar', figsize=(10,10))
<matplotlib.axes.AxesSubplot at 0x7fc4d9f927d0>
plt = pobs_plot_mf.plot(kind='bar', figsize=(12,5), cmap='Paired')
plt.set_title('Place of Birth Gender Composition by 9 world cultures.\nNo Data are items without Birth Data\nNot Easily Aggregratable means Place of Birth did Claim did not have a Country Property.(7%)')
plt.set_ylabel('Gender Composition')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0)
plt = pobs_plot_nmf.plot(kind='bar', figsize=(12,5), cmap='Accent')
plt.set_title('Place of Birth Gender Composition by 9 world cultures.\nNo Data are items without Birth Data\nNot Easily Aggregratable means Place of Birth did Claim did not have a Country Property.(7%)')
plt.set_ylabel('Gender Composition')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0)
<matplotlib.legend.Legend at 0x7fc4ac5825d0>
<matplotlib.legend.Legend at 0x7fc4ad56a710>
pob = gender_indexes[latest_date]['place_of_births'].copy(deep=True)
eg = gender_indexes[latest_date]['ethnic_groups'].copy(deep=True)
eg = pandas.DataFrame.from_csv('snapshot_data/2014-10-13/property_indexes/ethnic_groups-index.csv')
len(eg)
683
eg=eg.ix[1:].fillna(value=0)
eg['total'] = eg.sum(axis=1)
engify_labels(eg,index=True)
eg['qid'] = eg.index
eg['ethnic_name'] = eg['qid'].apply(lambda x: english_label(x))
eg.sort('total', ascending=False).head(50)
nan | transgender female | intersex | fa'afafine | transgender male | female animal | male animal | woman | genderqueer | female | male | kathoey | total | qid | ethinic_name | ethnic_name | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Q539051 | 116 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 190 | 2395 | 0 | 8103 | Q539051 | Greeks | Greeks |
Q127885 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 123 | 1135 | 0 | 3777 | Q127885 | Serbs | Serbs |
Q7325 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 30 | 157 | 0 | 564 | Q7325 | Jewish people | Jewish people |
Q7129609 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 115 | 57 | 0 | 522 | Q7129609 | Caucasian race | Caucasian race |
Q161652 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 150 | 20 | 0 | 513 | Q161652 | Japanese people | Japanese people |
Q49085 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 72 | 69 | 0 | 423 | Q49085 | African American | African American |
Q235155 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 81 | 35 | 0 | 351 | Q235155 | white people | white people |
Q402913 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 28 | 74 | 0 | 306 | Q402913 | Bengali people | Bengali people |
Q187985 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 9 | 88 | 0 | 291 | Q187985 | Tibetan people | Tibetan people |
Q485150 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 21 | 71 | 0 | 279 | Q485150 | Romanians | Romanians |
Q7994501 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 32 | 54 | 0 | 258 | Q7994501 | White British | White British |
Q4887679 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 9 | 71 | 0 | 240 | Q4887679 | Bengali Hindus | Bengali Hindus |
Q42406 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 43 | 30 | 0 | 219 | Q42406 | English people | English people |
Q34069 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 53 | 0 | 168 | Q34069 | Ashkenazi Jews | Ashkenazi Jews |
Q42884 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 15 | 39 | 0 | 162 | Q42884 | Germans | Germans |
Q49078 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 22 | 30 | 0 | 156 | Q49078 | White American | White American |
Q2556103 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | 41 | 0 | 144 | Q2556103 | Pashtun people | Pashtun people |
Q133032 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 14 | 27 | 0 | 123 | Q133032 | Hungarian people | Hungarian people |
Q42740 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 37 | 0 | 123 | Q42740 | Han Chinese | Han Chinese |
Q50001 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 26 | 14 | 0 | 120 | Q50001 | Italians | Italians |
Q1815623 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 36 | 0 | 108 | Q1815623 | Sri Lankan Tamil people | Sri Lankan Tamil people |
Q35323 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | 29 | 0 | 108 | Q35323 | Arab | Arab |
Q241696 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 11 | 22 | 0 | 99 | Q241696 | Somali people | Somali people |
Q403656 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 16 | 17 | 0 | 99 | Q403656 | Baganda | Baganda |
Q932244 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 28 | 0 | 96 | Q932244 | Sinhalese people | Sinhalese people |
Q30 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | 25 | 0 | 93 | Q30 | United States of America | United States of America |
Q49542 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | 23 | 0 | 87 | Q49542 | Russians | Russians |
Q43103 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 24 | 0 | 87 | Q43103 | European American | European American |
Q121842 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 18 | 10 | 0 | 84 | Q121842 | French people | French people |
Q854323 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | 22 | 0 | 81 | Q854323 | Punjabi people | Punjabi people |
Q84072 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | 19 | 0 | 78 | Q84072 | Turkish people | Turkish people |
Q179248 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 22 | 0 | 78 | Q179248 | Albanians | Albanians |
Q1026 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | 18 | 0 | 75 | Q1026 | Poles | Poles |
Q4172847 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 16 | 8 | 0 | 72 | Q4172847 | Filipino people | Filipino people |
Q170826 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 11 | 12 | 0 | 69 | Q170826 | Irish people | Irish people |
Q79797 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | 18 | 0 | 69 | Q79797 | Armenians | Armenians |
Q581780 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 22 | 0 | 66 | Q581780 | Banu Khazraj | Banu Khazraj |
Q445618 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 21 | 0 | 66 | Q445618 | Banat Swabians | Banat Swabians |
Q170217 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 13 | 7 | 0 | 63 | Q170217 | Czechs | Czechs |
Q160894 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 9 | 12 | 0 | 63 | Q160894 | Spanish people | Spanish people |
Q483569 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 18 | 0 | 63 | Q483569 | Belarusians | Belarusians |
Q974693 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 12 | 9 | 0 | 63 | Q974693 | Italian American | Italian American |
Q1344183 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | 12 | 0 | 57 | Q1344183 | English American | English American |
Q133255 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 16 | 0 | 57 | Q133255 | Bulgarians | Bulgarians |
Q181634 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 15 | 0 | 54 | Q181634 | Scottish people | Scottish people |
Q484464 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 9 | 9 | 0 | 54 | Q484464 | Koreans | Koreans |
Q842438 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | 10 | 0 | 51 | Q842438 | British people | British people |
Q700469 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 16 | 0 | 51 | Q700469 | Germans of Romania | Germans of Romania |
Q6501380 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | 12 | 0 | 51 | Q6501380 | Chinese people | Chinese people |
Q862086 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 9 | 7 | 0 | 48 | Q862086 | Indian people | Indian people |
eg_normed = eg[[u'transgender female', u'intersex', u"fa'afafine", u'transgender male', u'female animal', u'male animal', u'woman', u'genderqueer', u'female', u'male', u'kathoey', 'total']].apply(lambda x: x/x['total'], axis=1)
eg_cut = eg[eg['total']>1]
eg_cut_normed = eg_cut[[u'transgender female', u'intersex', u"fa'afafine", u'transgender male', u'female animal', u'male animal', u'woman', u'genderqueer', u'female', u'male', u'kathoey', 'total']].apply(lambda x: x/x['total'], axis=1)
eg_
engify_labels(eg_normed, index=True)
eg_normed.sort(columns=['female'], ascending=False)['female'].plot(kind='bar', figsize=(18,8))
<matplotlib.axes.AxesSubplot at 0x7f1990731090>
def export_for_crowd_aggregate(df, savename):
crowd_source_export = pandas.DataFrame()
crowd_source_export['qid'] = df.index
crowd_source_export['en_label'] = crowd_source_export['qid'].apply(lambda x: english_label(x).encode('utf-8'))
crowd_source_export['aggregate_group'] = ''
crowd_source_export.to_csv('helpers/%s_map.csv' % savename)
export_for_crowd_aggregate(eg, 'ethnic_groups')
cz = gender_indexes[latest_date]['citizenships'].copy(deep=True)
cz = pandas.DataFrame.from_csv('snapshot_data/2014-10-13/property_indexes/citizenships-index.csv')
cz = cz.ix[1:] #remove the inintial nan ,which are items that had no citienzship
len(cz)
732
export_for_crowd_aggregate(cz, 'citizenships')
sl = pandas.DataFrame.from_csv('snapshot_data/2014-10-13/property_indexes/site_linkss-index.csv')
len(sl)
428
yg.plot(kind='scatter', x='year', y='lnmale')
plt.draw()
<matplotlib.axes.AxesSubplot at 0x7f718fd5dc10>
yg_reg = yg
yg_reg['year'] = yg_reg.index
yg_reg['lnmale'] = numpy.log(yg_reg['male'])
import statsmodels.api as sm
nonnan = yg_reg.ix[start_year:1986].fillna(value=0)
model = sm.OLS(nonnan['male'],nonnan['year'])
results = model.fit()
print(results.summary())
print(start_year, results.rsquared)
(None, 0.20534979805159537) (-500, 0.21745102948866801) (0, 0.21941326163168851) (500, 0.22247317415119383) (1000, 0.24992223055451823) (1500, 0.37760491504948823) (1800, 0.72773643874338734) (1900, 0.95518546745325672)
rsquared_results = pandas.DataFrame(columns=['start_year','rsquared'])
for start_year in numpy.arange(-4000,1950,50):
nm_model = sm.OLS(yg_reg.ix[start_year:1986]['nm_per'], sm.add_constant(yg_reg.ix[start_year:1986]['year']) )
nm_results = nm_model.fit()
rsquared_results = rsquared_results.append({'start_year':start_year, 'rsquared':nm_results.rsquared}, ignore_index=True)
ax = rsquared_results[rsquared_results['start_year']>500].plot(kind='line',x='start_year',y='rsquared',
title=r'$R^2$ value for linear regression on non-male percentage')
ax.set_xlabel('starting year of regresssion untilt 1987')
ax.set_ylabel(r'$R^2$')
<matplotlib.text.Text at 0x7f719df8ef50>
This implies $y = -1.7495 + 0.001x$ with $R^2 = 0.885$
setting $y=0.5$ $\implies$ $x=2249.5$ or in the year 2250