import pandas as pd
import numpy
%pylab inline
java_min_int = -2147483648
Populating the interactive namespace from numpy and matplotlib
allrecs = pd.read_csv('snapshot_data/2014-10-13/gender-index-data-2014-10-13.csv',na_values=[java_min_int])
def split_column(q_str):
if type(q_str) is float:
if numpy.isnan(q_str):
return q_str #returning this way so we can gurantee that column contains list
if type(q_str) is str:
qs = q_str.split('|')
return qs[0] #cos the format will always end with a |
for col in ['place_of_birth','gender','citizenship','ethnic_group']:
allrecs[col] = allrecs[col].apply(split_column)
allrecs.head(10)
qid | dob | dod | gender | ethnic_group | citizenship | place_of_birth | site_links | |
---|---|---|---|---|---|---|---|---|
0 | Q23 | 1732 | 1799 | Q6581097 | NaN | Q30 | Q494413 | zhwiki|kywiki|euwiki|plwiki|bswiki|angwiki|uzw... |
1 | Q42 | 1952 | 2001 | Q6581097 | NaN | Q145 | Q350 | zhwiki|jvwiki|euwiki|plwiki|bswiki|eswiki|tawi... |
2 | Q207 | 1946 | NaN | Q6581097 | NaN | Q30 | Q49145 | uzwiki|eswiki|kowikiquote|huwiki|liwikiquote|p... |
3 | Q297 | NaN | 1660 | Q6581097 | NaN | Q29 | Q8717 | zhwiki|kywiki|plwiki|euwiki|bswiki|uzwiki|eswi... |
4 | Q326 | 1942 | NaN | Q6581097 | NaN | Q298 | Q2887 | zhwiki|plwiki|euwiki|kowiki|frwiki|eswiki|yowi... |
5 | Q368 | 1915 | 2006 | Q6581097 | NaN | Q298 | Q33986 | lbwiki|zhwiki|plwiki|euwiki|bswiki|angwiki|esw... |
6 | Q377 | 1882 | 1942 | Q6581097 | NaN | Q34266 | Q658871 | zhwiki|kywiki|ukwikisource|jvwiki|plwiki|euwik... |
7 | Q475 | 1911 | 1982 | Q6581097 | NaN | Q298 | Q2887 | plwiki|euwiki|kowiki|frwiki|eswiki|yowiki|ocwi... |
8 | Q501 | 1821 | 1867 | Q6581097 | NaN | Q142 | Q90 | zhwiki|glwikisource|plwiki|euwiki|bswiki|ptwik... |
9 | Q530 | 1956 | NaN | Q6581097 | NaN | Q34 | Q499415 | plwiki|euwiki|frwiki|bswiki|bewiki|eswiki|ocwi... |
import json
pobs_map = json.load(open('helpers/aggregation_maps/pobs_map.json','r'))
country_map = pd.DataFrame.from_csv('helpers/aggregation_maps/country_maps.csv')
ethnic_group_map = json.load(open('helpers/aggregation_maps/ethnic_groups_map.json','r'))
citizenship_map = json.load(open('helpers/aggregation_maps/citizenship_map.json','r'))
--------------------------------------------------------------------------- IOError Traceback (most recent call last) <ipython-input-6-4eb645f6ebfb> in <module>() 3 country_map = pd.DataFrame.from_csv('helpers/aggregation_maps/country_maps.csv') 4 ----> 5 ethnic_group_map = json.load(open('helpers/aggregation_maps/ethnic_groups_map.json','r')) 6 citizenship_map = json.load(open('helpers/aggregation_maps/citizenship_map.json','r')) IOError: [Errno 2] No such file or directory: 'helpers/aggregation_maps/ethnic_groups_map.json'
def map_pob(qid):
if not type(qid) is str:
return None
else:
country_list = pobs_map[qid]
if len(country_list) == 0:
return None
else:
country = country_list[0] #assumption
culture = country_map.ix[country]['culture_name']
return culture
def map_wrapper(m):
def return_fun(qid):
try:
return m[qid]
except KeyError:
return None
return return_fun
mismatch = pd.DataFrame()
def determine_culture(row):
#order is important because it determines the preference we will use
col_map_fun = zip(['ethnic_group', 'citizenship', 'place_of_birth'],
[map_wrapper(ethnic_group_map),map_wrapper(citizenship_map), map_pob])
culture = None
for col, map_fun in col_map_fun:
guess = map_fun(row[col])
guess = str(guess).lower()
if (culture is not None) and (guess is not None):
if culture != guess:
mismatch.append(row,ignore_index=True)
if guess:
culture = guess
return culture
allrecs['culture'] = allrecs.apply(lambda x: determine_culture(x), axis=1)
len(allrecs[allrecs['culture'].apply(lambda x: x is not None)])
2561999
import math
import pywikibot
#Tranforming QIDs into English labels.
enwp = pywikibot.Site('en','wikipedia')
wikidata = enwp.data_repository()
retrieved = dict()
def english_label(qid):
if type(qid) is float:
if math.isnan(qid):
return None
#first see if we've done it
try:
return retrieved[qid]
except KeyError:
try:
page = pywikibot.ItemPage(wikidata, qid)
data = page.get()
lab = data['labels']['en']
retrieved[qid] = lab
return lab
except KeyError:
retrieved[qid] = qid
return qid
VERBOSE:pywiki:Starting 1 threads...
english_label('Q6581097')
VERBOSE:pywiki:Found 1 wikidata:wikidata processes running, including this one.
u'male'
allrecs['gender_name'] = allrecs['gender'].apply(english_label)
outdf = allrecs[['gender_name','culture']]
allrecs.to_csv('helpers/allrecords_culture_mapped.csv')
outdf.to_csv('helpers/Chi_Squared_Test_Data.csv')
how many records have gender, pob and dob
hasdob = allrecs[allrecs['dob'].apply(lambda x: not math.isnan(x))]
len(hasdob)
1484003
hasgender = hasdob[hasdob['gender'].apply(lambda x: not math.isnan(x) if type(x) is float else True)]
len(hasgender)
1470491
hascult = hasgender[hasgender['culture'].apply(lambda x: x is not None)]
len(hascult)
1470491
culture_groups = hascult.groupby('culture')
def make_perc_series(df):
years_per = dict()
dobs = df.groupby('dob')
#hate to use a for loop, fixlater
for year, group in dobs:
nmcount = group[group['gender'] != 'Q6581097']['gender'].count()
totalcount = group['gender'].count()
nmper = nmcount / float(totalcount)
years_per[year] = nmper
perc_series = pd.TimeSeries(data=years_per)
return perc_series
perc_dict = dict()
for name, group in culture_groups:
perc_series = make_perc_series(group)
perc_dict[name] = perc_series
end_year = 1988
for start_year in [-1000, 1800]:
for ra_len in [10, 100]:
ra_dict = dict()
for name, series in perc_dict.iteritems():
ra_dict[name] = pd.rolling_mean(series, ra_len, min_periods=10)
cult_dob_per = pd.DataFrame(ra_dict)
plt = cult_dob_per.plot(figsize=(20,6), cmap='Paired', linewidth=1.5)
plt.set_xlim((start_year, end_year))
plt.set_xticks(range(start_year, end_year,(end_year-start_year) / 15))
plt.set_ylim((0,0.6))
plt.set_title('Non-male percentage of Biographies by Date of Birth - %s Year Rolling Average' % str(ra_len))
plt.legend(loc=2)
dobexists = allrecs[allrecs['dob'].apply(lambda x: not math.isnan(x))]
dobcultureexists = dobexists[dobexists['culture'].apply(lambda x: x is not None)]
len(dobcultureexists)
1484003
culture_groups = dobcultureexists[['dob','culture']].groupby(by='culture')
def make_tot_series(df):
years_tot = dict()
dobs = df.groupby('dob')
#hate to use a for loop, fixlater
for year, group in dobs:
totalcount = group['culture'].count()
years_tot[year] = totalcount
tot_series = pd.TimeSeries(data=years_tot)
return tot_series
tot_dict = dict()
for name, group in culture_groups:
tot_dict[name] = make_tot_series(group)
end_year = 1988
for start_year in [1500, 1800]:
for ra_len in [2, 5, 10]:
ra_dict = dict()
for name, series in tot_dict.iteritems():
ra_dict[name] = pd.rolling_mean(series, ra_len, min_periods=1)
cult_dob = pd.DataFrame(ra_dict)
plt = cult_dob.plot(figsize=(20,6), cmap='Paired', linewidth=1.5)
plt.set_xlim((start_year,end_year))
plt.set_xticks(range(start_year,end_year,(end_year-start_year) / 15))
plt.set_title('Total Biographies by Date of Birth | %s Year Rolling Average' % str(ra_len))
plt.legend(loc=2)
for start_year, end_year in zip([-2000, -1000], [1000,1500]):
for ra_len in [1,2,10]:
ra_dict = dict()
for name, series in tot_dict.iteritems():
ra_dict[name] = pd.rolling_mean(series, ra_len, min_periods=1)
cult_dob = pd.DataFrame(ra_dict)
plt = cult_dob.plot(figsize=(20,6), cmap='Set2', linewidth=1.5)
plt.set_ylim((0,50))
plt.set_yscale('log')
plt.set_xlim((start_year,end_year))
plt.set_xticks(range(start_year,end_year,(end_year-start_year) / 15))
plt.set_title('Total Biographies by Date of Birth | %s Year Rolling Average' % str(ra_len))
plt.legend(loc=2)
/usr/local/lib/python2.7/dist-packages/numpy/ma/core.py:3895: UserWarning: Warning: converting a masked element to nan. warnings.warn("Warning: converting a masked element to nan.")