import pandas as pd
import numpy
import json
from collections import defaultdict
from matplotlib.pylab import style
style.use('fivethirtyeight')
%pylab inline
java_min_int = -2147483648
Populating the interactive namespace from numpy and matplotlib
allrecs = pd.read_csv('snapshot_data/2014-10-13/gender-index-data-2014-10-13.csv',na_values=[java_min_int])
def split_column(q_str):
if type(q_str) is float:
if numpy.isnan(q_str):
return q_str
if type(q_str) is str:
qs = q_str.split('|')
return qs[0] #cos the format will always end with a |
for col in ['place_of_birth','gender', 'citizenship','ethnic_group']:
allrecs[col] = allrecs[col].apply(split_column)
allrecs.head(5)
qid | dob | dod | gender | ethnic_group | citizenship | place_of_birth | site_links | |
---|---|---|---|---|---|---|---|---|
0 | Q23 | 1732 | 1799 | Q6581097 | NaN | Q30 | Q494413 | zhwiki|kywiki|euwiki|plwiki|bswiki|angwiki|uzw... |
1 | Q42 | 1952 | 2001 | Q6581097 | NaN | Q145 | Q350 | zhwiki|jvwiki|euwiki|plwiki|bswiki|eswiki|tawi... |
2 | Q207 | 1946 | NaN | Q6581097 | NaN | Q30 | Q49145 | uzwiki|eswiki|kowikiquote|huwiki|liwikiquote|p... |
3 | Q297 | NaN | 1660 | Q6581097 | NaN | Q29 | Q8717 | zhwiki|kywiki|plwiki|euwiki|bswiki|uzwiki|eswi... |
4 | Q326 | 1942 | NaN | Q6581097 | NaN | Q298 | Q2887 | zhwiki|plwiki|euwiki|kowiki|frwiki|eswiki|yowi... |
#todo what about mechanical maps
pobs_map = json.load(open('helpers/aggregation_maps/pobs_map.json','r'))
country_map = pd.DataFrame.from_csv('helpers/aggregation_maps/country_maps.csv')
ethnic_group_map = json.load(open('helpers/aggregation_maps/mechanical_turk/ethnic_groups_map.json','r'))
citizenship_map = json.load(open('helpers/aggregation_maps/mechanical_turk/citizenship_map.json','r'))
def map_pob(qid):
if not type(qid) is str:
return None
else:
country_list = pobs_map[qid]
if len(country_list) == 0:
return None
else:
country = country_list[0] #assumption
culture = country_map.ix[country]['culture_name']
return culture
def map_wrapper(m):
def return_fun(qid):
try:
return m[qid]
except KeyError:
return None
return return_fun
mismatch = pd.DataFrame()
#order is important because it determines the preference we will use
col_map_fun = zip(['ethnic_group', 'citizenship', 'place_of_birth'],
[map_wrapper(ethnic_group_map),map_wrapper(citizenship_map), map_pob])
def determine_culture(row):
culture = None
for col, map_fun in col_map_fun:
guess = map_fun(row[col])
if (culture is not None) and (guess is not None):
if culture != guess:
mismatch.append(row,ignore_index=True)
if guess:
culture = guess
return str(culture).lower() if culture else culture #to return None properly
%%timeit -r 1 -n 1
allrecs.iloc[0:2500].apply(lambda x: determine_culture(x), axis=1)
1 loops, best of 1: 1.77 s per loop
%%timeit -r 1 -n 1
allrecs.iloc[0:25000].apply(lambda x: determine_culture(x), axis=1)
1 loops, best of 1: 17 s per loop
allrecs['culture'] = allrecs.apply(lambda x: determine_culture(x), axis=1)
--------------------------------------------------------------------------- KeyboardInterrupt Traceback (most recent call last) <ipython-input-17-5dc819b596ea> in <module>() ----> 1 allrecs['culture'] = allrecs.apply(lambda x: determine_culture(x), axis=1) /usr/local/lib/python2.7/dist-packages/pandas/core/frame.pyc in apply(self, func, axis, broadcast, raw, reduce, args, **kwds) 3594 if reduce is None: 3595 reduce = True -> 3596 return self._apply_standard(f, axis, reduce=reduce) 3597 else: 3598 return self._apply_broadcast(f, axis) /usr/local/lib/python2.7/dist-packages/pandas/core/frame.pyc in _apply_standard(self, func, axis, ignore_failures, reduce) 3646 labels = self._get_agg_axis(axis) 3647 result = lib.reduce(values, func, axis=axis, dummy=dummy, -> 3648 labels=labels) 3649 return Series(result, index=labels) 3650 except Exception: /usr/local/lib/python2.7/dist-packages/pandas/lib.so in pandas.lib.reduce (pandas/lib.c:40234)() /usr/local/lib/python2.7/dist-packages/pandas/lib.so in pandas.lib.Reducer.get_result (pandas/lib.c:30025)() <ipython-input-17-5dc819b596ea> in <lambda>(x) ----> 1 allrecs['culture'] = allrecs.apply(lambda x: determine_culture(x), axis=1) <ipython-input-16-ff2687e8a0fd> in determine_culture(row) 38 if (culture is not None) and (guess is not None): 39 if culture != guess: ---> 40 mismatch.append(row,ignore_index=True) 41 if guess: 42 culture = guess /usr/local/lib/python2.7/dist-packages/pandas/core/frame.pyc in append(self, other, ignore_index, verify_integrity) 3818 to_concat = [self, other] 3819 return concat(to_concat, ignore_index=ignore_index, -> 3820 verify_integrity=verify_integrity) 3821 3822 def join(self, other, on=None, how='left', lsuffix='', rsuffix='', /usr/local/lib/python2.7/dist-packages/pandas/tools/merge.pyc in concat(objs, axis, join, join_axes, ignore_index, keys, levels, names, verify_integrity, copy) 723 verify_integrity=verify_integrity, 724 copy=copy) --> 725 return op.get_result() 726 727 /usr/local/lib/python2.7/dist-packages/pandas/tools/merge.pyc in get_result(self) 894 895 new_data = concatenate_block_managers( --> 896 mgrs_indexers, self.new_axes, concat_axis=self.axis, copy=self.copy) 897 if not self.copy: 898 new_data._consolidate_inplace() /usr/local/lib/python2.7/dist-packages/pandas/core/internals.pyc in concatenate_block_managers(mgrs_indexers, axes, concat_axis, copy) 4044 copy=copy), 4045 placement=placement) -> 4046 for placement, join_units in concat_plan] 4047 4048 return BlockManager(blocks, axes) /usr/local/lib/python2.7/dist-packages/pandas/core/internals.pyc in concatenate_join_units(join_units, concat_axis, copy) 4133 raise AssertionError("Concatenating join units along axis0") 4134 -> 4135 empty_dtype, upcasted_na = get_empty_dtype_and_na(join_units) 4136 4137 to_concat = [ju.get_reindexed_values(empty_dtype=empty_dtype, /usr/local/lib/python2.7/dist-packages/pandas/core/internals.pyc in get_empty_dtype_and_na(join_units) 4072 has_none_blocks = True 4073 else: -> 4074 dtypes[i] = unit.dtype 4075 4076 # dtypes = set() /usr/local/lib/python2.7/dist-packages/pandas/lib.so in pandas.lib.cache_readonly.__get__ (pandas/lib.c:40766)() /usr/local/lib/python2.7/dist-packages/pandas/core/internals.pyc in dtype(self) 4343 raise AssertionError("Block is None, no dtype") 4344 -> 4345 if not self.needs_filling: 4346 return self.block.dtype 4347 else: /usr/local/lib/python2.7/dist-packages/pandas/lib.so in pandas.lib.cache_readonly.__get__ (pandas/lib.c:40766)() /usr/local/lib/python2.7/dist-packages/pandas/core/internals.pyc in needs_filling(self) 4333 for indexer in self.indexers.values(): 4334 # FIXME: cache results of indexer == -1 checks. -> 4335 if (indexer == -1).any(): 4336 return True 4337 KeyboardInterrupt:
print mismatch
Empty DataFrame Columns: [] Index: []
allrecs.to_json('helpers/world_cultures_shortcut.json')
allrecs = pd.DataFrame.from_dict(json.load(open('helpers/world_cultures_shortcut.json','r')))
import scipy.stats
scipy.stats.spearmanr(rank_compare[['Rank','Rank_wikidata']])
(0.09637690726400637, 0.25388210576052661)
scipy.stats.mannwhitneyu(rank_compare['Rank'],rank_compare['Rank_wikidata'])
(10078.0, 0.49798226262171613)
scipy.stats.ranksums(rank_compare['Rank'],rank_compare['Rank_wikidata'])
(0.0057801597300572065, 0.99538812547307132)
print rank_compare.to_html()
<table border="1" class="dataframe"> <thead> <tr style="text-align: right;"> <th></th> <th>Economy</th> <th>Rank</th> <th>Rank_wikidata</th> <th>diff</th> <th>abs_diff</th> </tr> </thead> <tbody> <tr> <th>0 </th> <td> Iceland</td> <td> 1</td> <td> 73</td> <td> -72</td> <td> 72</td> </tr> <tr> <th>1 </th> <td> Finland</td> <td> 2</td> <td> 49</td> <td> -47</td> <td> 47</td> </tr> <tr> <th>2 </th> <td> Norway</td> <td> 3</td> <td> 58</td> <td> -55</td> <td> 55</td> </tr> <tr> <th>3 </th> <td> Sweden</td> <td> 4</td> <td> 32</td> <td> -28</td> <td> 28</td> </tr> <tr> <th>4 </th> <td> Denmark</td> <td> 5</td> <td> 59</td> <td> -54</td> <td> 54</td> </tr> <tr> <th>5 </th> <td> Nicaragua</td> <td> 6</td> <td> 25</td> <td> -19</td> <td> 19</td> </tr> <tr> <th>6 </th> <td> Rwanda</td> <td> 7</td> <td> 67</td> <td> -60</td> <td> 60</td> </tr> <tr> <th>7 </th> <td> Ireland</td> <td> 8</td> <td> 77</td> <td> -69</td> <td> 69</td> </tr> <tr> <th>8 </th> <td> Philippines</td> <td> 9</td> <td> 2</td> <td> 7</td> <td> 7</td> </tr> <tr> <th>9 </th> <td> Belgium</td> <td> 10</td> <td> 98</td> <td> -88</td> <td> 88</td> </tr> <tr> <th>10 </th> <td> Switzerland</td> <td> 11</td> <td> 116</td> <td>-105</td> <td> 105</td> </tr> <tr> <th>11 </th> <td> Germany</td> <td> 12</td> <td> 117</td> <td>-105</td> <td> 105</td> </tr> <tr> <th>12 </th> <td> New Zealand</td> <td> 13</td> <td> 38</td> <td> -25</td> <td> 25</td> </tr> <tr> <th>13 </th> <td> Netherlands</td> <td> 14</td> <td> 72</td> <td> -58</td> <td> 58</td> </tr> <tr> <th>14 </th> <td> Latvia</td> <td> 15</td> <td> 60</td> <td> -45</td> <td> 45</td> </tr> <tr> <th>15 </th> <td> France</td> <td> 16</td> <td> 96</td> <td> -80</td> <td> 80</td> </tr> <tr> <th>16 </th> <td> Burundi</td> <td> 17</td> <td> 141</td> <td>-124</td> <td> 124</td> </tr> <tr> <th>17 </th> <td> South Africa</td> <td> 18</td> <td> 103</td> <td> -85</td> <td> 85</td> </tr> <tr> <th>18 </th> <td> Canada</td> <td> 19</td> <td> 23</td> <td> -4</td> <td> 4</td> </tr> <tr> <th>19 </th> <td> United States of America</td> <td> 20</td> <td> 31</td> <td> -11</td> <td> 11</td> </tr> <tr> <th>20 </th> <td> Ecuador</td> <td> 21</td> <td> 109</td> <td> -88</td> <td> 88</td> </tr> <tr> <th>21 </th> <td> Bulgaria</td> <td> 22</td> <td> 53</td> <td> -31</td> <td> 31</td> </tr> <tr> <th>22 </th> <td> Slovenia</td> <td> 23</td> <td> 78</td> <td> -55</td> <td> 55</td> </tr> <tr> <th>23 </th> <td> Australia</td> <td> 24</td> <td> 18</td> <td> 6</td> <td> 6</td> </tr> <tr> <th>24 </th> <td> Moldova</td> <td> 25</td> <td> 68</td> <td> -43</td> <td> 43</td> </tr> <tr> <th>25 </th> <td> United Kingdom</td> <td> 26</td> <td> 42</td> <td> -16</td> <td> 16</td> </tr> <tr> <th>26 </th> <td> Mozambique</td> <td> 27</td> <td> 61</td> <td> -34</td> <td> 34</td> </tr> <tr> <th>27 </th> <td> Luxembourg</td> <td> 28</td> <td> 107</td> <td> -79</td> <td> 79</td> </tr> <tr> <th>28 </th> <td> Spain</td> <td> 29</td> <td> 88</td> <td> -59</td> <td> 59</td> </tr> <tr> <th>29 </th> <td> Cuba</td> <td> 30</td> <td> 26</td> <td> 4</td> <td> 4</td> </tr> <tr> <th>30 </th> <td> Argentina</td> <td> 31</td> <td> 102</td> <td> -71</td> <td> 71</td> </tr> <tr> <th>31 </th> <td> Belarus</td> <td> 32</td> <td> 70</td> <td> -38</td> <td> 38</td> </tr> <tr> <th>32 </th> <td> Barbados</td> <td> 33</td> <td> 55</td> <td> -22</td> <td> 22</td> </tr> <tr> <th>33 </th> <td> Malawi</td> <td> 34</td> <td> 92</td> <td> -58</td> <td> 58</td> </tr> <tr> <th>34 </th> <td> The Bahamas</td> <td> 35</td> <td> 36</td> <td> -1</td> <td> 1</td> </tr> <tr> <th>35 </th> <td> Austria</td> <td> 36</td> <td> 82</td> <td> -46</td> <td> 46</td> </tr> <tr> <th>36 </th> <td> Kenya</td> <td> 37</td> <td> 9</td> <td> 28</td> <td> 28</td> </tr> <tr> <th>37 </th> <td> Lesotho</td> <td> 38</td> <td> 43</td> <td> -5</td> <td> 5</td> </tr> <tr> <th>38 </th> <td> Portugal</td> <td> 39</td> <td> 95</td> <td> -56</td> <td> 56</td> </tr> <tr> <th>39 </th> <td> Namibia</td> <td> 40</td> <td> 112</td> <td> -72</td> <td> 72</td> </tr> <tr> <th>40 </th> <td> Madagascar</td> <td> 41</td> <td> 99</td> <td> -58</td> <td> 58</td> </tr> <tr> <th>41 </th> <td> Mongolia</td> <td> 42</td> <td> 71</td> <td> -29</td> <td> 29</td> </tr> <tr> <th>42 </th> <td> Kazakhstan</td> <td> 43</td> <td> 44</td> <td> -1</td> <td> 1</td> </tr> <tr> <th>43 </th> <td> Lithuania</td> <td> 44</td> <td> 65</td> <td> -21</td> <td> 21</td> </tr> <tr> <th>44 </th> <td> Peru</td> <td> 45</td> <td> 97</td> <td> -52</td> <td> 52</td> </tr> <tr> <th>45 </th> <td> Panama</td> <td> 46</td> <td> 39</td> <td> 7</td> <td> 7</td> </tr> <tr> <th>46 </th> <td> Tanzania</td> <td> 47</td> <td> 16</td> <td> 31</td> <td> 31</td> </tr> <tr> <th>47 </th> <td> Costa Rica</td> <td> 48</td> <td> 129</td> <td> -81</td> <td> 81</td> </tr> <tr> <th>48 </th> <td> Trinidad and Tobago</td> <td> 49</td> <td> 24</td> <td> 25</td> <td> 25</td> </tr> <tr> <th>49 </th> <td> Cape Verde</td> <td> 50</td> <td> 136</td> <td> -86</td> <td> 86</td> </tr> <tr> <th>50 </th> <td> Botswana</td> <td> 51</td> <td> 46</td> <td> 5</td> <td> 5</td> </tr> <tr> <th>51 </th> <td> Jamaica</td> <td> 52</td> <td> 21</td> <td> 31</td> <td> 31</td> </tr> <tr> <th>52 </th> <td> Colombia</td> <td> 53</td> <td> 63</td> <td> -10</td> <td> 10</td> </tr> <tr> <th>53 </th> <td> Serbia</td> <td> 54</td> <td> 62</td> <td> -8</td> <td> 8</td> </tr> <tr> <th>54 </th> <td> Croatia</td> <td> 55</td> <td> 86</td> <td> -31</td> <td> 31</td> </tr> <tr> <th>55 </th> <td> Ukraine</td> <td> 56</td> <td> 79</td> <td> -23</td> <td> 23</td> </tr> <tr> <th>56 </th> <td> Poland</td> <td> 57</td> <td> 84</td> <td> -27</td> <td> 27</td> </tr> <tr> <th>57 </th> <td> Bolivia</td> <td> 58</td> <td> 128</td> <td> -70</td> <td> 70</td> </tr> <tr> <th>58 </th> <td> Singapore</td> <td> 59</td> <td> 8</td> <td> 51</td> <td> 51</td> </tr> <tr> <th>59 </th> <td> Laos</td> <td> 60</td> <td> 137</td> <td> -77</td> <td> 77</td> </tr> <tr> <th>60 </th> <td> Thailand</td> <td> 61</td> <td> 11</td> <td> 50</td> <td> 50</td> </tr> <tr> <th>61 </th> <td> Estonia</td> <td> 62</td> <td> 94</td> <td> -32</td> <td> 32</td> </tr> <tr> <th>62 </th> <td> Zimbabwe</td> <td> 63</td> <td> 35</td> <td> 28</td> <td> 28</td> </tr> <tr> <th>63 </th> <td> Guyana</td> <td> 64</td> <td> 134</td> <td> -70</td> <td> 70</td> </tr> <tr> <th>64 </th> <td> Israel</td> <td> 65</td> <td> 34</td> <td> 31</td> <td> 31</td> </tr> <tr> <th>65 </th> <td> Chile</td> <td> 66</td> <td> 47</td> <td> 19</td> <td> 19</td> </tr> <tr> <th>66 </th> <td> Kyrgyzstan</td> <td> 67</td> <td> 51</td> <td> 16</td> <td> 16</td> </tr> <tr> <th>67 </th> <td> Bangladesh</td> <td> 68</td> <td> 64</td> <td> 4</td> <td> 4</td> </tr> <tr> <th>68 </th> <td> Italy</td> <td> 69</td> <td> 105</td> <td> -36</td> <td> 36</td> </tr> <tr> <th>69 </th> <td> Republic of Macedonia</td> <td> 70</td> <td> 125</td> <td> -55</td> <td> 55</td> </tr> <tr> <th>70 </th> <td> Brazil</td> <td> 71</td> <td> 57</td> <td> 14</td> <td> 14</td> </tr> <tr> <th>71 </th> <td> Romania</td> <td> 72</td> <td> 54</td> <td> 18</td> <td> 18</td> </tr> <tr> <th>72 </th> <td> Honduras</td> <td> 73</td> <td> 131</td> <td> -58</td> <td> 58</td> </tr> <tr> <th>73 </th> <td> Montenegro</td> <td> 74</td> <td> 81</td> <td> -7</td> <td> 7</td> </tr> <tr> <th>74 </th> <td> Russia</td> <td> 75</td> <td> 52</td> <td> 23</td> <td> 23</td> </tr> <tr> <th>75 </th> <td> Vietnam</td> <td> 76</td> <td> 29</td> <td> 47</td> <td> 47</td> </tr> <tr> <th>76 </th> <td> Senegal</td> <td> 77</td> <td> 119</td> <td> -42</td> <td> 42</td> </tr> <tr> <th>77 </th> <td> Dominican Republic</td> <td> 78</td> <td> 22</td> <td> 56</td> <td> 56</td> </tr> <tr> <th>78 </th> <td> Sri Lanka</td> <td> 79</td> <td> 80</td> <td> -1</td> <td> 1</td> </tr> <tr> <th>79 </th> <td> Mexico</td> <td> 80</td> <td> 45</td> <td> 35</td> <td> 35</td> </tr> <tr> <th>80 </th> <td> Paraguay</td> <td> 81</td> <td> 132</td> <td> -51</td> <td> 51</td> </tr> <tr> <th>81 </th> <td> Uruguay</td> <td> 82</td> <td> 135</td> <td> -53</td> <td> 53</td> </tr> <tr> <th>82 </th> <td> Albania</td> <td> 83</td> <td> 115</td> <td> -32</td> <td> 32</td> </tr> <tr> <th>83 </th> <td> El Salvador</td> <td> 84</td> <td> 113</td> <td> -29</td> <td> 29</td> </tr> <tr> <th>84 </th> <td> Georgia</td> <td> 85</td> <td> 91</td> <td> -6</td> <td> 6</td> </tr> <tr> <th>85 </th> <td> Venezuela</td> <td> 86</td> <td> 12</td> <td> 74</td> <td> 74</td> </tr> <tr> <th>86 </th> <td> People's Republic of China</td> <td> 87</td> <td> 13</td> <td> 74</td> <td> 74</td> </tr> <tr> <th>87 </th> <td> Uganda</td> <td> 88</td> <td> 20</td> <td> 68</td> <td> 68</td> </tr> <tr> <th>88 </th> <td> Guatemala</td> <td> 89</td> <td> 120</td> <td> -31</td> <td> 31</td> </tr> <tr> <th>89 </th> <td> Slovakia</td> <td> 90</td> <td> 56</td> <td> 34</td> <td> 34</td> </tr> <tr> <th>90 </th> <td> Greece</td> <td> 91</td> <td> 83</td> <td> 8</td> <td> 8</td> </tr> <tr> <th>91 </th> <td> Swaziland</td> <td> 92</td> <td> 14</td> <td> 78</td> <td> 78</td> </tr> <tr> <th>92 </th> <td> Hungary</td> <td> 93</td> <td> 66</td> <td> 27</td> <td> 27</td> </tr> <tr> <th>93 </th> <td> Azerbaijan</td> <td> 94</td> <td> 106</td> <td> -12</td> <td> 12</td> </tr> <tr> <th>94 </th> <td> Cyprus</td> <td> 95</td> <td> 111</td> <td> -16</td> <td> 16</td> </tr> <tr> <th>95 </th> <td> Czech Republic</td> <td> 96</td> <td> 87</td> <td> 9</td> <td> 9</td> </tr> <tr> <th>96 </th> <td> Indonesia</td> <td> 97</td> <td> 17</td> <td> 80</td> <td> 80</td> </tr> <tr> <th>97 </th> <td> Brunei</td> <td> 98</td> <td> 14</td> <td> 84</td> <td> 84</td> </tr> <tr> <th>98 </th> <td> Malta</td> <td> 99</td> <td> 122</td> <td> -23</td> <td> 23</td> </tr> <tr> <th>99 </th> <td> Belize</td> <td> 100</td> <td> 40</td> <td> 60</td> <td> 60</td> </tr> <tr> <th>100</th> <td> Ghana</td> <td> 101</td> <td> 133</td> <td> -32</td> <td> 32</td> </tr> <tr> <th>101</th> <td> Tajikistan</td> <td> 102</td> <td> 85</td> <td> 17</td> <td> 17</td> </tr> <tr> <th>102</th> <td> Armenia</td> <td> 103</td> <td> 126</td> <td> -23</td> <td> 23</td> </tr> <tr> <th>103</th> <td> Japan</td> <td> 104</td> <td> 4</td> <td> 100</td> <td> 100</td> </tr> <tr> <th>104</th> <td> Maldives</td> <td> 105</td> <td> 101</td> <td> 4</td> <td> 4</td> </tr> <tr> <th>105</th> <td> Mauritius</td> <td> 106</td> <td> 6</td> <td> 100</td> <td> 100</td> </tr> <tr> <th>106</th> <td> Malaysia</td> <td> 107</td> <td> 5</td> <td> 102</td> <td> 102</td> </tr> <tr> <th>107</th> <td> Cambodia</td> <td> 108</td> <td> 30</td> <td> 78</td> <td> 78</td> </tr> <tr> <th>108</th> <td> Suriname</td> <td> 109</td> <td> 74</td> <td> 35</td> <td> 35</td> </tr> <tr> <th>109</th> <td> Burkina Faso</td> <td> 110</td> <td> 138</td> <td> -28</td> <td> 28</td> </tr> <tr> <th>110</th> <td> Liberia</td> <td> 111</td> <td> 50</td> <td> 61</td> <td> 61</td> </tr> <tr> <th>111</th> <td> Nepal</td> <td> 112</td> <td> 3</td> <td> 109</td> <td> 109</td> </tr> <tr> <th>112</th> <td> Kuwait</td> <td> 113</td> <td> 114</td> <td> -1</td> <td> 1</td> </tr> <tr> <th>113</th> <td> India</td> <td> 114</td> <td> 19</td> <td> 95</td> <td> 95</td> </tr> <tr> <th>114</th> <td> United Arab Emirates</td> <td> 115</td> <td> 118</td> <td> -3</td> <td> 3</td> </tr> <tr> <th>115</th> <td> Qatar</td> <td> 116</td> <td> 139</td> <td> -23</td> <td> 23</td> </tr> <tr> <th>116</th> <td> South Korea</td> <td> 117</td> <td> 1</td> <td> 116</td> <td> 116</td> </tr> <tr> <th>117</th> <td> Nigeria</td> <td> 118</td> <td> 92</td> <td> 26</td> <td> 26</td> </tr> <tr> <th>118</th> <td> Zambia</td> <td> 119</td> <td> 127</td> <td> -8</td> <td> 8</td> </tr> <tr> <th>119</th> <td> Bhutan</td> <td> 120</td> <td> 33</td> <td> 87</td> <td> 87</td> </tr> <tr> <th>120</th> <td> Angola</td> <td> 121</td> <td> 90</td> <td> 31</td> <td> 31</td> </tr> <tr> <th>121</th> <td> Fiji</td> <td> 122</td> <td> 104</td> <td> 18</td> <td> 18</td> </tr> <tr> <th>122</th> <td> Tunisia</td> <td> 123</td> <td> 110</td> <td> 13</td> <td> 13</td> </tr> <tr> <th>123</th> <td> Bahrain</td> <td> 124</td> <td> 6</td> <td> 118</td> <td> 118</td> </tr> <tr> <th>124</th> <td> Turkey</td> <td> 125</td> <td> 41</td> <td> 84</td> <td> 84</td> </tr> <tr> <th>125</th> <td> Algeria</td> <td> 126</td> <td> 75</td> <td> 51</td> <td> 51</td> </tr> <tr> <th>126</th> <td> Ethiopia</td> <td> 127</td> <td> 10</td> <td> 117</td> <td> 117</td> </tr> <tr> <th>127</th> <td> Oman</td> <td> 128</td> <td> 28</td> <td> 100</td> <td> 100</td> </tr> <tr> <th>128</th> <td> Egypt</td> <td> 129</td> <td> 48</td> <td> 81</td> <td> 81</td> </tr> <tr> <th>129</th> <td> Saudi Arabia</td> <td> 130</td> <td> 123</td> <td> 7</td> <td> 7</td> </tr> <tr> <th>130</th> <td> Mauritania</td> <td> 131</td> <td> 142</td> <td> -11</td> <td> 11</td> </tr> <tr> <th>131</th> <td> Guinea</td> <td> 132</td> <td> 140</td> <td> -8</td> <td> 8</td> </tr> <tr> <th>132</th> <td> Morocco</td> <td> 133</td> <td> 100</td> <td> 33</td> <td> 33</td> </tr> <tr> <th>133</th> <td> Jordan</td> <td> 134</td> <td> 108</td> <td> 26</td> <td> 26</td> </tr> <tr> <th>134</th> <td> Lebanon</td> <td> 135</td> <td> 76</td> <td> 59</td> <td> 59</td> </tr> <tr> <th>135</th> <td> Côte d'Ivoire</td> <td> 136</td> <td> 130</td> <td> 6</td> <td> 6</td> </tr> <tr> <th>136</th> <td> Iran</td> <td> 137</td> <td> 69</td> <td> 68</td> <td> 68</td> </tr> <tr> <th>137</th> <td> Mali</td> <td> 138</td> <td> 124</td> <td> 14</td> <td> 14</td> </tr> <tr> <th>138</th> <td> Syria</td> <td> 139</td> <td> 121</td> <td> 18</td> <td> 18</td> </tr> <tr> <th>139</th> <td> Chad</td> <td> 140</td> <td> 37</td> <td> 103</td> <td> 103</td> </tr> <tr> <th>140</th> <td> Pakistan</td> <td> 141</td> <td> 27</td> <td> 114</td> <td> 114</td> </tr> <tr> <th>141</th> <td> Yemen</td> <td> 142</td> <td> 88</td> <td> 54</td> <td> 54</td> </tr> </tbody> </table>
Quite uncorrellated. That means that the data is not good, or that the world economic forum methods have little to do with the percentage of women born in those countries recorded semantically on a historic level. And /rho is high
country_map = pd.DataFrame.from_csv('helpers/aggregation_maps/country_maps.csv')
def map_culture(qid):
if not type(qid) is str:
return None
else:
country_list = pobs_map[qid]
if len(country_list) == 0:
return None
else:
country = country_list[0] #assumption
culture = country_map.ix[country]['culture_name']
return culture
allrecs['culture'] = allrecs['place_of_birth'].apply(map_culture)
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-15-ce3e2a640676> in <module>() ----> 1 allrecs['culture'] = allrecs['place_of_birth'].apply(map_culture) /usr/local/lib/python2.7/dist-packages/pandas/core/series.pyc in apply(self, func, convert_dtype, args, **kwds) 2056 values = lib.map_infer(values, lib.Timestamp) 2057 -> 2058 mapped = lib.map_infer(values, f, convert=convert_dtype) 2059 if len(mapped) and isinstance(mapped[0], Series): 2060 from pandas.core.frame import DataFrame /usr/local/lib/python2.7/dist-packages/pandas/lib.so in pandas.lib.map_infer (pandas/lib.c:57158)() <ipython-input-14-dd8662bb0567> in map_culture(qid) 3 return None 4 else: ----> 5 country_list = pobs_map[qid] 6 if len(country_list) == 0: 7 return None NameError: global name 'pobs_map' is not defined
import math
import pywikibot
#Tranforming QIDs into English labels.
enwp = pywikibot.Site('en','wikipedia')
wikidata = enwp.data_repository()
retrieved = dict()
def english_label(qid):
if qid:
if type(qid) is float:
if math.isnan(qid):
return None
#first see if we've done it
try:
return retrieved[qid]
except KeyError:
try:
page = pywikibot.ItemPage(wikidata, qid)
data = page.get()
lab = data['labels']['en']
retrieved[qid] = lab
return lab
except KeyError:
retrieved[qid] = qid
return qid
else:
return None
english_label('Q6581097')
u'male'
allrecs['gender_name'] = allrecs['gender'].apply(english_label)
VERBOSE:pywiki:Found 1 commons:commons processes running, including this one.
outdf = allrecs[['gender_name','culture']]
outdf.to_csv('helpers/Chi_Squared_Test_Data.csv')
how many records have gender, pob and dob
has = defaultdict(dict)
for col in allrecs.columns:
def test(x):
if isinstance(x, float):
return not math.isnan(x)
else:
return x is not None
nonempty = len(allrecs[allrecs[col].apply(test)])
nonemptyper = nonempty / float(len(allrecs))
has[col]['Items with property'] = nonempty
has[col]['% of total'] = nonemptyper
hasdf = pd.DataFrame.from_dict(has, orient='index')
print hasdf.sort('% of total').to_html(justify='right', formatters={'% of total':lambda x: '%.2f' % (x*100),
'Items with property':lambda x: '{0:,}'.format(x)})
<table border="1" class="dataframe"> <thead> <tr style="text-align: right;"> <th></th> <th>% of total</th> <th>Items with property</th> </tr> </thead> <tbody> <tr> <th>ethnic_group</th> <td> 0.30</td> <td> 7,772</td> </tr> <tr> <th>country</th> <td> 23.47</td> <td> 601,361</td> </tr> <tr> <th>place_of_birth</th> <td> 23.93</td> <td> 613,092</td> </tr> <tr> <th>dod</th> <td> 28.79</td> <td> 737,522</td> </tr> <tr> <th>citizenship</th> <td> 41.44</td> <td>1,061,634</td> </tr> <tr> <th>culture</th> <td> 45.20</td> <td>1,158,086</td> </tr> <tr> <th>dob</th> <td> 57.92</td> <td>1,484,003</td> </tr> <tr> <th>gender</th> <td> 89.40</td> <td>2,290,433</td> </tr> <tr> <th>site_links</th> <td> 99.05</td> <td>2,537,545</td> </tr> <tr> <th>qid</th> <td>100.00</td> <td>2,561,999</td> </tr> </tbody> </table>
hasdob = allrecs[allrecs['dob'].apply(lambda x: not math.isnan(x))]
len(hasdob)
1484003
hasgender = hasdob[hasdob['gender'].apply(lambda x: not math.isnan(x) if type(x) is float else True)]
len(hasgender)
1484003
hascult = hasgender[hasgender['culture'].apply(lambda x: x is not None)]
len(hascult)
915101
hascult.head()
citizenship | country | culture | dob | dod | ethnic_group | gender | place_of_birth | qid | site_links | |
---|---|---|---|---|---|---|---|---|---|---|
0 | Q30 | Q30 | english-speaking | 1732 | 1799 | None | Q6581097 | Q494413 | Q23 | zhwiki|kywiki|euwiki|plwiki|bswiki|angwiki|uzw... |
1 | Q145 | Q145 | english-speaking | 1952 | 2001 | None | Q6581097 | Q350 | Q42 | zhwiki|jvwiki|euwiki|plwiki|bswiki|eswiki|tawi... |
10 | Q30 | Q30 | english-speaking | 1973 | NaN | None | Q6581072 | Q1020700 | Q555 | zhwiki|eowiki|plwiki|kowiki|ruwiki|frwiki|eswi... |
100 | Q36 | None | catholic european | 1989 | NaN | None | Q6581097 | None | Q2327 | dewiki|plwiki|ruwiki|enwiki|ocwiki|svwiki|dawiki| |
1000 | Q21 | None | english-speaking | 1944 | 1994 | None | Q6581097 | None | Q28348 | ptwiki|plwiki|kowiki|hewiki|frwiki|ruwiki|eswi... |
culture_groups = hascult.groupby('culture')
def make_perc_series(df):
years_per = dict()
dobs = df.groupby('dob')
#hate to use a for loop, fixlater
for year, group in dobs:
nmcount = group[group['gender'] != 'Q6581097']['gender'].count()
totalcount = group['gender'].count()
nmper = nmcount / float(totalcount)
years_per[year] = nmper
perc_series = pd.TimeSeries(data=years_per)
return perc_series
perc_dict = dict()
for name, group in culture_groups:
perc_series = make_perc_series(group)
perc_dict[name] = perc_series
perc_df.tail(10)
africa | catholic european | confucian | english-speaking | islamic | latin america | orthodox | protestant european | south asia | euro | |
---|---|---|---|---|---|---|---|---|---|---|
2007 | 0 | 1.000000 | 0 | 0.6 | 1 | NaN | NaN | 0.750000 | 0 | 1.750000 |
2008 | NaN | 1.000000 | NaN | 1.0 | NaN | NaN | NaN | NaN | NaN | NaN |
2009 | NaN | 0.000000 | NaN | 0.0 | NaN | NaN | NaN | 0.000000 | 0 | 0.000000 |
2010 | 0 | NaN | NaN | 0.4 | NaN | NaN | 0 | 0.000000 | 0 | NaN |
2011 | NaN | 0.333333 | NaN | 0.0 | NaN | 1 | 0 | 0.250000 | 1 | 0.583333 |
2012 | NaN | NaN | NaN | 0.0 | 0 | NaN | 0 | 0.666667 | 0 | NaN |
2013 | NaN | 0.000000 | NaN | 0.0 | NaN | 0 | 0 | NaN | 0 | NaN |
2014 | NaN | NaN | NaN | 0.5 | NaN | NaN | NaN | NaN | NaN | NaN |
2411 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
2426 | NaN | NaN | NaN | NaN | 0 | NaN | 0 | NaN | NaN | NaN |
perc_df = pd.DataFrame.from_dict(perc_dict)
years = range(1800,2000,int(200/6.0))
subbd_df = perc_df.ix[years]
infogram = subbd_df
infogram.to_csv('Magnus Gender analysis/infogram_pob_dob_cult.csv',index=True, encoding='utf-8')
fig, (full, modern) = plt.subplots(1,2, figsize=(20,6))
end_year = 2000
for start_year, ra_len, ax in zip((-1000,1800), (100,10), (full, modern)):
ra_dict = dict()
for name, series in perc_dict.iteritems():
ra_dict[name] = pd.rolling_mean(series, ra_len, min_periods=10)
cult_dob_per = pd.DataFrame(ra_dict)
if start_year == 1800:
year_list = range(1900,end_year,10)
cult_dob_per.ix[years].to_csv('Magnus Gender analysis/infogram_pob_dob_cult.csv',index=True, encoding='utf-8')
cult_dob_per.plot(cmap='Paired', linewidth=2, ax=ax, legend=False,zorder=-ra_len)
ax.set_xlim((start_year, end_year))
ax.set_xticks(range(start_year, end_year,(end_year-start_year) / 16))
ax.set_ylim((0,0.6))
ax.set_title(u'{}—{}, with {} year Rolling Average'.format(start_year, end_year,ra_len))
ax.yaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, y: '{:.0%}'.format(x )))
full.legend = legend(bbox_to_anchor=(0.05, 0.95), loc=2, borderaxespad=0)
full.set_xticks(range(-1000, end_year,(end_year+1000) / 15))
fig.suptitle('Female % of Biographies by Culture, over Time', fontsize=24)
fig.subplots_adjust(top=0.88)
fig, (full, modern) = plt.subplots(2,1, figsize=(12,8), sharex=False)
end_year = 2000
for start_year, ra_len, ax in zip((-1000,1800), (100,10), (full, modern)):
ra_dict = dict()
for name, series in perc_dict.iteritems():
ra_dict[name] = pd.rolling_mean(series, ra_len, min_periods=10)
cult_dob_per = pd.DataFrame(ra_dict)
cult_dob_per.plot(cmap='Paired', linewidth=2, ax=ax, legend=False,zorder=-ra_len)
ax.set_xlim((start_year, end_year))
ax.set_xticks(range(start_year, end_year,(end_year-start_year) / 16))
ax.set_ylim((0,0.6))
ax.set_title(u'{}—{}, with {} year Rolling Average'.format(start_year, end_year,ra_len))
ax.yaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, y: '{:.0%}'.format(x )))
full.legend = legend(bbox_to_anchor=(0.05, 0.95), loc=2, borderaxespad=0)
#full.set_xticks(range(-1000, end_year,(end_year+1000) / 15))
fig.suptitle('Female % of Biographies by Culture, over Time', fontsize=24)
fig.subplots_adjust(top=0.88)
dobexists = allrecs[allrecs['dob'].apply(lambda x: not math.isnan(x))]
dobcultureexists = dobexists[dobexists['culture'].apply(lambda x: x is not None)]
len(dobcultureexists)
915101
culture_groups = dobcultureexists[['dob','culture']].groupby(by='culture')
def make_tot_series(df):
years_tot = dict()
dobs = df.groupby('dob')
#hate to use a for loop, fixlater
for year, group in dobs:
totalcount = group['culture'].count()
years_tot[year] = totalcount
tot_series = pd.TimeSeries(data=years_tot)
return tot_series
tot_dict = dict()
for name, group in culture_groups:
tot_dict[name] = make_tot_series(group)
end_year = 2014
for start_year in [1500, 1800]:
for ra_len in [2, 5, 10]:
ra_dict = dict()
for name, series in tot_dict.iteritems():
ra_dict[name] = pd.rolling_mean(series, ra_len, min_periods=1)
cult_dob = pd.DataFrame(ra_dict)
plt = cult_dob.plot(figsize=(20,6), cmap='Set2', linewidth=1.5)
plt.set_xlim((start_year,end_year))
plt.set_xticks(range(start_year,end_year,(end_year-start_year) / 15))
plt.set_title('Total Biographies by Date of Birth | %s Year Rolling Average' % str(ra_len))
plt.legend(loc=2)
for start_year, end_year in zip([-2000, -1000], [1000,1500]):
for ra_len in [1,2,10]:
ra_dict = dict()
for name, series in tot_dict.iteritems():
ra_dict[name] = pd.rolling_mean(series, ra_len, min_periods=1)
cult_dob = pd.DataFrame(ra_dict)
plt = cult_dob.plot(figsize=(20,6), cmap='Set2', linewidth=1.5)
plt.set_ylim((0,50))
plt.set_yscale('log')
plt.set_xlim((start_year,end_year))
plt.set_xticks(range(start_year,end_year,(end_year-start_year) / 15))
plt.set_title('Total Biographies by Date of Birth | %s Year Rolling Average' % str(ra_len))
plt.legend(loc=2)
/usr/local/lib/python2.7/dist-packages/numpy/ma/core.py:3895: UserWarning: Warning: converting a masked element to nan. warnings.warn("Warning: converting a masked element to nan.")