What's for those recently in confucian wikipedias, what's
import pandas as pd
import numpy
from collections import defaultdict
import json
import statsmodels.api as sm
from matplotlib.pylab import style
style.use('fivethirtyeight')
%pylab inline
java_min_int = -2147483648
WIKIS =('jawiki', 'tlwiki', 'urwiki', 'zhwiki', 'dewiki', 'enwiki','kowiki')
Populating the interactive namespace from numpy and matplotlib
allrecs = pd.read_csv('snapshot_data/2014-10-13/gender-index-data-2014-10-13.csv',na_values=[java_min_int])
def split_column(q_str):
if type(q_str) is float:
if numpy.isnan(q_str):
return q_str
if type(q_str) is str:
qs = q_str.split('|')
return qs[:-1] #cos the format will always end with a |
for col in ['gender','site_links']:
allrecs[col] = allrecs[col].apply(split_column)
def has(xxwiki):
def has_xx(row):
if isinstance(row['site_links'], list):
return xxwiki in row['site_links']
else: return False
return has_xx
generate files
allrecs.head(20)
qid | dob | dod | gender | ethnic_group | citizenship | place_of_birth | site_links | |
---|---|---|---|---|---|---|---|---|
0 | Q23 | 1732 | 1799 | [Q6581097] | NaN | Q30| | Q494413| | [zhwiki, kywiki, euwiki, plwiki, bswiki, angwi... |
1 | Q42 | 1952 | 2001 | [Q6581097] | NaN | Q145| | Q350| | [zhwiki, jvwiki, euwiki, plwiki, bswiki, eswik... |
2 | Q207 | 1946 | NaN | [Q6581097] | NaN | Q30| | Q49145| | [uzwiki, eswiki, kowikiquote, huwiki, liwikiqu... |
3 | Q297 | NaN | 1660 | [Q6581097] | NaN | Q29| | Q8717| | [zhwiki, kywiki, plwiki, euwiki, bswiki, uzwik... |
4 | Q326 | 1942 | NaN | [Q6581097] | NaN | Q298|Q39| | Q2887| | [zhwiki, plwiki, euwiki, kowiki, frwiki, eswik... |
5 | Q368 | 1915 | 2006 | [Q6581097] | NaN | Q298| | Q33986| | [lbwiki, zhwiki, plwiki, euwiki, bswiki, angwi... |
6 | Q377 | 1882 | 1942 | [Q6581097] | NaN | Q34266|Q2895|Q15180| | Q658871| | [zhwiki, kywiki, ukwikisource, jvwiki, plwiki,... |
7 | Q475 | 1911 | 1982 | [Q6581097] | NaN | Q298| | Q2887| | [plwiki, euwiki, kowiki, frwiki, eswiki, yowik... |
8 | Q501 | 1821 | 1867 | [Q6581097] | NaN | Q142| | Q90| | [zhwiki, glwikisource, plwiki, euwiki, bswiki,... |
9 | Q530 | 1956 | NaN | [Q6581097] | NaN | Q34| | Q499415| | [plwiki, euwiki, frwiki, bswiki, bewiki, eswik... |
10 | Q555 | 1973 | NaN | [Q6581072] | NaN | Q30| | Q1020700| | [zhwiki, eowiki, plwiki, kowiki, ruwiki, frwik... |
11 | Q619 | 1473 | 1543 | [Q6581097] | NaN | Q36| | Q47554| | [szlwiki, zhwiki, kywiki, plwiki, euwiki, bswi... |
12 | Q633 | 1945 | NaN | [Q6581097] | NaN | Q16| | Q172| | [euwikiquote, zhwiki, plwiki, euwiki, eswiki, ... |
13 | Q635 | -68 | -29 | [Q6581072] | NaN | Q11768| | Q87| | [zhwiki, plwiki, euwiki, bswiki, uzwiki, eswik... |
14 | Q747 | 1606 | 1684 | [Q6581097] | NaN | Q70972| | Q30974| | [lbwiki, zhwiki, plwiki, euwiki, bswiki, eswik... |
15 | Q815 | 1898 | 1980 | [Q6581097] | NaN | Q298| | Q14467| | [ptwiki, plwiki, ruwiki, kowiki, frwiki, enwik... |
16 | Q849 | 1431 | 1463 | [Q6581097] | NaN | Q70972| | Q90| | [zhwiki, plwiki, euwiki, ptwikisource, eswiki,... |
17 | Q853 | 1932 | 1986 | [Q6581097] | NaN | Q15180| | Q15651436| | [zhwiki, plwiki, euwiki, eswiki, afwiki, ocwik... |
18 | Q859 | -427 | -347 | [Q6581097] | NaN | Q844930| | Q1779520| | [uzwiki, bhwiki, eswiki, ptwikisource, huwiki,... |
19 | Q873 | 1949 | NaN | [Q6581072] | NaN | Q30| | Q1000642| | [lbwiki, zhwiki, jvwiki, plwiki, euwiki, bswik... |
def makedecades(b,e):
for y in range(b, e, 10):
yield y, y+10
def isfemale(x):
if isinstance(x, list):
return x[0] == 'Q6581072'
else: return False
def ismale(x):
if isinstance(x, list):
return x[0] == 'Q6581097'
else: return False
def nogender(x):
if not isinstance(x, list):
return True
else: return False
for xxwiki in WIKIS:
has_wiki = has(xxwiki)
recs = allrecs[allrecs.apply(has_wiki, axis=1)]
for gender, gender_test in (('female', isfemale), ('male', ismale), ('nogender', nogender)):
grecs = recs[recs['gender'].apply(gender_test)]
for start_year, stop_year in makedecades(1930,1990):
modrecs = grecs[(grecs['dob'] >= start_year) &(grecs['dob'] < stop_year)]
#print len(modrecs), xxwiki, start_year
filepath = 'helpers/inspection/{}_{}_{}.json'.format(xxwiki, start_year,gender)
json.dump(list(modrecs['qid']), open(filepath,'w'))
!scp $filepath wmflabs-tools:/home/maximilianklein/inspectionshortcut
772 jawiki 1930 1202 jawiki 1940 1457 jawiki 1950 2194 jawiki 1960 3138 jawiki 1970 4304 jawiki 1980 4718 jawiki 1930 6520 jawiki 1940 6555 jawiki 1950 7536 jawiki 1960 9715 jawiki 1970 11830 jawiki 1980 340 jawiki 1930 422 jawiki 1940 379 jawiki 1950 376 jawiki 1960 379 jawiki 1970 305 jawiki 1980 177 tlwiki 1930 235 tlwiki 1940 294 tlwiki 1950 427 tlwiki 1960 625 tlwiki 1970 837 tlwiki 1980 359 tlwiki 1930 480 tlwiki 1940 479 tlwiki 1950 557 tlwiki 1960 598 tlwiki 1970 683 tlwiki 1980 0 tlwiki 1930 1 tlwiki 1940 2 tlwiki 1950 2 tlwiki 1960 6 tlwiki 1970 3 tlwiki 1980 109 urwiki 1930 156 urwiki 1940 189 urwiki 1950 229 urwiki 1960 422 urwiki 1970 697 urwiki 1980 176 urwiki 1930 199 urwiki 1940 176 urwiki 1950 142 urwiki 1960 126 urwiki 1970 133 urwiki 1980 0 urwiki 1930 0 urwiki 1940 0 urwiki 1950 1 urwiki 1960 0 urwiki 1970 1 urwiki 1980 307 zhwiki 1930 468 zhwiki 1940 717 zhwiki 1950 996 zhwiki 1960 1637 zhwiki 1970 3014 zhwiki 1980 2448 zhwiki 1930 3339 zhwiki 1940 3495 zhwiki 1950 3794 zhwiki 1960 4753 zhwiki 1970 6477 zhwiki 1980 257 zhwiki 1930 389 zhwiki 1940 624 zhwiki 1950 543 zhwiki 1960 257 zhwiki 1970 299 zhwiki 1980 4101 dewiki 1930 7107 dewiki 1940 8671 dewiki 1950 10194 dewiki 1960 10401 dewiki 1970 10982 dewiki 1980 31003 dewiki 1930 37919 dewiki 1940 35954 dewiki 1950 33389 dewiki 1960 28342 dewiki 1970 29896 dewiki 1980 0 dewiki 1930 0 dewiki 1940 1 dewiki 1950 1 dewiki 1960 0 dewiki 1970 0 dewiki 1980 7338 enwiki 1930 12475 enwiki 1940 15750 enwiki 1950 17603 enwiki 1960 20983 enwiki 1970 24977 enwiki 1980 50168 enwiki 1930 67619 enwiki 1940 71683 enwiki 1950 71914 enwiki 1960 78598 enwiki 1970 96430 enwiki 1980 48 enwiki 1930 63 enwiki 1940 88 enwiki 1950 88 enwiki 1960 92 enwiki 1970 93 enwiki 1980 215 kowiki 1930 397 kowiki 1940 552 kowiki 1950 920 kowiki 1960 1454 kowiki 1970 2056 kowiki 1980 1405 kowiki 1930 2172 kowiki 1940 2240 kowiki 1950 2913 kowiki 1960 3851 kowiki 1970 5287 kowiki 1980 8 kowiki 1930 13 kowiki 1940 20 kowiki 1950 24 kowiki 1960 37 kowiki 1970 34 kowiki 1980
then wait for the remote 200word summary task on labs under viafbot/inspection
!mkdir helpers/inspection/expanded_descriptions
!scp wmflabs-tools:/home/maximilianklein/inspectionshortcut/output/* helpers/inspection/expanded_descriptions/.
If you are having access problems, please see: https://wikitech.wikimedia.org/wiki/Access#Accessing_public_and_private_instances dewiki_1930_female_descriptions.json 100% 1054KB 1.0MB/s 00:01 dewiki_1930_male_descriptions.json 100% 2604KB 650.9KB/s 00:04 dewiki_1940_female_descriptions.json 100% 1824KB 456.0KB/s 00:04 dewiki_1940_male_descriptions.json 100% 2556KB 638.9KB/s 00:04 dewiki_1950_female_descriptions.json 100% 2225KB 556.2KB/s 00:04 dewiki_1950_male_descriptions.json 100% 2562KB 512.3KB/s 00:05 dewiki_1950_nogender_descriptions.json 100% 263 0.3KB/s 00:00 dewiki_1960_female_descriptions.json 100% 2563KB 640.8KB/s 00:04 dewiki_1960_male_descriptions.json 100% 2581KB 645.1KB/s 00:04 dewiki_1960_nogender_descriptions.json 100% 257 0.3KB/s 00:00 dewiki_1970_female_descriptions.json 100% 2546KB 636.5KB/s 00:04 dewiki_1970_male_descriptions.json 100% 2574KB 643.6KB/s 00:04 dewiki_1980_female_descriptions.json 100% 2544KB 508.7KB/s 00:05 dewiki_1980_male_descriptions.json 100% 2595KB 432.5KB/s 00:06 enwiki_1930_female_descriptions.json 100% 1858KB 619.2KB/s 00:03 enwiki_1930_male_descriptions.json 100% 2505KB 626.2KB/s 00:04 enwiki_1930_nogender_descriptions.json 100% 12KB 12.2KB/s 00:00 enwiki_1940_female_descriptions.json 100% 2528KB 842.7KB/s 00:03 enwiki_1940_male_descriptions.json 100% 2509KB 836.2KB/s 00:03 enwiki_1940_nogender_descriptions.json 100% 16KB 16.0KB/s 00:00 enwiki_1950_female_descriptions.json 100% 2505KB 626.3KB/s 00:04 enwiki_1950_male_descriptions.json 100% 2528KB 632.0KB/s 00:04 enwiki_1950_nogender_descriptions.json 100% 22KB 21.9KB/s 00:00 enwiki_1960_female_descriptions.json 100% 2524KB 631.1KB/s 00:04 enwiki_1960_male_descriptions.json 100% 2518KB 629.6KB/s 00:04 enwiki_1960_nogender_descriptions.json 100% 21KB 21.2KB/s 00:00 enwiki_1970_female_descriptions.json 100% 2534KB 633.4KB/s 00:04 enwiki_1970_male_descriptions.json 100% 2530KB 632.4KB/s 00:04 enwiki_1970_nogender_descriptions.json 100% 22KB 22.2KB/s 00:00 enwiki_1980_female_descriptions.json 100% 2494KB 623.6KB/s 00:04 enwiki_1980_male_descriptions.json 100% 2528KB 632.0KB/s 00:04 enwiki_1980_nogender_descriptions.json 100% 22KB 22.4KB/s 00:00 jawiki_1930_female_descriptions.json 100% 228KB 228.1KB/s 00:01 jawiki_1930_male_descriptions.json 100% 1426KB 712.8KB/s 00:02 jawiki_1930_nogender_descriptions.json 100% 354KB 354.2KB/s 00:01 jawiki_1940_female_descriptions.json 100% 363KB 363.0KB/s 00:01 jawiki_1940_male_descriptions.json 100% 1902KB 475.6KB/s 00:04 jawiki_1940_nogender_descriptions.json 100% 443KB 443.5KB/s 00:01 jawiki_1950_female_descriptions.json 100% 452KB 451.7KB/s 00:01 jawiki_1950_male_descriptions.json 100% 1904KB 634.6KB/s 00:03 jawiki_1950_nogender_descriptions.json 100% 398KB 398.3KB/s 00:01 jawiki_1960_female_descriptions.json 100% 681KB 340.5KB/s 00:02 jawiki_1960_male_descriptions.json 100% 2177KB 544.2KB/s 00:04 jawiki_1960_nogender_descriptions.json 100% 394KB 394.4KB/s 00:01 jawiki_1970_female_descriptions.json 100% 984KB 492.0KB/s 00:02 jawiki_1970_male_descriptions.json 100% 2705KB 541.0KB/s 00:05 jawiki_1970_nogender_descriptions.json 100% 387KB 387.5KB/s 00:01 jawiki_1980_female_descriptions.json 100% 1522KB 760.9KB/s 00:02 jawiki_1980_male_descriptions.json 100% 2710KB 677.5KB/s 00:04 jawiki_1980_nogender_descriptions.json 100% 296KB 296.5KB/s 00:00 kowiki_1930_female_descriptions.json 100% 56KB 56.4KB/s 00:01 kowiki_1930_male_descriptions.json 100% 368KB 367.7KB/s 00:00 kowiki_1930_nogender_descriptions.json 100% 6241 6.1KB/s 00:01 kowiki_1940_female_descriptions.json 100% 102KB 102.4KB/s 00:00 kowiki_1940_male_descriptions.json 100% 567KB 567.0KB/s 00:01 kowiki_1940_nogender_descriptions.json 100% 10KB 10.2KB/s 00:00 kowiki_1950_female_descriptions.json 100% 144KB 144.3KB/s 00:00 kowiki_1950_male_descriptions.json 100% 584KB 292.2KB/s 00:02 kowiki_1950_nogender_descriptions.json 100% 16KB 15.8KB/s 00:00 kowiki_1960_female_descriptions.json 100% 236KB 235.8KB/s 00:01 kowiki_1960_male_descriptions.json 100% 751KB 375.6KB/s 00:02 kowiki_1960_nogender_descriptions.json 100% 18KB 17.7KB/s 00:00 kowiki_1970_female_descriptions.json 100% 375KB 374.6KB/s 00:01 kowiki_1970_male_descriptions.json 100% 984KB 491.8KB/s 00:02 kowiki_1970_nogender_descriptions.json 100% 27KB 26.6KB/s 00:00 kowiki_1980_female_descriptions.json 100% 548KB 548.4KB/s 00:01 kowiki_1980_male_descriptions.json 100% 1360KB 679.9KB/s 00:02 kowiki_1980_nogender_descriptions.json 100% 22KB 21.8KB/s 00:00 tlwiki_1930_female_descriptions.json 100% 44KB 43.8KB/s 00:00 tlwiki_1930_male_descriptions.json 100% 90KB 89.9KB/s 00:00 tlwiki_1940_female_descriptions.json 100% 58KB 58.4KB/s 00:00 tlwiki_1940_male_descriptions.json 100% 120KB 119.9KB/s 00:01 tlwiki_1940_nogender_descriptions.json 100% 143 0.1KB/s 00:00 tlwiki_1950_female_descriptions.json 100% 73KB 73.3KB/s 00:00 tlwiki_1950_male_descriptions.json 100% 119KB 118.6KB/s 00:01 tlwiki_1950_nogender_descriptions.json 100% 396 0.4KB/s 00:00 tlwiki_1960_female_descriptions.json 100% 106KB 106.1KB/s 00:00 tlwiki_1960_male_descriptions.json 100% 138KB 137.6KB/s 00:01 tlwiki_1960_nogender_descriptions.json 100% 286 0.3KB/s 00:00 tlwiki_1970_female_descriptions.json 100% 155KB 155.0KB/s 00:01 tlwiki_1970_male_descriptions.json 100% 149KB 148.9KB/s 00:00 tlwiki_1970_nogender_descriptions.json 100% 854 0.8KB/s 00:00 tlwiki_1980_female_descriptions.json 100% 207KB 207.2KB/s 00:00 tlwiki_1980_male_descriptions.json 100% 168KB 168.3KB/s 00:01 tlwiki_1980_nogender_descriptions.json 100% 440 0.4KB/s 00:00 urwiki_1930_female_descriptions.json 100% 27KB 27.2KB/s 00:00 urwiki_1930_male_descriptions.json 100% 45KB 45.5KB/s 00:00 urwiki_1940_female_descriptions.json 100% 39KB 39.5KB/s 00:00 urwiki_1940_male_descriptions.json 100% 52KB 51.6KB/s 00:00 urwiki_1950_female_descriptions.json 100% 47KB 47.0KB/s 00:00 urwiki_1950_male_descriptions.json 100% 47KB 47.0KB/s 00:00 urwiki_1960_female_descriptions.json 100% 57KB 57.4KB/s 00:01 urwiki_1960_male_descriptions.json 100% 37KB 37.2KB/s 00:00 urwiki_1960_nogender_descriptions.json 100% 1113 1.1KB/s 00:00 urwiki_1970_female_descriptions.json 100% 114KB 113.7KB/s 00:00 urwiki_1970_male_descriptions.json 100% 33KB 33.2KB/s 00:01 urwiki_1980_female_descriptions.json 100% 196KB 195.5KB/s 00:00 urwiki_1980_male_descriptions.json 100% 33KB 33.3KB/s 00:00 urwiki_1980_nogender_descriptions.json 100% 1036 1.0KB/s 00:00 zhwiki_1930_female_descriptions.json 100% 95KB 95.4KB/s 00:00 zhwiki_1930_male_descriptions.json 100% 683KB 341.6KB/s 00:02 zhwiki_1930_nogender_descriptions.json 100% 251KB 251.2KB/s 00:01 zhwiki_1940_female_descriptions.json 100% 148KB 148.0KB/s 00:01 zhwiki_1940_male_descriptions.json 100% 908KB 302.6KB/s 00:03 zhwiki_1940_nogender_descriptions.json 100% 363KB 362.6KB/s 00:01 zhwiki_1950_female_descriptions.json 100% 209KB 209.2KB/s 00:00 zhwiki_1950_male_descriptions.json 100% 985KB 492.6KB/s 00:02 zhwiki_1950_nogender_descriptions.json 100% 580KB 289.8KB/s 00:02 zhwiki_1960_female_descriptions.json 100% 307KB 307.3KB/s 00:01 zhwiki_1960_male_descriptions.json 100% 1043KB 1.0MB/s 00:01 zhwiki_1960_nogender_descriptions.json 100% 499KB 499.2KB/s 00:01 zhwiki_1970_female_descriptions.json 100% 492KB 492.5KB/s 00:01 zhwiki_1970_male_descriptions.json 100% 1283KB 641.4KB/s 00:02 zhwiki_1970_nogender_descriptions.json 100% 243KB 242.7KB/s 00:01 zhwiki_1980_female_descriptions.json 100% 990KB 495.0KB/s 00:02 zhwiki_1980_male_descriptions.json 100% 1768KB 589.4KB/s 00:03 zhwiki_1980_nogender_descriptions.json 100% 280KB 280.2KB/s 00:00
description_files = !ls helpers/inspection/expanded_descriptions/
len(description_files)
117
celebrity_dict = {'jawiki': [u'俳優', u'選手', u'歌手', u'ミュージシャン', u'モデル', u'アイドル'],
'zhwiki': [u'演員', u'運動員', u'歌手', u'音乐家', u'模特兒', u'偶像'],
'tlwiki': [u'artista', 'aktor', u'player', u'mang-aawit', u'musikero', u'modelo', u'idolo'],
'urwiki': [u'اردو', u'کھلاڑ', u'گلوکار' , u'موسیقار' , u'ماڈل', u'بت'],
'dewiki': [u'schauspieler' , u'spieler', u'Musiker', u'Sänger', u'Modell', u'Idol'],
'enwiki' :[u'actor', u'actress', u'player', u'singer', u'musician', u'model', u'idol'],
'kowiki' : [u'배우', u'선수', u'가수', u'음악가', u'모델', u'우상']}
def intext(text, xxwiki):
if text:
text = text.encode('utf-8').lower()
engwords = celebrity_dict['enwiki']
foreignwords = celebrity_dict[xxwiki]
for word in engwords + foreignwords:
if word.encode('utf-8').lower() in text:
return True
#if we get to this point its too late
return False
else: return False
celebdf = pd.DataFrame(columns=['wiki','decade','gender','celeb_per'])
for f in description_files:
parts = f.split('_')
xxwiki, decade, gender = parts[0], parts[1], parts[2]
df = pd.DataFrame.from_dict(json.load(open('helpers/inspection/expanded_descriptions/{}'.format(f), 'r')), orient='index')
df['celeb'] = df['text'].apply(lambda text: intext(text,xxwiki))
test_per = df['celeb'].sum()/float(len(df))
celebdf = celebdf.append({'wiki':xxwiki, 'decade':int(decade), 'gender':gender, 'celeb_per':test_per}, ignore_index=True)
dummy_langs = pd.get_dummies(celebdf['wiki'])
dummy_gender = pd.get_dummies(celebdf['gender'])
dummy_gender = dummy_gender[['male','female','nogender']]
print dummy_langs.head(2)
print dummy_gender.head(2)
dewiki enwiki jawiki kowiki tlwiki urwiki zhwiki 0 1 0 0 0 0 0 0 1 1 0 0 0 0 0 0 male female nogender 0 0 1 0 1 1 0 0
catdf = celebdf[['celeb_per','decade']].join(dummy_langs.ix[:,'enwiki':]).join(dummy_gender.ix[:,'female':'female'])
catdf['intercept'] = 1.0
catdf.corr()
celeb_per | decade | enwiki | jawiki | kowiki | tlwiki | urwiki | zhwiki | female | intercept | |
---|---|---|---|---|---|---|---|---|---|---|
celeb_per | 1.000000 | 0.284661 | -0.290744 | -0.079482 | 0.125096 | 0.606231 | 0.017558 | -0.154101 | 0.460273 | NaN |
decade | 0.284661 | 1.000000 | -0.011882 | -0.011882 | -0.011882 | 0.024452 | 0.036556 | -0.011882 | -0.020852 | NaN |
enwiki | -0.290744 | -0.011882 | 1.000000 | -0.181818 | -0.181818 | -0.175810 | -0.157204 | -0.181818 | -0.022792 | NaN |
jawiki | -0.079482 | -0.011882 | -0.181818 | 1.000000 | -0.181818 | -0.175810 | -0.157204 | -0.181818 | -0.022792 | NaN |
kowiki | 0.125096 | -0.011882 | -0.181818 | -0.181818 | 1.000000 | -0.175810 | -0.157204 | -0.181818 | -0.022792 | NaN |
tlwiki | 0.606231 | 0.024452 | -0.175810 | -0.175810 | -0.175810 | 1.000000 | -0.152009 | -0.175810 | -0.005186 | NaN |
urwiki | 0.017558 | 0.036556 | -0.157204 | -0.157204 | -0.157204 | -0.152009 | 1.000000 | -0.157204 | 0.053489 | NaN |
zhwiki | -0.154101 | -0.011882 | -0.181818 | -0.181818 | -0.181818 | -0.175810 | -0.157204 | 1.000000 | -0.022792 | NaN |
female | 0.460273 | -0.020852 | -0.022792 | -0.022792 | -0.022792 | -0.005186 | 0.053489 | -0.022792 | 1.000000 | NaN |
intercept | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
train_cols = catdf.columns[1:]
logit = sm.Logit(catdf['celeb_per'], catdf[train_cols])
result= logit.fit()
Optimization terminated successfully. Current function value: 0.466279 Iterations 6
result.summary()
Dep. Variable: | celeb_per | No. Observations: | 117 |
---|---|---|---|
Model: | Logit | Df Residuals: | 108 |
Method: | MLE | Df Model: | 8 |
Date: | Wed, 07 Jan 2015 | Pseudo R-squ.: | 0.3079 |
Time: | 11:25:05 | Log-Likelihood: | -54.555 |
converged: | True | LL-Null: | -78.830 |
LLR p-value: | 7.752e-08 |
coef | std err | z | P>|z| | [95.0% Conf. Int.] | |
---|---|---|---|---|---|
decade | 0.0236 | 0.013 | 1.823 | 0.068 | -0.002 0.049 |
enwiki | 0.0509 | 0.875 | 0.058 | 0.954 | -1.664 1.766 |
jawiki | 0.7763 | 0.837 | 0.927 | 0.354 | -0.865 2.418 |
kowiki | 1.3834 | 0.832 | 1.662 | 0.097 | -0.248 3.015 |
tlwiki | 3.0009 | 0.945 | 3.176 | 0.001 | 1.149 4.853 |
urwiki | 0.8901 | 0.869 | 1.025 | 0.306 | -0.813 2.593 |
zhwiki | 0.5383 | 0.846 | 0.637 | 0.524 | -1.119 2.196 |
female | 1.3580 | 0.453 | 2.999 | 0.003 | 0.471 2.245 |
intercept | -47.9056 | 25.368 | -1.888 | 0.059 | -97.626 1.815 |
subj_list = ['female','male','nogender']
fig, axes = plt.subplots(nrows = 1, ncols = len(subj_list), sharex='col', sharey='row')
for ax, subj in zip(axes, subj_list):
natlangdf = celebdf[celebdf['gender'] == subj]
natlangpiv = pd.pivot_table(natlangdf, values='celeb_per', rows='decade', cols='wiki')
natlangpiv = natlangpiv[['jawiki','zhwiki','kowiki','tlwiki','urwiki','dewiki','enwiki']]
natlangpiv.columns = ['Japanese', 'Chinese', 'Korean', 'Tagalog', 'Urdu', 'German', 'English']
natlangpiv = natlangpiv * 100
heatmap = ax.pcolor(natlangpiv, cmap='Purples', vmin=0, vmax=100)
ax.set_yticks(np.arange(0.5, len(natlangpiv.index), 1))
ax.set_yticklabels(map(int, natlangpiv.index))
ax.set_xticks(np.arange(0.5, len(natlangpiv.columns), 1))
ax.set_xticklabels(natlangpiv.columns, rotation=90)
fig.suptitle('''Heatmap of Celebrity Biography %, By Decade of Birth versus Wikipedia Language by Gender''', fontsize=18)
fig.set_size_inches(12,4,dpi=600)
#fig.tight_layout()
subj_titles = ['Female','Male','Not Recorded or Non-Binary']
metric_titles =['Decade']
cbar = plt.colorbar(mappable=heatmap, ax=ax, format="%.0f%%")
for i in range(len(subj_titles)):
axes[i].set_title(subj_titles[i])
fig.subplots_adjust(wspace=0.0, hspace=0.0, top=0.85)
subplots_adjust
<function matplotlib.pyplot.subplots_adjust>
actress_dict = {'jawiki': u'俳優', 'zhwiki': u'演員', 'tlwiki':u'artista', 'urwiki': u'اردو', 'dewiki': 'schauspieler' , 'enwiki' :'actress'}
player_dict = {'jawiki': u'選手', 'zhwiki': u'運動員', 'tlwiki':u'player', 'urwiki': u'کھلاڑ', 'dewiki': 'spieler' , 'enwiki' :'player'}
def multiword(xxwiki, prof_dict):
def intext(text):
if text:
text = text.lower()
eng = prof_dict['enwiki']
foreign = prof_dict[xxwiki]
if eng in text or foreign in text:
return True
else: return False
else: return False
return intext
celeb = defaultdict(dict)
for xxwiki in ('jawiki', 'tlwiki', 'urwiki', 'zhwiki', 'enwiki', 'dewiki'):
df = pd.DataFrame.from_dict(json.load(open('helpers/inspection/descriptions/{}_descriptions.json'.format(xxwiki), 'r')), orient='index')
for prof, test in (('actress', multiword(xxwiki, actress_dict)), ('player',multiword(xxwiki, player_dict))):
df[prof] = df['text'].apply(test)
test_per = df[prof].sum()/float(len(df))
celeb[xxwiki][prof] = test_per
for xxwiki in ('jawiki', 'tlwiki', 'urwiki', 'zhwiki', 'enwiki', 'dewiki'):
df = pd.DataFrame.from_dict(json.load(open('helpers/inspection/descriptions/{}_descriptions.json'.format(xxwiki), 'r')), orient='index')
df['text'] = df['text'].apply(lambda x: x.replace('\n',' ') if x else x)
df.to_csv('helpers/inspection/readable/{}_modern_bios_for_inspection.csv'.format(xxwiki), encoding='utf-8')
celebdf = pd.DataFrame.from_dict(celeb, orient='index')
celebdf['either'] = celebdf['player'] + celebdf['actress']
celebdf.sort('either')
player | actress | either | |
---|---|---|---|
enwiki | 0.223995 | 0.042483 | 0.266478 |
jawiki | 0.198402 | 0.090304 | 0.288705 |
dewiki | 0.287200 | 0.075000 | 0.362200 |
zhwiki | 0.251717 | 0.121909 | 0.373626 |
tlwiki | 0.068075 | 0.419601 | 0.487676 |
urwiki | 0.016148 | 0.741638 | 0.757785 |
celebdf['wiki'].convert_object(convert_dates=True)
--------------------------------------------------------------------------- AttributeError Traceback (most recent call last) <ipython-input-19-abc28e5c26e7> in <module>() ----> 1 celebdf['wiki'].convert_object(convert_dates=True) /usr/local/lib/python2.7/dist-packages/pandas/core/generic.pyc in __getattr__(self, name) 1945 return self[name] 1946 raise AttributeError("'%s' object has no attribute '%s'" % -> 1947 (type(self).__name__, name)) 1948 1949 def __setattr__(self, name, value): AttributeError: 'Series' object has no attribute 'convert_object'
celebdf
wiki | decade | gender | celeb_per | |
---|---|---|---|---|
0 | dewiki | 1930 | female | 0.313582 |
1 | dewiki | 1930 | male | 0.147053 |
2 | dewiki | 1940 | female | 0.271985 |
3 | dewiki | 1940 | male | 0.162964 |
4 | dewiki | 1950 | female | 0.246338 |
5 | dewiki | 1950 | male | 0.186657 |
6 | dewiki | 1950 | nogender | 0.000000 |
7 | dewiki | 1960 | female | 0.323635 |
8 | dewiki | 1960 | male | 0.256650 |
9 | dewiki | 1960 | nogender | 0.000000 |
10 | dewiki | 1970 | female | 0.461909 |
11 | dewiki | 1970 | male | 0.328388 |
12 | dewiki | 1980 | female | 0.511078 |
13 | dewiki | 1980 | male | 0.331519 |
14 | enwiki | 1930 | female | 0.293813 |
15 | enwiki | 1930 | male | 0.167428 |
16 | enwiki | 1930 | nogender | 0.104167 |
17 | enwiki | 1940 | female | 0.262633 |
18 | enwiki | 1940 | male | 0.179598 |
19 | enwiki | 1940 | nogender | 0.079365 |
20 | enwiki | 1950 | female | 0.253739 |
21 | enwiki | 1950 | male | 0.202080 |
22 | enwiki | 1950 | nogender | 0.102273 |
23 | enwiki | 1960 | female | 0.332370 |
24 | enwiki | 1960 | male | 0.250150 |
25 | enwiki | 1960 | nogender | 0.159091 |
26 | enwiki | 1970 | female | 0.439121 |
27 | enwiki | 1970 | male | 0.296730 |
28 | enwiki | 1970 | nogender | 0.152174 |
29 | enwiki | 1980 | female | 0.494689 |
... | ... | ... | ... | ... |
87 | urwiki | 1940 | female | 0.839744 |
88 | urwiki | 1940 | male | 0.075377 |
89 | urwiki | 1950 | female | 0.841270 |
90 | urwiki | 1950 | male | 0.107955 |
91 | urwiki | 1960 | female | 0.868996 |
92 | urwiki | 1960 | male | 0.161972 |
93 | urwiki | 1960 | nogender | 0.000000 |
94 | urwiki | 1970 | female | 0.902844 |
95 | urwiki | 1970 | male | 0.198413 |
96 | urwiki | 1980 | female | 0.928264 |
97 | urwiki | 1980 | male | 0.203008 |
98 | urwiki | 1980 | nogender | 0.000000 |
99 | zhwiki | 1930 | female | 0.371336 |
100 | zhwiki | 1930 | male | 0.191993 |
101 | zhwiki | 1930 | nogender | 0.070039 |
102 | zhwiki | 1940 | female | 0.275641 |
103 | zhwiki | 1940 | male | 0.225217 |
104 | zhwiki | 1940 | nogender | 0.043702 |
105 | zhwiki | 1950 | female | 0.358438 |
106 | zhwiki | 1950 | male | 0.269242 |
107 | zhwiki | 1950 | nogender | 0.038462 |
108 | zhwiki | 1960 | female | 0.560241 |
109 | zhwiki | 1960 | male | 0.423300 |
110 | zhwiki | 1960 | nogender | 0.069982 |
111 | zhwiki | 1970 | female | 0.684178 |
112 | zhwiki | 1970 | male | 0.489796 |
113 | zhwiki | 1970 | nogender | 0.272374 |
114 | zhwiki | 1980 | female | 0.720637 |
115 | zhwiki | 1980 | male | 0.413772 |
116 | zhwiki | 1980 | nogender | 0.387960 |
117 rows × 4 columns
natlangpiv = pd.pivot_table(celebdf, values='celeb_per', rows='decade', cols='wiki')
natlangpiv
wiki | dewiki | enwiki | jawiki | kowiki | tlwiki | urwiki | zhwiki |
---|---|---|---|---|---|---|---|
decade | |||||||
1930 | 0.230318 | 0.188469 | 0.283101 | 0.311064 | 0.717719 | 0.496977 | 0.211123 |
1940 | 0.217475 | 0.173866 | 0.291716 | 0.448845 | 0.812707 | 0.457560 | 0.181520 |
1950 | 0.144332 | 0.186030 | 0.314639 | 0.446219 | 0.666099 | 0.474612 | 0.222047 |
1960 | 0.193429 | 0.247204 | 0.383604 | 0.584960 | 0.876718 | 0.343656 | 0.351174 |
1970 | 0.395148 | 0.296008 | 0.460160 | 0.639824 | 0.914757 | 0.550628 | 0.482116 |
1980 | 0.421298 | 0.340357 | 0.508095 | 0.594178 | 0.924142 | 0.377091 | 0.507456 |