What's for those recently in confucian wikipedias, what's import pandas as pd import numpy from collections import defaultdict import json import statsmodels.api as sm from matplotlib.pylab import style style.use('fivethirtyeight') %pylab inline java_min_int = -2147483648 WIKIS =('jawiki', 'tlwiki', 'urwiki', 'zhwiki', 'dewiki', 'enwiki','kowiki') allrecs = pd.read_csv('snapshot_data/2014-10-13/gender-index-data-2014-10-13.csv',na_values=[java_min_int]) def split_column(q_str): if type(q_str) is float: if numpy.isnan(q_str): return q_str if type(q_str) is str: qs = q_str.split('|') return qs[:-1] #cos the format will always end with a | for col in ['gender','site_links']: allrecs[col] = allrecs[col].apply(split_column) def has(xxwiki): def has_xx(row): if isinstance(row['site_links'], list): return xxwiki in row['site_links'] else: return False return has_xx allrecs.head(20) def makedecades(b,e): for y in range(b, e, 10): yield y, y+10 def isfemale(x): if isinstance(x, list): return x[0] == 'Q6581072' else: return False def ismale(x): if isinstance(x, list): return x[0] == 'Q6581097' else: return False def nogender(x): if not isinstance(x, list): return True else: return False for xxwiki in WIKIS: has_wiki = has(xxwiki) recs = allrecs[allrecs.apply(has_wiki, axis=1)] for gender, gender_test in (('female', isfemale), ('male', ismale), ('nogender', nogender)): grecs = recs[recs['gender'].apply(gender_test)] for start_year, stop_year in makedecades(1930,1990): modrecs = grecs[(grecs['dob'] >= start_year) &(grecs['dob'] < stop_year)] #print len(modrecs), xxwiki, start_year filepath = 'helpers/inspection/{}_{}_{}.json'.format(xxwiki, start_year,gender) json.dump(list(modrecs['qid']), open(filepath,'w')) !scp $filepath wmflabs-tools:/home/maximilianklein/inspectionshortcut !mkdir helpers/inspection/expanded_descriptions !scp wmflabs-tools:/home/maximilianklein/inspectionshortcut/output/* helpers/inspection/expanded_descriptions/. description_files = !ls helpers/inspection/expanded_descriptions/ len(description_files) celebrity_dict = {'jawiki': [u'俳優', u'選手', u'歌手', u'ミュージシャン', u'モデル', u'アイドル'], 'zhwiki': [u'演員', u'運動員', u'歌手', u'音乐家', u'模特兒', u'偶像'], 'tlwiki': [u'artista', 'aktor', u'player', u'mang-aawit', u'musikero', u'modelo', u'idolo'], 'urwiki': [u'اردو', u'کھلاڑ', u'گلوکار' , u'موسیقار' , u'ماڈل', u'بت'], 'dewiki': [u'schauspieler' , u'spieler', u'Musiker', u'Sänger', u'Modell', u'Idol'], 'enwiki' :[u'actor', u'actress', u'player', u'singer', u'musician', u'model', u'idol'], 'kowiki' : [u'배우', u'선수', u'가수', u'음악가', u'모델', u'우상']} def intext(text, xxwiki): if text: text = text.encode('utf-8').lower() engwords = celebrity_dict['enwiki'] foreignwords = celebrity_dict[xxwiki] for word in engwords + foreignwords: if word.encode('utf-8').lower() in text: return True #if we get to this point its too late return False else: return False celebdf = pd.DataFrame(columns=['wiki','decade','gender','celeb_per']) for f in description_files: parts = f.split('_') xxwiki, decade, gender = parts[0], parts[1], parts[2] df = pd.DataFrame.from_dict(json.load(open('helpers/inspection/expanded_descriptions/{}'.format(f), 'r')), orient='index') df['celeb'] = df['text'].apply(lambda text: intext(text,xxwiki)) test_per = df['celeb'].sum()/float(len(df)) celebdf = celebdf.append({'wiki':xxwiki, 'decade':int(decade), 'gender':gender, 'celeb_per':test_per}, ignore_index=True) dummy_langs = pd.get_dummies(celebdf['wiki']) dummy_gender = pd.get_dummies(celebdf['gender']) dummy_gender = dummy_gender[['male','female','nogender']] print dummy_langs.head(2) print dummy_gender.head(2) catdf = celebdf[['celeb_per','decade']].join(dummy_langs.ix[:,'enwiki':]).join(dummy_gender.ix[:,'female':'female']) catdf['intercept'] = 1.0 catdf.corr() train_cols = catdf.columns[1:] logit = sm.Logit(catdf['celeb_per'], catdf[train_cols]) result= logit.fit() result.summary() subj_list = ['female','male','nogender'] fig, axes = plt.subplots(nrows = 1, ncols = len(subj_list), sharex='col', sharey='row') for ax, subj in zip(axes, subj_list): natlangdf = celebdf[celebdf['gender'] == subj] natlangpiv = pd.pivot_table(natlangdf, values='celeb_per', rows='decade', cols='wiki') natlangpiv = natlangpiv[['jawiki','zhwiki','kowiki','tlwiki','urwiki','dewiki','enwiki']] natlangpiv.columns = ['Japanese', 'Chinese', 'Korean', 'Tagalog', 'Urdu', 'German', 'English'] natlangpiv = natlangpiv * 100 heatmap = ax.pcolor(natlangpiv, cmap='Purples', vmin=0, vmax=100) ax.set_yticks(np.arange(0.5, len(natlangpiv.index), 1)) ax.set_yticklabels(map(int, natlangpiv.index)) ax.set_xticks(np.arange(0.5, len(natlangpiv.columns), 1)) ax.set_xticklabels(natlangpiv.columns, rotation=90) fig.suptitle('''Heatmap of Celebrity Biography %, By Decade of Birth versus Wikipedia Language by Gender''', fontsize=18) fig.set_size_inches(12,4,dpi=600) #fig.tight_layout() subj_titles = ['Female','Male','Not Recorded or Non-Binary'] metric_titles =['Decade'] cbar = plt.colorbar(mappable=heatmap, ax=ax, format="%.0f%%") for i in range(len(subj_titles)): axes[i].set_title(subj_titles[i]) fig.subplots_adjust(wspace=0.0, hspace=0.0, top=0.85) subplots_adjust actress_dict = {'jawiki': u'俳優', 'zhwiki': u'演員', 'tlwiki':u'artista', 'urwiki': u'اردو', 'dewiki': 'schauspieler' , 'enwiki' :'actress'} player_dict = {'jawiki': u'選手', 'zhwiki': u'運動員', 'tlwiki':u'player', 'urwiki': u'کھلاڑ', 'dewiki': 'spieler' , 'enwiki' :'player'} def multiword(xxwiki, prof_dict): def intext(text): if text: text = text.lower() eng = prof_dict['enwiki'] foreign = prof_dict[xxwiki] if eng in text or foreign in text: return True else: return False else: return False return intext celeb = defaultdict(dict) for xxwiki in ('jawiki', 'tlwiki', 'urwiki', 'zhwiki', 'enwiki', 'dewiki'): df = pd.DataFrame.from_dict(json.load(open('helpers/inspection/descriptions/{}_descriptions.json'.format(xxwiki), 'r')), orient='index') for prof, test in (('actress', multiword(xxwiki, actress_dict)), ('player',multiword(xxwiki, player_dict))): df[prof] = df['text'].apply(test) test_per = df[prof].sum()/float(len(df)) celeb[xxwiki][prof] = test_per for xxwiki in ('jawiki', 'tlwiki', 'urwiki', 'zhwiki', 'enwiki', 'dewiki'): df = pd.DataFrame.from_dict(json.load(open('helpers/inspection/descriptions/{}_descriptions.json'.format(xxwiki), 'r')), orient='index') df['text'] = df['text'].apply(lambda x: x.replace('\n',' ') if x else x) df.to_csv('helpers/inspection/readable/{}_modern_bios_for_inspection.csv'.format(xxwiki), encoding='utf-8') celebdf = pd.DataFrame.from_dict(celeb, orient='index') celebdf['either'] = celebdf['player'] + celebdf['actress'] celebdf.sort('either') celebdf['wiki'].convert_object(convert_dates=True) celebdf natlangpiv = pd.pivot_table(celebdf, values='celeb_per', rows='decade', cols='wiki') natlangpiv