#!/usr/bin/env python # coding: utf-8 # # Data Processing # Reproduce results of our [AAAI paper](http://www.cs.iit.edu/~culotta/pubs/culotta15predicting.pdf). # This notebook assumes the data is in place. You can get data either by running [data_collection.ipynb](https://github.com/tapilab/aaai-2015-demographics/blob/master/src/data_collection.ipynb) # or by running the next cell, which downloads it. # In[2]: # Download Twitter data from server if not already present. import os import urllib for fname in ['username2brand.pkl', 'brand2counts.pkl', 'id2brand.pkl', 'brands.json']: if not os.path.isfile('../data/' + fname): url = 'http://tapi.cs.iit.edu/data/aaai-2015-demographics/originals/' + fname print('downloading %s to %s' % (url, '../data/' + fname)) urllib.urlretrieve(url, "../data/" + fname) else: print(fname + 'already exists.') # In[3]: # Unpickle everything import pickle id2brand = pickle.load(open('../data/id2brand.pkl', 'rb')) brand2counts = pickle.load(open('../data/brand2counts.pkl', 'rb')) username2brand = pickle.load(open('../data/username2brand.pkl', 'rb')) # In[5]: import numpy as np # Plot descriptive stats of the data. import matplotlib.pyplot as plt get_ipython().run_line_magic('matplotlib', 'inline') def plot_data_figs(): figure, axes = plt.subplots(2, 1, sharex=True) unique_friends = sorted([len(d.keys()) for d in brand2counts.values()], reverse=True) axes[0].plot(unique_friends) axes[0].set_xscale('log') axes[0].set_yscale('log') axes[0].set_title('number of unique neighbors', size=16) brcounts = sorted([sum(d.values()) for d in brand2counts.values()], reverse=True) print('total friend links:', sum(brcounts)) axes[1].plot(brcounts) axes[1].set_xscale('log') axes[1].set_yscale('log') axes[1].set_title('number of neighbor links', size=16) axes[1].set_xlim((0,1500)) axes[1].set_xlabel('rank', size=14) axes[1].set_ylabel(' ' * 30 + 'count', size=14) figure.tight_layout() plt.savefig('data.pdf', bbox_inches='tight') plot_data_figs() # In[31]: # Normalize data and create sparse matrix. import numpy as np from numpy import array as npa from sklearn.feature_extraction import DictVectorizer from sklearn.feature_extraction.text import TfidfTransformer from sklearn.preprocessing import scale brand_ids = npa(list(brand2counts.keys())) vec = DictVectorizer() X = vec.fit_transform(brand2counts.values()) print('The feature vector for one brand looks like this:\n%s' % str(X[0])) # In[32]: # Normalize by row. from sklearn.preprocessing import normalize print('%d total friend links' % X.sum()) X = normalize(X, norm='l1', axis=1) print('The normalized feature vector for one brand looks like this:\n%s' % str(X[0])) # In[33]: # Do cross-fold validation for different demographics. from scipy.stats import pearsonr from sklearn.cross_validation import KFold from sklearn.feature_selection import f_regression from sklearn.linear_model import ElasticNet, Lasso, MultiTaskElasticNet, MultiTaskElasticNetCV, Ridge, RidgeCV from sklearn.metrics import mean_squared_error feats = npa(vec.get_feature_names()) def plot_scatter(preds, truths, ylabels): for yi, ylabel in enumerate(ylabels): pr = [p[yi] for p in preds] tr = [t[yi] for t in truths] plt.figure() plt.scatter(tr, pr) plt.xlabel('truth') plt.ylabel('pred') corr = pearsonr(pr, tr) plt.title('%s r=%.2f (%.2g)' % (ylabel, corr[0], corr[1])) plt.show() def print_top_feats(m, feature_names, labels, n=10): for yi, ylabel in enumerate(labels): print('Top Coefficients for', ylabel) coef = m.coef_[yi] srted = np.argsort(coef) topi = srted[::-1][:n] boti = srted[:n] print('pos:' + ' '.join('%s (%.2g)' % (n, c) for n, c in zip(feature_names[topi], coef[topi]))) print('neg:' + ' '.join('%s (%.2g)' % (n, c) for n, c in zip(feature_names[boti], coef[boti]))) def get_yvalues(ylabels, demo): return npa([float(demo[yl][:-1]) for yl in ylabels]) def get_correlations(preds, truths, ylabels): results = [] for i, y in enumerate(ylabels): pr = [p[i] for p in preds] tr = [t[i] for t in truths] results.append(pearsonr(pr, tr)[0]) return results correlations = [] category_results = {} outputs = {'Education': ['No College', 'College', 'Grad School'], 'Children': ['No Kids', 'Has Kids'], 'Income': ['$0-50k', '$50-100k', '$100-150k', '$150k+'], 'Gender': ['Male', 'Female'], 'Age': ['18-24', '25-34', '35-44', '45-54', '55-64', '65+'], 'Ethnicity': ['Caucasian', 'Hispanic', 'African American', 'Asian']} def get_model(): # return Ridge(.1) # return ElasticNet(alpha=1e-5, l1_ratio=0.5) return MultiTaskElasticNet(alpha=1e-5, l1_ratio=0.5) # Labels grouped together for use by MultiTaskElasticNet. for category, ylabels in outputs.items(): indices = [i for i, bid in enumerate(brand_ids) if len(set(ylabels) & set(id2brand[bid]['demo'].keys())) == len(ylabels)] print('predicting', ylabels, 'for', len(indices), 'brands') y = npa([get_yvalues(ylabels, id2brand[brand_ids[bid]]['demo']) for bid in indices]) thisX = X[indices].toarray() cv = KFold(len(y), 5, shuffle=True, random_state=123456) preds = [] truths = [] for train, test in cv: m = get_model() m.fit(thisX[train], y[train]) pred = m.predict(thisX[test]) preds.extend(pred) truths.extend(y[test]) m = get_model() m.fit(thisX, y) category_results[category] = {'preds': preds, 'truths': truths, 'model': m} plot_scatter(preds, truths, ylabels) print_top_feats(m, feats, ylabels) correlations.append(np.mean(get_correlations(preds, truths, ylabels))) print('average correlation=', np.mean(correlations)) # In[34]: # Plot scatters. import math from matplotlib import lines def nrmsd(truths, preds): """ Normalized root mean squared deviation. """ return rmsd(truths, preds) / (max(truths) - min(truths)) def rmsd(truths, preds): """ Normalized root mean squared deviation. """ return math.sqrt(mean_squared_error(preds, truths)) def plot_scatter_subfig(axis, category, yidx): results = category_results[category] name = outputs[category][yidx] preds = [p[yidx] for p in results['preds']] truths = [p[yidx] for p in results['truths']] fit = np.polyfit(truths, preds, 1) fit_fn = np.poly1d(fit) axis.plot(truths, preds, 'o', truths, fit_fn(truths), 'k', linewidth=1.5, ms=2, markerfacecolor='None', markeredgecolor='b') axis.set_title('%s\n$r=%.2f$' % (name, pearsonr(preds, truths)[0]), size=14) axis.locator_params(nbins=4, tight=True) mean = np.mean(truths) start, end = axis.get_xlim() def make_scatters_fig(): figure, axes = plt.subplots(3, 7, figsize=(15,8)) # Row 1 plot_scatter_subfig(axes[0][0], 'Education', 0) plot_scatter_subfig(axes[0][1], 'Education', 1) plot_scatter_subfig(axes[0][2], 'Education', 2) plot_scatter_subfig(axes[0][3], 'Income', 0) plot_scatter_subfig(axes[0][4], 'Income', 1) plot_scatter_subfig(axes[0][5], 'Income', 2) plot_scatter_subfig(axes[0][6], 'Income', 3) # Row 2 for i in range(6): plot_scatter_subfig(axes[1][i], 'Age', i) # Row 3 for i in range(4): plot_scatter_subfig(axes[2][i], 'Ethnicity', i) plot_scatter_subfig(axes[2][4], 'Gender', 0) plot_scatter_subfig(axes[2][5], 'Children', 0) # Now add titles. axes[1, 6].axis('off') axes[2, 6].axis('off') axes[0, 1].text(.5, 1.35, 'Education', verticalalignment='bottom', horizontalalignment='center', color='black', fontsize=18, weight='bold', transform=axes[0, 1].transAxes) axes[0, 4].text(1.1, 1.35, 'Income', verticalalignment='bottom', horizontalalignment='center', color='black', fontsize=18, weight='bold', transform=axes[0, 4].transAxes) axes[1, 2].text(1.1, 1.3, 'Age', verticalalignment='bottom', horizontalalignment='center', color='black', fontsize=18, weight='bold', transform=axes[1, 2].transAxes) axes[2, 1].text(1.1, 1.32, 'Ethnicity', verticalalignment='bottom', horizontalalignment='center', color='black', fontsize=18, weight='bold', transform=axes[2, 1].transAxes) axes[2, 4].text(.5, 1.32, 'Gender', verticalalignment='bottom', horizontalalignment='center', color='black', fontsize=18, weight='bold', transform=axes[2, 4].transAxes) axes[2, 5].text(.5, 1.32, 'Family', verticalalignment='bottom', horizontalalignment='center', color='black', fontsize=18, weight='bold', transform=axes[2, 5].transAxes) axes[1][0].set_ylabel('Predicted Value (%)', size=18) plt.subplots_adjust(hspace=.7) plt.figtext(0.5,.08,"True Value (%)",fontdict={'fontsize':18}, verticalalignment='top', horizontalalignment='center') plt.savefig('scatters.pdf', bbox_inches='tight') make_scatters_fig() # In[ ]: # Print the top features. from collections import defaultdict from twutil import collect def get_top_user_ids(): id_list = [] top_user_ids = defaultdict(lambda: defaultdict(lambda: [])) for category in category_results: results = category_results[category] coef = results['model'].coef_ for yi, ylabel in enumerate(outputs[category]): topi = np.argsort(coef[yi])[::-1][:5] print(category, ylabel, ' '.join('%d' % x for x in feats[topi])) id_list.extend(feats[topi]) top_user_ids[category][ylabel] = feats[topi] return top_user_ids, id_list def get_top_user_names(): top_user_ids, id_list = get_top_user_ids() user_names = collect.lookup_handles(id_list) id2user = dict([(int(x[1]), x[0]) for x in user_names]) for category in top_user_ids: for label in top_user_ids[category]: top_user_ids[category][label] = [id2user[x] for x in top_user_ids[category][label] if x in id2user] return top_user_ids top_users = get_top_user_names() # In[16]: import re def list2row(mylist, fmt='%s'): return ' & '.join([fmt % i for i in mylist]) def verb(s, delim=';'): return '\\verb' + delim + s + delim def clean(s): return re.sub('_', '\\_', re.sub('\$', '\\$', s)) def make_user_table(top_users): outf = open('users.tex', 'wt') outf.write('\\begin{table*}[t]\n\\centering\n\\begin{tabular}{|c|c|l|}\n\\hline\n') outf.write(list2row(['{\\bf Category}', '{\\bf Value}', '{\\bf Top Accounts}']) + '\\\\\n\\hline\n') for ci, category in enumerate(outputs): for li, label in enumerate(outputs[category]): row = [''] * 3 row[0] = category if li == 0 else '' row[1] = clean(label) row[2] = ', '.join(clean(x) for x in top_users[category][label]) outf.write(list2row(row) + '\\\\\n') outf.write('\\hline\n') outf.write('\\end{tabular}\\caption{Accounts with the highest estimated coefficients for each category.\\label{tab.users}}\n\\end{table*}\n') make_user_table(top_users) # In[17]: get_ipython().system('cat users.tex') # **Comparison with supervised learning (logistic regression)** # # We manually labeled individual Twitter users with race/gender to compare accuracy of the model trained above. For comparison, we also train a supervised logistic regression classifier, which uses the same feature vector as our model. # # Because the labeled data contains personally identifiable information, we have elected not to share it publicly. Please contact the authors to discuss possible data sharing agreements. # In[39]: # Compute accuracy on users labeled by race. from collections import Counter import random from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score, confusion_matrix, f1_score from sklearn.utils.extmath import safe_sparse_dot def train_demo_model(ylabels): indices = [i for i, bid in enumerate(brand_ids) if len(set(ylabels) & set(id2brand[bid]['demo'].keys())) == len(ylabels)] print('training race model on', len(indices), 'brands') y = npa([get_yvalues(ylabels, id2brand[brand_ids[bid]]['demo']) for bid in indices]) thisX = X[indices].toarray() m = get_model() m.fit(thisX, scale(y)) m.coef_ = m.coef_[0:3] return m def map_race_label(label): return ['white', 'latin', 'black', 'asian'].index(label) def read_labeled_data(fname, label_map_f): users = [] labels = [] friends = [] for line in open(fname): parts = line.strip().split() if len(parts) > 10: users.append(parts[0]) labels.append(label_map_f(parts[1])) friends.append(Counter([int(x) for x in parts[2:]])) X_race = vec.transform(friends) return users, npa(labels), X_race def label_by_reg(X_race, m): """ Scale coefficients per class to make them comparable; then keep only positive coefficients. """ coef = m.coef_ coef = scale(m.coef_, axis=0) # Scale by class label for i in range(len(coef)): topi = np.where(coef[i] > 0) topv = coef[i][topi] coef[i] = [0] * len(coef[i]) coef[i][topi] = topv pred = safe_sparse_dot(coef, X_race.T, dense_output=True).T return np.argmax(pred, axis=1) def label_by_clf(X_race, y_race, pct): clf = LogisticRegression() cv = KFold(len(y_race), 3, shuffle=True, random_state=123456) preds = np.zeros(len(y_race), int) for train, test in cv: train = random.sample(set(train), int(len(train) * pct)) clf.fit(X_race[train], y_race[train]) preds[test] = clf.predict(X_race[test]) return preds def eval_labeled(truth, pred, labels): label_idx = np.arange(len(labels)) acc, f1 = (accuracy_score(pred, truth), f1_score(truth, pred, labels=label_idx, average='macro', pos_label=None)) print('acc=', acc, 'f1=', f1) print(confusion_matrix(truth, pred)) return f1 def do_race_expt(): labels = ['Caucasian', 'Hispanic', 'African American', 'Asian'] users_race, y_race, X_race = read_labeled_data('../data/race.txt', map_race_label) X_race = X_race[np.where(y_race != 3)] y_race = y_race[np.where(y_race != 3)] print('X_race shape=', str(X_race.get_shape()), 'total matches=', X_race.sum()) labels = labels[0:3] reg = train_demo_model(labels) pred_reg = label_by_reg(X_race, reg) reg_f1 = eval_labeled(y_race, pred_reg, labels) clf_f1s = [] for pct in [.1, .2, .3, .4, .5, .6, .7, .8, .9, 1.]: pred_clf = label_by_clf(X_race, y_race, pct) clf_f1s.append(eval_labeled(y_race, pred_clf, labels)) return reg_f1, clf_f1s race_results = do_race_expt() # In[40]: # Compute accuracy on data labeled by gender. def map_gender_label(label): return ['Male', 'Female'].index(label) def do_gender_expt(): labels = ['Male', 'Female'] users_gender, y_gender, X_gender = read_labeled_data('../data/gender.txt', map_gender_label) print('X_gender shape=', str(X_gender.get_shape()), 'total matches=', X_gender.sum()) reg = train_demo_model(labels) pred_reg = label_by_reg(X_gender, reg) reg_f1 = eval_labeled(y_gender, pred_reg, labels) clf_f1s = [] for pct in [.1, .2, .3, .4, .5, .6, .7, .8, .9, 1.]: pred_clf = label_by_clf(X_gender, y_gender, pct) clf_f1s.append(eval_labeled(y_gender, pred_clf, labels)) return reg_f1, clf_f1s gender_results = do_gender_expt() # In[41]: def plot_labeled_results(reg_results, clf_results, xticks, axis, title): axis.plot(xticks, [reg_results] * len(clf_results), 'g--', label='regression', lw=3) axis.plot(xticks, clf_results, 'bo-', label='classification') axis.set_title(title, size=16) def make_labeled_plot(gender_results, race_results): xticks = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100] figure, axes = plt.subplots(2, 1, sharex=True) plot_labeled_results(gender_results[0], gender_results[1], xticks, axes[0], 'Gender') plot_labeled_results(race_results[0], race_results[1], xticks, axes[1], 'Ethnicity') axes[1].set_ylabel((' ' * 25) + 'Macro F1', size=16) axes[1].set_xlabel('% of labeled training data', size=16) axes[1].legend(loc='lower right') plt.savefig('labeled.pdf', bbox_inches='tight') make_labeled_plot(gender_results, race_results) # In[42]: # Plot F1 as the number of friends per user increases. import random def sample_friends(X, n): X_sample = X.copy() for i, xi in enumerate(X_sample): nnz = xi.getnnz() if n < nnz: nzcols = xi.nonzero()[1] indices = random.sample(range(nnz), nnz - n) X_sample[i, nzcols[indices]] = 0. X_sample.eliminate_zeros() return X_sample def _do_nfriends_expt(XX, y, m, labels): ys = [] stderrs = [] xs = [1,2,3,4,5,10,20,30,40,50] # range(1, 50)[::5] # [::50] for nfriends in xs: f1s = [] for sample in range(5): X_sample = sample_friends(XX, nfriends) pred_reg = label_by_reg(X_sample, m) reg_f1 = eval_labeled(y, pred_reg, labels) f1s.append(reg_f1) ys.append(np.mean(f1s)) stderrs.append(np.std(f1s) / math.sqrt(len(f1s))) return npa(xs), npa(ys), npa(stderrs) def do_nfriends_expt(): random.seed(1234) labels = ['Caucasian', 'Hispanic', 'African American', 'Asian'] users_race, y_race, X_race = read_labeled_data('../data/race.txt', map_race_label) labels = labels[:3] m = train_demo_model(labels) xs_r, ys_r, stderrs_r = _do_nfriends_expt(X_race, y_race, m, labels) labels = ['Male', 'Female'] users_race, y_gender, X_gender = read_labeled_data('../data/gender.txt', map_gender_label) m = train_demo_model(labels) xs_g, ys_g, stderrs_g = _do_nfriends_expt(X_gender, y_gender, m, labels) figure, axes = plt.subplots(2, 1, sharex=True) axes[0].plot(xs_g, ys_g, 'bo-', ms=3) axes[0].fill_between(xs_g, ys_g - stderrs_g, ys_g + stderrs_g, alpha=0.4, facecolor='b') axes[0].set_title('Gender', size=16) axes[1].plot(xs_r, ys_r, 'bo-', ms=3) axes[1].fill_between(xs_r, ys_r - stderrs_r, ys_r + stderrs_r, alpha=0.4, facecolor='b') axes[1].set_title('Ethnicity', size=16) axes[1].set_xlabel('# of friends per user', size=16) axes[1].set_ylabel((' ' * 25) + 'Macro F1', size=16) axes[1].legend(loc='lower right') plt.savefig('friends.pdf', bbox_inches='tight') do_nfriends_expt()