#!/usr/bin/env python
# coding: utf-8

# #  Data Processing

# Reproduce results of our [AAAI paper](http://www.cs.iit.edu/~culotta/pubs/culotta15predicting.pdf).

# This notebook assumes the data is in place. You can get data either by running [data_collection.ipynb](https://github.com/tapilab/aaai-2015-demographics/blob/master/src/data_collection.ipynb)
# or by running the next cell, which downloads it.

# In[2]:


# Download Twitter data from server if not already present.
import os
import urllib

for fname in ['username2brand.pkl', 'brand2counts.pkl', 'id2brand.pkl', 'brands.json']:
    if not os.path.isfile('../data/' + fname):
        url = 'http://tapi.cs.iit.edu/data/aaai-2015-demographics/originals/' + fname
        print('downloading %s to %s' % (url, '../data/' + fname))
        urllib.urlretrieve(url, "../data/" + fname)
    else:
        print(fname + 'already exists.')


# In[3]:


# Unpickle everything
import pickle
id2brand = pickle.load(open('../data/id2brand.pkl', 'rb'))
brand2counts = pickle.load(open('../data/brand2counts.pkl', 'rb'))
username2brand = pickle.load(open('../data/username2brand.pkl', 'rb'))


# In[5]:


import numpy as np

# Plot descriptive stats of the data.
import matplotlib.pyplot as plt
get_ipython().run_line_magic('matplotlib', 'inline')

def plot_data_figs():
    figure, axes = plt.subplots(2, 1, sharex=True)
    unique_friends = sorted([len(d.keys()) for d in brand2counts.values()], reverse=True)
    axes[0].plot(unique_friends)
    axes[0].set_xscale('log')
    axes[0].set_yscale('log')
    axes[0].set_title('number of unique neighbors', size=16)

    brcounts = sorted([sum(d.values()) for d in brand2counts.values()], reverse=True)
    print('total friend links:', sum(brcounts))
    axes[1].plot(brcounts)
    axes[1].set_xscale('log')
    axes[1].set_yscale('log')
    axes[1].set_title('number of neighbor links', size=16)
    axes[1].set_xlim((0,1500))
    axes[1].set_xlabel('rank', size=14)
    axes[1].set_ylabel(' ' * 30 + 'count', size=14)
    figure.tight_layout()
    plt.savefig('data.pdf', bbox_inches='tight')

plot_data_figs()


# In[31]:


# Normalize data and create sparse matrix.
import numpy as np
from numpy import array as npa
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.preprocessing import scale

brand_ids = npa(list(brand2counts.keys()))
vec = DictVectorizer()
X = vec.fit_transform(brand2counts.values())
print('The feature vector for one brand looks like this:\n%s' % str(X[0]))


# In[32]:


# Normalize by row.
from sklearn.preprocessing import normalize
print('%d total friend links' % X.sum())
X = normalize(X, norm='l1', axis=1)
print('The normalized feature vector for one brand looks like this:\n%s' % str(X[0]))


# In[33]:


# Do cross-fold validation for different demographics.
from scipy.stats import pearsonr
from sklearn.cross_validation import KFold
from sklearn.feature_selection import f_regression
from sklearn.linear_model import ElasticNet, Lasso, MultiTaskElasticNet, MultiTaskElasticNetCV, Ridge, RidgeCV
from sklearn.metrics import mean_squared_error

feats = npa(vec.get_feature_names())

def plot_scatter(preds, truths, ylabels):
    for yi, ylabel in enumerate(ylabels):
        pr = [p[yi] for p in preds]
        tr = [t[yi] for t in truths]
        plt.figure()
        plt.scatter(tr, pr)
        plt.xlabel('truth')
        plt.ylabel('pred')
        corr = pearsonr(pr, tr)
        plt.title('%s r=%.2f (%.2g)' % (ylabel, corr[0], corr[1]))
        plt.show()
    
def print_top_feats(m, feature_names, labels, n=10):
    for yi, ylabel in enumerate(labels):
        print('Top Coefficients for', ylabel)
        coef = m.coef_[yi]
        srted = np.argsort(coef)
        topi = srted[::-1][:n]
        boti = srted[:n]
        print('pos:' + ' '.join('%s (%.2g)' % (n, c) for n, c in zip(feature_names[topi], coef[topi])))
        print('neg:' + ' '.join('%s (%.2g)' % (n, c) for n, c in zip(feature_names[boti], coef[boti])))

def get_yvalues(ylabels, demo):
    return npa([float(demo[yl][:-1]) for yl in ylabels])

def get_correlations(preds, truths, ylabels):
    results = []
    for i, y in enumerate(ylabels):
        pr = [p[i] for p in preds]
        tr = [t[i] for t in truths]
        results.append(pearsonr(pr, tr)[0])
    return results

correlations = []
category_results = {}
outputs = {'Education': ['No College', 'College', 'Grad School'],
           'Children': ['No Kids', 'Has Kids'],
           'Income': ['$0-50k', '$50-100k', '$100-150k', '$150k+'],
           'Gender': ['Male', 'Female'],
           'Age': ['18-24', '25-34', '35-44', '45-54', '55-64', '65+'],
           'Ethnicity': ['Caucasian', 'Hispanic', 'African American', 'Asian']}

def get_model():
    # return Ridge(.1)
    # return ElasticNet(alpha=1e-5, l1_ratio=0.5)
    return MultiTaskElasticNet(alpha=1e-5, l1_ratio=0.5)
    
# Labels grouped together for use by MultiTaskElasticNet.
for category, ylabels in outputs.items():
    indices = [i for i, bid in enumerate(brand_ids) if len(set(ylabels) & set(id2brand[bid]['demo'].keys())) == len(ylabels)]
    print('predicting', ylabels, 'for', len(indices), 'brands')
    y = npa([get_yvalues(ylabels, id2brand[brand_ids[bid]]['demo']) for bid in indices])
    thisX = X[indices].toarray()
    cv = KFold(len(y), 5, shuffle=True, random_state=123456)
    preds = []
    truths = []
    for train, test in cv:
        m = get_model()
        m.fit(thisX[train], y[train])
        pred = m.predict(thisX[test])
        preds.extend(pred)
        truths.extend(y[test])
    m = get_model()
    m.fit(thisX, y)
    category_results[category] = {'preds': preds, 'truths': truths, 'model': m}
    plot_scatter(preds, truths, ylabels)
    print_top_feats(m, feats, ylabels)
    correlations.append(np.mean(get_correlations(preds, truths, ylabels)))
print('average correlation=', np.mean(correlations))


# In[34]:


# Plot scatters.
import math
from matplotlib import lines

def nrmsd(truths, preds):
    """ Normalized root mean squared deviation. """
    return rmsd(truths, preds) / (max(truths) - min(truths))

def rmsd(truths, preds):
    """ Normalized root mean squared deviation. """
    return math.sqrt(mean_squared_error(preds, truths))

def plot_scatter_subfig(axis, category, yidx):
    results = category_results[category]
    name = outputs[category][yidx]
    preds = [p[yidx] for p in results['preds']]
    truths = [p[yidx] for p in results['truths']]

    fit = np.polyfit(truths, preds, 1)
    fit_fn = np.poly1d(fit)
    axis.plot(truths, preds, 'o', truths, fit_fn(truths), 'k', linewidth=1.5,
              ms=2, markerfacecolor='None', markeredgecolor='b')
    axis.set_title('%s\n$r=%.2f$' % (name, pearsonr(preds, truths)[0]), size=14)
    axis.locator_params(nbins=4, tight=True) 
    mean = np.mean(truths)
    start, end = axis.get_xlim()
 
def make_scatters_fig():
    figure, axes = plt.subplots(3, 7, figsize=(15,8))
        # Row 1
    plot_scatter_subfig(axes[0][0], 'Education', 0)
    plot_scatter_subfig(axes[0][1], 'Education', 1)
    plot_scatter_subfig(axes[0][2], 'Education', 2)
    plot_scatter_subfig(axes[0][3], 'Income', 0)
    plot_scatter_subfig(axes[0][4], 'Income', 1)
    plot_scatter_subfig(axes[0][5], 'Income', 2)
    plot_scatter_subfig(axes[0][6], 'Income', 3)
    # Row 2
    for i in range(6):
        plot_scatter_subfig(axes[1][i], 'Age', i)        
    # Row 3
    for i in range(4):
        plot_scatter_subfig(axes[2][i], 'Ethnicity', i)        
    plot_scatter_subfig(axes[2][4], 'Gender', 0)        
    plot_scatter_subfig(axes[2][5], 'Children', 0)    
    # Now add titles.
    axes[1, 6].axis('off')
    axes[2, 6].axis('off')
    axes[0, 1].text(.5, 1.35, 'Education',
                    verticalalignment='bottom', horizontalalignment='center',
                    color='black', fontsize=18, weight='bold', transform=axes[0, 1].transAxes)
    axes[0, 4].text(1.1, 1.35, 'Income',
                    verticalalignment='bottom', horizontalalignment='center',
                    color='black', fontsize=18, weight='bold', transform=axes[0, 4].transAxes)
    axes[1, 2].text(1.1, 1.3, 'Age',
                    verticalalignment='bottom', horizontalalignment='center',
                    color='black', fontsize=18, weight='bold', transform=axes[1, 2].transAxes)
    axes[2, 1].text(1.1, 1.32, 'Ethnicity',
                    verticalalignment='bottom', horizontalalignment='center',
                    color='black', fontsize=18, weight='bold', transform=axes[2, 1].transAxes)
    axes[2, 4].text(.5, 1.32, 'Gender',
                    verticalalignment='bottom', horizontalalignment='center',
                    color='black', fontsize=18, weight='bold', transform=axes[2, 4].transAxes)
    axes[2, 5].text(.5, 1.32, 'Family',
                    verticalalignment='bottom', horizontalalignment='center',
                    color='black', fontsize=18, weight='bold', transform=axes[2, 5].transAxes)

    axes[1][0].set_ylabel('Predicted Value (%)', size=18)
    plt.subplots_adjust(hspace=.7)   
    plt.figtext(0.5,.08,"True Value (%)",fontdict={'fontsize':18}, verticalalignment='top', horizontalalignment='center')
    plt.savefig('scatters.pdf', bbox_inches='tight')

make_scatters_fig()


# In[ ]:


# Print the top features.
from collections import defaultdict
from twutil import collect

def get_top_user_ids():
    id_list = []
    top_user_ids = defaultdict(lambda: defaultdict(lambda: []))
    for category in category_results:
        results = category_results[category]
        coef = results['model'].coef_
        for yi, ylabel in enumerate(outputs[category]):
            topi = np.argsort(coef[yi])[::-1][:5]
            print(category, ylabel, ' '.join('%d' % x for x in feats[topi]))
            id_list.extend(feats[topi])
            top_user_ids[category][ylabel] = feats[topi]
    return top_user_ids, id_list

def get_top_user_names():
    top_user_ids, id_list = get_top_user_ids()
    user_names = collect.lookup_handles(id_list)
    id2user = dict([(int(x[1]), x[0]) for x in user_names])
    for category in top_user_ids:
        for label in top_user_ids[category]:
            top_user_ids[category][label] = [id2user[x] for x in top_user_ids[category][label] if x in id2user]
    return top_user_ids

top_users = get_top_user_names()


# In[16]:


import re

def list2row(mylist, fmt='%s'):
    return ' & '.join([fmt % i for i in mylist])

def verb(s, delim=';'):
    return '\\verb' + delim + s + delim

def clean(s):
    return re.sub('_', '\\_', re.sub('\$', '\\$', s))

def make_user_table(top_users):
    outf = open('users.tex', 'wt')
    outf.write('\\begin{table*}[t]\n\\centering\n\\begin{tabular}{|c|c|l|}\n\\hline\n')
    outf.write(list2row(['{\\bf Category}', '{\\bf Value}', '{\\bf Top Accounts}']) +
                   '\\\\\n\\hline\n')
    for ci, category in enumerate(outputs):
        for li, label in enumerate(outputs[category]):
            row = [''] * 3
            row[0] = category if li == 0 else ''
            row[1] = clean(label)
            row[2] = ', '.join(clean(x) for x in top_users[category][label])
            outf.write(list2row(row) + '\\\\\n')
        outf.write('\\hline\n')
    outf.write('\\end{tabular}\\caption{Accounts with the highest estimated coefficients for each category.\\label{tab.users}}\n\\end{table*}\n')

make_user_table(top_users)
    

# In[17]:


get_ipython().system('cat users.tex')


# **Comparison with supervised learning (logistic regression)**
# 
# We manually labeled individual Twitter users with race/gender to compare accuracy of the model trained above. For comparison, we also train a supervised logistic regression classifier, which uses the same feature vector as our model.
# 
# Because the labeled data contains personally identifiable information, we have elected not to share it publicly. Please contact the authors to discuss possible data sharing agreements.

# In[39]:


# Compute accuracy on users labeled by race.
from collections import Counter
import random
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score
from sklearn.utils.extmath import safe_sparse_dot

def train_demo_model(ylabels):
    indices = [i for i, bid in enumerate(brand_ids) if len(set(ylabels) & set(id2brand[bid]['demo'].keys())) == len(ylabels)]
    print('training race model on', len(indices), 'brands')
    y = npa([get_yvalues(ylabels, id2brand[brand_ids[bid]]['demo']) for bid in indices])
    thisX = X[indices].toarray()
    m = get_model()
    m.fit(thisX, scale(y))
    m.coef_ = m.coef_[0:3]
    return m

def map_race_label(label):
    return ['white', 'latin', 'black', 'asian'].index(label)
    
def read_labeled_data(fname, label_map_f):
    users = []
    labels = []
    friends = []
    for line in open(fname):
        parts = line.strip().split()
        if len(parts) > 10:
            users.append(parts[0])
            labels.append(label_map_f(parts[1]))
            friends.append(Counter([int(x) for x in parts[2:]]))
    X_race = vec.transform(friends)
    return users, npa(labels), X_race

def label_by_reg(X_race, m):
    """ Scale coefficients per class to make them comparable;
    then keep only positive coefficients. """
    coef = m.coef_
    coef = scale(m.coef_, axis=0)  # Scale by class label
    for i in range(len(coef)):
        topi = np.where(coef[i] > 0)
        topv = coef[i][topi]
        coef[i] = [0] * len(coef[i])
        coef[i][topi] = topv
    pred = safe_sparse_dot(coef, X_race.T, dense_output=True).T
    return np.argmax(pred, axis=1)

def label_by_clf(X_race, y_race, pct):
    clf = LogisticRegression()
    cv = KFold(len(y_race), 3, shuffle=True, random_state=123456)
    preds = np.zeros(len(y_race), int)
    for train, test in cv:
        train = random.sample(set(train), int(len(train) * pct))
        clf.fit(X_race[train], y_race[train])
        preds[test] = clf.predict(X_race[test])
    return preds

def eval_labeled(truth, pred, labels):
    label_idx = np.arange(len(labels))
    acc, f1 = (accuracy_score(pred, truth),
                       f1_score(truth, pred, labels=label_idx,
                                average='macro', pos_label=None))
    print('acc=', acc, 'f1=', f1)
    print(confusion_matrix(truth, pred))
    return f1

    
def do_race_expt():
    labels = ['Caucasian', 'Hispanic', 'African American', 'Asian']
    users_race, y_race, X_race = read_labeled_data('../data/race.txt', map_race_label)
    X_race = X_race[np.where(y_race != 3)]
    y_race = y_race[np.where(y_race != 3)]
    print('X_race shape=', str(X_race.get_shape()), 'total matches=', X_race.sum())
    labels = labels[0:3]
    reg = train_demo_model(labels)
    pred_reg = label_by_reg(X_race, reg)
    reg_f1 = eval_labeled(y_race, pred_reg, labels)
    clf_f1s = []
    for pct in [.1, .2, .3, .4, .5, .6, .7, .8, .9, 1.]:
        pred_clf = label_by_clf(X_race, y_race, pct)
        clf_f1s.append(eval_labeled(y_race, pred_clf, labels))
    return reg_f1, clf_f1s

race_results = do_race_expt()


# In[40]:


# Compute accuracy on data labeled by gender.
def map_gender_label(label):
    return ['Male', 'Female'].index(label)

def do_gender_expt():
    labels = ['Male', 'Female']
    users_gender, y_gender, X_gender = read_labeled_data('../data/gender.txt', map_gender_label)
    print('X_gender shape=', str(X_gender.get_shape()), 'total matches=', X_gender.sum())
    
    reg = train_demo_model(labels)
    pred_reg = label_by_reg(X_gender, reg)
    reg_f1 = eval_labeled(y_gender, pred_reg, labels)
    clf_f1s = []
    for pct in [.1, .2, .3, .4, .5, .6, .7, .8, .9, 1.]:
        pred_clf = label_by_clf(X_gender, y_gender, pct)
        clf_f1s.append(eval_labeled(y_gender, pred_clf, labels))
    return reg_f1, clf_f1s
    
gender_results = do_gender_expt()


# In[41]:


def plot_labeled_results(reg_results, clf_results, xticks, axis, title):
    axis.plot(xticks, [reg_results] * len(clf_results), 'g--', label='regression', lw=3)
    axis.plot(xticks, clf_results, 'bo-', label='classification')
    axis.set_title(title, size=16)
    
def make_labeled_plot(gender_results, race_results):
    xticks = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
    figure, axes = plt.subplots(2, 1, sharex=True)
    plot_labeled_results(gender_results[0], gender_results[1], xticks, axes[0], 'Gender')
    plot_labeled_results(race_results[0], race_results[1], xticks, axes[1], 'Ethnicity')
    axes[1].set_ylabel((' ' * 25) + 'Macro F1', size=16)
    axes[1].set_xlabel('% of labeled training data', size=16)
    axes[1].legend(loc='lower right')
    plt.savefig('labeled.pdf', bbox_inches='tight')

    
make_labeled_plot(gender_results, race_results)


# In[42]:


# Plot F1 as the number of friends per user increases.
import random

def sample_friends(X, n):
    X_sample = X.copy()
    for i, xi in enumerate(X_sample):
        nnz = xi.getnnz()
        if n < nnz:
            nzcols = xi.nonzero()[1]
            indices = random.sample(range(nnz), nnz - n)
            X_sample[i, nzcols[indices]] = 0.
            X_sample.eliminate_zeros()
    return X_sample

def _do_nfriends_expt(XX, y, m, labels):
    ys = []
    stderrs = []
    xs = [1,2,3,4,5,10,20,30,40,50]  # range(1, 50)[::5]  # [::50]
    for nfriends in xs:
        f1s = []
        for sample in range(5):
            X_sample = sample_friends(XX, nfriends)
            pred_reg = label_by_reg(X_sample, m)
            reg_f1 = eval_labeled(y, pred_reg, labels)
            f1s.append(reg_f1)
        ys.append(np.mean(f1s))
        stderrs.append(np.std(f1s) / math.sqrt(len(f1s)))
    return npa(xs), npa(ys), npa(stderrs)

def do_nfriends_expt():
    random.seed(1234)
    labels = ['Caucasian', 'Hispanic', 'African American', 'Asian']
    users_race, y_race, X_race = read_labeled_data('../data/race.txt', map_race_label)
    labels = labels[:3]
    m = train_demo_model(labels)
    xs_r, ys_r, stderrs_r = _do_nfriends_expt(X_race, y_race, m, labels)

    labels = ['Male', 'Female']
    users_race, y_gender, X_gender = read_labeled_data('../data/gender.txt', map_gender_label)
    m = train_demo_model(labels)
    xs_g, ys_g, stderrs_g = _do_nfriends_expt(X_gender, y_gender, m, labels)
    
    figure, axes = plt.subplots(2, 1, sharex=True)
    axes[0].plot(xs_g, ys_g, 'bo-', ms=3)
    axes[0].fill_between(xs_g, ys_g - stderrs_g, ys_g + stderrs_g, alpha=0.4, facecolor='b')
    axes[0].set_title('Gender', size=16)
    axes[1].plot(xs_r, ys_r, 'bo-', ms=3)
    axes[1].fill_between(xs_r, ys_r - stderrs_r, ys_r + stderrs_r, alpha=0.4, facecolor='b')
    axes[1].set_title('Ethnicity', size=16)
    
    axes[1].set_xlabel('# of friends per user', size=16)
    axes[1].set_ylabel((' ' * 25) + 'Macro F1', size=16)
    axes[1].legend(loc='lower right')
    plt.savefig('friends.pdf', bbox_inches='tight')

do_nfriends_expt()