# Classify users as male or female based on first names based on Census name frequency.
from collections import defaultdict
import re
import requests

def names2dict(url):
    """ Fetch data from census and parse into dict mapping name to frequency. """
    names = defaultdict(lambda: 0)
    for line in requests.get(url).text.split('\n'):
        parts = line.lower().split()
        if len(parts) >= 2:
            names[parts[0]] = float(parts[1])
    return names

def getCensusNames():
    """ Fetch census name data and remove ambiguous names. """
    males = names2dict('http://www.census.gov/genealogy/www/data/1990surnames/dist.male.first')
    females = names2dict('http://www.census.gov/genealogy/www/data/1990surnames/dist.female.first')
    print len(set(males.keys() + females.keys())), 'total names'
    eps = 10.  # keep names that are eps times more frequent in one gender than the other.
    tokeep = []
    for name in set(males.keys() + females.keys()):
        mscore = males[name]
        fscore = females[name]
        if mscore == 0 or fscore == 0 or mscore / fscore > eps or fscore / mscore > eps:
            tokeep.append(name)
    print 'keeping', len(tokeep)
    m = set([n for n in tokeep if males[n] > females[n]])
    f = set([n for n in tokeep if females[n] > males[n]])
    return m, f

males, females = getCensusNames()
print 'found', len(males), 'males and', len(females), 'females'

def labelGender(tweet, males, females):
    """ Classify a tweet as male (m) female (f) or neutral (n) based on first token in name field. """
    name = tweet['user']['name'].lower().split()
    if len(name) == 0:
        name = ['']
    name = re.findall('\w+', name[0])
    if len(name) == 0:
        name = ''
    else:
        name = name[0]
    if name in males:
        tweet['user']['gender'] = 'm'
    elif name in females:
        tweet['user']['gender'] = 'f'
    else:
        tweet['user']['gender'] = 'n'
    return tweet

print 'John is', labelGender({'user':{'name':'John smith'}}, males, females)['user']['gender']
print 'Jane is', labelGender({'user':{'name':'jane doe'}}, males, females)['user']['gender']
print 'kris is', labelGender({'user':{'name':'Kris doe'}}, males, females)['user']['gender']

def labelRace(tweet):
    
    desc = tweet['user']['description']
    if not desc:
        desc = ''
    toks = set(re.findall('\w+', desc.lower()))
    
    if len(set(['african', 'black', 'aa', 'sbm', 'sbf']) & toks) > 0:
        tweet['user']['race'] = 'b'
        print 'black:', desc
    elif len(set(['latin', 'latino', 'latina', 'hispanic']) & toks) > 0:
        tweet['user']['race'] = 'l'
        print 'latino:', desc
    else:
        tweet['user']['race'] = 'n'
    return tweet

print 'John is', labelRace({'user':{'description':'african-american'}})['user']['race']
print 'Jane is', labelRace({'user':{'description':'black man'}})['user']['race']
print 'kris is', labelRace({'user':{'description':'a latino'}})['user']['race']
print 'foo is', labelRace({'user':{'description':'blah'}})['user']['race']

%pylab inline
# Train a race classifier based on description field.
import glob
import io
import json
import os
import re

import numpy as np

from sklearn.cross_validation import KFold
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, precision_recall_curve
from sklearn.preprocessing import label_binarize, scale

from twutil import preprocess

def print_top_feats_cl(m, feature_names, labels, n=10):
    for i, coef in enumerate(m.coef_):
        srted = np.argsort(coef)
        topi = srted[::-1][:n]
        boti = srted[:n]
        print 'label=', labels[i]
        print 'pos:' + ' '.join('%s (%.2g)' % (n, c) for n, c in zip(feature_names[topi], coef[topi]))
        print 'neg:' + ' '.join('%s (%.2g)' % (n, c) for n, c in zip(feature_names[boti], coef[boti]))

def prec_rec(truths_bin, probs, races):
    truths_bin = np.array(truths_bin)
    probs = np.array(probs)
    precision = dict()
    recall = dict()
    for i in range(len(races)):
        precision[i], recall[i], _ = precision_recall_curve(truths_bin[:, i],
                                                            probs[:, i])
        
    # Plot Precision-Recall curve for each class
    for i in range(len(races)):
        plot(recall[i], precision[i], label=races[i])
    xlabel('Recall')
    ylabel('Precision')
    legend(loc="lower right")
    
def prec_rec_afro_hisp(truths_bin, probs, races):
    truths_bin = np.array(truths_bin)
    probs = np.array(probs)
    truths_ah = np.array(truths_bin[:, 0] + truths_bin[:, 1])
    probs_ah = np.array([max(i, j) for i, j in zip(probs[:, 0], probs[:, 1])])
    precision, recall, _ = precision_recall_curve(truths_ah,  probs_ah)
    plot(recall, precision)
    xlabel('Recall')
    ylabel('Precision')
    legend(loc="lower right")
    plt.savefig('race.pdf', bbox_inches='tight')
    
def read_race_data(race_path, tzer, races):
    tokens = []
    labels = []
    for fname in glob.glob(race_path + '/*.*'):
        label = os.path.splitext(os.path.basename(fname))[0]
        for line in io.open(fname, encoding='utf8'):
            js = json.loads(line)
            toks = tzer.do_tokenize(js['description'])
            if len(toks) > 0:
                labels.append(races.index(label))
                tokens.append(' '.join(toks))
    return tokens, labels

races = ['black', 'latin', 'white']

def train_race_clf():
    print 'training race classifier'
    tzer = preprocess.Tokenizer(fields='description', retain_punc_toks=False, collapse_digits=True)
    tokens, labels = read_race_data('/data/twcounty/labeled_users/race', tzer, races)
    vec = TfidfVectorizer(token_pattern='\S+', min_df=2, binary=True)
    X = vec.fit_transform(tokens)    
    y = np.array(labels)
    print 'vocab size=', len(vec.get_feature_names())

    cv = KFold(len(y), 10, shuffle=True, random_state=123456)
    m = LogisticRegression(class_weight='auto')
    preds = []
    truths = []
    probs = [] 
    y_bin = label_binarize(y, classes=range(len(races)))
    truths_bin = []
    for train, test in cv:
        m.fit(X[train], y[train])
        preds.extend(m.predict(X[test]))
        truths.extend(y[test])
        probs.extend(m.predict_proba(X[test]))
        truths_bin.extend(y_bin[test])
        
    print classification_report(truths, preds, target_names=races)
    m.fit(X, y)
    m.tzer = tzer
    m.vec = vec
    print_top_feats_cl(m, np.array(vec.get_feature_names()), races, 5)
    # What is precision of classifications with probability > .5?
    threshold = 0.5
    y_filtered = [i for i, pred in enumerate(preds) if probs[i][pred] > threshold]
    print len(y_filtered), 'retained of', len(preds), 'total using threshold', threshold
    print classification_report(np.array(truths)[y_filtered], np.array(preds)[y_filtered], target_names=races)
    prec_rec_afro_hisp(truths_bin, probs, races)
    return m
    
    
def label_race_clf(tweet, clf):
    desc = tweet['user']['description']
    if not desc:
        desc = ''
    preds = clf.predict_proba(clf.vec.transform([' '.join(clf.tzer.do_tokenize(desc))]))[0]
    if max(preds) > 0.5:
        # print 'predicting', races[np.argmax(preds)], 'for', tweet['user']['screen_name'], desc
        tweet['user']['race'] = races[np.argmax(preds)]
    else:
        tweet['user']['race'] = 'n'
    return tweet

race_clf = train_race_clf()
print 'John is', label_race_clf({'user':{'description':'r.i.p.', 'screen_name':'john'}}, race_clf)['user']['race']
print 'Jane is', label_race_clf({'user':{'description':'que', 'screen_name':'jane'}}, race_clf)['user']['race']
print 'Joe is', label_race_clf({'user':{'description':'like', 'screen_name':'joe'}}, race_clf)['user']['race']
print 'Jesse is', label_race_clf({'user':{'description':'asdf', 'screen_name':'jesse'}}, race_clf)['user']['race']

# Read in Twitter json files, classifying each by gender and/or race.
from collections import Counter
import io
import glob
import json
import os
import re
import sys
import tok.unicode_props

import tweet
import json2tsv

JSON_D='/data/twcounty/json/'  # Directory containing one json file per county, named according to FIPS.
    
def iter_jsons(f):
    """ Iterate tweet json objects from a file, appending county information from file name."""
    fp = io.open(f, mode='rt', encoding='utf8')
    county = os.path.basename(f)
    count = 0    
    for line in fp:
        try:
            line = re.sub('\t', '     ', line)
            js = json.loads(line)
            if json2tsv.valid_line(js):
                js['county'] = county
                yield js
                count += 1
                if count >= 10000:  # FIXME: limiting to first 1000 users per county for testing.
                    return
        except:
            pass

def iter_files(path):
    """ Iterate tweets in directory of json files. """
    fcount = 1
    for fname in glob.glob(path + '/*'):
        print 'parsing', fname, '(#', fcount, ')'
        fcount += 1
        for js in iter_jsons(fname):
            yield js

def iter_gender(jss):
    for js in jss:
        labelGender(js, males, females)
        yield js

def iter_race(jss, race_clf):
    for js in jss:
        label_race_clf(js, race_clf)
        yield js

def iter_tokenize(jss):
    all_users = set()
    for js in jss:
        if not js['user']['description']:
            js['user']['description'] = ' '
        tw = tweet.Tweet(js['county'], js['id'], js['user']['screen_name'],
                            json2tsv.tokenize(js['text'], tokenizer),
                            json2tsv.tokenize(js['user']['description'], tokenizer))
        tw.gender = js['user']['gender']
        tw.race = js['user']['race']
        if tw.screen_name not in all_users:
            county2gender[js['county']][tw.gender] += 1
            county2race[js['county']][tw.race] += 1
            all_users.add(tw.screen_name)
        gender_counts.update([tw.gender])
        race_counts.update([tw.race])
        yield tw


def do_featurize(feats, words, prefix, alpha, unigrams=True, liwc=False, perma=False):
    lexi = tsv2feats.lexicons(words, liwc, perma)
    if unigrams:
        lexi.extend(words)
    for t in lexi:
        feats[alpha[prefix + t]] += 1
    return feats


# def featurize(tw, alpha, unigrams, liwc, perma):
#     feats = defaultdict(lambda: 0)
#    if tw.text:
#        do_featurize(feats, tw.text.split(' '), '', alpha, unigrams, liwc, perma)
#        if tw.gender != 'n':
#            do_featurize(feats, tw.text.split(' '), tw.gender + "_", alpha, unigrams, liwc, perma)
#    if tw.description:
#        do_featurize(feats, tw.description.split(' '), 'd=', alpha, unigrams, liwc, perma)
#        if tw.gender != 'n':
#            do_featurize(feats, tw.description.split(' '), tw.gender + '_d=', alpha, unigrams, liwc, perma)
#    return feats

def featurize(tw, alpha, unigrams, liwc, perma):
    """
    >>> alpha = defaultdict(lambda: len(alpha))
    >>> feats = featurize(tweet.Tweet('cty1', '123', 'joe', 'i', 'abject'), alpha, True, True, True)
    >>> all(feats[alpha[word]] == 1 for word in [u'd=P-', u'Pronoun', u'I', u'Self', 'i', 'd=abject'])
    True
    """
    feats = defaultdict(lambda: 0)
    if tw.text:
        toks = tw.text.split(' ')
        do_featurize(feats, toks, '', alpha, unigrams, liwc, perma)
        if tw.race != 'n':
            do_featurize(feats, toks, 'race=' + tw.race + "_", alpha, unigrams, liwc, perma)
        if tw.gender != 'n':
            do_featurize(feats, toks, 'gender=' + tw.gender + "_", alpha, unigrams, liwc, perma)
    if tw.description:
        toks = tw.description.split(' ')
        do_featurize(feats, toks, 'd=', alpha, unigrams, liwc, perma)
        if tw.gender != 'n':
            do_featurize(feats, toks, 'gender=' + tw.gender + '_d=', alpha, unigrams, liwc, perma)
        if tw.race != 'n':
            do_featurize(feats, toks, 'race=' + tw.race + '_d=', alpha, unigrams, liwc, perma)
    return feats

def read_tweets(tweets, user_norm=False, unigrams=True, liwc=False, perma=False):
    alpha = defaultdict(lambda: len(alpha))
    # county -> tweets
    tweets_by_county = defaultdict(lambda: defaultdict(lambda: 0))
    users_by_county = defaultdict(lambda: set())
    tweets_by_user = defaultdict(lambda: defaultdict(lambda: set()))
    i = 0
    for tw in tweets:
        users_by_county[tw.county].add(tw.screen_name)
        feats = featurize(tw, alpha, unigrams, liwc, perma)
        for k, v in feats.iteritems():
            if user_norm:
                if k not in tweets_by_user[tw.county][tw.screen_name]:
                    tweets_by_county[tw.county][k] += 1
                    tweets_by_user[tw.county][tw.screen_name].add(k)
            else:
                tweets_by_county[tw.county][k] += v
        i += 1
    return tsv2feats.default_dict_to_dict(tweets_by_county), users_by_county, alpha

# READ DATA
# Parse all json files. This takes a while.
import data
import expt
import tsv2feats

tokenizer = tok.unicode_props.UP_Tiny(1)

gender_counts = Counter()
race_counts = Counter()
county2race = defaultdict(lambda: {'black': 0., 'latin': 0., 'white':0., 'n': 0.})
county2gender = defaultdict(lambda: {'m': 0., 'f': 0., 'n':0.})
race_clf = train_race_clf()

features_og, users_by_county_og, alpha_og = tsv2feats.read_tweets(iter_tokenize(iter_race(iter_gender(iter_files(JSON_D)), race_clf)),
                                                                               user_norm=True, unigrams=False, liwc=True, perma=True)
features_og_gender, users_by_county_og_gender, alpha_og_gender = read_tweets(iter_tokenize(iter_race(iter_gender(iter_files(JSON_D)), race_clf)), user_norm=True, unigrams=False, liwc=True, perma=True)
print 'genders:', gender_counts
print 'races:', race_counts


def norm_by_demo(tweets_by_county, users_by_county, alpha, include_og_feature, include_race, include_gender):
    """ Consider various normalization strategies using gender/race info."""
    alpha = defaultdict(lambda: len(alpha), alpha)
    rev_alpha = tsv2feats.reverse_dict(alpha)
    for cty_id in tweets_by_county:
        nusers = len(users_by_county[cty_id])
        features = tweets_by_county[cty_id]
        for feature, value in features.items():
            feat_name = rev_alpha[feature]
            # Divide by the number of that race (m,f,n), which prefixes the feature
            if feat_name[0:5] == 'race=':
                if include_race:
                    if county2race[cty_id][feat_name[5:10]] == 0:
                        county2race[cty_id][feat_name[5:10]] = 1.
                    features[feature] = 1. * value / county2race[cty_id][feat_name[5:10]]
                else:
                    features[feature] = 0.
            elif feat_name[0:7] == 'gender=':
                if include_gender:
                    features[feature] = 1. * value / county2gender[cty_id][feat_name[7:8]]
                else:
                    features[feature] = 0.
            else:
                if include_og_feature:
                    features[feature] = 1. * value / nusers            
                else:
                    features[feature] = 0.
    # return alpha

def compute_reweighting():
    targets, target_alpha = expt.read_targets('/data/twcounty/targets.tsv', county2gender.keys())
    female_idx = target_alpha.index('Female')
    female_true = dict((fips, target[female_idx]) for fips, target in targets.iteritems())
    female_pred = dict((fips, 100. * v['f'] / (v['m'] + v['f'])) for fips, v in county2gender.iteritems())
    female_wt = dict((fips, female_true[fips] / female_pred[fips]) for fips in county2gender)

    afro_idx = target_alpha.index('Afro-hispanic')
    afro_true = dict((fips, target[afro_idx]) for fips, target in targets.iteritems())
    afro_pred = dict((fips, 100. * (v['black'] + v['latin']) / (v['black'] + v['latin'] + v['white'])) for fips, v in county2race.iteritems())
    afro_wt = dict((fips, afro_true[fips] / afro_pred[fips]) for fips in county2gender)
    return female_wt, afro_wt

def norm_by_demo_reweight(tweets_by_county, users_by_county, alpha, include_og_feature, include_race, include_gender):
    """ Consider various reweighting strategies using gender/race info."""
    female_wt, afro_wt = compute_reweighting()

    alpha = defaultdict(lambda: len(alpha), alpha)
    rev_alpha = tsv2feats.reverse_dict(alpha)
    for cty_id in tweets_by_county:
        nusers = len(users_by_county[cty_id])
        features = tweets_by_county[cty_id]
        for feature, value in features.items():
            feat_name = rev_alpha[feature]
            # If race or gender is known, reweight
            if feat_name[0:5] == 'race=':
                if include_race:
                    if county2race[cty_id][feat_name[5:10]] == 0:
                        county2race[cty_id][feat_name[5:10]] = 1.
                    if feat_name[5:10] == 'black' or feat_name[5:10] == 'latin':
                        wt = afro_wt[cty_id]
                    else:
                        wt = 1.
                        #wt = 1. / afro_wt[cty_id]
                    features[alpha[feat_name[11:]]] += (wt * features[feature] - features[feature])
                features[feature] = 0.
            elif feat_name[0:7] == 'gender=':
                if include_gender:
                    if feat_name[7:8] == 'f':
                        wt = female_wt[cty_id]
                    else:
                        wt = 1.
                        #wt = 1. / female_wt[cty_id]
                    features[alpha[feat_name[9:]]] += (wt * features[feature] - features[feature])
                features[feature] = 0.
    # Now that we've reweighted, normalize as usual.
    tsv2feats.norm_by_user(tweets_by_county, users_by_county, alpha)


def get_results(train_folds, test_folds, m, vocab, target_name, coords):
    """ Write prediction results to disk. """
    target_name = expt.fmt_target(target_name)
    train_results = expt.evaluate(train_folds, coords)
    test_results = expt.evaluate(test_folds, coords)
    # print('%20s:\ttrain\t%s' % (target_name[:20], expt.fmt_eval(train_results, ['smape_m', 'correl', 'correl_a'])))
    # print('%20s:\ttest\t%s' % (target_name[:20], expt.fmt_eval(test_results, ['smape_m', 'correl', 'correl_a'])))
    return test_results

import copy
import numpy as np

from scipy.stats import pearsonr
from sklearn import linear_model
from sklearn.feature_selection import f_regression, SelectKBest

def print_top_feats(m, feature_names, n=10):
    srted = np.argsort(m.coef_)
    topi = srted[::-1][:n]
    boti = srted[:n]
    print 'pos:' + ' '.join('%s (%.2g)' % (n, c) for n, c in zip(feature_names[topi], m.coef_[topi]))
    print 'neg:' + ' '.join('%s (%.2g)' % (n, c) for n, c in zip(feature_names[boti], m.coef_[boti]))

def avg_results(all_results):
    """ Average all_results. """
    result = {}
    for key in all_results[0].keys():
        result[key] = np.mean([d[key] for d in all_results])
    return result

def myridge(x, y, train, test, alpha=1):
    fsel = SelectKBest(f_regression, k=min(160, len(x[0])))
    m = linear_model.Ridge(alpha)
    xtrain = fsel.fit_transform(x[train], y[train])
    xtest = fsel.transform(x[test])
    m.fit(xtrain, y[train])
    return (m, m.predict(xtrain), m.predict(xtest), fsel)
    #m.fit(x[train], y[train])
    #return (m, m.predict(x[train]), m.predict(x[test]))

def run_expt(features_og, users_by_county_og, alpha_og, use_race, use_gender, use_og_feature, ridge_alpha):
    features = copy.deepcopy(features_og)
    users_by_county = copy.deepcopy(users_by_county_og)
    alpha = copy.deepcopy(alpha_og)
    features, alpha = tsv2feats.filter_by_count(features, alpha, min_df=20)
    if use_gender or use_race:
        norm_by_demo_reweight(features, users_by_county, alpha, include_og_feature=use_og_feature, include_race=use_race, include_gender=use_gender)
    else:
        tsv2feats.norm_by_user(features, users_by_county, alpha)
    features = tsv2feats.densify(features, alpha)  # default_dict_to_dict(features)
    # print('last 10 features from first county: %s' % features.values()[0][-10:])
    # print('last 10 words from alpha: %s' % alpha.keys()[-10:])
    county_data = data.Data(None, features, alpha)
    coords = expt.read_coords('/data/twcounty/stats2/counties/counties.top100.bounding.txt')
    counties = county_data.features
    data_alpha = np.array(sorted(county_data.alpha.keys(), key=lambda k: county_data.alpha[k]))
    targets, target_alpha = expt.read_targets('/data/twcounty/targets.tsv', counties.keys())
    states = expt.read_states('/data/twcounty/states.tsv', counties.keys())
 
    
    cv = expt.make_state_cv(counties, states, 5)
    X, Y = expt.to_matrices(counties, targets)
    county_ids = np.array(sorted(counties))
    all_results = []
    all_smapes = []
    for yi, ylabel in enumerate(target_alpha):
        y = Y[:, yi]
        # XX = fsel.fit_transform(X, y)
        train_folds = []
        test_folds = []
        for train, test in cv:
            m, train_pred, test_pred, fsel = myridge(X, y, train, test, alpha=ridge_alpha)
            train_folds.append((y[train], train_pred, county_ids[train]))
            test_folds.append((y[test], test_pred, county_ids[test]))
        m, train_pred, test_pred, fsel = myridge(X, y, range(len(y)), range(len(y)), alpha=ridge_alpha)
        #fvals, pvals = expt.feature_correls(X, y, [coords[cty] for cty in county_ids])
        all_results.append(get_results(train_folds, test_folds, m, data_alpha, ylabel, coords))
        smapes = [expt.smape(t[0], t[1]) for t in test_folds]
        all_smapes.append(smapes)
        # smapes = [pearsonr(t[0], t[1])[0] for t in test_folds]
        # all_smapes.append(smapes)
        if ylabel in ['% Obese', '% No Social-Emotional Support', 'Teen Birth Rate']:
            print ylabel, 'features:'
            print_top_feats(m, np.array(data_alpha[fsel.get_support(True)]))
    summary = avg_results(all_results)
    print(u'\t'.join(str(k) for k in sorted(summary.keys())) + '\n')
    print(u'\t'.join(str(summary[k]) for k in sorted(summary.keys())) + '\n')
    return target_alpha, np.array(all_smapes)

targets, gender_results = run_expt(features_og_gender, users_by_county_og_gender, alpha_og_gender, use_gender=True, use_og_feature=True, use_race=True, ridge_alpha=.1)

targets, og_results = run_expt(features_og, users_by_county_og, alpha_og, use_race=False, use_gender=False, use_og_feature=True, ridge_alpha=.1)

# Search for best alpha
def avg_results_alpha(my_results):
    return np.mean([np.mean(r) for r in my_results])

def best_alpha(features_og_gender, users_by_county_og_gender, alpha_og_gender,
               use_gender, use_race, use_og_feature):
    """ Hill climb to find best alpha. """
    values = [.1 * (i+1) for i in range(25)]
    i = len(values) / 2
    print 'ALPHA=', values[i], 'i=', i
    direction = -1
    _, my_results = run_expt(features_og_gender, users_by_county_og_gender, alpha_og_gender, use_gender=use_gender, use_race=use_race, use_og_feature=use_og_feature, ridge_alpha=values[i])
    while True:
        newi = i + direction
        if newi >= 0 and newi < len(values):
            print 'ALPHA=', values[newi], 'i=', newi
            t, new_results = run_expt(features_og_gender, users_by_county_og_gender, alpha_og_gender, use_gender=use_gender, use_race=use_race, use_og_feature=use_og_feature, ridge_alpha=values[newi])
            if avg_results_alpha(new_results) < avg_results_alpha(my_results):
                i = newi
                print 'accepting new alpha'
                my_results = new_results
                continue
            else:
                direction *= -1
                newi = i + direction
                print 'ALPHA=', values[newi], 'i=', newi
                t, new_results = run_expt(features_og_gender, users_by_county_og_gender, alpha_og_gender, use_gender=use_gender, use_race=use_race, use_og_feature=use_og_feature, ridge_alpha=values[newi])
                if avg_results_alpha(new_results) < avg_results_alpha(my_results):
                    i = newi
                    print 'accepting new alpha'
                    my_results = new_results
                    continue
        # No better direction. Stop.
        print 'stopping.'
        return values[i], avg_results_alpha(my_results)

print best_alpha(features_og_gender, users_by_county_og_gender, alpha_og_gender, use_gender=True, use_race=True, use_og_feature=True)

smape  alpha  use_gender  use_race  use_og
.0969   .3     t            t         t
.0973   .4     f            t         t
.1002   .4     f            t         f
.0934   .2     t            f         t
.0938   .2     t            f         f

import expt
import texify_results
def pct_impr(old, new):
    return 100. * (old - new) / old

improvements = []
impr_err = []
target_pr = []
for target, gender_r, og_r in sorted(zip(targets, gender_results, og_results),
                                     key=lambda x:np.mean(pct_impr(x[2], x[1]))):
    if target not in ['Female', 'Afro-hispanic', '< 18', '65 and over', 'med_income']:
        imprs = pct_impr(og_r, gender_r)
        print texify_results.label_map[expt.fmt_target(target)], ',', np.mean(imprs), ',', np.std(imprs), ',', np.mean(og_r), np.mean(gender_r)
        improvements.append(np.mean(imprs))
        impr_err.append(np.std(imprs) / sqrt(len(imprs)) / 2)
        target_pr.append(target)
    
y_pos = np.arange(len(target_pr))

fig = plt.figure()
ax = fig.add_subplot(111)
ax.tick_params(axis='both', labelsize='8')
plt.barh(y_pos, improvements, xerr=impr_err, align='center', alpha=0.8)
plt.yticks(y_pos, target_pr)
plt.xlabel('% SMAPE improvement')
plt.savefig('reweight.pdf', bbox_inches='tight')
print 'mean improvement=', np.mean(improvements)

%pylab inline --no-import-all

import spatial
# Plot discrepancy between estimated %female and census
# Print gender by county.
targets, target_alpha = expt.read_targets('/data/twcounty/targets.tsv', county2gender.keys())
female_idx = target_alpha.index('Female')
female_true = dict((fips, target[female_idx]) for fips, target in targets.iteritems())
female_pred = dict((fips, 100. * v['f'] / (v['m'] + v['f'])) for fips, v in county2gender.iteritems())

coords = expt.read_coords('/data/twcounty/stats2/counties/counties.top100.bounding.txt')

truths = []
errs = []
preds = []
coords_fips = []
for fips, f_true in sorted(female_true.items(), key=lambda x: x[1]):
    truths.append(f_true)
    errs.append(female_pred[fips] - f_true)
    preds.append(female_pred[fips])
    coords_fips = coords[fips]

# Plot truth vs error.
scatter(truths, errs)
axhline(y=0)
xlabel("% Female (truth)")
ylabel("Error (Predicted - Truth)")

# Plot truth vs predicted.
figure()
scatter(truths,preds)
ylabel("% Female (predicted)", size=18)
xlabel("% Female (truth)", size=18)
title('r=%.2f' % pearsonr(truths, preds)[0], size=18)
plt.savefig('gender-scatter.pdf', bbox_inches='tight')

from scipy.stats import pearsonr
print 'correlation is', pearsonr(truths, preds)


%pylab inline --no-import-all
from scipy.stats import pearsonr
import spatial
# Plot discrepancy between estimated race and census
targets, target_alpha = expt.read_targets('/data/twcounty/targets.tsv', county2race.keys())
race_idx = target_alpha.index('Afro-hispanic')
race_true = dict((fips, target[race_idx]) for fips, target in targets.iteritems())
race_pred = dict((fips, 100. * (v['black'] + v['latin']) / (v['black'] + v['latin'] + v['white'])) for fips, v in county2race.iteritems())

coords = expt.read_coords('/data/twcounty/stats2/counties/counties.top100.bounding.txt')

truths = []
errs = []
preds = []
coords_fips = []
for fips, f_true in sorted(race_true.items(), key=lambda x: x[1]):
    if race_pred[fips] > 70:
        print fips, f_true, race_pred[fips], county2race[fips]
    else:
        truths.append(f_true)
        errs.append(race_pred[fips] - f_true)
        preds.append(race_pred[fips])
        coords_fips = coords[fips]

# Plot truth vs error.
scatter(truths, errs)
axhline(y=0)
xlabel("% Afro-hispanic (truth)")
ylabel("Error (Predicted - Truth)")

# Plot truth vs predicted.
figure()
scatter(truths,preds)
ylabel("% Afro-hispanic (predicted)", size='18')
xlabel("% Afro-hispanic(truth)", size='18')
title('r=%.2f' % pearsonr(truths, preds)[0], size=18)
plt.savefig('race-scatter.pdf', bbox_inches='tight')

print 'correlation is', pearsonr(truths, preds)