# Classify users as male or female based on first names based on Census name frequency. from collections import defaultdict import re import requests def names2dict(url): """ Fetch data from census and parse into dict mapping name to frequency. """ names = defaultdict(lambda: 0) for line in requests.get(url).text.split('\n'): parts = line.lower().split() if len(parts) >= 2: names[parts[0]] = float(parts[1]) return names def getCensusNames(): """ Fetch census name data and remove ambiguous names. """ males = names2dict('http://www.census.gov/genealogy/www/data/1990surnames/dist.male.first') females = names2dict('http://www.census.gov/genealogy/www/data/1990surnames/dist.female.first') print len(set(males.keys() + females.keys())), 'total names' eps = 10. # keep names that are eps times more frequent in one gender than the other. tokeep = [] for name in set(males.keys() + females.keys()): mscore = males[name] fscore = females[name] if mscore == 0 or fscore == 0 or mscore / fscore > eps or fscore / mscore > eps: tokeep.append(name) print 'keeping', len(tokeep) m = set([n for n in tokeep if males[n] > females[n]]) f = set([n for n in tokeep if females[n] > males[n]]) return m, f males, females = getCensusNames() print 'found', len(males), 'males and', len(females), 'females' def labelGender(tweet, males, females): """ Classify a tweet as male (m) female (f) or neutral (n) based on first token in name field. """ name = tweet['user']['name'].lower().split() if len(name) == 0: name = [''] name = re.findall('\w+', name[0]) if len(name) == 0: name = '' else: name = name[0] if name in males: tweet['user']['gender'] = 'm' elif name in females: tweet['user']['gender'] = 'f' else: tweet['user']['gender'] = 'n' return tweet print 'John is', labelGender({'user':{'name':'John smith'}}, males, females)['user']['gender'] print 'Jane is', labelGender({'user':{'name':'jane doe'}}, males, females)['user']['gender'] print 'kris is', labelGender({'user':{'name':'Kris doe'}}, males, females)['user']['gender'] def labelRace(tweet): desc = tweet['user']['description'] if not desc: desc = '' toks = set(re.findall('\w+', desc.lower())) if len(set(['african', 'black', 'aa', 'sbm', 'sbf']) & toks) > 0: tweet['user']['race'] = 'b' print 'black:', desc elif len(set(['latin', 'latino', 'latina', 'hispanic']) & toks) > 0: tweet['user']['race'] = 'l' print 'latino:', desc else: tweet['user']['race'] = 'n' return tweet print 'John is', labelRace({'user':{'description':'african-american'}})['user']['race'] print 'Jane is', labelRace({'user':{'description':'black man'}})['user']['race'] print 'kris is', labelRace({'user':{'description':'a latino'}})['user']['race'] print 'foo is', labelRace({'user':{'description':'blah'}})['user']['race'] %pylab inline # Train a race classifier based on description field. import glob import io import json import os import re import numpy as np from sklearn.cross_validation import KFold from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer from sklearn.linear_model import LogisticRegression from sklearn.metrics import classification_report, precision_recall_curve from sklearn.preprocessing import label_binarize, scale from twutil import preprocess def print_top_feats_cl(m, feature_names, labels, n=10): for i, coef in enumerate(m.coef_): srted = np.argsort(coef) topi = srted[::-1][:n] boti = srted[:n] print 'label=', labels[i] print 'pos:' + ' '.join('%s (%.2g)' % (n, c) for n, c in zip(feature_names[topi], coef[topi])) print 'neg:' + ' '.join('%s (%.2g)' % (n, c) for n, c in zip(feature_names[boti], coef[boti])) def prec_rec(truths_bin, probs, races): truths_bin = np.array(truths_bin) probs = np.array(probs) precision = dict() recall = dict() for i in range(len(races)): precision[i], recall[i], _ = precision_recall_curve(truths_bin[:, i], probs[:, i]) # Plot Precision-Recall curve for each class for i in range(len(races)): plot(recall[i], precision[i], label=races[i]) xlabel('Recall') ylabel('Precision') legend(loc="lower right") def prec_rec_afro_hisp(truths_bin, probs, races): truths_bin = np.array(truths_bin) probs = np.array(probs) truths_ah = np.array(truths_bin[:, 0] + truths_bin[:, 1]) probs_ah = np.array([max(i, j) for i, j in zip(probs[:, 0], probs[:, 1])]) precision, recall, _ = precision_recall_curve(truths_ah, probs_ah) plot(recall, precision) xlabel('Recall') ylabel('Precision') legend(loc="lower right") plt.savefig('race.pdf', bbox_inches='tight') def read_race_data(race_path, tzer, races): tokens = [] labels = [] for fname in glob.glob(race_path + '/*.*'): label = os.path.splitext(os.path.basename(fname))[0] for line in io.open(fname, encoding='utf8'): js = json.loads(line) toks = tzer.do_tokenize(js['description']) if len(toks) > 0: labels.append(races.index(label)) tokens.append(' '.join(toks)) return tokens, labels races = ['black', 'latin', 'white'] def train_race_clf(): print 'training race classifier' tzer = preprocess.Tokenizer(fields='description', retain_punc_toks=False, collapse_digits=True) tokens, labels = read_race_data('/data/twcounty/labeled_users/race', tzer, races) vec = TfidfVectorizer(token_pattern='\S+', min_df=2, binary=True) X = vec.fit_transform(tokens) y = np.array(labels) print 'vocab size=', len(vec.get_feature_names()) cv = KFold(len(y), 10, shuffle=True, random_state=123456) m = LogisticRegression(class_weight='auto') preds = [] truths = [] probs = [] y_bin = label_binarize(y, classes=range(len(races))) truths_bin = [] for train, test in cv: m.fit(X[train], y[train]) preds.extend(m.predict(X[test])) truths.extend(y[test]) probs.extend(m.predict_proba(X[test])) truths_bin.extend(y_bin[test]) print classification_report(truths, preds, target_names=races) m.fit(X, y) m.tzer = tzer m.vec = vec print_top_feats_cl(m, np.array(vec.get_feature_names()), races, 5) # What is precision of classifications with probability > .5? threshold = 0.5 y_filtered = [i for i, pred in enumerate(preds) if probs[i][pred] > threshold] print len(y_filtered), 'retained of', len(preds), 'total using threshold', threshold print classification_report(np.array(truths)[y_filtered], np.array(preds)[y_filtered], target_names=races) prec_rec_afro_hisp(truths_bin, probs, races) return m def label_race_clf(tweet, clf): desc = tweet['user']['description'] if not desc: desc = '' preds = clf.predict_proba(clf.vec.transform([' '.join(clf.tzer.do_tokenize(desc))]))[0] if max(preds) > 0.5: # print 'predicting', races[np.argmax(preds)], 'for', tweet['user']['screen_name'], desc tweet['user']['race'] = races[np.argmax(preds)] else: tweet['user']['race'] = 'n' return tweet race_clf = train_race_clf() print 'John is', label_race_clf({'user':{'description':'r.i.p.', 'screen_name':'john'}}, race_clf)['user']['race'] print 'Jane is', label_race_clf({'user':{'description':'que', 'screen_name':'jane'}}, race_clf)['user']['race'] print 'Joe is', label_race_clf({'user':{'description':'like', 'screen_name':'joe'}}, race_clf)['user']['race'] print 'Jesse is', label_race_clf({'user':{'description':'asdf', 'screen_name':'jesse'}}, race_clf)['user']['race'] # Read in Twitter json files, classifying each by gender and/or race. from collections import Counter import io import glob import json import os import re import sys import tok.unicode_props import tweet import json2tsv JSON_D='/data/twcounty/json/' # Directory containing one json file per county, named according to FIPS. def iter_jsons(f): """ Iterate tweet json objects from a file, appending county information from file name.""" fp = io.open(f, mode='rt', encoding='utf8') county = os.path.basename(f) count = 0 for line in fp: try: line = re.sub('\t', ' ', line) js = json.loads(line) if json2tsv.valid_line(js): js['county'] = county yield js count += 1 if count >= 10000: # FIXME: limiting to first 1000 users per county for testing. return except: pass def iter_files(path): """ Iterate tweets in directory of json files. """ fcount = 1 for fname in glob.glob(path + '/*'): print 'parsing', fname, '(#', fcount, ')' fcount += 1 for js in iter_jsons(fname): yield js def iter_gender(jss): for js in jss: labelGender(js, males, females) yield js def iter_race(jss, race_clf): for js in jss: label_race_clf(js, race_clf) yield js def iter_tokenize(jss): all_users = set() for js in jss: if not js['user']['description']: js['user']['description'] = ' ' tw = tweet.Tweet(js['county'], js['id'], js['user']['screen_name'], json2tsv.tokenize(js['text'], tokenizer), json2tsv.tokenize(js['user']['description'], tokenizer)) tw.gender = js['user']['gender'] tw.race = js['user']['race'] if tw.screen_name not in all_users: county2gender[js['county']][tw.gender] += 1 county2race[js['county']][tw.race] += 1 all_users.add(tw.screen_name) gender_counts.update([tw.gender]) race_counts.update([tw.race]) yield tw def do_featurize(feats, words, prefix, alpha, unigrams=True, liwc=False, perma=False): lexi = tsv2feats.lexicons(words, liwc, perma) if unigrams: lexi.extend(words) for t in lexi: feats[alpha[prefix + t]] += 1 return feats # def featurize(tw, alpha, unigrams, liwc, perma): # feats = defaultdict(lambda: 0) # if tw.text: # do_featurize(feats, tw.text.split(' '), '', alpha, unigrams, liwc, perma) # if tw.gender != 'n': # do_featurize(feats, tw.text.split(' '), tw.gender + "_", alpha, unigrams, liwc, perma) # if tw.description: # do_featurize(feats, tw.description.split(' '), 'd=', alpha, unigrams, liwc, perma) # if tw.gender != 'n': # do_featurize(feats, tw.description.split(' '), tw.gender + '_d=', alpha, unigrams, liwc, perma) # return feats def featurize(tw, alpha, unigrams, liwc, perma): """ >>> alpha = defaultdict(lambda: len(alpha)) >>> feats = featurize(tweet.Tweet('cty1', '123', 'joe', 'i', 'abject'), alpha, True, True, True) >>> all(feats[alpha[word]] == 1 for word in [u'd=P-', u'Pronoun', u'I', u'Self', 'i', 'd=abject']) True """ feats = defaultdict(lambda: 0) if tw.text: toks = tw.text.split(' ') do_featurize(feats, toks, '', alpha, unigrams, liwc, perma) if tw.race != 'n': do_featurize(feats, toks, 'race=' + tw.race + "_", alpha, unigrams, liwc, perma) if tw.gender != 'n': do_featurize(feats, toks, 'gender=' + tw.gender + "_", alpha, unigrams, liwc, perma) if tw.description: toks = tw.description.split(' ') do_featurize(feats, toks, 'd=', alpha, unigrams, liwc, perma) if tw.gender != 'n': do_featurize(feats, toks, 'gender=' + tw.gender + '_d=', alpha, unigrams, liwc, perma) if tw.race != 'n': do_featurize(feats, toks, 'race=' + tw.race + '_d=', alpha, unigrams, liwc, perma) return feats def read_tweets(tweets, user_norm=False, unigrams=True, liwc=False, perma=False): alpha = defaultdict(lambda: len(alpha)) # county -> tweets tweets_by_county = defaultdict(lambda: defaultdict(lambda: 0)) users_by_county = defaultdict(lambda: set()) tweets_by_user = defaultdict(lambda: defaultdict(lambda: set())) i = 0 for tw in tweets: users_by_county[tw.county].add(tw.screen_name) feats = featurize(tw, alpha, unigrams, liwc, perma) for k, v in feats.iteritems(): if user_norm: if k not in tweets_by_user[tw.county][tw.screen_name]: tweets_by_county[tw.county][k] += 1 tweets_by_user[tw.county][tw.screen_name].add(k) else: tweets_by_county[tw.county][k] += v i += 1 return tsv2feats.default_dict_to_dict(tweets_by_county), users_by_county, alpha # READ DATA # Parse all json files. This takes a while. import data import expt import tsv2feats tokenizer = tok.unicode_props.UP_Tiny(1) gender_counts = Counter() race_counts = Counter() county2race = defaultdict(lambda: {'black': 0., 'latin': 0., 'white':0., 'n': 0.}) county2gender = defaultdict(lambda: {'m': 0., 'f': 0., 'n':0.}) race_clf = train_race_clf() features_og, users_by_county_og, alpha_og = tsv2feats.read_tweets(iter_tokenize(iter_race(iter_gender(iter_files(JSON_D)), race_clf)), user_norm=True, unigrams=False, liwc=True, perma=True) features_og_gender, users_by_county_og_gender, alpha_og_gender = read_tweets(iter_tokenize(iter_race(iter_gender(iter_files(JSON_D)), race_clf)), user_norm=True, unigrams=False, liwc=True, perma=True) print 'genders:', gender_counts print 'races:', race_counts def norm_by_demo(tweets_by_county, users_by_county, alpha, include_og_feature, include_race, include_gender): """ Consider various normalization strategies using gender/race info.""" alpha = defaultdict(lambda: len(alpha), alpha) rev_alpha = tsv2feats.reverse_dict(alpha) for cty_id in tweets_by_county: nusers = len(users_by_county[cty_id]) features = tweets_by_county[cty_id] for feature, value in features.items(): feat_name = rev_alpha[feature] # Divide by the number of that race (m,f,n), which prefixes the feature if feat_name[0:5] == 'race=': if include_race: if county2race[cty_id][feat_name[5:10]] == 0: county2race[cty_id][feat_name[5:10]] = 1. features[feature] = 1. * value / county2race[cty_id][feat_name[5:10]] else: features[feature] = 0. elif feat_name[0:7] == 'gender=': if include_gender: features[feature] = 1. * value / county2gender[cty_id][feat_name[7:8]] else: features[feature] = 0. else: if include_og_feature: features[feature] = 1. * value / nusers else: features[feature] = 0. # return alpha def compute_reweighting(): targets, target_alpha = expt.read_targets('/data/twcounty/targets.tsv', county2gender.keys()) female_idx = target_alpha.index('Female') female_true = dict((fips, target[female_idx]) for fips, target in targets.iteritems()) female_pred = dict((fips, 100. * v['f'] / (v['m'] + v['f'])) for fips, v in county2gender.iteritems()) female_wt = dict((fips, female_true[fips] / female_pred[fips]) for fips in county2gender) afro_idx = target_alpha.index('Afro-hispanic') afro_true = dict((fips, target[afro_idx]) for fips, target in targets.iteritems()) afro_pred = dict((fips, 100. * (v['black'] + v['latin']) / (v['black'] + v['latin'] + v['white'])) for fips, v in county2race.iteritems()) afro_wt = dict((fips, afro_true[fips] / afro_pred[fips]) for fips in county2gender) return female_wt, afro_wt def norm_by_demo_reweight(tweets_by_county, users_by_county, alpha, include_og_feature, include_race, include_gender): """ Consider various reweighting strategies using gender/race info.""" female_wt, afro_wt = compute_reweighting() alpha = defaultdict(lambda: len(alpha), alpha) rev_alpha = tsv2feats.reverse_dict(alpha) for cty_id in tweets_by_county: nusers = len(users_by_county[cty_id]) features = tweets_by_county[cty_id] for feature, value in features.items(): feat_name = rev_alpha[feature] # If race or gender is known, reweight if feat_name[0:5] == 'race=': if include_race: if county2race[cty_id][feat_name[5:10]] == 0: county2race[cty_id][feat_name[5:10]] = 1. if feat_name[5:10] == 'black' or feat_name[5:10] == 'latin': wt = afro_wt[cty_id] else: wt = 1. #wt = 1. / afro_wt[cty_id] features[alpha[feat_name[11:]]] += (wt * features[feature] - features[feature]) features[feature] = 0. elif feat_name[0:7] == 'gender=': if include_gender: if feat_name[7:8] == 'f': wt = female_wt[cty_id] else: wt = 1. #wt = 1. / female_wt[cty_id] features[alpha[feat_name[9:]]] += (wt * features[feature] - features[feature]) features[feature] = 0. # Now that we've reweighted, normalize as usual. tsv2feats.norm_by_user(tweets_by_county, users_by_county, alpha) def get_results(train_folds, test_folds, m, vocab, target_name, coords): """ Write prediction results to disk. """ target_name = expt.fmt_target(target_name) train_results = expt.evaluate(train_folds, coords) test_results = expt.evaluate(test_folds, coords) # print('%20s:\ttrain\t%s' % (target_name[:20], expt.fmt_eval(train_results, ['smape_m', 'correl', 'correl_a']))) # print('%20s:\ttest\t%s' % (target_name[:20], expt.fmt_eval(test_results, ['smape_m', 'correl', 'correl_a']))) return test_results import copy import numpy as np from scipy.stats import pearsonr from sklearn import linear_model from sklearn.feature_selection import f_regression, SelectKBest def print_top_feats(m, feature_names, n=10): srted = np.argsort(m.coef_) topi = srted[::-1][:n] boti = srted[:n] print 'pos:' + ' '.join('%s (%.2g)' % (n, c) for n, c in zip(feature_names[topi], m.coef_[topi])) print 'neg:' + ' '.join('%s (%.2g)' % (n, c) for n, c in zip(feature_names[boti], m.coef_[boti])) def avg_results(all_results): """ Average all_results. """ result = {} for key in all_results[0].keys(): result[key] = np.mean([d[key] for d in all_results]) return result def myridge(x, y, train, test, alpha=1): fsel = SelectKBest(f_regression, k=min(160, len(x[0]))) m = linear_model.Ridge(alpha) xtrain = fsel.fit_transform(x[train], y[train]) xtest = fsel.transform(x[test]) m.fit(xtrain, y[train]) return (m, m.predict(xtrain), m.predict(xtest), fsel) #m.fit(x[train], y[train]) #return (m, m.predict(x[train]), m.predict(x[test])) def run_expt(features_og, users_by_county_og, alpha_og, use_race, use_gender, use_og_feature, ridge_alpha): features = copy.deepcopy(features_og) users_by_county = copy.deepcopy(users_by_county_og) alpha = copy.deepcopy(alpha_og) features, alpha = tsv2feats.filter_by_count(features, alpha, min_df=20) if use_gender or use_race: norm_by_demo_reweight(features, users_by_county, alpha, include_og_feature=use_og_feature, include_race=use_race, include_gender=use_gender) else: tsv2feats.norm_by_user(features, users_by_county, alpha) features = tsv2feats.densify(features, alpha) # default_dict_to_dict(features) # print('last 10 features from first county: %s' % features.values()[0][-10:]) # print('last 10 words from alpha: %s' % alpha.keys()[-10:]) county_data = data.Data(None, features, alpha) coords = expt.read_coords('/data/twcounty/stats2/counties/counties.top100.bounding.txt') counties = county_data.features data_alpha = np.array(sorted(county_data.alpha.keys(), key=lambda k: county_data.alpha[k])) targets, target_alpha = expt.read_targets('/data/twcounty/targets.tsv', counties.keys()) states = expt.read_states('/data/twcounty/states.tsv', counties.keys()) cv = expt.make_state_cv(counties, states, 5) X, Y = expt.to_matrices(counties, targets) county_ids = np.array(sorted(counties)) all_results = [] all_smapes = [] for yi, ylabel in enumerate(target_alpha): y = Y[:, yi] # XX = fsel.fit_transform(X, y) train_folds = [] test_folds = [] for train, test in cv: m, train_pred, test_pred, fsel = myridge(X, y, train, test, alpha=ridge_alpha) train_folds.append((y[train], train_pred, county_ids[train])) test_folds.append((y[test], test_pred, county_ids[test])) m, train_pred, test_pred, fsel = myridge(X, y, range(len(y)), range(len(y)), alpha=ridge_alpha) #fvals, pvals = expt.feature_correls(X, y, [coords[cty] for cty in county_ids]) all_results.append(get_results(train_folds, test_folds, m, data_alpha, ylabel, coords)) smapes = [expt.smape(t[0], t[1]) for t in test_folds] all_smapes.append(smapes) # smapes = [pearsonr(t[0], t[1])[0] for t in test_folds] # all_smapes.append(smapes) if ylabel in ['% Obese', '% No Social-Emotional Support', 'Teen Birth Rate']: print ylabel, 'features:' print_top_feats(m, np.array(data_alpha[fsel.get_support(True)])) summary = avg_results(all_results) print(u'\t'.join(str(k) for k in sorted(summary.keys())) + '\n') print(u'\t'.join(str(summary[k]) for k in sorted(summary.keys())) + '\n') return target_alpha, np.array(all_smapes) targets, gender_results = run_expt(features_og_gender, users_by_county_og_gender, alpha_og_gender, use_gender=True, use_og_feature=True, use_race=True, ridge_alpha=.1) targets, og_results = run_expt(features_og, users_by_county_og, alpha_og, use_race=False, use_gender=False, use_og_feature=True, ridge_alpha=.1) # Search for best alpha def avg_results_alpha(my_results): return np.mean([np.mean(r) for r in my_results]) def best_alpha(features_og_gender, users_by_county_og_gender, alpha_og_gender, use_gender, use_race, use_og_feature): """ Hill climb to find best alpha. """ values = [.1 * (i+1) for i in range(25)] i = len(values) / 2 print 'ALPHA=', values[i], 'i=', i direction = -1 _, my_results = run_expt(features_og_gender, users_by_county_og_gender, alpha_og_gender, use_gender=use_gender, use_race=use_race, use_og_feature=use_og_feature, ridge_alpha=values[i]) while True: newi = i + direction if newi >= 0 and newi < len(values): print 'ALPHA=', values[newi], 'i=', newi t, new_results = run_expt(features_og_gender, users_by_county_og_gender, alpha_og_gender, use_gender=use_gender, use_race=use_race, use_og_feature=use_og_feature, ridge_alpha=values[newi]) if avg_results_alpha(new_results) < avg_results_alpha(my_results): i = newi print 'accepting new alpha' my_results = new_results continue else: direction *= -1 newi = i + direction print 'ALPHA=', values[newi], 'i=', newi t, new_results = run_expt(features_og_gender, users_by_county_og_gender, alpha_og_gender, use_gender=use_gender, use_race=use_race, use_og_feature=use_og_feature, ridge_alpha=values[newi]) if avg_results_alpha(new_results) < avg_results_alpha(my_results): i = newi print 'accepting new alpha' my_results = new_results continue # No better direction. Stop. print 'stopping.' return values[i], avg_results_alpha(my_results) print best_alpha(features_og_gender, users_by_county_og_gender, alpha_og_gender, use_gender=True, use_race=True, use_og_feature=True) smape alpha use_gender use_race use_og .0969 .3 t t t .0973 .4 f t t .1002 .4 f t f .0934 .2 t f t .0938 .2 t f f import expt import texify_results def pct_impr(old, new): return 100. * (old - new) / old improvements = [] impr_err = [] target_pr = [] for target, gender_r, og_r in sorted(zip(targets, gender_results, og_results), key=lambda x:np.mean(pct_impr(x[2], x[1]))): if target not in ['Female', 'Afro-hispanic', '< 18', '65 and over', 'med_income']: imprs = pct_impr(og_r, gender_r) print texify_results.label_map[expt.fmt_target(target)], ',', np.mean(imprs), ',', np.std(imprs), ',', np.mean(og_r), np.mean(gender_r) improvements.append(np.mean(imprs)) impr_err.append(np.std(imprs) / sqrt(len(imprs)) / 2) target_pr.append(target) y_pos = np.arange(len(target_pr)) fig = plt.figure() ax = fig.add_subplot(111) ax.tick_params(axis='both', labelsize='8') plt.barh(y_pos, improvements, xerr=impr_err, align='center', alpha=0.8) plt.yticks(y_pos, target_pr) plt.xlabel('% SMAPE improvement') plt.savefig('reweight.pdf', bbox_inches='tight') print 'mean improvement=', np.mean(improvements) %pylab inline --no-import-all import spatial # Plot discrepancy between estimated %female and census # Print gender by county. targets, target_alpha = expt.read_targets('/data/twcounty/targets.tsv', county2gender.keys()) female_idx = target_alpha.index('Female') female_true = dict((fips, target[female_idx]) for fips, target in targets.iteritems()) female_pred = dict((fips, 100. * v['f'] / (v['m'] + v['f'])) for fips, v in county2gender.iteritems()) coords = expt.read_coords('/data/twcounty/stats2/counties/counties.top100.bounding.txt') truths = [] errs = [] preds = [] coords_fips = [] for fips, f_true in sorted(female_true.items(), key=lambda x: x[1]): truths.append(f_true) errs.append(female_pred[fips] - f_true) preds.append(female_pred[fips]) coords_fips = coords[fips] # Plot truth vs error. scatter(truths, errs) axhline(y=0) xlabel("% Female (truth)") ylabel("Error (Predicted - Truth)") # Plot truth vs predicted. figure() scatter(truths,preds) ylabel("% Female (predicted)", size=18) xlabel("% Female (truth)", size=18) title('r=%.2f' % pearsonr(truths, preds)[0], size=18) plt.savefig('gender-scatter.pdf', bbox_inches='tight') from scipy.stats import pearsonr print 'correlation is', pearsonr(truths, preds) %pylab inline --no-import-all from scipy.stats import pearsonr import spatial # Plot discrepancy between estimated race and census targets, target_alpha = expt.read_targets('/data/twcounty/targets.tsv', county2race.keys()) race_idx = target_alpha.index('Afro-hispanic') race_true = dict((fips, target[race_idx]) for fips, target in targets.iteritems()) race_pred = dict((fips, 100. * (v['black'] + v['latin']) / (v['black'] + v['latin'] + v['white'])) for fips, v in county2race.iteritems()) coords = expt.read_coords('/data/twcounty/stats2/counties/counties.top100.bounding.txt') truths = [] errs = [] preds = [] coords_fips = [] for fips, f_true in sorted(race_true.items(), key=lambda x: x[1]): if race_pred[fips] > 70: print fips, f_true, race_pred[fips], county2race[fips] else: truths.append(f_true) errs.append(race_pred[fips] - f_true) preds.append(race_pred[fips]) coords_fips = coords[fips] # Plot truth vs error. scatter(truths, errs) axhline(y=0) xlabel("% Afro-hispanic (truth)") ylabel("Error (Predicted - Truth)") # Plot truth vs predicted. figure() scatter(truths,preds) ylabel("% Afro-hispanic (predicted)", size='18') xlabel("% Afro-hispanic(truth)", size='18') title('r=%.2f' % pearsonr(truths, preds)[0], size=18) plt.savefig('race-scatter.pdf', bbox_inches='tight') print 'correlation is', pearsonr(truths, preds)