Text Regression II
Dr. Aron Culotta
Illinois Institute of Technology
Today, we'll look more closely at the flu-tracking example.
RAW_DATA='tweets' # I haven't placed this directory in GitHub, since it's so large.
# Data looks like this. (first 10 lines of second file)
! head `ls $RAW_DATA/* | head -1 `
# This is already tokenized.
nice exclamationpoint looking forward to seeing the piece on you twittelator dude exclamationpoint congratulations exclamationpoint
# Read tweets. One file per day, one tweet per line.
# Store a list of Counter objects, tracking the frequency of terms for each day.
from collections import Counter
import glob
import io
import os
import subprocess
import sys
def read_data():
counts = []
dates = []
lines = []
# For demonstration purposes, we'll only read the first 10K lines of a file.
max_lines = 10000
for filename in glob.glob(RAW_DATA + '/*'):
print 'reading', filename
lines.append(file_length(filename))
dates.append(os.path.basename(filename))
this_counter = Counter()
line_ct = 0
for line in io.open(filename, 'rt', encoding='utf8'):
this_counter.update(line.split())
line_ct += 1
if line_ct > max_lines:
break
print 'read %d tokens for %d words' % (sum(this_counter.values()),
len(this_counter.keys()))
counts.append(this_counter)
return counts, dates, lines
def file_length(filename):
p = subprocess.Popen(['wc', '-l', filename], stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
result, err = p.communicate()
if p.returncode != 0:
raise IOError(err)
return int(result.strip().split()[0])
counts, dates, lines = read_data()
reading tweets/2009-08-20 read 15 tokens for 13 words reading tweets/2009-09-01 read 112851 tokens for 14763 words reading tweets/2009-09-02 read 113951 tokens for 15052 words reading tweets/2009-09-03 read 114283 tokens for 14908 words reading tweets/2009-09-04 read 112864 tokens for 14953 words reading tweets/2009-09-05 read 110453 tokens for 15044 words reading tweets/2009-09-06 read 109191 tokens for 14659 words reading tweets/2009-09-07 read 112170 tokens for 14899 words reading tweets/2009-09-08 read 107959 tokens for 14384 words reading tweets/2009-09-09 read 112892 tokens for 14759 words reading tweets/2009-09-10 read 111702 tokens for 14846 words reading tweets/2009-09-11 read 110466 tokens for 14655 words reading tweets/2009-09-12 read 109467 tokens for 14993 words reading tweets/2009-09-13 read 110850 tokens for 14541 words reading tweets/2009-09-14 read 105590 tokens for 13606 words reading tweets/2009-09-15 read 111750 tokens for 14504 words reading tweets/2009-09-16 read 111442 tokens for 15051 words reading tweets/2009-09-17 read 113606 tokens for 14909 words reading tweets/2009-09-18 read 110347 tokens for 14972 words reading tweets/2009-09-19 read 109105 tokens for 15182 words reading tweets/2009-09-20 read 109844 tokens for 15312 words reading tweets/2009-09-21 read 110194 tokens for 14288 words reading tweets/2009-09-22 read 110845 tokens for 14686 words reading tweets/2009-09-23 read 110829 tokens for 14957 words reading tweets/2009-09-24 read 112000 tokens for 15084 words reading tweets/2009-09-25 read 111658 tokens for 15142 words reading tweets/2009-09-26 read 109578 tokens for 15130 words reading tweets/2009-09-27 read 109567 tokens for 14847 words reading tweets/2009-09-28 read 108502 tokens for 14792 words reading tweets/2009-09-29 read 109033 tokens for 14688 words reading tweets/2009-09-30 read 110650 tokens for 14927 words reading tweets/2009-10-01 read 112405 tokens for 15257 words reading tweets/2009-10-02 read 106972 tokens for 14938 words reading tweets/2009-10-03 read 109021 tokens for 15192 words reading tweets/2009-10-04 read 109088 tokens for 15046 words reading tweets/2009-10-05 read 109948 tokens for 14630 words reading tweets/2009-10-06 read 108938 tokens for 15085 words reading tweets/2009-10-07 read 110704 tokens for 15026 words reading tweets/2009-10-08 read 111030 tokens for 14569 words reading tweets/2009-10-09 read 110342 tokens for 15003 words reading tweets/2009-10-10 read 108434 tokens for 15240 words reading tweets/2009-10-11 read 107951 tokens for 14404 words reading tweets/2009-10-12 read 110014 tokens for 14460 words reading tweets/2009-10-13 read 110369 tokens for 14810 words reading tweets/2009-10-14 read 109465 tokens for 14697 words reading tweets/2009-10-15 read 110649 tokens for 15208 words reading tweets/2009-10-16 read 110487 tokens for 14885 words reading tweets/2009-10-17 read 108903 tokens for 14985 words reading tweets/2009-10-18 read 107276 tokens for 14622 words reading tweets/2009-10-19 read 108847 tokens for 14393 words reading tweets/2009-10-20 read 111058 tokens for 14902 words reading tweets/2009-10-21 read 110852 tokens for 15182 words reading tweets/2009-10-22 read 111766 tokens for 15212 words reading tweets/2009-10-23 read 109688 tokens for 15035 words reading tweets/2009-10-24 read 107894 tokens for 15024 words reading tweets/2009-10-25 read 107384 tokens for 14907 words reading tweets/2009-10-26 read 109615 tokens for 14556 words reading tweets/2009-10-27 read 110153 tokens for 15033 words reading tweets/2009-10-28 read 108262 tokens for 14597 words reading tweets/2009-10-29 read 111013 tokens for 14915 words reading tweets/2009-10-30 read 108500 tokens for 14935 words reading tweets/2009-10-31 read 108171 tokens for 14945 words reading tweets/2009-11-01 read 107784 tokens for 14427 words reading tweets/2009-11-02 read 110849 tokens for 14896 words reading tweets/2009-11-03 read 113113 tokens for 15444 words reading tweets/2009-11-04 read 112538 tokens for 15397 words reading tweets/2009-11-05 read 112720 tokens for 15653 words reading tweets/2009-11-06 read 112489 tokens for 15500 words reading tweets/2009-11-07 read 110574 tokens for 15486 words reading tweets/2009-11-08 read 110931 tokens for 15102 words reading tweets/2009-11-09 read 108961 tokens for 14805 words reading tweets/2009-11-10 read 109584 tokens for 15228 words reading tweets/2009-11-11 read 112315 tokens for 15360 words reading tweets/2009-11-12 read 108823 tokens for 15185 words reading tweets/2009-11-13 read 111545 tokens for 15220 words reading tweets/2009-11-14 read 108765 tokens for 15118 words reading tweets/2009-11-15 read 108385 tokens for 14508 words reading tweets/2009-11-16 read 109166 tokens for 15028 words reading tweets/2009-11-17 read 109148 tokens for 15219 words reading tweets/2009-11-18 read 111023 tokens for 15224 words reading tweets/2009-11-19 read 111777 tokens for 15455 words reading tweets/2009-11-20 read 112980 tokens for 15041 words reading tweets/2009-11-21 read 109907 tokens for 15290 words reading tweets/2009-11-22 read 111542 tokens for 15400 words reading tweets/2009-11-23 read 114745 tokens for 15708 words reading tweets/2009-11-24 read 109427 tokens for 15371 words reading tweets/2009-11-25 read 110655 tokens for 15125 words reading tweets/2009-11-26 read 108549 tokens for 15261 words reading tweets/2009-11-27 read 105793 tokens for 15308 words reading tweets/2009-11-28 read 108411 tokens for 14984 words reading tweets/2009-11-29 read 107720 tokens for 14769 words reading tweets/2009-11-30 read 113674 tokens for 13752 words reading tweets/2009-12-01 read 110829 tokens for 14878 words reading tweets/2009-12-02 read 110234 tokens for 14965 words reading tweets/2009-12-03 read 111591 tokens for 15426 words reading tweets/2009-12-04 read 111643 tokens for 15606 words reading tweets/2009-12-05 read 110992 tokens for 15261 words reading tweets/2009-12-06 read 107936 tokens for 14714 words reading tweets/2009-12-07 read 108016 tokens for 14710 words reading tweets/2009-12-08 read 111097 tokens for 15161 words reading tweets/2009-12-09 read 111591 tokens for 15421 words reading tweets/2009-12-10 read 113669 tokens for 15528 words reading tweets/2009-12-11 read 111335 tokens for 15556 words reading tweets/2009-12-12 read 110497 tokens for 15314 words reading tweets/2009-12-13 read 107376 tokens for 14779 words reading tweets/2009-12-14 read 107704 tokens for 14675 words reading tweets/2009-12-15 read 111002 tokens for 15353 words reading tweets/2009-12-16 read 110423 tokens for 15217 words reading tweets/2009-12-17 read 110182 tokens for 15463 words reading tweets/2009-12-18 read 108764 tokens for 15160 words reading tweets/2009-12-19 read 111658 tokens for 14978 words reading tweets/2009-12-21 read 110391 tokens for 14970 words reading tweets/2009-12-22 read 110550 tokens for 15227 words reading tweets/2009-12-23 read 110270 tokens for 15510 words reading tweets/2009-12-24 read 109092 tokens for 15080 words reading tweets/2009-12-25 read 103392 tokens for 13208 words reading tweets/2009-12-26 read 106601 tokens for 13880 words reading tweets/2009-12-27 read 108230 tokens for 14697 words reading tweets/2009-12-28 read 108223 tokens for 14695 words reading tweets/2009-12-29 read 109394 tokens for 15230 words reading tweets/2009-12-30 read 109112 tokens for 15206 words reading tweets/2009-12-31 read 110775 tokens for 15241 words reading tweets/2010-01-01 read 105552 tokens for 12742 words reading tweets/2010-01-02 read 107839 tokens for 14489 words reading tweets/2010-01-03 read 107679 tokens for 14454 words reading tweets/2010-01-04 read 110156 tokens for 14767 words reading tweets/2010-01-05 read 112415 tokens for 15393 words reading tweets/2010-01-06 read 112478 tokens for 15169 words reading tweets/2010-01-07 read 108945 tokens for 15068 words reading tweets/2010-01-08 read 113086 tokens for 15065 words reading tweets/2010-01-09 read 107570 tokens for 15224 words reading tweets/2010-01-10 read 106221 tokens for 14715 words reading tweets/2010-01-11 read 108903 tokens for 14755 words reading tweets/2010-01-12 read 111074 tokens for 15686 words reading tweets/2010-01-13 read 110037 tokens for 15250 words reading tweets/2010-01-14 read 112219 tokens for 15605 words reading tweets/2010-01-15 read 111010 tokens for 15431 words reading tweets/2010-01-16 read 108836 tokens for 15246 words reading tweets/2010-01-17 read 107107 tokens for 14891 words reading tweets/2010-01-18 read 107102 tokens for 14582 words reading tweets/2010-01-19 read 111386 tokens for 15483 words reading tweets/2010-01-20 read 110250 tokens for 15395 words reading tweets/2010-01-21 read 111415 tokens for 15167 words reading tweets/2010-01-22 read 109265 tokens for 15506 words reading tweets/2010-01-23 read 109175 tokens for 15430 words reading tweets/2010-01-24 read 106465 tokens for 15237 words reading tweets/2010-01-25 read 103069 tokens for 13985 words reading tweets/2010-01-26 read 108982 tokens for 15225 words reading tweets/2010-01-27 read 110316 tokens for 15479 words reading tweets/2010-01-28 read 110848 tokens for 15546 words reading tweets/2010-01-29 read 109563 tokens for 15522 words reading tweets/2010-01-30 read 107295 tokens for 15301 words reading tweets/2010-01-31 read 106842 tokens for 15009 words reading tweets/2010-02-01 read 105056 tokens for 14454 words reading tweets/2010-02-02 read 109074 tokens for 15296 words reading tweets/2010-02-03 read 108897 tokens for 15543 words reading tweets/2010-02-04 read 114662 tokens for 14484 words reading tweets/2010-02-05 read 108489 tokens for 15573 words reading tweets/2010-02-06 read 107277 tokens for 15353 words reading tweets/2010-02-07 read 106984 tokens for 15233 words reading tweets/2010-02-08 read 95307 tokens for 12138 words reading tweets/2010-02-09 read 108689 tokens for 15435 words reading tweets/2010-02-10 read 110216 tokens for 15337 words reading tweets/2010-02-11 read 110492 tokens for 15466 words reading tweets/2010-02-12 read 110314 tokens for 15300 words reading tweets/2010-02-13 read 107124 tokens for 15083 words reading tweets/2010-02-14 read 106555 tokens for 15021 words reading tweets/2010-02-15 read 106150 tokens for 14705 words reading tweets/2010-02-16 read 109076 tokens for 15242 words reading tweets/2010-02-17 read 111034 tokens for 15072 words reading tweets/2010-02-18 read 107711 tokens for 15611 words reading tweets/2010-02-19 read 108507 tokens for 15698 words reading tweets/2010-02-20 read 106115 tokens for 15525 words reading tweets/2010-02-21 read 106177 tokens for 15525 words reading tweets/2010-02-22 read 104792 tokens for 14805 words reading tweets/2010-02-23 read 107749 tokens for 15288 words reading tweets/2010-02-24 read 107716 tokens for 15650 words reading tweets/2010-02-25 read 109616 tokens for 15314 words reading tweets/2010-03-02 read 112267 tokens for 16106 words reading tweets/2010-03-03 read 108426 tokens for 15716 words reading tweets/2010-03-04 read 108858 tokens for 15910 words reading tweets/2010-03-05 read 107379 tokens for 15781 words reading tweets/2010-03-06 read 104583 tokens for 15457 words reading tweets/2010-03-07 read 104829 tokens for 15535 words reading tweets/2010-03-08 read 106083 tokens for 14918 words reading tweets/2010-03-09 read 108228 tokens for 15684 words reading tweets/2010-03-11 read 109565 tokens for 15747 words reading tweets/2010-03-12 read 108226 tokens for 15522 words reading tweets/2010-03-13 read 106277 tokens for 15620 words reading tweets/2010-03-14 read 105333 tokens for 15189 words reading tweets/2010-03-15 read 106531 tokens for 14902 words reading tweets/2010-03-16 read 105982 tokens for 15450 words reading tweets/2010-03-17 read 109162 tokens for 15527 words reading tweets/2010-03-18 read 109265 tokens for 15504 words reading tweets/2010-03-19 read 106696 tokens for 15659 words reading tweets/2010-03-20 read 106340 tokens for 15700 words reading tweets/2010-03-21 read 108759 tokens for 15059 words reading tweets/2010-03-22 read 107921 tokens for 15188 words reading tweets/2010-03-23 read 105984 tokens for 15124 words reading tweets/2010-03-24 read 107987 tokens for 15361 words reading tweets/2010-03-25 read 106502 tokens for 15628 words reading tweets/2010-03-26 read 105383 tokens for 15431 words reading tweets/2010-03-27 read 104627 tokens for 15382 words reading tweets/2010-03-28 read 103414 tokens for 14971 words reading tweets/2010-03-29 read 106617 tokens for 14944 words reading tweets/2010-03-30 read 106517 tokens for 15128 words reading tweets/2010-03-31 read 107741 tokens for 15214 words reading tweets/2010-04-01 read 105481 tokens for 15197 words reading tweets/2010-04-02 read 106309 tokens for 15209 words reading tweets/2010-04-03 read 104793 tokens for 15360 words reading tweets/2010-04-04 read 104770 tokens for 15170 words reading tweets/2010-04-05 read 105540 tokens for 14773 words reading tweets/2010-04-06 read 106829 tokens for 15027 words reading tweets/2010-04-07 read 106723 tokens for 15452 words reading tweets/2010-04-08 read 107946 tokens for 15754 words reading tweets/2010-04-09 read 107657 tokens for 15335 words reading tweets/2010-04-10 read 103465 tokens for 15589 words reading tweets/2010-04-11 read 104499 tokens for 15463 words reading tweets/2010-04-12 read 103600 tokens for 14603 words reading tweets/2010-04-13 read 107547 tokens for 15618 words reading tweets/2010-04-14 read 107136 tokens for 15539 words reading tweets/2010-04-15 read 107014 tokens for 15359 words reading tweets/2010-04-16 read 104346 tokens for 15293 words reading tweets/2010-04-17 read 103672 tokens for 15308 words reading tweets/2010-04-18 read 105339 tokens for 15222 words reading tweets/2010-04-19 read 103085 tokens for 14678 words reading tweets/2010-04-20 read 104852 tokens for 15291 words reading tweets/2010-04-21 read 106207 tokens for 15486 words reading tweets/2010-04-22 read 105254 tokens for 15170 words reading tweets/2010-04-23 read 107802 tokens for 14876 words reading tweets/2010-04-24 read 104366 tokens for 15686 words reading tweets/2010-04-25 read 103182 tokens for 15230 words reading tweets/2010-04-26 read 103105 tokens for 14824 words reading tweets/2010-04-27 read 106444 tokens for 15230 words reading tweets/2010-04-28 read 103120 tokens for 15256 words reading tweets/2010-04-29 read 104974 tokens for 15397 words reading tweets/2010-04-30 read 106525 tokens for 15585 words reading tweets/2010-05-01 read 105375 tokens for 15288 words reading tweets/2010-05-02 read 106362 tokens for 15147 words reading tweets/2010-05-03 read 105517 tokens for 14865 words reading tweets/2010-05-04 read 104905 tokens for 15254 words reading tweets/2010-05-05 read 105700 tokens for 15480 words reading tweets/2010-05-06 read 105047 tokens for 15206 words
# Convert to np arrays.
import numpy as np
counts = np.array(counts)
dates = np.array(dates)
lines = np.array(lines)
# Pickle everything
import pickle
pickle.dump((counts, dates, lines), open('data.pkl', 'wb'))
# ~73Mb
print '%d days of tweets' % len(dates)
243 days of tweets
print sum(sum(c.values()) for c in counts), 'total tokens'
words = set()
for c in counts:
words.update(c.keys())
print len(words), 'total unique words'
26302164 total tokens 97329 total unique words
# Plot occurrences of 'flu'
import matplotlib.pyplot as plt
def get_trend(counts, term):
return np.array([c[term] for c in counts])
def plot_trend(x, y, title):
plt.figure()
plt.plot(y)
plt.xticks(range(len(x))[::20], x[::20], rotation='90')
plt.title(title)
plt.show()
flu_trend = get_trend(counts, 'flu')
plot_trend(dates, flu_trend, 'flu')
# Normalize by number of tweets per day.
norm_flu_trend = 1. * flu_trend / lines
def plot_trends(x, y1, y2, label1, label2):
# Plot two trends with different y-scales
fig, ax1 = plt.subplots()
ax1.plot(y1, 'b.-', label=label1)
ax1.set_ylabel(label1, color='b')
plt.xticks(range(len(x))[::20], x[::20], rotation='90')
ax2 = ax1.twinx()
ax2.plot(y2, 'g.-', label=label2)
ax2.set_ylabel(label2, color='g')
plt.show()
plot_trends(dates, flu_trend, norm_flu_trend, 'flu', 'norm_flu')
** Need to smooth! **
We'll group by week rather than day to deal with the high variance of counts.
# Read CDC's ILI data, which looks like this:
!head ili.csv
2009-08-29,2009-09-05,0.03475 2009-09-05,2009-09-12,0.04060 2009-09-12,2009-09-19,0.04248 2009-09-19,2009-09-26,0.04189 2009-09-26,2009-10-03,0.04869 2009-10-03,2009-10-10,0.06076 2009-10-10,2009-10-17,0.07028 2009-10-17,2009-10-24,0.07688 2009-10-24,2009-10-31,0.07514 2009-11-01,2009-11-07,0.06615
ili_dates = [l.strip().split(',') for l in open('ili.csv')]
print ili_dates[0]
['2009-08-29', '2009-09-05', '0.03475']
# Sum together tweet data for each week.
def get_week_counts(counts, dates, ili_dates, lines):
week_counts = []
week_dates = []
week_lines = []
for start, end, value in ili_dates:
week_dates.append(end)
this_counts, this_lines = get_counts_in_range(counts, lines, dates, start, end)
week_counts.append(this_counts)
week_lines.append(this_lines)
return np.array(week_counts), np.array(week_dates), np.array(week_lines)
def get_counts_in_range(counts, lines, dates, start, end):
indices = [i for i, v in enumerate(dates)
if dates[i] > start and dates[i] <= end]
print 'got %d dates from %s to %s' % (len(indices), start, end)
sumc = Counter()
for cts in counts[indices]:
sumc.update(cts)
return sumc, sum(lines[indices])
week_counts, week_dates, week_lines = get_week_counts(counts, dates, ili_dates, lines)
got 5 dates from 2009-08-29 to 2009-09-05 got 7 dates from 2009-09-05 to 2009-09-12 got 7 dates from 2009-09-12 to 2009-09-19 got 7 dates from 2009-09-19 to 2009-09-26 got 7 dates from 2009-09-26 to 2009-10-03 got 7 dates from 2009-10-03 to 2009-10-10 got 7 dates from 2009-10-10 to 2009-10-17 got 7 dates from 2009-10-17 to 2009-10-24 got 7 dates from 2009-10-24 to 2009-10-31 got 6 dates from 2009-11-01 to 2009-11-07 got 7 dates from 2009-11-07 to 2009-11-14 got 7 dates from 2009-11-14 to 2009-11-21 got 7 dates from 2009-11-21 to 2009-11-28 got 7 dates from 2009-11-28 to 2009-12-05 got 7 dates from 2009-12-05 to 2009-12-12 got 7 dates from 2009-12-12 to 2009-12-19 got 6 dates from 2009-12-19 to 2009-12-26 got 7 dates from 2009-12-26 to 2010-01-02 got 7 dates from 2010-01-02 to 2010-01-09 got 7 dates from 2010-01-09 to 2010-01-16 got 7 dates from 2010-01-16 to 2010-01-23 got 7 dates from 2010-01-23 to 2010-01-30 got 7 dates from 2010-01-30 to 2010-02-06 got 7 dates from 2010-02-06 to 2010-02-13 got 7 dates from 2010-02-13 to 2010-02-20 got 5 dates from 2010-02-20 to 2010-02-27 got 5 dates from 2010-02-27 to 2010-03-06 got 6 dates from 2010-03-06 to 2010-03-13 got 7 dates from 2010-03-13 to 2010-03-20 got 7 dates from 2010-03-20 to 2010-03-27 got 7 dates from 2010-03-27 to 2010-04-03 got 7 dates from 2010-04-03 to 2010-04-10 got 7 dates from 2010-04-10 to 2010-04-17 got 7 dates from 2010-04-17 to 2010-04-24 got 7 dates from 2010-04-24 to 2010-05-01 got 5 dates from 2010-05-01 to 2010-05-08
flu_trend = get_trend(week_counts, 'flu')
plot_trend(week_dates, flu_trend, 'flu')
norm_flu_trend = 1. * flu_trend / week_lines
plot_trends(week_dates, flu_trend, norm_flu_trend, 'flu', 'norm_flu')
# Compare norm_flu_trend and ILI data
ilis = [float(v[2]) for v in ili_dates]
plot_trends(dates, norm_flu_trend, ilis, 'flu', 'ILI')
# What is correlation?
from scipy.stats.stats import pearsonr
print pearsonr(norm_flu_trend, ilis)
(0.92186448815003175, 1.4721218653313829e-15)
# Try some other terms.
def evaluate_terms(terms, week_counts, week_lines, week_dates, ilis):
for term in terms:
trend = 1. * get_trend(week_counts, term) / week_lines
plot_trends(week_dates, trend, ilis, term, 'ILI')
print 'correlation for', term, pearsonr(trend, ilis)
evaluate_terms(['zebra', 'cough', 'headache', 'fever', 'sick'], week_counts, week_lines, week_dates, ilis)
correlation for zebra (0.11135875119317874, 0.51789856630330067)
correlation for cough (0.73992319956590169, 2.5123341693956533e-07)
correlation for headache (0.81506787372637202, 1.4331660744340132e-09)
correlation for fever (0.69213469769648095, 2.9244777232518385e-06)
correlation for sick (0.85208866376956682, 4.3486363046679577e-11)
# Fit a model on first N weeks, then predict on remaining.
from sklearn.linear_model import LinearRegression
import math
def train_test(trend, ilis, dates, train_size):
reg = LinearRegression()
x = [[i] for i in trend]
reg.fit(x[:train_size], ilis[:train_size])
print 'slope=', reg.coef_, 'intercept=', reg.intercept_
train_preds = reg.predict(x[:train_size])
test_preds = reg.predict(x[train_size:])
all_preds = np.concatenate((train_preds, test_preds))
plot_trends(dates, all_preds, ilis, 'flu', 'ILI')
print 'correlation:', pearsonr(test_preds,
ilis[train_size:])
print 'mse=', math.sqrt(np.mean((test_preds -
ilis[train_size:])**2))
train_test(norm_flu_trend, ilis, week_dates, 10)
slope= [ 4875.46999025] intercept= 0.0111974820114
correlation: (0.88896023873430152, 1.3017011234610459e-09) mse= 0.00536794341945
def train_test_multi(trends, ilis, dates, train_size):
reg = LinearRegression()
x = []
for i in range(len(trends[0])):
x.append([trends[j][i] for j in range(len(trends))])
reg.fit(x[:train_size], ilis[:train_size])
print 'coef=', reg.coef_, 'intercept=', reg.intercept_
train_preds = reg.predict(x[:train_size])
test_preds = reg.predict(x[train_size:])
all_preds = np.concatenate((train_preds, test_preds))
plot_trends(dates, all_preds, ilis, 'flu', 'ILI')
corr = pearsonr(test_preds, ilis[train_size:])
print 'correlation:', corr[0]
return corr[0]
fever = 1. * get_trend(week_counts, 'fever') / week_lines
cough = 1. * get_trend(week_counts, 'cough') / week_lines
sick = 1. * get_trend(week_counts, 'sick') / week_lines
corr = train_test_multi([norm_flu_trend, fever, cough, sick],
ilis, week_dates, 15)
coef= [ 5251.82254491 -84.69640503 -1916.69957167 875.47184323] intercept= -0.0061411779179
correlation: 0.717677334484
Given a list of N potential terms to track, we want to search for the subset that results in the most accurate model.
Assume we have a training set Dt and a validation set Dv of tweets and CDC values.
A common iterative feature selection strategy is as follows:
Let F={} be the set of terms used in the regression model.
Let T={t1…tN} be the set of possible terms to use.