import numpy as np
import matplotlib.pylab as plt
%matplotlib inline
trends = plt.csv2rec('trends.csv')
trends.dtype
dtype((numpy.record, [('week_start', 'O'), ('week_end', 'O'), ('textbooks', '<i8'), ('spring_break', '<i8'), ('kayak', '<i8'), ('skiing', '<i8'), ('global_warming', '<i8')]))
f, ax = plt.subplots()
ax.plot(trends.week_start, trends.spring_break, label='spring break')
ax.plot(trends.week_start, trends.textbooks, label='texbooks')
ax.plot(trends.week_start, trends.skiing, label='skiing')
ax.plot(trends.week_start, trends.kayak, label='kayak')
ax.set_xlabel("Date")
ax.set_ylabel("Relative Search Volume")
ax.legend(loc="upper center")
<matplotlib.legend.Legend at 0x10de90978>
II. Determine maxima and minima of each trend term
Create vector of year and week numbers
dates = trends.week_start
yrs = np.zeros_like(dates)
wks = np.zeros_like(dates)
for i in range(len(dates)):
yrs[i] = dates[i].year
wks[i] = dates[i].isocalendar()[1]
determine in which week of each year (for all five search trends including "global warming") that search reached its peak and its minimum. Are there any trends you can spot with any of the terms?
for name in trends.dtype.names:
if name in ["week_start","week_end"]:
continue
print("\n",name,"\n","-"*50)
trend = trends[name]
for yr in range(2004,2014):
## get the trend info for just this year
trend_year = trend[np.where(yrs == yr)]
wks_year = wks[np.where(yrs == yr)]
print(yr, "max=",np.amax(trend_year),"@ week =",
wks[np.argmax(trend_year)],
"min=",np.amin(trend_year),"@ week =",
wks[np.argmin(trend_year)])
textbooks -------------------------------------------------- 2004 max= 60 @ week = 34 min= 7 @ week = 47 2005 max= 48 @ week = 34 min= 7 @ week = 47 2006 max= 42 @ week = 34 min= 5 @ week = 47 2007 max= 32 @ week = 33 min= 6 @ week = 44 2008 max= 36 @ week = 34 min= 5 @ week = 47 2009 max= 39 @ week = 34 min= 6 @ week = 15 2010 max= 31 @ week = 34 min= 6 @ week = 44 2011 max= 24 @ week = 34 min= 5 @ week = 44 2012 max= 21 @ week = 34 min= 4 @ week = 47 2013 max= 18 @ week = 34 min= 3 @ week = 47 spring_break -------------------------------------------------- 2004 max= 60 @ week = 11 min= 10 @ week = 33 2005 max= 63 @ week = 12 min= 9 @ week = 24 2006 max= 46 @ week = 11 min= 7 @ week = 32 2007 max= 40 @ week = 10 min= 5 @ week = 22 2008 max= 38 @ week = 10 min= 5 @ week = 22 2009 max= 42 @ week = 10 min= 5 @ week = 25 2010 max= 39 @ week = 10 min= 5 @ week = 23 2011 max= 40 @ week = 10 min= 5 @ week = 27 2012 max= 43 @ week = 10 min= 5 @ week = 25 2013 max= 47 @ week = 11 min= 6 @ week = 21 kayak -------------------------------------------------- 2004 max= 52 @ week = 29 min= 18 @ week = 43 2005 max= 51 @ week = 30 min= 20 @ week = 43 2006 max= 55 @ week = 29 min= 23 @ week = 7 2007 max= 53 @ week = 28 min= 27 @ week = 2 2008 max= 57 @ week = 29 min= 29 @ week = 47 2009 max= 55 @ week = 28 min= 29 @ week = 41 2010 max= 58 @ week = 29 min= 33 @ week = 47 2011 max= 81 @ week = 30 min= 45 @ week = 11 2012 max= 100 @ week = 28 min= 49 @ week = 51 2013 max= 94 @ week = 28 min= 46 @ week = 47 skiing -------------------------------------------------- 2004 max= 93 @ week = 52 min= 14 @ week = 20 2005 max= 89 @ week = 1 min= 12 @ week = 22 2006 max= 86 @ week = 7 min= 11 @ week = 22 2007 max= 64 @ week = 52 min= 10 @ week = 17 2008 max= 53 @ week = 1 min= 8 @ week = 18 2009 max= 47 @ week = 1 min= 7 @ week = 18 2010 max= 64 @ week = 7 min= 6 @ week = 19 2011 max= 34 @ week = 1 min= 6 @ week = 17 2012 max= 32 @ week = 53 min= 5 @ week = 21 2013 max= 36 @ week = 52 min= 5 @ week = 16 global_warming -------------------------------------------------- 2004 max= 29 @ week = 46 min= 8 @ week = 28 2005 max= 27 @ week = 38 min= 8 @ week = 52 2006 max= 41 @ week = 49 min= 12 @ week = 33 2007 max= 75 @ week = 10 min= 18 @ week = 51 2008 max= 48 @ week = 16 min= 14 @ week = 51 2009 max= 50 @ week = 49 min= 13 @ week = 32 2010 max= 33 @ week = 1 min= 10 @ week = 51 2011 max= 20 @ week = 5 min= 9 @ week = 52 2012 max= 18 @ week = 9 min= 6 @ week = 52 2013 max= 13 @ week = 16 min= 5 @ week = 51
which term has the largest scatter about it's median value? which term has the smallest scatter?
def std_median(datums):
return np.sqrt( np.sum( (datums - np.median(datums))**2 ) )
for name in trends.dtype.names:
if name in ["week_start","week_end"]:
continue
print("\n",name,"\n","-"*15)
print(std_median(trends[name]))
textbooks --------------- 205.684223994 spring_break --------------- 310.621956726 kayak --------------- 462.606744439 skiing --------------- 449.831079406 global_warming --------------- 305.695927353
Determine the time lag, in units of weeks, that maximizes the cross-correlation between "skiing" and "spring break". Do this also for "skiing" and "global warming"
result = np.correlate(trends.skiing,trends.spring_break, mode='full')
gap = np.arange(result.size) - result.size/2
plt.plot(gap,result)
plt.xlim(-30,30)
print(gap[np.argmax(result)])
-6.5
skiing preceeds spring break by about 6.5 weeks
result = np.correlate(trends.skiing,trends.global_warming, mode='full')
gap = np.arange(result.size) - result.size/2
plt.plot(gap,result)
print(gap[np.argmax(result)])
plt.xlim(-90,60)
-61.5
(-90, 60)
skiing preceeds global warming by 1 year, 10 weeks