Josh Montague (@jrmontag)
This notebook includes the code for munging data and creating the presentation figures. Executing the code in this notebook requires these packages: numpy, pandas, statsmodels, matplotlib
. There is also a static version of this notebook available on nbviewer, where you can see the output without needing a pydata stack.
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import json
# prettier graphs, thanks to @Cmrn_DP
s = json.load( open("./bmh_matplotlibrc.json") )
matplotlib.rcParams.update(s)
matplotlib.rcParams.update({'figure.autolayout': True})
# read in twitter data
tw = pd.read_csv('./twitter.csv'
, sep=','
, names=["publisher","datetime","count"]
)
# data is sampled @ 20%
tw["count"] *= 5
# have a look
tw.head()
publisher | datetime | count | |
---|---|---|---|
0 | 2014-01-28 12:00:00 | 1644105 | |
1 | 2014-01-28 12:05:00 | 1660765 | |
2 | 2014-01-28 12:10:00 | 1716535 | |
3 | 2014-01-28 12:15:00 | 1727800 | |
4 | 2014-01-28 12:20:00 | 1689110 |
# we actually only need two columns, and update the names
tw = tw[["datetime", "count"]]
tw.columns = ["datetime", "twitter"]
#tw.head()
# read in tumblr data, smarter this time via the usecols & names kwargs
tu = pd.read_csv('./tumblr.csv'
, sep=','
, usecols=[1,2]
, names=["datetime","tumblr"]
)
# same sampling
tu["tumblr"] *= 5
tu.head()
datetime | tumblr | |
---|---|---|
0 | 2014-01-28 12:00:00 | 416630 |
1 | 2014-01-28 12:05:00 | 383835 |
2 | 2014-01-28 12:10:00 | 399445 |
3 | 2014-01-28 12:15:00 | 389190 |
4 | 2014-01-28 12:20:00 | 399800 |
# combine the dataframes, convert the datetimes, and create a datetimeindex
both = pd.merge(tw, tu, how="outer", on="datetime")
both["datetime"] = pd.to_datetime(both["datetime"], format="%Y-%m-%d %H:%M:%S")
both = both.set_index("datetime")
# optionally, plot the data with hour buckets
both_hr = both.resample(rule="60Min", how="sum")
#both.head()
both_hr.head()
tumblr | ||
---|---|---|
datetime | ||
2014-01-28 12:00:00 | 19094940 | 4933735 |
2014-01-28 13:00:00 | 23684955 | 5669370 |
2014-01-28 14:00:00 | 23095720 | 6479420 |
2014-01-28 15:00:00 | 25310145 | 7421830 |
2014-01-28 16:00:00 | 23610840 | 8310870 |
%pylab inline
# use the pandas wrapper for matplotlib
#ax = both_hr.plot(
ax = both.plot(
style='-'
, markersize=10
, alpha=0.75
#, ylim=[0, 2.6e6]
, xlim=[pd.to_datetime("2014-01-29 00:00:00", format='%Y-%m-%d %H:%M:%S')
, pd.to_datetime("2014-02-02 00:00:00", format='%Y-%m-%d %H:%M:%S')
]
, legend=True
, label="firehose activities"
#, subplots=True
, figsize=(12,5)
)
# ... but still have to set some params via mpl.plt
plt.legend(loc='best')
plt.ylabel('firehose activities / 5-min')
plt.xlabel('Datetime (UTC)')
# update the presentation figure automatically
plt.savefig('../presentation/images/tw-tu_grapherator.png'
#, pad_inches=0.5
)
Populating the interactive namespace from numpy and matplotlib
# read in storm data
janus = pd.read_csv('./janus.csv'
, sep=','
, names=["datetime","count"]
)
janus.tail()
datetime | count | |
---|---|---|
5276 | 2014-01-23T11:56:00 | 3 |
5277 | 2014-01-23T11:57:00 | 0 |
5278 | 2014-01-23T11:58:00 | 0 |
5279 | 2014-01-23T11:59:00 | 4 |
5280 | 2014-01-23T12:00:00 | 3 |
# lowess = Locally Weighted Scatterplot Smoothing
from statsmodels.nonparametric.smoothers_lowess import lowess
# you can experiment with the 'fraction' kwarg to vary the
# amount of data used in each local regression. the use
# of .index is discussed below
lowess_line = lowess(janus["count"], janus.index, frac=0.15)
# returns a numpy array
lowess_line[10:]
array([[ 1.00000000e+01, 4.79654443e-01], [ 1.10000000e+01, 4.79721823e-01], [ 1.20000000e+01, 4.79789741e-01], ..., [ 5.27800000e+03, 1.19832803e+00], [ 5.27900000e+03, 1.19781658e+00], [ 5.28000000e+03, 1.19730998e+00]])
%pylab inline
# start with the data plot
ax = janus.plot(style='-'
, markersize=1
, linewidth=1
, alpha=0.75
, figsize=(12,6)
, ylim=[-1,50]
, xlim=[0,5200]
, legend=False
, label="'janus' activities (per min)"
)
# apparently the 'label' kwarg above doesn't do what I'd like,
# so reach all the way in to the axis object and set it
ax.set_ylabel('"janus" activities / min')
# this is a little sneaky. I wasn't able to get the lowess smooth
# to place nice with datetime objects (or a pandas datetimeindex)
# so I let it fit with the monotonic index (which you can also
# use to plot). then, having chosen a plot layout, i know which
# tick labels (ie indexes) are being shown on the major ticks.
# we can then swap the integer indexes for the corresponding
# datetime string in that row.
sneaky_labels = [ janus.ix[i]["datetime"] for i in range(0,6000,1000) ]
ax.set_xticklabels(labels=sneaky_labels)
plt.plot(lowess_line[:,0], lowess_line[:, 1]
, '-'
, lw = 3
, alpha = 0.75
, color = 'black'
, label = "Lowess smooth"
)
plt.legend(loc = "upper left")
# update the presentation figure
plt.savefig('../presentation/images/janus-smooth.png'
#, pad_inches=0.5
)
Populating the interactive namespace from numpy and matplotlib