import pandas as pd import matplotlib import matplotlib.pyplot as plt import json # prettier graphs, thanks to @Cmrn_DP s = json.load( open("./bmh_matplotlibrc.json") ) matplotlib.rcParams.update(s) matplotlib.rcParams.update({'figure.autolayout': True}) # read in twitter data tw = pd.read_csv('./twitter.csv' , sep=',' , names=["publisher","datetime","count"] ) # data is sampled @ 20% tw["count"] *= 5 # have a look tw.head() # we actually only need two columns, and update the names tw = tw[["datetime", "count"]] tw.columns = ["datetime", "twitter"] #tw.head() # read in tumblr data, smarter this time via the usecols & names kwargs tu = pd.read_csv('./tumblr.csv' , sep=',' , usecols=[1,2] , names=["datetime","tumblr"] ) # same sampling tu["tumblr"] *= 5 tu.head() # combine the dataframes, convert the datetimes, and create a datetimeindex both = pd.merge(tw, tu, how="outer", on="datetime") both["datetime"] = pd.to_datetime(both["datetime"], format="%Y-%m-%d %H:%M:%S") both = both.set_index("datetime") # optionally, plot the data with hour buckets both_hr = both.resample(rule="60Min", how="sum") #both.head() both_hr.head() %pylab inline # use the pandas wrapper for matplotlib #ax = both_hr.plot( ax = both.plot( style='-' , markersize=10 , alpha=0.75 #, ylim=[0, 2.6e6] , xlim=[pd.to_datetime("2014-01-29 00:00:00", format='%Y-%m-%d %H:%M:%S') , pd.to_datetime("2014-02-02 00:00:00", format='%Y-%m-%d %H:%M:%S') ] , legend=True , label="firehose activities" #, subplots=True , figsize=(12,5) ) # ... but still have to set some params via mpl.plt plt.legend(loc='best') plt.ylabel('firehose activities / 5-min') plt.xlabel('Datetime (UTC)') # update the presentation figure automatically plt.savefig('../presentation/images/tw-tu_grapherator.png' #, pad_inches=0.5 ) # read in storm data janus = pd.read_csv('./janus.csv' , sep=',' , names=["datetime","count"] ) janus.tail() # lowess = Locally Weighted Scatterplot Smoothing from statsmodels.nonparametric.smoothers_lowess import lowess # you can experiment with the 'fraction' kwarg to vary the # amount of data used in each local regression. the use # of .index is discussed below lowess_line = lowess(janus["count"], janus.index, frac=0.15) # returns a numpy array lowess_line[10:] %pylab inline # start with the data plot ax = janus.plot(style='-' , markersize=1 , linewidth=1 , alpha=0.75 , figsize=(12,6) , ylim=[-1,50] , xlim=[0,5200] , legend=False , label="'janus' activities (per min)" ) # apparently the 'label' kwarg above doesn't do what I'd like, # so reach all the way in to the axis object and set it ax.set_ylabel('"janus" activities / min') # this is a little sneaky. I wasn't able to get the lowess smooth # to place nice with datetime objects (or a pandas datetimeindex) # so I let it fit with the monotonic index (which you can also # use to plot). then, having chosen a plot layout, i know which # tick labels (ie indexes) are being shown on the major ticks. # we can then swap the integer indexes for the corresponding # datetime string in that row. sneaky_labels = [ janus.ix[i]["datetime"] for i in range(0,6000,1000) ] ax.set_xticklabels(labels=sneaky_labels) plt.plot(lowess_line[:,0], lowess_line[:, 1] , '-' , lw = 3 , alpha = 0.75 , color = 'black' , label = "Lowess smooth" ) plt.legend(loc = "upper left") # update the presentation figure plt.savefig('../presentation/images/janus-smooth.png' #, pad_inches=0.5 )