import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import json

# prettier graphs, thanks to @Cmrn_DP
s = json.load( open("./bmh_matplotlibrc.json") )
matplotlib.rcParams.update(s)
matplotlib.rcParams.update({'figure.autolayout': True})

# read in twitter data
tw = pd.read_csv('./twitter.csv'
                 , sep=','
                 , names=["publisher","datetime","count"]
                 )
# data is sampled @ 20%
tw["count"] *= 5
# have a look
tw.head()

# we actually only need two columns, and update the names
tw = tw[["datetime", "count"]]
tw.columns = ["datetime", "twitter"]

#tw.head()

# read in tumblr data, smarter this time via the usecols & names kwargs
tu = pd.read_csv('./tumblr.csv'
                 , sep=','
                 , usecols=[1,2]
                 , names=["datetime","tumblr"]
                 )
# same sampling
tu["tumblr"] *= 5 
tu.head()

# combine the dataframes, convert the datetimes, and create a datetimeindex
both = pd.merge(tw, tu, how="outer", on="datetime")
both["datetime"] = pd.to_datetime(both["datetime"], format="%Y-%m-%d %H:%M:%S")
both = both.set_index("datetime")

# optionally, plot the data with hour buckets
both_hr = both.resample(rule="60Min", how="sum")

#both.head()
both_hr.head()

%pylab inline

# use the pandas wrapper for matplotlib
#ax = both_hr.plot(
ax = both.plot(
          style='-' 
          , markersize=10
          , alpha=0.75
          #, ylim=[0, 2.6e6]
          , xlim=[pd.to_datetime("2014-01-29 00:00:00", format='%Y-%m-%d %H:%M:%S')
                  , pd.to_datetime("2014-02-02 00:00:00", format='%Y-%m-%d %H:%M:%S') 
                  ]
          , legend=True
          , label="firehose activities"
          #, subplots=True
          , figsize=(12,5)
          )

# ... but still have to set some params via mpl.plt
plt.legend(loc='best')
plt.ylabel('firehose activities / 5-min')
plt.xlabel('Datetime (UTC)')

# update the presentation figure automatically
plt.savefig('../presentation/images/tw-tu_grapherator.png'
            #, pad_inches=0.5
            )

# read in storm data
janus = pd.read_csv('./janus.csv'
                    , sep=','
                    , names=["datetime","count"]
                    )

janus.tail()

# lowess = Locally Weighted Scatterplot Smoothing
from statsmodels.nonparametric.smoothers_lowess import lowess

# you can experiment with the 'fraction' kwarg to vary the 
#    amount of data used in each local regression. the use 
#    of .index is discussed below
lowess_line = lowess(janus["count"], janus.index, frac=0.15)

# returns a numpy array
lowess_line[10:]

%pylab inline

# start with the data plot
ax = janus.plot(style='-' 
                , markersize=1
                , linewidth=1
                , alpha=0.75
                , figsize=(12,6)
                , ylim=[-1,50] 
                , xlim=[0,5200] 
                , legend=False
                , label="'janus' activities (per min)"
                )

# apparently the 'label' kwarg above doesn't do what I'd like,
#    so reach all the way in to the axis object and set it
ax.set_ylabel('"janus" activities / min')

# this is a little sneaky. I wasn't able to get the lowess smooth 
#    to place nice with datetime objects (or a pandas datetimeindex)
#    so I let it fit with the monotonic index (which you can also 
#    use to plot). then, having chosen a plot layout, i know which 
#    tick labels (ie indexes) are being shown on the major ticks. 
#    we can then swap the integer indexes for the corresponding 
#    datetime string in that row.
sneaky_labels = [ janus.ix[i]["datetime"] for i in range(0,6000,1000) ]
ax.set_xticklabels(labels=sneaky_labels)

plt.plot(lowess_line[:,0], lowess_line[:, 1]
         , '-'
         , lw = 3
         , alpha = 0.75
         , color = 'black'
         , label = "Lowess smooth"
         )

plt.legend(loc = "upper left")

# update the presentation figure
plt.savefig('../presentation/images/janus-smooth.png'
            #, pad_inches=0.5
            )