from ggplot import ggplot import ggplot as gg from IPython.html.widgets import interact import matplotlib.pyplot as plt import pandas as pd import qgrid import seaborn as sns %matplotlib inline qgrid.nbinstall() # Pull in the CSV, drop NAs df = pd.read_csv('mthood_snotel.csv', header=7, parse_dates=['Date']).dropna() qgrid.show_grid(df, remote_js=True) # Let's start with some basic histograms of our key dimensions sns.set_context(rc={"figure.figsize": (15, 7)}) sns.distplot(df['Precipitation Accumulation (in)'], bins=50) sns.distplot(df['Snow Water Equivalent (in)'], bins=100) sns.kdeplot(df['Air Temperature Maximum (degF)'], shade=True); sns.kdeplot(df['Air Temperature Minimum (degF)'], shade=True); sns.kdeplot(df['Air Temperature Average (degF)'], shade=True); # We can use Seaborn + IPython interact widgets to do quick comparison of dimensions subset = df.drop(['Date'], axis=1) dims = subset.columns.tolist() @interact def linear_comp(x=dims, y=dims): sns.jointplot(x, y, data=subset, size=9) # How closely to average and Maximum temps follow one another? sns.lmplot("Air Temperature Minimum (degF)", "Air Temperature Maximum (degF)", df, size=10) # Now to use some Pandas timeseries magic to look at monthly trends # First we need to set the Date column as the Index indexed = df.set_index('Date') resampled = indexed.resample('MS').dropna() qgrid.show_grid(resampled, remote_js=True) # Exploratory: Pandas plotting should let us take a nice quick look at the data # Going to use Seaborn to set our plot context sns.set_context(rc={"figure.figsize": (18, 9)}) resampled.plot() # ggplot is quite good at handling timeseries. Let's use it to look at long-term trends resampled['Date'] = resampled.index (ggplot(gg.aes(x='Date', y='Snow Water Equivalent (in)'), data=resampled) + gg.geom_line() + gg.stat_smooth()) # What about temperatures? (ggplot(gg.aes(x='Date', y='Air Temperature Average (degF)'), data=resampled) + gg.geom_line() + gg.stat_smooth()) # I want to look at monthly statistics, so need to create a column that's just months resampled['Month'] = resampled.index.month monthly_grouped = resampled.groupby('Month').mean() # Matplotlib now has context managers to set styles. Let's try the bmh style with plt.style.context('bmh'): sns.set_context(rc={"figure.figsize": (18, 9)}) monthly_grouped.plot() res_dims = resampled.columns.tolist() @interact def res_comp(x=res_dims, y=res_dims): sns.jointplot(x, y, data=resampled, size=9) qgrid.show_grid(monthly_grouped) # Back to ggplot monthly_grouped['Month'] = monthly_grouped.index ggplot(gg.aes(x='Month', y='Snow Water Equivalent (in)'), data=monthly_grouped) + gg.geom_line() # Let's do some faceting to look at some monthly statistics (ggplot(gg.aes(x='Air Temperature Average (degF)'), data=resampled) + gg.geom_density(alpha=0.25) + gg.facet_wrap('Month') + gg.labs("Air Temperature Average (degF)", "Freq")) (ggplot(gg.aes(x='Snow Water Equivalent (in)'), data=resampled) + gg.geom_density(alpha=0.25) + gg.facet_wrap('Month') + gg.labs("Snow Water Equivalent (in)", "Freq")) # Seaborn also has very powerful faceting mechanisms. Let's look at the monthly average temperatures # again, but in a FacetGrid months = resampled['Month'].unique() months.sort() months g = sns.FacetGrid(resampled, row="Month", hue="Month", palette="deep", size=1.8, aspect=4, hue_order=months, row_order=months) g.map(sns.distplot, 'Air Temperature Average (degF)'); pair_cols = resampled[['Snow Water Equivalent (in)', 'Precipitation Accumulation (in)', 'Air Temperature Average (degF)', 'Month']].reset_index(drop=True) pair_cols.head() pair = sns.PairGrid(pair_cols, hue="Month", palette="GnBu_d") pair.map(plt.scatter) pair.add_legend() from IPython.core.display import HTML # Use the following if running locally: # styles = open("styles/custom.css", "r").read() # This is for nbviewer: styles = open("custom.css", "r").read() HTML(styles)