import pandas as pd
import numpy as np
from datetime import datetime as dt
# Build a sample DataFrame that looks like what you will get back from a history() call.
df = pd.DataFrame(np.array([np.arange(10), np.arange(10)]).transpose(),
index=pd.date_range(start=pd.Timestamp('2014-06-17'), periods=10),
# In the actual history dataframe, columns are Security objects.
columns=['sid_1', 'sid_2'])
df
sid_1 | sid_2 | |
---|---|---|
2014-06-17 | 0 | 0 |
2014-06-18 | 1 | 1 |
2014-06-19 | 2 | 2 |
2014-06-20 | 3 | 3 |
2014-06-21 | 4 | 4 |
2014-06-22 | 5 | 5 |
2014-06-23 | 6 | 6 |
2014-06-24 | 7 | 7 |
2014-06-25 | 8 | 8 |
2014-06-26 | 9 | 9 |
# Get the mean of all prices in this window.
means = df.mean()
means
sid_1 4.5 sid_2 4.5 dtype: float64
# Get a rolling means window in which each cell contains
# the mean of the current day and the previous three days.
# Note that the first three windows have NaN (not a number),
# because they don't have a full 3 days yet.
rolling_means = pd.rolling_mean(df, window=4)
rolling_means
sid_1 | sid_2 | |
---|---|---|
2014-06-17 | NaN | NaN |
2014-06-18 | NaN | NaN |
2014-06-19 | NaN | NaN |
2014-06-20 | 1.5 | 1.5 |
2014-06-21 | 2.5 | 2.5 |
2014-06-22 | 3.5 | 3.5 |
2014-06-23 | 4.5 | 4.5 |
2014-06-24 | 5.5 | 5.5 |
2014-06-25 | 6.5 | 6.5 |
2014-06-26 | 7.5 | 7.5 |
# We can customize the minumum number of required observations by specifying min_periods.
rolling_means_less_nans = pd.rolling_mean(df, window=4, min_periods=2)
rolling_means_less_nans
sid_1 | sid_2 | |
---|---|---|
2014-06-17 | NaN | NaN |
2014-06-18 | 0.5 | 0.5 |
2014-06-19 | 1.0 | 1.0 |
2014-06-20 | 1.5 | 1.5 |
2014-06-21 | 2.5 | 2.5 |
2014-06-22 | 3.5 | 3.5 |
2014-06-23 | 4.5 | 4.5 |
2014-06-24 | 5.5 | 5.5 |
2014-06-25 | 6.5 | 6.5 |
2014-06-26 | 7.5 | 7.5 |
# Pandas has similar rolling functions for sum, stddev,
# and a variety of other statistical operations.
rolling_sums = pd.rolling_sum(df, window=2)
rolling_stddevs = pd.rolling_std(df, window=3)
# Combine both DataFrames by joining on the date index.
combined = rolling_sums.merge(rolling_stddevs, left_index=True, right_index=True, suffixes=("_sum", "_stddev"))
combined
sid_1_sum | sid_2_sum | sid_1_stddev | sid_2_stddev | |
---|---|---|---|---|
2014-06-17 | NaN | NaN | NaN | NaN |
2014-06-18 | 1 | 1 | NaN | NaN |
2014-06-19 | 3 | 3 | 1 | 1 |
2014-06-20 | 5 | 5 | 1 | 1 |
2014-06-21 | 7 | 7 | 1 | 1 |
2014-06-22 | 9 | 9 | 1 | 1 |
2014-06-23 | 11 | 11 | 1 | 1 |
2014-06-24 | 13 | 13 | 1 | 1 |
2014-06-25 | 15 | 15 | 1 | 1 |
2014-06-26 | 17 | 17 | 1 | 1 |
# Note that these outputs are still DataFrames, so we can do nice things like this:
# Get the first 5 elements of the frame.
combined.iloc[:5]
sid_1_sum | sid_2_sum | sid_1_stddev | sid_2_stddev | |
---|---|---|---|---|
2014-06-17 | NaN | NaN | NaN | NaN |
2014-06-18 | 1 | 1 | NaN | NaN |
2014-06-19 | 3 | 3 | 1 | 1 |
2014-06-20 | 5 | 5 | 1 | 1 |
2014-06-21 | 7 | 7 | 1 | 1 |
# Or this:
# Get all but the last element of the frame:
combined.iloc[:-1]
sid_1_sum | sid_2_sum | sid_1_stddev | sid_2_stddev | |
---|---|---|---|---|
2014-06-17 | NaN | NaN | NaN | NaN |
2014-06-18 | 1 | 1 | NaN | NaN |
2014-06-19 | 3 | 3 | 1 | 1 |
2014-06-20 | 5 | 5 | 1 | 1 |
2014-06-21 | 7 | 7 | 1 | 1 |
2014-06-22 | 9 | 9 | 1 | 1 |
2014-06-23 | 11 | 11 | 1 | 1 |
2014-06-24 | 13 | 13 | 1 | 1 |
2014-06-25 | 15 | 15 | 1 | 1 |
# Or this:
# Get every other entry in the DataFrame, in reverse.
combined.iloc[::-2]
sid_1_sum | sid_2_sum | sid_1_stddev | sid_2_stddev | |
---|---|---|---|---|
2014-06-26 | 17 | 17 | 1 | 1 |
2014-06-24 | 13 | 13 | 1 | 1 |
2014-06-22 | 9 | 9 | 1 | 1 |
2014-06-20 | 5 | 5 | 1 | 1 |
2014-06-18 | 1 | 1 | NaN | NaN |
# Or this:
# Get the entries for June 20th and June 21st.
combined.loc[[pd.Timestamp('2014-06-20'), pd.Timestamp('2014-06-21')]]
sid_1_sum | sid_2_sum | sid_1_stddev | sid_2_stddev | |
---|---|---|---|---|
2014-06-20 | 5 | 5 | 1 | 1 |
2014-06-21 | 7 | 7 | 1 | 1 |
# Or this:
# Get a specific column (this returns a Series)
combined['sid_2_sum']
2014-06-17 NaN 2014-06-18 1 2014-06-19 3 2014-06-20 5 2014-06-21 7 2014-06-22 9 2014-06-23 11 2014-06-24 13 2014-06-25 15 2014-06-26 17 Freq: D, Name: sid_2_sum, dtype: float64
# Or this:
# Get multiple specific columns (This returns a DataFrame)
combined[['sid_2_sum', 'sid_1_stddev']]
sid_2_sum | sid_1_stddev | |
---|---|---|
2014-06-17 | NaN | NaN |
2014-06-18 | 1 | NaN |
2014-06-19 | 3 | 1 |
2014-06-20 | 5 | 1 |
2014-06-21 | 7 | 1 |
2014-06-22 | 9 | 1 |
2014-06-23 | 11 | 1 |
2014-06-24 | 13 | 1 |
2014-06-25 | 15 | 1 |
2014-06-26 | 17 | 1 |