Working with time-series data is an important part of data analysis.
Starting with v0.8, the pandas library has included a rich API for time-series manipulations.
The pandas time-series API includes:
from datetime import datetime, date, time
import sys
sys.version
import pandas as pd
from pandas import Series, DataFrame, Panel
pd.__version__
np.__version__
import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.rc('figure', figsize=(10, 8))
mpl.__version__
Sample trade ticks from 2011-11-01 to 2011-11-03 for a single security
with open('data.csv', 'r') as fh:
print fh.readline() # headers
print fh.readline() # first row
parse_dates
: use a list or dict for flexible (possibly multi-column) date parsing
data = pd.read_csv('data.csv',
parse_dates={'Timestamp': ['Date', 'Time']},
index_col='Timestamp')
data
ticks = data.ix[:, ['Price', 'Volume']]
ticks.head()
resample
: regularization and frequency conversion¶bars = ticks.Price.resample('1min', how='ohlc')
bars
minute_range = bars.high - bars.low
minute_range.describe()
minute_return = bars.close / bars.open - 1
minute_return.describe()
Compute a VWAP using resample
volume = ticks.Volume.resample('1min', how='sum')
value = ticks.prod(axis=1).resample('1min', how='sum')
vwap = value / volume
vwap.ix['2011-11-01 09:27':'2011-11-01 09:32']
at_time
: same (b)at_time (same bat channel)¶bars.open.at_time('9:30')
bars.close.at_time('16:00')
between_time
: intraday time range¶filtered = vwap.between_time('10:00', '16:00')
filtered.head(20)
vol = volume.between_time('10:00', '16:00')
vol.head(20)
fillna
: handling missing data¶filtered.ix['2011-11-03':'2011-11-04'].head(20)
filled = filtered.fillna(method='pad', limit=1)
filled.ix['2011-11-03':'2011-11-04'].head(20)
vol = vol.fillna(0.)
vol.head(20)
filled.ix['2011-11-03':'2011-11-04'].plot()
plt.ylim(103.5, 104.5)
vwap.ix['2011-11-03':'2011-11-04'].plot()
plt.ylim(103.5, 104.5)
vol.ix['2011-11-03':'2011-11-04'].plot(secondary_y=True, style='r')
ticks.head()
shift
realigns values
ticks.shift(1).head()
ticks.shift(-1).head()
tshift
manipulates index values
ticks.tshift(1, 'min').head()
minute_return.head()
mr = minute_return.between_time('9:30', '16:00')
mr.head()
lagged = mr.shift(1)
lagged.head()
We shouldn't use shift here because:
lagged.at_time('9:30')
mr.at_time('16:00')
lagged = minute_return.tshift(1, 'min').between_time('9:30', '16:00')
lagged.at_time('9:30')
Let's play
pd.ols(y=mr, x=lagged)
mr = vwap / bars.open - 1
mr = mr.between_time('9:30', '16:00')
lagged = mr.tshift(1, 'min').between_time('9:30', '16:00')
pd.ols(y=mr, x=lagged)
inter = mr * vol
inter = inter.between_time('9:30', '16:00')
lagged_inter = inter.tshift(1, 'min').between_time('9:30', '16:00')
pd.ols(y=mr, x=lagged_inter)
Convert to percentage volume
vol = vol.groupby(vol.index.day).transform(lambda x: x/x.sum())
vol.head()
Verify
vol.resample('D', how='sum')
inter = mr * vol
inter = inter.between_time('9:30', '16:00')
lagged_inter = inter.tshift(1, 'min').between_time('9:30', '16:00')
pd.ols(y=mr, x=lagged_inter)
Vivaldi FTW
hour = vol.index.hour
hourly_volume = vol.groupby(hour).mean()
hourly_volume.plot(kind='bar')
Expanding window of hourly means for volume
hourly = vol.resample('H')
def calc_mean(hr):
hr = time(hour=hr)
data = hourly.at_time(hr)
return pd.expanding_mean(data)
df = pd.concat([calc_mean(hr) for hr in range(10, 16)])
df = df.sort_index()
df
Compute deviations from the hourly means
clean_vol = vol.between_time('10:00', '15:59')
dev = clean_vol - df.reindex(clean_vol.index, method='pad') # be careful over day boundaries
dev
inter = mr * dev
inter = inter.between_time('10:00', '15:59')
pd.ols(y=mr, x=inter.tshift(1, 'min'))
pd.date_range
rng = pd.date_range('2005', '2012', freq='M')
rng
pd.date_range('2005', periods=7*12, freq='M')
pd.date_range(end='2012', periods=7*12, freq='M')
Name | Description |
D | Calendar day |
B | Business day |
M | Calendar end of month |
MS | Calendar start of month |
BM | Business end of month |
BMS | Business start of month |
W-{MON, TUE,...} | Week ending on Monday, Tuesday, ... |
Q-{JAN, FEB,...} | Quarter end with year ending January, February... |
QS-{JAN, FEB,...} | Quarter start with year ending January, February... |
BQ-{JAN, FEB,...} | Business quarter end with year ending January, February... |
BQS-{JAN, FEB,...} | Business quarter start with year ending January, February... |
A-{JAN, FEB, ...} | Year end (December) |
AS-{JAN, FEB, ...} | Year start (December) |
BA-{JAN, FEB, ...} | Business year end (December) |
BAS-{JAN, FEB, ...} | Business year start (December) |
H | Hour |
T | Minute |
s | Second |
L, ms | Millisecond |
U | Microsecond |
Anchored offsets
pd.date_range('2005', periods=4, freq='Q')
pd.date_range('2005', periods=4, freq='Q-NOV')
Week anchor indicates end of week
wkrng = pd.date_range('2012-10-25', periods=3, freq='W')
wkrng
wkrng[0].dayofweek
Year anchor indicates year ending month
pd.date_range('2005', periods=3, freq='A-JUN')
DatetimeIndex
is a subclass of Index
isinstance(rng, pd.Index)
rng[2:4]
Use it for Series/DataFrame
labelling
s = Series(randn(len(rng)), rng)
s.head()
df = DataFrame(randn(len(rng), 3), rng, ['X', 'Y', 'Z'])
df.head()
s[datetime(2005, 1, 31) : datetime(2006, 12, 31)] #slice end inclusive
df['2005-1-31':'2006-12-31']
Partial indexing
s['2005':'2006']
positional indexing still works
df[:2] # slice end exclusive
Elements boxed as Timestamp
(subclass of datetime.datetime
)
elm = rng[0]
elm
isinstance(elm, datetime)
Why do we need this subclass?
elm.nanosecond
Implemented internally using numpy.datetime64 (dtype='M8[ns]')
val = rng.values
type(val)
val.dtype
Upgrade Numpy to 1.7b to fix repr issue
val[0]
Or use DatetimeIndex.asobject for workaround
rng.asobject.values[0]
rng.asobject
rng.to_pydatetime()
rng.to_pydatetime()[0]
Integer representation
type(rng.asi8)
rng.asi8.dtype
rng.asi8[0]
s.index.freqstr
s.resample('30D').head(10)
s.resample('30D', fill_method='ffill').head(10)
Upsampling
s.ix[:3].resample('W')
s.ix[:3].resample('W', fill_method='ffill')
asfreq
s.asfreq('Q').head()
s.resample('Q', 'last').head()
closed: 'left' or 'right' bin edge is closed (default is 'right')
s.resample('Q').head()
s.ix[3:6].mean()
s.resample('Q', closed='left').head()
s.ix[2:5].mean()
label: label the bin with 'left' or 'right' edge (default is 'right')
s.resample('Q').head()
s.resample('Q', label='left').head()
loffset: shift the result index
s.resample('Q', label='left', loffset='-1D').head()
rng.tz
d = rng[0]
d
d.tz
localized = rng.tz_localize('US/Eastern')
Localization assumes naive time is local (and not UTC)
localized[0]
localized.asi8[0]
rng.asi8[0]
d_utc = d.tz_localize('UTC')
d_utc
d_utc.tz_localize('US/Eastern')
localized.tz_convert('UTC')
d_ny = d_utc.tz_convert('US/Eastern')
d_ny
rng.tz_convert('US/Eastern')
A lot of time series data is better represented as intervals of time rather than points in time.
This is represented in pandas as Period and PeriodIndex
p = pd.Period('2005', 'A')
p
pd.Period('2006Q1', 'Q-MAR')
pd.Period('2007-1-1', 'B')
No xxx-start frequencies
pd.Period('2005', 'AS')
pd.period_range('2005', '2012', freq='A')
prng = pd.period_range('2005', periods=7, freq='A')
prng
p
p.to_timestamp()
p.to_timestamp('M', 's')
p.to_timestamp('M', 'e')
prng.to_timestamp(how='e')
prng.to_timestamp('M', 'e')
rng
rng.to_period()
rng.to_period('D')
p
p.end_time
datetime(2005, 12, 31, 10, 0, 0) < p.end_time # WAT?!
Look for a 0.9.1 bugfix release next week