Working with time-series data is an important part of data analysis.
Starting with v0.8, the pandas library has included a rich API for time-series manipulations.
The pandas time-series API includes:
from datetime import datetime, date, time
import sys
sys.version
'2.7.3 |EPD 7.3-2 (64-bit)| (default, Apr 12 2012, 11:14:05) \n[GCC 4.0.1 (Apple Inc. build 5493)]'
import pandas as pd
from pandas import Series, DataFrame, Panel
pd.__version__
'0.9.1.dev-88c9ef5'
np.__version__
'1.6.1'
import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.rc('figure', figsize=(10, 8))
mpl.__version__
'1.1.0'
Sample trade ticks from 2011-11-01 to 2011-11-03 for a single security
with open('data.csv', 'r') as fh:
print fh.readline() # headers
print fh.readline() # first row
Date,Time,Price,Volume,Exchange Code,Sales Condition,Correction Indicator,Sequence Number,Trade Stop Indicator,Source of Trade,MDS 127 / TRF (Trade Reporting Facility) (*),Exclude Record Flag,Filtered Price 11/01/2011,08:04:56.364,104.55,100,P,T,00,196,N,C,,,
parse_dates
: use a list or dict for flexible (possibly multi-column) date parsing
data = pd.read_csv('data.csv',
parse_dates={'Timestamp': ['Date', 'Time']},
index_col='Timestamp')
data
<class 'pandas.core.frame.DataFrame'> DatetimeIndex: 14632 entries, 2011-11-01 08:04:56.364000 to 2011-11-03 16:09:47.834000 Data columns: Price 14632 non-null values Volume 14632 non-null values Exchange Code 14632 non-null values Sales Condition 14632 non-null values Correction Indicator 14632 non-null values Sequence Number 14632 non-null values Trade Stop Indicator 14632 non-null values Source of Trade 14632 non-null values MDS 127 / TRF (Trade Reporting Facility) (*) 2421 non-null values Exclude Record Flag 28 non-null values Filtered Price 0 non-null values dtypes: float64(2), int64(3), object(6)
ticks = data.ix[:, ['Price', 'Volume']]
ticks.head()
Price | Volume | |
---|---|---|
Timestamp | ||
2011-11-01 08:04:56.364000 | 104.55 | 100 |
2011-11-01 08:53:07.459000 | 104.43 | 150 |
2011-11-01 08:53:09.284000 | 104.43 | 100 |
2011-11-01 08:54:03.921000 | 104.50 | 100 |
2011-11-01 08:58:06.162000 | 104.46 | 1000 |
resample
: regularization and frequency conversion¶bars = ticks.Price.resample('1min', how='ohlc')
bars
<class 'pandas.core.frame.DataFrame'> DatetimeIndex: 3366 entries, 2011-11-01 08:05:00 to 2011-11-03 16:10:00 Freq: T Data columns: open 1104 non-null values high 1104 non-null values low 1104 non-null values close 1104 non-null values dtypes: float64(4)
minute_range = bars.high - bars.low
minute_range.describe()
count 1104.000000 mean 0.017898 std 0.018568 min 0.000000 25% 0.000875 50% 0.013900 75% 0.029000 max 0.150000
minute_return = bars.close / bars.open - 1
minute_return.describe()
count 1104.000000 mean 0.000003 std 0.000191 min -0.001442 25% -0.000096 50% 0.000000 75% 0.000096 max 0.000962
Compute a VWAP using resample
volume = ticks.Volume.resample('1min', how='sum')
value = ticks.prod(axis=1).resample('1min', how='sum')
vwap = value / volume
vwap.ix['2011-11-01 09:27':'2011-11-01 09:32']
Timestamp 2011-11-01 09:27:00 NaN 2011-11-01 09:28:00 NaN 2011-11-01 09:29:00 104.170000 2011-11-01 09:30:00 103.300800 2011-11-01 09:31:00 104.231368 2011-11-01 09:32:00 104.236679 Freq: T
at_time
: same (b)at_time (same bat channel)¶bars.open.at_time('9:30')
Timestamp 2011-11-01 09:30:00 103.3008 2011-11-02 09:30:00 103.8600 2011-11-03 09:30:00 NaN Name: open
bars.close.at_time('16:00')
Timestamp 2011-11-01 16:00:00 104.56 2011-11-02 16:00:00 104.32 2011-11-03 16:00:00 103.82 Name: close
between_time
: intraday time range¶filtered = vwap.between_time('10:00', '16:00')
filtered.head(20)
Timestamp 2011-11-01 10:00:00 104.145000 2011-11-01 10:01:00 104.245387 2011-11-01 10:02:00 104.170771 2011-11-01 10:03:00 104.170000 2011-11-01 10:04:00 104.160952 2011-11-01 10:05:00 104.110870 2011-11-01 10:06:00 104.093576 2011-11-01 10:07:00 104.145132 2011-11-01 10:08:00 104.157924 2011-11-01 10:09:00 104.154646 2011-11-01 10:10:00 104.150756 2011-11-01 10:11:00 104.182591 2011-11-01 10:12:00 104.116803 2011-11-01 10:13:00 104.092174 2011-11-01 10:14:00 104.067768 2011-11-01 10:15:00 104.070000 2011-11-01 10:16:00 104.073333 2011-11-01 10:17:00 104.080906 2011-11-01 10:18:00 104.082566 2011-11-01 10:19:00 104.085140
vol = volume.between_time('10:00', '16:00')
vol.head(20)
Timestamp 2011-11-01 10:00:00 200 2011-11-01 10:01:00 18767 2011-11-01 10:02:00 67287 2011-11-01 10:03:00 2000 2011-11-01 10:04:00 2100 2011-11-01 10:05:00 2300 2011-11-01 10:06:00 11509 2011-11-01 10:07:00 3526 2011-11-01 10:08:00 734 2011-11-01 10:09:00 1106 2011-11-01 10:10:00 1800 2011-11-01 10:11:00 1760 2011-11-01 10:12:00 2961 2011-11-01 10:13:00 2300 2011-11-01 10:14:00 5818 2011-11-01 10:15:00 200 2011-11-01 10:16:00 600 2011-11-01 10:17:00 1040 2011-11-01 10:18:00 1576 2011-11-01 10:19:00 2813
fillna
: handling missing data¶filtered.ix['2011-11-03':'2011-11-04'].head(20)
Timestamp 2011-11-03 10:00:00 NaN 2011-11-03 10:01:00 103.968667 2011-11-03 10:02:00 NaN 2011-11-03 10:03:00 103.997500 2011-11-03 10:04:00 104.044295 2011-11-03 10:05:00 NaN 2011-11-03 10:06:00 104.135118 2011-11-03 10:07:00 104.130000 2011-11-03 10:08:00 104.120000 2011-11-03 10:09:00 104.101250 2011-11-03 10:10:00 NaN 2011-11-03 10:11:00 NaN 2011-11-03 10:12:00 NaN 2011-11-03 10:13:00 NaN 2011-11-03 10:14:00 104.086667 2011-11-03 10:15:00 104.049200 2011-11-03 10:16:00 104.028333 2011-11-03 10:17:00 103.976438 2011-11-03 10:18:00 103.912492 2011-11-03 10:19:00 NaN
filled = filtered.fillna(method='pad', limit=1)
filled.ix['2011-11-03':'2011-11-04'].head(20)
Timestamp 2011-11-03 10:00:00 104.305099 2011-11-03 10:01:00 103.968667 2011-11-03 10:02:00 103.968667 2011-11-03 10:03:00 103.997500 2011-11-03 10:04:00 104.044295 2011-11-03 10:05:00 104.044295 2011-11-03 10:06:00 104.135118 2011-11-03 10:07:00 104.130000 2011-11-03 10:08:00 104.120000 2011-11-03 10:09:00 104.101250 2011-11-03 10:10:00 104.101250 2011-11-03 10:11:00 NaN 2011-11-03 10:12:00 NaN 2011-11-03 10:13:00 NaN 2011-11-03 10:14:00 104.086667 2011-11-03 10:15:00 104.049200 2011-11-03 10:16:00 104.028333 2011-11-03 10:17:00 103.976438 2011-11-03 10:18:00 103.912492 2011-11-03 10:19:00 103.912492
vol = vol.fillna(0.)
vol.head(20)
Timestamp 2011-11-01 10:00:00 200 2011-11-01 10:01:00 18767 2011-11-01 10:02:00 67287 2011-11-01 10:03:00 2000 2011-11-01 10:04:00 2100 2011-11-01 10:05:00 2300 2011-11-01 10:06:00 11509 2011-11-01 10:07:00 3526 2011-11-01 10:08:00 734 2011-11-01 10:09:00 1106 2011-11-01 10:10:00 1800 2011-11-01 10:11:00 1760 2011-11-01 10:12:00 2961 2011-11-01 10:13:00 2300 2011-11-01 10:14:00 5818 2011-11-01 10:15:00 200 2011-11-01 10:16:00 600 2011-11-01 10:17:00 1040 2011-11-01 10:18:00 1576 2011-11-01 10:19:00 2813
filled.ix['2011-11-03':'2011-11-04'].plot()
plt.ylim(103.5, 104.5)
(103.5, 104.5)
vwap.ix['2011-11-03':'2011-11-04'].plot()
plt.ylim(103.5, 104.5)
vol.ix['2011-11-03':'2011-11-04'].plot(secondary_y=True, style='r')
<matplotlib.axes.AxesSubplot at 0x10d6ba690>
ticks.head()
Price | Volume | |
---|---|---|
Timestamp | ||
2011-11-01 08:04:56.364000 | 104.55 | 100 |
2011-11-01 08:53:07.459000 | 104.43 | 150 |
2011-11-01 08:53:09.284000 | 104.43 | 100 |
2011-11-01 08:54:03.921000 | 104.50 | 100 |
2011-11-01 08:58:06.162000 | 104.46 | 1000 |
shift
realigns values
ticks.shift(1).head()
Price | Volume | |
---|---|---|
Timestamp | ||
2011-11-01 08:04:56.364000 | NaN | NaN |
2011-11-01 08:53:07.459000 | 104.55 | 100 |
2011-11-01 08:53:09.284000 | 104.43 | 150 |
2011-11-01 08:54:03.921000 | 104.43 | 100 |
2011-11-01 08:58:06.162000 | 104.50 | 100 |
ticks.shift(-1).head()
Price | Volume | |
---|---|---|
Timestamp | ||
2011-11-01 08:04:56.364000 | 104.43 | 150 |
2011-11-01 08:53:07.459000 | 104.43 | 100 |
2011-11-01 08:53:09.284000 | 104.50 | 100 |
2011-11-01 08:54:03.921000 | 104.46 | 1000 |
2011-11-01 08:58:06.162000 | 104.46 | 1000 |
tshift
manipulates index values
ticks.tshift(1, 'min').head()
Price | Volume | |
---|---|---|
2011-11-01 08:05:56.364000 | 104.55 | 100 |
2011-11-01 08:54:07.459000 | 104.43 | 150 |
2011-11-01 08:54:09.284000 | 104.43 | 100 |
2011-11-01 08:55:03.921000 | 104.50 | 100 |
2011-11-01 08:59:06.162000 | 104.46 | 1000 |
minute_return.head()
Timestamp 2011-11-01 08:05:00 0 2011-11-01 08:06:00 NaN 2011-11-01 08:07:00 NaN 2011-11-01 08:08:00 NaN 2011-11-01 08:09:00 NaN Freq: T
mr = minute_return.between_time('9:30', '16:00')
mr.head()
Timestamp 2011-11-01 09:30:00 0.000000 2011-11-01 09:31:00 0.000192 2011-11-01 09:32:00 -0.000192 2011-11-01 09:33:00 0.000000 2011-11-01 09:34:00 -0.000096
lagged = mr.shift(1)
lagged.head()
Timestamp 2011-11-01 09:30:00 NaN 2011-11-01 09:31:00 0.000000 2011-11-01 09:32:00 0.000192 2011-11-01 09:33:00 -0.000192 2011-11-01 09:34:00 0.000000
We shouldn't use shift here because:
lagged.at_time('9:30')
Timestamp 2011-11-01 09:30:00 NaN 2011-11-02 09:30:00 0.000191 2011-11-03 09:30:00 0.000192
mr.at_time('16:00')
Timestamp 2011-11-01 16:00:00 0.000191 2011-11-02 16:00:00 0.000192 2011-11-03 16:00:00 -0.000193
lagged = minute_return.tshift(1, 'min').between_time('9:30', '16:00')
lagged.at_time('9:30')
Timestamp 2011-11-01 09:30:00 0 2011-11-02 09:30:00 0 2011-11-03 09:30:00 NaN
Let's play
pd.ols(y=mr, x=lagged)
-------------------------Summary of Regression Analysis------------------------- Formula: Y ~ <x> + <intercept> Number of Observations: 1009 Number of Degrees of Freedom: 2 R-squared: 0.0011 Adj R-squared: 0.0001 Rmse: 0.0002 F-stat (1, 1007): 1.1007, p-value: 0.2944 Degrees of Freedom: model 1, resid 1007 -----------------------Summary of Estimated Coefficients------------------------ Variable Coef Std Err t-stat p-value CI 2.5% CI 97.5% -------------------------------------------------------------------------------- x 0.0338 0.0322 1.05 0.2944 -0.0293 0.0969 intercept 0.0000 0.0000 0.50 0.6148 -0.0000 0.0000 ---------------------------------End of Summary---------------------------------
mr = vwap / bars.open - 1
mr = mr.between_time('9:30', '16:00')
lagged = mr.tshift(1, 'min').between_time('9:30', '16:00')
pd.ols(y=mr, x=lagged)
-------------------------Summary of Regression Analysis------------------------- Formula: Y ~ <x> + <intercept> Number of Observations: 1007 Number of Degrees of Freedom: 2 R-squared: 0.0024 Adj R-squared: 0.0014 Rmse: 0.0001 F-stat (1, 1005): 2.4262, p-value: 0.1196 Degrees of Freedom: model 1, resid 1005 -----------------------Summary of Estimated Coefficients------------------------ Variable Coef Std Err t-stat p-value CI 2.5% CI 97.5% -------------------------------------------------------------------------------- x 0.0503 0.0323 1.56 0.1196 -0.0130 0.1136 intercept 0.0000 0.0000 0.66 0.5120 -0.0000 0.0000 ---------------------------------End of Summary---------------------------------
inter = mr * vol
inter = inter.between_time('9:30', '16:00')
lagged_inter = inter.tshift(1, 'min').between_time('9:30', '16:00')
pd.ols(y=mr, x=lagged_inter)
-------------------------Summary of Regression Analysis------------------------- Formula: Y ~ <x> + <intercept> Number of Observations: 919 Number of Degrees of Freedom: 2 R-squared: 0.0055 Adj R-squared: 0.0044 Rmse: 0.0001 F-stat (1, 917): 5.0453, p-value: 0.0249 Degrees of Freedom: model 1, resid 917 -----------------------Summary of Estimated Coefficients------------------------ Variable Coef Std Err t-stat p-value CI 2.5% CI 97.5% -------------------------------------------------------------------------------- x -0.0000 0.0000 -2.25 0.0249 -0.0000 -0.0000 intercept 0.0000 0.0000 0.98 0.3251 -0.0000 0.0000 ---------------------------------End of Summary---------------------------------
Convert to percentage volume
vol = vol.groupby(vol.index.day).transform(lambda x: x/x.sum())
vol.head()
Timestamp 2011-11-01 10:00:00 0.000115 2011-11-01 10:01:00 0.010772 2011-11-01 10:02:00 0.038624 2011-11-01 10:03:00 0.001148 2011-11-01 10:04:00 0.001205
Verify
vol.resample('D', how='sum')
Timestamp 2011-11-02 1 2011-11-03 1 2011-11-04 1 Freq: D
inter = mr * vol
inter = inter.between_time('9:30', '16:00')
lagged_inter = inter.tshift(1, 'min').between_time('9:30', '16:00')
pd.ols(y=mr, x=lagged_inter)
-------------------------Summary of Regression Analysis------------------------- Formula: Y ~ <x> + <intercept> Number of Observations: 919 Number of Degrees of Freedom: 2 R-squared: 0.0050 Adj R-squared: 0.0039 Rmse: 0.0001 F-stat (1, 917): 4.6031, p-value: 0.0322 Degrees of Freedom: model 1, resid 917 -----------------------Summary of Estimated Coefficients------------------------ Variable Coef Std Err t-stat p-value CI 2.5% CI 97.5% -------------------------------------------------------------------------------- x -4.1357 1.9276 -2.15 0.0322 -7.9139 -0.3576 intercept 0.0000 0.0000 0.98 0.3283 -0.0000 0.0000 ---------------------------------End of Summary---------------------------------
Vivaldi FTW
hour = vol.index.hour
hourly_volume = vol.groupby(hour).mean()
hourly_volume.plot(kind='bar')
<matplotlib.axes.AxesSubplot at 0x10ffe38d0>
Expanding window of hourly means for volume
hourly = vol.resample('H')
def calc_mean(hr):
hr = time(hour=hr)
data = hourly.at_time(hr)
return pd.expanding_mean(data)
df = pd.concat([calc_mean(hr) for hr in range(10, 16)])
df = df.sort_index()
df
Timestamp 2011-11-01 10:00:00 0.000115 2011-11-01 11:00:00 0.002510 2011-11-01 12:00:00 0.001253 2011-11-01 13:00:00 0.000628 2011-11-01 14:00:00 0.003013 2011-11-01 15:00:00 0.004419 2011-11-02 10:00:00 0.000742 2011-11-02 11:00:00 0.002986 2011-11-02 12:00:00 0.002283 2011-11-02 13:00:00 0.001088 2011-11-02 14:00:00 0.002224 2011-11-02 15:00:00 0.003979 2011-11-03 10:00:00 0.000495 2011-11-03 11:00:00 0.002537 2011-11-03 12:00:00 0.002102 2011-11-03 13:00:00 0.002193 2011-11-03 14:00:00 0.003233 2011-11-03 15:00:00 0.003228
Compute deviations from the hourly means
clean_vol = vol.between_time('10:00', '15:59')
dev = clean_vol - df.reindex(clean_vol.index, method='pad') # be careful over day boundaries
dev
Timestamp 2011-11-01 10:00:00 0.000000 2011-11-01 10:01:00 0.010658 2011-11-01 10:02:00 0.038509 2011-11-01 10:03:00 0.001033 2011-11-01 10:04:00 0.001091 2011-11-01 10:05:00 0.001205 2011-11-01 10:06:00 0.006492 2011-11-01 10:07:00 0.001909 2011-11-01 10:08:00 0.000307 2011-11-01 10:09:00 0.000520 2011-11-01 10:10:00 0.000918 2011-11-01 10:11:00 0.000895 2011-11-01 10:12:00 0.001585 2011-11-01 10:13:00 0.001205 2011-11-01 10:14:00 0.003225 ... 2011-11-03 15:45:00 -0.002881 2011-11-03 15:46:00 -0.002411 2011-11-03 15:47:00 -0.002708 2011-11-03 15:48:00 -0.002881 2011-11-03 15:49:00 -0.001083 2011-11-03 15:50:00 -0.003228 2011-11-03 15:51:00 -0.001469 2011-11-03 15:52:00 -0.003054 2011-11-03 15:53:00 -0.003054 2011-11-03 15:54:00 -0.003054 2011-11-03 15:55:00 -0.002303 2011-11-03 15:56:00 0.012786 2011-11-03 15:57:00 -0.003054 2011-11-03 15:58:00 -0.002521 2011-11-03 15:59:00 -0.000219 Length: 1080
inter = mr * dev
inter = inter.between_time('10:00', '15:59')
pd.ols(y=mr, x=inter.tshift(1, 'min'))
-------------------------Summary of Regression Analysis------------------------- Formula: Y ~ <x> + <intercept> Number of Observations: 919 Number of Degrees of Freedom: 2 R-squared: 0.0059 Adj R-squared: 0.0048 Rmse: 0.0001 F-stat (1, 917): 5.3977, p-value: 0.0204 Degrees of Freedom: model 1, resid 917 -----------------------Summary of Estimated Coefficients------------------------ Variable Coef Std Err t-stat p-value CI 2.5% CI 97.5% -------------------------------------------------------------------------------- x -4.6532 2.0029 -2.32 0.0204 -8.5789 -0.7276 intercept 0.0000 0.0000 0.97 0.3299 -0.0000 0.0000 ---------------------------------End of Summary---------------------------------
pd.date_range
rng = pd.date_range('2005', '2012', freq='M')
rng
<class 'pandas.tseries.index.DatetimeIndex'> [2005-10-31 00:00:00, ..., 2012-09-30 00:00:00] Length: 84, Freq: M, Timezone: None
pd.date_range('2005', periods=7*12, freq='M')
<class 'pandas.tseries.index.DatetimeIndex'> [2005-10-31 00:00:00, ..., 2012-09-30 00:00:00] Length: 84, Freq: M, Timezone: None
pd.date_range(end='2012', periods=7*12, freq='M')
<class 'pandas.tseries.index.DatetimeIndex'> [2005-10-31 00:00:00, ..., 2012-09-30 00:00:00] Length: 84, Freq: M, Timezone: None
Name | Description |
D | Calendar day |
B | Business day |
M | Calendar end of month |
MS | Calendar start of month |
BM | Business end of month |
BMS | Business start of month |
W-{MON, TUE,...} | Week ending on Monday, Tuesday, ... |
Q-{JAN, FEB,...} | Quarter end with year ending January, February... |
QS-{JAN, FEB,...} | Quarter start with year ending January, February... |
BQ-{JAN, FEB,...} | Business quarter end with year ending January, February... |
BQS-{JAN, FEB,...} | Business quarter start with year ending January, February... |
A-{JAN, FEB, ...} | Year end (December) |
AS-{JAN, FEB, ...} | Year start (December) |
BA-{JAN, FEB, ...} | Business year end (December) |
BAS-{JAN, FEB, ...} | Business year start (December) |
H | Hour |
T | Minute |
s | Second |
L, ms | Millisecond |
U | Microsecond |
Anchored offsets
pd.date_range('2005', periods=4, freq='Q')
<class 'pandas.tseries.index.DatetimeIndex'> [2005-12-31 00:00:00, ..., 2006-09-30 00:00:00] Length: 4, Freq: Q-DEC, Timezone: None
pd.date_range('2005', periods=4, freq='Q-NOV')
<class 'pandas.tseries.index.DatetimeIndex'> [2005-11-30 00:00:00, ..., 2006-08-31 00:00:00] Length: 4, Freq: Q-NOV, Timezone: None
Week anchor indicates end of week
wkrng = pd.date_range('2012-10-25', periods=3, freq='W')
wkrng
<class 'pandas.tseries.index.DatetimeIndex'> [2012-10-28 00:00:00, ..., 2012-11-11 00:00:00] Length: 3, Freq: W-SUN, Timezone: None
wkrng[0].dayofweek
6
Year anchor indicates year ending month
pd.date_range('2005', periods=3, freq='A-JUN')
<class 'pandas.tseries.index.DatetimeIndex'> [2006-06-30 00:00:00, ..., 2008-06-30 00:00:00] Length: 3, Freq: A-JUN, Timezone: None
DatetimeIndex
is a subclass of Index
isinstance(rng, pd.Index)
True
rng[2:4]
<class 'pandas.tseries.index.DatetimeIndex'> [2005-12-31 00:00:00, 2006-01-31 00:00:00] Length: 2, Freq: M, Timezone: None
Use it for Series/DataFrame
labelling
s = Series(randn(len(rng)), rng)
s.head()
2005-10-31 1.026565 2005-11-30 -0.306949 2005-12-31 -1.838600 2006-01-31 -1.325459 2006-02-28 1.642384 Freq: M
df = DataFrame(randn(len(rng), 3), rng, ['X', 'Y', 'Z'])
df.head()
X | Y | Z | |
---|---|---|---|
2005-10-31 | -0.256664 | 0.255390 | -0.765260 |
2005-11-30 | 2.035113 | 1.694331 | 1.436986 |
2005-12-31 | 1.094142 | -1.098397 | 0.643988 |
2006-01-31 | 0.714353 | 0.429468 | 0.702067 |
2006-02-28 | 0.318068 | 0.656602 | 1.701144 |
s[datetime(2005, 1, 31) : datetime(2006, 12, 31)] #slice end inclusive
2005-10-31 1.026565 2005-11-30 -0.306949 2005-12-31 -1.838600 2006-01-31 -1.325459 2006-02-28 1.642384 2006-03-31 -0.274467 2006-04-30 -0.433675 2006-05-31 -0.963449 2006-06-30 1.112873 2006-07-31 -0.089778 2006-08-31 0.046948 2006-09-30 0.035094 2006-10-31 1.113169 2006-11-30 -0.401821 2006-12-31 0.723640 Freq: M
df['2005-1-31':'2006-12-31']
X | Y | Z | |
---|---|---|---|
2005-10-31 | -0.256664 | 0.255390 | -0.765260 |
2005-11-30 | 2.035113 | 1.694331 | 1.436986 |
2005-12-31 | 1.094142 | -1.098397 | 0.643988 |
2006-01-31 | 0.714353 | 0.429468 | 0.702067 |
2006-02-28 | 0.318068 | 0.656602 | 1.701144 |
2006-03-31 | 0.192433 | 1.127046 | -2.496724 |
2006-04-30 | 1.417775 | 0.845626 | -0.713990 |
2006-05-31 | 0.057333 | 0.016464 | -0.311196 |
2006-06-30 | -0.535289 | 0.131742 | 0.459589 |
2006-07-31 | -0.787894 | -1.482390 | -0.341181 |
2006-08-31 | 0.131652 | 2.037986 | -0.547023 |
2006-09-30 | 0.883980 | 1.178719 | -1.960677 |
2006-10-31 | 0.096330 | -0.034388 | 0.944521 |
2006-11-30 | -0.559477 | 0.178275 | -0.637401 |
2006-12-31 | -0.730724 | 0.363436 | -1.048955 |
Partial indexing
s['2005':'2006']
2005-10-31 1.026565 2005-11-30 -0.306949 2005-12-31 -1.838600 2006-01-31 -1.325459 2006-02-28 1.642384 2006-03-31 -0.274467 2006-04-30 -0.433675 2006-05-31 -0.963449 2006-06-30 1.112873 2006-07-31 -0.089778 2006-08-31 0.046948 2006-09-30 0.035094 2006-10-31 1.113169 2006-11-30 -0.401821 2006-12-31 0.723640 Freq: M
positional indexing still works
df[:2] # slice end exclusive
X | Y | Z | |
---|---|---|---|
2005-10-31 | -0.256664 | 0.255390 | -0.765260 |
2005-11-30 | 2.035113 | 1.694331 | 1.436986 |
Elements boxed as Timestamp
(subclass of datetime.datetime
)
elm = rng[0]
elm
<Timestamp: 2005-10-31 00:00:00>
isinstance(elm, datetime)
True
Why do we need this subclass?
elm.nanosecond
0
Implemented internally using numpy.datetime64 (dtype='M8[ns]')
val = rng.values
type(val)
numpy.ndarray
val.dtype
dtype('datetime64[ns]')
Upgrade Numpy to 1.7b to fix repr issue
val[0]
1970-01-14 40:00:00
Or use DatetimeIndex.asobject for workaround
rng.asobject.values[0]
<Timestamp: 2005-10-31 00:00:00>
rng.asobject
Index([2005-10-31 00:00:00, 2005-11-30 00:00:00, 2005-12-31 00:00:00, 2006-01-31 00:00:00, 2006-02-28 00:00:00, 2006-03-31 00:00:00, 2006-04-30 00:00:00, 2006-05-31 00:00:00, 2006-06-30 00:00:00, 2006-07-31 00:00:00, 2006-08-31 00:00:00, 2006-09-30 00:00:00, 2006-10-31 00:00:00, 2006-11-30 00:00:00, 2006-12-31 00:00:00, 2007-01-31 00:00:00, 2007-02-28 00:00:00, 2007-03-31 00:00:00, 2007-04-30 00:00:00, 2007-05-31 00:00:00, 2007-06-30 00:00:00, 2007-07-31 00:00:00, 2007-08-31 00:00:00, 2007-09-30 00:00:00, 2007-10-31 00:00:00, 2007-11-30 00:00:00, 2007-12-31 00:00:00, 2008-01-31 00:00:00, 2008-02-29 00:00:00, 2008-03-31 00:00:00, 2008-04-30 00:00:00, 2008-05-31 00:00:00, 2008-06-30 00:00:00, 2008-07-31 00:00:00, 2008-08-31 00:00:00, 2008-09-30 00:00:00, 2008-10-31 00:00:00, 2008-11-30 00:00:00, 2008-12-31 00:00:00, 2009-01-31 00:00:00, 2009-02-28 00:00:00, 2009-03-31 00:00:00, 2009-04-30 00:00:00, 2009-05-31 00:00:00, 2009-06-30 00:00:00, 2009-07-31 00:00:00, 2009-08-31 00:00:00, 2009-09-30 00:00:00, 2009-10-31 00:00:00, 2009-11-30 00:00:00, 2009-12-31 00:00:00, 2010-01-31 00:00:00, 2010-02-28 00:00:00, 2010-03-31 00:00:00, 2010-04-30 00:00:00, 2010-05-31 00:00:00, 2010-06-30 00:00:00, 2010-07-31 00:00:00, 2010-08-31 00:00:00, 2010-09-30 00:00:00, 2010-10-31 00:00:00, 2010-11-30 00:00:00, 2010-12-31 00:00:00, 2011-01-31 00:00:00, 2011-02-28 00:00:00, 2011-03-31 00:00:00, 2011-04-30 00:00:00, 2011-05-31 00:00:00, 2011-06-30 00:00:00, 2011-07-31 00:00:00, 2011-08-31 00:00:00, 2011-09-30 00:00:00, 2011-10-31 00:00:00, 2011-11-30 00:00:00, 2011-12-31 00:00:00, 2012-01-31 00:00:00, 2012-02-29 00:00:00, 2012-03-31 00:00:00, 2012-04-30 00:00:00, 2012-05-31 00:00:00, 2012-06-30 00:00:00, 2012-07-31 00:00:00, 2012-08-31 00:00:00, 2012-09-30 00:00:00], dtype=object)
rng.to_pydatetime()
array([2005-10-31 00:00:00, 2005-11-30 00:00:00, 2005-12-31 00:00:00, 2006-01-31 00:00:00, 2006-02-28 00:00:00, 2006-03-31 00:00:00, 2006-04-30 00:00:00, 2006-05-31 00:00:00, 2006-06-30 00:00:00, 2006-07-31 00:00:00, 2006-08-31 00:00:00, 2006-09-30 00:00:00, 2006-10-31 00:00:00, 2006-11-30 00:00:00, 2006-12-31 00:00:00, 2007-01-31 00:00:00, 2007-02-28 00:00:00, 2007-03-31 00:00:00, 2007-04-30 00:00:00, 2007-05-31 00:00:00, 2007-06-30 00:00:00, 2007-07-31 00:00:00, 2007-08-31 00:00:00, 2007-09-30 00:00:00, 2007-10-31 00:00:00, 2007-11-30 00:00:00, 2007-12-31 00:00:00, 2008-01-31 00:00:00, 2008-02-29 00:00:00, 2008-03-31 00:00:00, 2008-04-30 00:00:00, 2008-05-31 00:00:00, 2008-06-30 00:00:00, 2008-07-31 00:00:00, 2008-08-31 00:00:00, 2008-09-30 00:00:00, 2008-10-31 00:00:00, 2008-11-30 00:00:00, 2008-12-31 00:00:00, 2009-01-31 00:00:00, 2009-02-28 00:00:00, 2009-03-31 00:00:00, 2009-04-30 00:00:00, 2009-05-31 00:00:00, 2009-06-30 00:00:00, 2009-07-31 00:00:00, 2009-08-31 00:00:00, 2009-09-30 00:00:00, 2009-10-31 00:00:00, 2009-11-30 00:00:00, 2009-12-31 00:00:00, 2010-01-31 00:00:00, 2010-02-28 00:00:00, 2010-03-31 00:00:00, 2010-04-30 00:00:00, 2010-05-31 00:00:00, 2010-06-30 00:00:00, 2010-07-31 00:00:00, 2010-08-31 00:00:00, 2010-09-30 00:00:00, 2010-10-31 00:00:00, 2010-11-30 00:00:00, 2010-12-31 00:00:00, 2011-01-31 00:00:00, 2011-02-28 00:00:00, 2011-03-31 00:00:00, 2011-04-30 00:00:00, 2011-05-31 00:00:00, 2011-06-30 00:00:00, 2011-07-31 00:00:00, 2011-08-31 00:00:00, 2011-09-30 00:00:00, 2011-10-31 00:00:00, 2011-11-30 00:00:00, 2011-12-31 00:00:00, 2012-01-31 00:00:00, 2012-02-29 00:00:00, 2012-03-31 00:00:00, 2012-04-30 00:00:00, 2012-05-31 00:00:00, 2012-06-30 00:00:00, 2012-07-31 00:00:00, 2012-08-31 00:00:00, 2012-09-30 00:00:00], dtype=object)
rng.to_pydatetime()[0]
datetime.datetime(2005, 10, 31, 0, 0)
Integer representation
type(rng.asi8)
numpy.ndarray
rng.asi8.dtype
dtype('int64')
rng.asi8[0]
1130716800000000000
s.index.freqstr
'M'
s.resample('30D').head(10)
2005-10-31 1.026565 2005-11-30 -0.306949 2005-12-30 NaN 2006-01-29 NaN 2006-02-28 1.642384 2006-03-30 NaN 2006-04-29 NaN 2006-05-29 NaN 2006-06-28 NaN 2006-07-28 NaN Freq: 30D
s.resample('30D', fill_method='ffill').head(10)
2005-10-31 1.026565 2005-11-30 -0.306949 2005-12-30 -0.306949 2006-01-29 -1.838600 2006-02-28 1.642384 2006-03-30 1.642384 2006-04-29 -0.274467 2006-05-29 -0.433675 2006-06-28 -0.963449 2006-07-28 1.112873 Freq: 30D
Upsampling
s.ix[:3].resample('W')
2005-11-06 NaN 2005-11-13 NaN 2005-11-20 NaN 2005-11-27 NaN 2005-12-04 NaN 2005-12-11 NaN 2005-12-18 NaN 2005-12-25 NaN 2006-01-01 NaN Freq: W-SUN
s.ix[:3].resample('W', fill_method='ffill')
2005-11-06 1.026565 2005-11-13 1.026565 2005-11-20 1.026565 2005-11-27 1.026565 2005-12-04 -0.306949 2005-12-11 -0.306949 2005-12-18 -0.306949 2005-12-25 -0.306949 2006-01-01 -1.838600 Freq: W-SUN
asfreq
s.asfreq('Q').head()
2005-12-31 -1.838600 2006-03-31 -0.274467 2006-06-30 1.112873 2006-09-30 0.035094 2006-12-31 0.723640 Freq: Q-DEC
s.resample('Q', 'last').head()
2005-12-31 -1.838600 2006-03-31 -0.274467 2006-06-30 1.112873 2006-09-30 0.035094 2006-12-31 0.723640 Freq: Q-DEC
closed: 'left' or 'right' bin edge is closed (default is 'right')
s.resample('Q').head()
2005-12-31 -0.372995 2006-03-31 0.014153 2006-06-30 -0.094750 2006-09-30 -0.002579 2006-12-31 0.478330 Freq: Q-DEC
s.ix[3:6].mean()
0.014152740332844638
s.resample('Q', closed='left').head()
2005-12-31 0.359808 2006-03-31 -0.507225 2006-06-30 -0.557197 2006-09-30 0.356681 2006-12-31 0.248814 Freq: Q-DEC
s.ix[2:5].mean()
-0.50722477768944352
label: label the bin with 'left' or 'right' edge (default is 'right')
s.resample('Q').head()
2005-12-31 -0.372995 2006-03-31 0.014153 2006-06-30 -0.094750 2006-09-30 -0.002579 2006-12-31 0.478330 Freq: Q-DEC
s.resample('Q', label='left').head()
2005-09-30 -0.372995 2005-12-31 0.014153 2006-03-31 -0.094750 2006-06-30 -0.002579 2006-09-30 0.478330 Freq: Q-DEC
loffset: shift the result index
s.resample('Q', label='left', loffset='-1D').head()
2005-09-29 -0.372995 2005-12-30 0.014153 2006-03-30 -0.094750 2006-06-29 -0.002579 2006-09-29 0.478330
rng.tz
d = rng[0]
d
<Timestamp: 2005-10-31 00:00:00>
d.tz
localized = rng.tz_localize('US/Eastern')
Localization assumes naive time is local (and not UTC)
localized[0]
<Timestamp: 2005-10-31 00:00:00-0500 EST, tz=US/Eastern>
localized.asi8[0]
1130734800000000000
rng.asi8[0]
1130716800000000000
d_utc = d.tz_localize('UTC')
d_utc
<Timestamp: 2005-10-31 00:00:00+0000 UTC, tz=UTC>
d_utc.tz_localize('US/Eastern')
--------------------------------------------------------------------------- Exception Traceback (most recent call last) <ipython-input-95-5747806c09b7> in <module>() ----> 1 d_utc.tz_localize('US/Eastern') /Users/changshe/code/pandas/pandas/lib.so in pandas.lib.Timestamp.tz_localize (pandas/src/tseries.c:32388)() Exception: Cannot localize tz-aware Timestamp, use tz_convert for conversions
localized.tz_convert('UTC')
<class 'pandas.tseries.index.DatetimeIndex'> [2005-10-31 05:00:00, ..., 2012-09-30 04:00:00] Length: 84, Freq: M, Timezone: UTC
d_ny = d_utc.tz_convert('US/Eastern')
d_ny
<Timestamp: 2005-10-30 19:00:00-0500 EST, tz=US/Eastern>
rng.tz_convert('US/Eastern')
--------------------------------------------------------------------------- Exception Traceback (most recent call last) <ipython-input-98-ce1650e7ae36> in <module>() ----> 1 rng.tz_convert('US/Eastern') /Users/changshe/code/pandas/pandas/tseries/index.pyc in tz_convert(self, tz) 1275 if self.tz is None: 1276 # tz naive, use tz_localize -> 1277 raise Exception('Cannot convert tz-naive timestamps, use ' 1278 'tz_localize to localize') 1279 Exception: Cannot convert tz-naive timestamps, use tz_localize to localize
A lot of time series data is better represented as intervals of time rather than points in time.
This is represented in pandas as Period and PeriodIndex
p = pd.Period('2005', 'A')
p
Period('2005', 'A-DEC')
pd.Period('2006Q1', 'Q-MAR')
Period('2006Q1', 'Q-MAR')
pd.Period('2007-1-1', 'B')
Period('2007-01-01', 'B')
No xxx-start frequencies
pd.Period('2005', 'AS')
--------------------------------------------------------------------------- KeyError Traceback (most recent call last) <ipython-input-102-fafe81610ca8> in <module>() ----> 1 pd.Period('2005', 'AS') /Users/changshe/code/pandas/pandas/tseries/period.pyc in __init__(self, value, freq, ordinal, year, month, quarter, day, hour, minute, second) 114 raise ValueError(msg) 115 --> 116 base, mult = _gfc(freq) 117 if mult != 1: 118 raise ValueError('Only mult == 1 supported') /Users/changshe/code/pandas/pandas/tseries/frequencies.pyc in get_freq_code(freqstr) 71 72 base, stride = _base_and_stride(freqstr) ---> 73 code = _period_str_to_code(base) 74 75 return code, stride /Users/changshe/code/pandas/pandas/tseries/frequencies.pyc in _period_str_to_code(freqstr) 694 return _period_code_map[freqstr] 695 except: --> 696 alias = _period_alias_dict[freqstr] 697 return _period_code_map[alias] 698 KeyError: 'AS-JAN'
pd.period_range('2005', '2012', freq='A')
<class 'pandas.tseries.period.PeriodIndex'> freq: A-DEC [2005, ..., 2012] length: 8
prng = pd.period_range('2005', periods=7, freq='A')
prng
<class 'pandas.tseries.period.PeriodIndex'> freq: A-DEC [2005, ..., 2011] length: 7
p
Period('2005', 'A-DEC')
p.to_timestamp()
<Timestamp: 2005-01-01 00:00:00>
p.to_timestamp('M', 's')
<Timestamp: 2005-01-31 00:00:00>
p.to_timestamp('M', 'e')
<Timestamp: 2005-12-31 00:00:00>
prng.to_timestamp(how='e')
<class 'pandas.tseries.index.DatetimeIndex'> [2005-12-31 00:00:00, ..., 2011-12-31 00:00:00] Length: 7, Freq: A-DEC, Timezone: None
prng.to_timestamp('M', 'e')
<class 'pandas.tseries.index.DatetimeIndex'> [2005-12-31 00:00:00, ..., 2011-12-31 00:00:00] Length: 7, Freq: A-DEC, Timezone: None
rng
<class 'pandas.tseries.index.DatetimeIndex'> [2005-10-31 00:00:00, ..., 2012-09-30 00:00:00] Length: 84, Freq: M, Timezone: None
rng.to_period()
<class 'pandas.tseries.period.PeriodIndex'> freq: M [2005-10, ..., 2012-09] length: 84
rng.to_period('D')
<class 'pandas.tseries.period.PeriodIndex'> freq: D [2005-10-31, ..., 2012-09-30] length: 84
p
Period('2005', 'A-DEC')
p.end_time
<Timestamp: 2005-12-31 00:00:00>
datetime(2005, 12, 31, 10, 0, 0) < p.end_time # WAT?!
False
Look for a 0.9.1 bugfix release next week