import pandas as pd
import pandas.util.testing as tm
from pytz import timezone
import numpy as np
from pandas.tseries.tools import normalize_date
import datetime
import quantreactor as qr
%load_ext Cython
index = pd.date_range(start="2000/01/01", freq="MIN", periods=1001000, tz='UTC')
index = index[(index.hour >= 8) & (index.hour < 16)] # make market hours 8am => 4pm
short_wed = index[(index.dayofweek != 2) | (index.hour < 12)]
data = np.random.randn(len(short_wed), 2)
df = pd.DataFrame(data, index=short_wed, columns=['AAPL', 'MSFT'])
%%scoped pivot
prices = df.copy()
prices['date'] = prices.index.date
prices['time'] = prices.index.time
prices_pivot = {} # dict of pivot tables, keyed to sid
for stock in ['AAPL', 'MSFT']:
prices_pivot[stock] = prices.pivot(index='date', columns='time',values=stock)
Run pivot: CPU time: 1.84 s Wall time: 1.85 s
%%scoped grouped_apply
# naive reindex by day
def align_time(df):
if len(df) == 480:
return df
# not a full day, reindex
start = datetime.datetime.combine(df.index[0].date(), datetime.time(8))
end = datetime.datetime.combine(df.index[0].date(), datetime.time(16))
ind = pd.date_range(start=start, end=end, freq="MIN")
return df.reindex(ind)
Run grouped_apply: CPU time: 2.68 ms Wall time: 2.68 ms
%%scoped grouped_apply
grouped = df.groupby(pd.TimeGrouper('D'))
out = grouped.apply(align_time)
Run grouped_apply: CPU time: 2 s Wall time: 2 s
%%scoped reindex
# need dates
grouped = df.groupby(pd.TimeGrouper('D'))
grouper = grouped.grouper
# build up full normal index
start = normalize_date(df.index[0])
end = normalize_date(df.index[-1]) + pd.tseries.frequencies.Day(1) - pd.tseries.frequencies.Minute(1)
ind = pd.date_range(start=start, end=end, freq="MIN")
# cut back down to market hours
temp = df.reindex(ind).between_time("8:00", "16:00", include_end=False)
# 480 is number of minutes in our market
data = temp.values.T.reshape(2, len(grouped.grouper.binlabels), 480)
times = temp.index[:480].time
out = pd.Panel(data, items=['AAPL', 'MSFT'], minor_axis=times, major_axis=grouper.binlabels.date)
Run reindex: CPU time: 168 ms Wall time: 167 ms
%%scoped binned
df = df.between_time("8:00", "16:00", include_end=False)
market_time = pd.date_range(start="2000-01-01 8:00", end="2000-01-01 16:00",
freq="min")[:-1].time
start_open = normalize_date(df.index[0]) + pd.Timedelta(hours=8)
start_close = normalize_date(df.index[0]) + pd.Timedelta(hours=16)
open_times = pd.date_range(start=start_open, end=df.index[-1].date(), freq="D").values
close_times = pd.date_range(start=start_close, end=df.index[-1].date(), freq="D").values
grouped = df.groupby(pd.TimeGrouper('D'))
grouper = grouped.grouper
bins = grouper.bins
values = df.values.T
index_values = df.index.values
out = np.empty((2, len(grouper.binlabels), 480))
out[:] = np.nan
last = 0
for i in range(out.shape[1]):
# full day
if bins[i] - last == 480:
out[:, i] = values[:, last:bins[i]]
last = bins[i]
continue
val_day = values[:, last:bins[i]]
day_times = index_values[last:bins[i]]
obs = bins[i] - last
current_time = open_times[i]
last_day_time = 0
for j in range(480):
if j >= obs:
continue
if current_time == day_times[last_day_time]:
out[:, i, j] = val_day[:, last_day_time]
last_day_time += 1
current_time += 60000000000
last = bins[i]
out = pd.Panel(out, items=['AAPL', 'MSFT'], major_axis=grouper.binlabels, minor_axis=market_time)
Run binned: CPU time: 336 ms Wall time: 336 ms
%%petri
%%cython
import bottleneck as bn
cimport numpy as np
import numpy as np
from numpy cimport *
cimport cython
import_array()
cdef double NaN = <double> np.NaN
from numpy cimport (double_t)
def time_table(double_t[:, :] input, int64_t[:] index_values, int64_t[:] bins, int64_t[:] open_times):
cdef int last = 0, i, last_day_time, obs
cdef int64_t current_time
cdef int days = len(bins)
cdef int items = input.shape[0]
cdef np.ndarray[double_t, ndim=3] output = np.empty((items, days, 480), dtype=float)
output[:] = np.nan
for i from 0 <= i < days:
# full day
if bins[i] - last == 480:
output[:, i] = input[:, last:bins[i]]
last = bins[i]
continue
val_day = input[:, last:bins[i]]
day_times = index_values[last:bins[i]]
obs = bins[i] - last
current_time = open_times[i]
last_day_time = 0
for j from 0 <= j < 480:
if j >= obs:
continue
if current_time == day_times[last_day_time]:
output[:, i, j] = val_day[:, last_day_time]
last_day_time += 1
current_time += 60000000000
last = bins[i]
return output
%%scoped binned_cython
df = df.between_time("8:00", "16:00", include_end=False)
market_time = pd.date_range(start="2000-01-01 8:00", end="2000-01-01 16:00",
freq="min")[:-1].time
start_open = normalize_date(df.index[0]) + pd.Timedelta(hours=8)
start_close = normalize_date(df.index[0]) + pd.Timedelta(hours=16)
open_times = pd.date_range(start=start_open, end=df.index[-1].date(), freq="D").values
grouped = df.groupby(pd.TimeGrouper('D'))
grouper = grouped.grouper
bins = grouper.bins
values = df.values.T
index_values = df.index.values
out = time_table(values, index_values.astype(int), np.array(bins), open_times.astype(int))
out = pd.Panel(np.array(out), items=['AAPL', 'MSFT'], major_axis=grouper.binlabels, minor_axis=market_time)
Run binned_cython: CPU time: 147 ms Wall time: 147 ms
out = qr.scoped_namespaces['binned_cython']['out']
out.iloc[0].head()
08:00:00 | 08:01:00 | 08:02:00 | 08:03:00 | 08:04:00 | 08:05:00 | 08:06:00 | 08:07:00 | 08:08:00 | 08:09:00 | ... | 15:50:00 | 15:51:00 | 15:52:00 | 15:53:00 | 15:54:00 | 15:55:00 | 15:56:00 | 15:57:00 | 15:58:00 | 15:59:00 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
2000-01-01 00:00:00+00:00 | -0.207255 | 0.267061 | 1.065043 | 0.492873 | 1.801605 | 1.635603 | 0.268815 | 1.694796 | -0.815680 | 1.921499 | ... | 0.095201 | -0.074996 | 0.239721 | 0.178841 | -0.927879 | -0.027830 | -0.049922 | -0.056138 | 0.362360 | -0.276897 |
2000-01-02 00:00:00+00:00 | -0.850649 | -0.713958 | -0.172411 | -0.375508 | 1.410634 | -0.439250 | -1.701368 | -0.324816 | 0.364080 | 1.958639 | ... | -0.264066 | 0.121890 | 1.369458 | -0.039989 | 0.889648 | -0.683334 | -0.118875 | -0.536274 | -1.015804 | -1.321273 |
2000-01-03 00:00:00+00:00 | 0.597315 | -1.068466 | 0.451337 | 0.177858 | -0.895664 | 0.159911 | -0.154640 | -0.921625 | 1.908262 | 0.727127 | ... | -0.462427 | -1.282839 | -0.104688 | -0.059753 | -2.471115 | 0.302470 | -0.949837 | -0.040999 | 0.557134 | 1.200767 |
2000-01-04 00:00:00+00:00 | 0.639688 | 1.055032 | 0.194147 | 0.099894 | -0.108194 | 2.204129 | 0.332998 | 1.181519 | 1.594374 | 0.468094 | ... | -1.030670 | 1.751952 | 1.100828 | 0.199484 | 1.006872 | -0.707579 | -0.271732 | 1.194524 | -0.040274 | 0.612135 |
2000-01-05 00:00:00+00:00 | -0.099197 | 0.136847 | -1.959596 | 0.685271 | 0.554076 | -0.320950 | -2.394725 | -0.880683 | -0.781639 | -0.556718 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
5 rows × 480 columns
out = qr.scoped_namespaces['binned']['out']
out.iloc[0].head()
08:00:00 | 08:01:00 | 08:02:00 | 08:03:00 | 08:04:00 | 08:05:00 | 08:06:00 | 08:07:00 | 08:08:00 | 08:09:00 | ... | 15:50:00 | 15:51:00 | 15:52:00 | 15:53:00 | 15:54:00 | 15:55:00 | 15:56:00 | 15:57:00 | 15:58:00 | 15:59:00 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
2000-01-01 00:00:00+00:00 | -0.207255 | 0.267061 | 1.065043 | 0.492873 | 1.801605 | 1.635603 | 0.268815 | 1.694796 | -0.815680 | 1.921499 | ... | 0.095201 | -0.074996 | 0.239721 | 0.178841 | -0.927879 | -0.027830 | -0.049922 | -0.056138 | 0.362360 | -0.276897 |
2000-01-02 00:00:00+00:00 | -0.850649 | -0.713958 | -0.172411 | -0.375508 | 1.410634 | -0.439250 | -1.701368 | -0.324816 | 0.364080 | 1.958639 | ... | -0.264066 | 0.121890 | 1.369458 | -0.039989 | 0.889648 | -0.683334 | -0.118875 | -0.536274 | -1.015804 | -1.321273 |
2000-01-03 00:00:00+00:00 | 0.597315 | -1.068466 | 0.451337 | 0.177858 | -0.895664 | 0.159911 | -0.154640 | -0.921625 | 1.908262 | 0.727127 | ... | -0.462427 | -1.282839 | -0.104688 | -0.059753 | -2.471115 | 0.302470 | -0.949837 | -0.040999 | 0.557134 | 1.200767 |
2000-01-04 00:00:00+00:00 | 0.639688 | 1.055032 | 0.194147 | 0.099894 | -0.108194 | 2.204129 | 0.332998 | 1.181519 | 1.594374 | 0.468094 | ... | -1.030670 | 1.751952 | 1.100828 | 0.199484 | 1.006872 | -0.707579 | -0.271732 | 1.194524 | -0.040274 | 0.612135 |
2000-01-05 00:00:00+00:00 | -0.099197 | 0.136847 | -1.959596 | 0.685271 | 0.554076 | -0.320950 | -2.394725 | -0.880683 | -0.781639 | -0.556718 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
5 rows × 480 columns
out = qr.scoped_namespaces['reindex']['out']
out.iloc[0].head()
08:00:00 | 08:01:00 | 08:02:00 | 08:03:00 | 08:04:00 | 08:05:00 | 08:06:00 | 08:07:00 | 08:08:00 | 08:09:00 | ... | 15:50:00 | 15:51:00 | 15:52:00 | 15:53:00 | 15:54:00 | 15:55:00 | 15:56:00 | 15:57:00 | 15:58:00 | 15:59:00 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
2000-01-01 | -0.207255 | 0.267061 | 1.065043 | 0.492873 | 1.801605 | 1.635603 | 0.268815 | 1.694796 | -0.815680 | 1.921499 | ... | 0.095201 | -0.074996 | 0.239721 | 0.178841 | -0.927879 | -0.027830 | -0.049922 | -0.056138 | 0.362360 | -0.276897 |
2000-01-02 | -0.850649 | -0.713958 | -0.172411 | -0.375508 | 1.410634 | -0.439250 | -1.701368 | -0.324816 | 0.364080 | 1.958639 | ... | -0.264066 | 0.121890 | 1.369458 | -0.039989 | 0.889648 | -0.683334 | -0.118875 | -0.536274 | -1.015804 | -1.321273 |
2000-01-03 | 0.597315 | -1.068466 | 0.451337 | 0.177858 | -0.895664 | 0.159911 | -0.154640 | -0.921625 | 1.908262 | 0.727127 | ... | -0.462427 | -1.282839 | -0.104688 | -0.059753 | -2.471115 | 0.302470 | -0.949837 | -0.040999 | 0.557134 | 1.200767 |
2000-01-04 | 0.639688 | 1.055032 | 0.194147 | 0.099894 | -0.108194 | 2.204129 | 0.332998 | 1.181519 | 1.594374 | 0.468094 | ... | -1.030670 | 1.751952 | 1.100828 | 0.199484 | 1.006872 | -0.707579 | -0.271732 | 1.194524 | -0.040274 | 0.612135 |
2000-01-05 | -0.099197 | 0.136847 | -1.959596 | 0.685271 | 0.554076 | -0.320950 | -2.394725 | -0.880683 | -0.781639 | -0.556718 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
5 rows × 480 columns
pivot_aapl = qr.scoped_namespaces['pivot']['prices_pivot']['AAPL']
pivot_aapl.head()
time | 08:00:00 | 08:01:00 | 08:02:00 | 08:03:00 | 08:04:00 | 08:05:00 | 08:06:00 | 08:07:00 | 08:08:00 | 08:09:00 | ... | 15:50:00 | 15:51:00 | 15:52:00 | 15:53:00 | 15:54:00 | 15:55:00 | 15:56:00 | 15:57:00 | 15:58:00 | 15:59:00 |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
date | |||||||||||||||||||||
2000-01-01 | -0.207255 | 0.267061 | 1.065043 | 0.492873 | 1.801605 | 1.635603 | 0.268815 | 1.694796 | -0.815680 | 1.921499 | ... | 0.095201 | -0.074996 | 0.239721 | 0.178841 | -0.927879 | -0.027830 | -0.049922 | -0.056138 | 0.362360 | -0.276897 |
2000-01-02 | -0.850649 | -0.713958 | -0.172411 | -0.375508 | 1.410634 | -0.439250 | -1.701368 | -0.324816 | 0.364080 | 1.958639 | ... | -0.264066 | 0.121890 | 1.369458 | -0.039989 | 0.889648 | -0.683334 | -0.118875 | -0.536274 | -1.015804 | -1.321273 |
2000-01-03 | 0.597315 | -1.068466 | 0.451337 | 0.177858 | -0.895664 | 0.159911 | -0.154640 | -0.921625 | 1.908262 | 0.727127 | ... | -0.462427 | -1.282839 | -0.104688 | -0.059753 | -2.471115 | 0.302470 | -0.949837 | -0.040999 | 0.557134 | 1.200767 |
2000-01-04 | 0.639688 | 1.055032 | 0.194147 | 0.099894 | -0.108194 | 2.204129 | 0.332998 | 1.181519 | 1.594374 | 0.468094 | ... | -1.030670 | 1.751952 | 1.100828 | 0.199484 | 1.006872 | -0.707579 | -0.271732 | 1.194524 | -0.040274 | 0.612135 |
2000-01-05 | -0.099197 | 0.136847 | -1.959596 | 0.685271 | 0.554076 | -0.320950 | -2.394725 | -0.880683 | -0.781639 | -0.556718 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
5 rows × 480 columns