import pandas as pd import pandas.util.testing as tm from pytz import timezone import numpy as np from pandas.tseries.tools import normalize_date import datetime import quantreactor as qr %load_ext Cython index = pd.date_range(start="2000/01/01", freq="MIN", periods=1001000, tz='UTC') index = index[(index.hour >= 8) & (index.hour < 16)] # make market hours 8am => 4pm short_wed = index[(index.dayofweek != 2) | (index.hour < 12)] data = np.random.randn(len(short_wed), 2) df = pd.DataFrame(data, index=short_wed, columns=['AAPL', 'MSFT']) %%scoped pivot prices = df.copy() prices['date'] = prices.index.date prices['time'] = prices.index.time prices_pivot = {} # dict of pivot tables, keyed to sid for stock in ['AAPL', 'MSFT']: prices_pivot[stock] = prices.pivot(index='date', columns='time',values=stock) %%scoped grouped_apply # naive reindex by day def align_time(df): if len(df) == 480: return df # not a full day, reindex start = datetime.datetime.combine(df.index[0].date(), datetime.time(8)) end = datetime.datetime.combine(df.index[0].date(), datetime.time(16)) ind = pd.date_range(start=start, end=end, freq="MIN") return df.reindex(ind) %%scoped grouped_apply grouped = df.groupby(pd.TimeGrouper('D')) out = grouped.apply(align_time) %%scoped reindex # need dates grouped = df.groupby(pd.TimeGrouper('D')) grouper = grouped.grouper # build up full normal index start = normalize_date(df.index[0]) end = normalize_date(df.index[-1]) + pd.tseries.frequencies.Day(1) - pd.tseries.frequencies.Minute(1) ind = pd.date_range(start=start, end=end, freq="MIN") # cut back down to market hours temp = df.reindex(ind).between_time("8:00", "16:00", include_end=False) # 480 is number of minutes in our market data = temp.values.T.reshape(2, len(grouped.grouper.binlabels), 480) times = temp.index[:480].time out = pd.Panel(data, items=['AAPL', 'MSFT'], minor_axis=times, major_axis=grouper.binlabels.date) %%scoped binned df = df.between_time("8:00", "16:00", include_end=False) market_time = pd.date_range(start="2000-01-01 8:00", end="2000-01-01 16:00", freq="min")[:-1].time start_open = normalize_date(df.index[0]) + pd.Timedelta(hours=8) start_close = normalize_date(df.index[0]) + pd.Timedelta(hours=16) open_times = pd.date_range(start=start_open, end=df.index[-1].date(), freq="D").values close_times = pd.date_range(start=start_close, end=df.index[-1].date(), freq="D").values grouped = df.groupby(pd.TimeGrouper('D')) grouper = grouped.grouper bins = grouper.bins values = df.values.T index_values = df.index.values out = np.empty((2, len(grouper.binlabels), 480)) out[:] = np.nan last = 0 for i in range(out.shape[1]): # full day if bins[i] - last == 480: out[:, i] = values[:, last:bins[i]] last = bins[i] continue val_day = values[:, last:bins[i]] day_times = index_values[last:bins[i]] obs = bins[i] - last current_time = open_times[i] last_day_time = 0 for j in range(480): if j >= obs: continue if current_time == day_times[last_day_time]: out[:, i, j] = val_day[:, last_day_time] last_day_time += 1 current_time += 60000000000 last = bins[i] out = pd.Panel(out, items=['AAPL', 'MSFT'], major_axis=grouper.binlabels, minor_axis=market_time) %%petri %%cython import bottleneck as bn cimport numpy as np import numpy as np from numpy cimport * cimport cython import_array() cdef double NaN = np.NaN from numpy cimport (double_t) def time_table(double_t[:, :] input, int64_t[:] index_values, int64_t[:] bins, int64_t[:] open_times): cdef int last = 0, i, last_day_time, obs cdef int64_t current_time cdef int days = len(bins) cdef int items = input.shape[0] cdef np.ndarray[double_t, ndim=3] output = np.empty((items, days, 480), dtype=float) output[:] = np.nan for i from 0 <= i < days: # full day if bins[i] - last == 480: output[:, i] = input[:, last:bins[i]] last = bins[i] continue val_day = input[:, last:bins[i]] day_times = index_values[last:bins[i]] obs = bins[i] - last current_time = open_times[i] last_day_time = 0 for j from 0 <= j < 480: if j >= obs: continue if current_time == day_times[last_day_time]: output[:, i, j] = val_day[:, last_day_time] last_day_time += 1 current_time += 60000000000 last = bins[i] return output %%scoped binned_cython df = df.between_time("8:00", "16:00", include_end=False) market_time = pd.date_range(start="2000-01-01 8:00", end="2000-01-01 16:00", freq="min")[:-1].time start_open = normalize_date(df.index[0]) + pd.Timedelta(hours=8) start_close = normalize_date(df.index[0]) + pd.Timedelta(hours=16) open_times = pd.date_range(start=start_open, end=df.index[-1].date(), freq="D").values grouped = df.groupby(pd.TimeGrouper('D')) grouper = grouped.grouper bins = grouper.bins values = df.values.T index_values = df.index.values out = time_table(values, index_values.astype(int), np.array(bins), open_times.astype(int)) out = pd.Panel(np.array(out), items=['AAPL', 'MSFT'], major_axis=grouper.binlabels, minor_axis=market_time) out = qr.scoped_namespaces['binned_cython']['out'] out.iloc[0].head() out = qr.scoped_namespaces['binned']['out'] out.iloc[0].head() out = qr.scoped_namespaces['reindex']['out'] out.iloc[0].head() pivot_aapl = qr.scoped_namespaces['pivot']['prices_pivot']['AAPL'] pivot_aapl.head()