import pandas as pd from pandas import Series, DataFrame, Panel values = [1, 2, 3] labels = ['Cashews', 'Almonds', 'Peanuts'] s = Series(values, labels) s s.index s.values type(s.values) s['Peanuts'] s['Almonds'] s['Cashews'] d = dict(zip(labels, values)) Series(d) df = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], index=['a', 'b', 'c'], columns=labels) df df.index df.columns df.values df['Cashews'] type(df['Cashews']) df.Cashews df.ix['a', :] p = Panel({'x' : df, 'y' : df**2}) p p.index p.items p.major_axis p.minor_axis p['x'] type(p['x']) p.ix['x', :, :] p.ix[:, :, 'Cashews'] #items, major, minor s s[0] s[-1] s[:2] s[:-1] s[['Cashews', 'Peanuts', 'Almonds']] s[0] = 5 df df.ix[0] df.ix['a', :] df.ix[:, 'Cashews'] df.ix[0, 1] df.ix[[0, 1], :-1] df.ix[df.Almonds > 4] df[df > 3] # new in 0.9.1 df.ix[:, 0] = 1 df.ix['b', :] = Series([6, 1, 5], df.columns) df path = 'https://dl.dropbox.com/u/22164876/data.csv' df = pd.read_csv(path) df df.shape df.index df.head() df = pd.read_csv(path, index_col=['Date', 'Time']) df df.index[:5] df.index[0] ticks = pd.read_csv(path, parse_dates={'ts': ['Date', 'Time']}, index_col='ts') ticks ticks.index ticks.index[0] isinstance(ticks.index[0], datetime.datetime) ticks.index[0].nanosecond sq_price = ticks.Price**2 np.sqrt(sq_price.mean()) sq_vol = ticks.Volume**2 np.sqrt(sq_vol.mean()) ticks['Price'].std() mean = ticks.Price.mean() std = ticks.Price.std(ddof=0) uncentered = np.sqrt(mean**2 + std**2) uncentered std = ticks.Price.std() ticks['Price'].std() ticks['Price'].std() ticks.to_csv('tmp.csv') pd.read_csv('tmp.csv', index_col='ts') import pandas.io.sql as sql import sqlite3 con = sqlite3.connect(':memory:') sql.write_frame(ticks, 'ticks', con) sql.read_frame('select * from ticks', con) store = pd.HDFStore('ticks.h5') store['ticks'] = ticks store['ticks'] store.close() df = ticks.ix[:1000, ['Price', 'Volume']] df df.index[0] df.describe() df.count() df.mean() df.std() df.min() df.max() df.quantile(0.50) df.median() df.mean(axis=1) df.ix[:10, :].std(axis=1) df.sum() df.apply(lambda x: x.sum()) df.applymap(lambda x: x.sum()).head() # no effect df.head() df.head().applymap(lambda x: x**2) df.ix[:5] df.ix[:5] * 10 df.ix[:5] + 100 df.ix[:5] - df.ix[:5] means = df.mean() means df.ix[:5] - means result = df - means result.mean() df.cov() df.dot(df.T) # oops, wrong orientation df.T.dot(df) df.corr() df.ix[:5].abs() df.kurt() ticks.ix[:, :3].head() df.cov() demeaned = df - df.mean() numer = demeaned.T.dot(demeaned) denom = demeaned.count() numer / denom numer / (denom - 1) def cov(df): demeaned = df - df.mean() numer = demeaned.T.dot(demeaned) denom = demeaned.count() return numer / (denom - 1) df2 = df.copy() df2.ix[:10, 0] = np.nan df2.ix[-10:, 1] = np.nan cov(df2) cov(df2.dropna()) df2.cov() df.ix[:5] df.ix[6:11] df.ix[:5] + df.ix[6:11] df.head() df.head().shift() df.Price.pct_change() every_other = df.ix[[0, 1, 2, 5, 7]] every_other missing = every_other.reindex(df.ix[:20].index) missing df = ticks.ix[:, ['Price', 'Volume']] df df = ticks.reindex(columns=['Price', 'Volume']) df missing missing.mean() missing.values missing.values.mean(axis=0) missing.Price missing.Price.fillna(method='ffill') missing.Price.fillna(method='bfill') missing.Price.fillna(method='ffill', limit=3) missing.Price.fillna(missing.Price.mean()) missing.Price.interpolate() missing.apply(Series.interpolate) missing.dropna() ticks = pd.read_csv(path, parse_dates=['Date']) ticks df = ticks.ix[:, ['Price', 'Volume']] df df['Returns'] = df.Price.pct_change() grouped = df.groupby(ticks.Date) grouped grouped.Returns.mean() grouped.Volume.sum() grouped.Price.std() grouped.Returns.agg(lambda x: (1 + x).prod() - 1) grouped.agg(lambda x: (x.Price * x.Volume).sum() / x.Volume.sum()) grouped.agg(lambda x: x.irow(-1) / x.irow(0) - 1) grouped.last() / grouped.first() - 1 rs = grouped.transform(lambda x: (x - x.mean()) / x.std()) rs rs.min() rs.max() rs.mean() rs.std() ticks df = ticks.ix[:, ['Date', 'Time', 'Price', 'Volume']] df df = df.set_index(['Date', 'Time']) df rets = df.Price / df.Price.shift(5) - 1 rets grouped = df.groupby(level=0) rs = grouped.transform(lambda x: (x / x.shift(5) - 1).cumsum()) returns = rs.Price returns def raw_var(df, dates): return df.groupby(dates).agg(lambda x: (x**2).sum() / (x.count() - 1)) raw_var(df, ticks.Date)