import numpy as np
import pandas as pd
from datetime import time
pd.set_option('html', False)
from IPython.core.display import Image
Image('http://akamaicovers.oreilly.com/images/0636920023784/lrg.jpg')
import pandas as pd
import numpy as np
temp = '/Users/wesm/Downloads/minutebars/%s.csv'
path = temp % 'AAPL'
!wc -l $path
aapl_bars = pd.read_csv(temp % 'AAPL')
aapl_bars
%time _ = pd.read_csv(path)
aapl_bars.dt
aapl_bars.index = pd.to_datetime(aapl_bars.pop('dt'))
aapl_bars.head()
def load_bars(ticker):
bars = pd.read_csv(temp % ticker)
bars.index = pd.to_datetime(bars.pop('dt'))
return bars
aapl_bars.at_time(time(15, 0)).head(10)
aapl_bars.close_price['2009-10-15']
aapl_bars.close_price
mth_mean = aapl_bars.close_price.resample('M', how=['mean', 'median', 'std'])
mth_mean
mth_mean.plot()
close = aapl_bars.close_price
close / close.shift(1) - 1
minute_returns = aapl_bars.close_price.pct_change()
std_10day = pd.rolling_std(minute_returns, 390 * 10)
std_10day.resample('B').plot()
ts1 = pd.Series(np.random.randn(10),
index=pd.date_range('1/1/2000', periods=10))
ts1
ts2 = ts1[[0, 2, 4, 5, 6, 7, 8]]
ts2
ts1 + ts2
df = pd.DataFrame({'A': ts1, 'B': ts2})
df
ibm_bars = load_bars('IBM')
def subsample(frame, pct=0.9):
N = len(frame)
indexer = np.sort(np.random.permutation(N)[:pct*N])
return frame.take(indexer)
f1 = subsample(ibm_bars)
f2 = subsample(aapl_bars)
f1
both = pd.concat([f1, f2], axis=1, keys=['IBM', 'AAPL'])
both.head(20)
df
df.count()
both.count()
df.sum()
df.mean(1)
df.dropna()
df.fillna(0)
df.fillna(method='ffill')
df.asfreq('4h')
df.asfreq('4h').ffill(limit=3)
import random, string
import matplotlib as mpl
def rands(n):
choices = string.ascii_letters
return ''.join([random.choice(choices) for _ in xrange(n)])
mpl.rc('figure', figsize=(12, 8))
ind_names = np.array(['ENERGY', 'FINANCIAL', 'TECH',
'CONSDUR', 'SERVICES', 'UTILITIES'], dtype='O')
ccys = np.array(['USD', 'EUR'], dtype='O')
Nfull = 2000
tickers = np.array(sorted(rands(5).upper() for _ in xrange(Nfull)), dtype='O')
tickers = np.unique(tickers)
industries = pd.Series(ind_names.take(np.random.randint(0, 6, Nfull)),
index=tickers, name='industry')
ccy = pd.Series(ccys.take(np.random.randint(0, len(ccys), Nfull)),
index=tickers, name='ccy')
ccy
df = pd.DataFrame({'Momentum' : np.random.randn(1000) / 200 + 0.03,
'Value' : np.random.randn(1000) / 200 + 0.08,
'ShortInterest' : np.random.randn(1000) / 200 - 0.02},
index=tickers.take(np.random.permutation(Nfull)[:1000]))
df.head()
means = df.groupby(industries).mean()
means
means.plot(kind='barh')
means = df.groupby([industries, ccy]).mean()
means
keys = [industries, ccy]
zscore = lambda x: (x - x.mean()) / x.std()
normed = df.groupby(keys).apply(zscore)
normed.groupby(keys).agg(['mean', 'std'])
means
means['Momentum']
means.ix['TECH']
means.stack()
means.stack().unstack('industry')
base = '/Users/wesm/Dropbox/book/svn/book_scripts/movielens/ml-1m'
get_path = lambda x: '%s/%s.dat' % (base, x)
unames = ['user_id', 'gender', 'age', 'occupation', 'zip']
users = pd.read_table(get_path('users'), sep='::', header=None, names=unames)
rnames = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_table(get_path('ratings'), sep='::', header=None, names=rnames)
mnames = ['movie_id', 'title', 'genres']
movies = pd.read_table(get_path('movies'), sep='::', header=None, names=mnames)
movies.head()
ratings.head()
users.head()
data = pd.merge(pd.merge(ratings, users), movies)
data
rating_counts = data.groupby('title').size()
freq_titles = rating_counts.index[rating_counts > 1000]
freq_titles
highest_rated = data.groupby('title').rating.mean()[freq_titles].order()[-20:]
highest_rated
filtered = data[data.title.isin(highest_rated.index)]
filtered.title = filtered.title.str[:25]
filtered.groupby(['title', 'gender']).rating.count().unstack()
mean_ratings = data.pivot_table('rating', rows='title',
cols='gender', aggfunc='mean')
mean_ratings.tail(20)
summary, value_counts, etc.
data.title.value_counts()
data.rating.describe()
by_gender = data.groupby('gender').rating.describe()
by_gender
by_gender.unstack(0)