# Tell IPython to display mapltplotlib plots inline. %matplotlib inline # Set default font attributes. import matplotlib font = {'family' : 'normal', 'weight' : 'bold', 'size' : 13} matplotlib.rc('font', **font) import pandas as pd import numpy as np import matplotlib.pyplot as plt randn = np.random.randn pd.set_option('display.mpl_style', 'default') pd.set_option('display.max_rows', 15) # Make a default figure size for later use. DEFAULT_FIGSIZE = (12, 6) s = pd.Series([3,5,7,2]) s # An important concept to understand when working with a `Series` is that it's # actually composed of two pieces: an index array, and a data array. print "The index is {0}.".format(s.index) print "The values are {0}.".format(s.values) # You can explicitly pass your own labels to use as an index. If you don't # Pandas will construct a default index with integer labels. pd.Series(np.random.randn(4), index=['a', 'b', 'c', 'd']) # You can also construct a Series from a dictionary. # The keys are used as the index, and the values are used as the Series' values pd.Series( { 'a': 1, 'b': 2, 'c': 3, } ) # You get performance (and code clarity!) benefits if your Series' # labels/values are homogenously-typed, but mixed-type arrays are supported. pd.Series( [1, 2.6, 'a', {'a': 'b'}], index=[1, 'a', 2, 2.5], ) s = pd.Series(range(10), index=list('ABCDEFGHIJ')) s # Lookups by key work as you'd expect. s['E'] # We can look up multiple values at a time by passing a list of keys. # The resulting value is a new `Series`. s[['E', 'I', 'B']] # Because the Index is ordered, we can use Python's slicing syntax. s['E':] # Label-based slicing is inclusive of both endpoints. s[:'I'] s['E':'I'] # Step arguments work just like Python lists. s['E':'I':2] # If you don't know the label you want, but you do know the position, you can # use `iloc`. print "The first entry is: %d" % s.iloc[0] print "The last entry is: %d" % s.iloc[-1] # Slicing works with `iloc` as well. # Note that, unlike with label-based slicing, integer-based slices are # right-open intervals, i.e. doing s.iloc[X:Y] gives you elements with indices # in [X, Y). This is the same as the semantics for list slicing. s.iloc[5:] print s.iloc[:5] s.iloc[-3:] # Create two Series objects containing 100 samples each of sine and cosine. sine = pd.Series(np.sin(np.linspace(0, 3.14 * 2, 100)), name='sine') cosine = pd.Series(np.cos(np.linspace(0, 3.14 * 2, 100)), name='cosine') sine cosine # Multiplying two Series objects produces a new Series by multiplying values that have the same keys. product = cosine * sine product # Adding or multiplying a Series by a scalar applies that operation to each value in the Series. cosine_plus_one = cosine + 1 cosine_plus_one # Other binary operators work as you'd expect. # Note how much cleaner and clearer this is # compared to looping over two containers and # performing multiple operations on elements # from each. identity = (sine ** 2) + (cosine ** 2) identity # Plot our sines values. trigplot = sine.plot( ylim=(-1.2, 1.2), legend=True, figsize=DEFAULT_FIGSIZE, linewidth=3, label='sine', ) # Add our other Series' to the same plot. cosine.plot(ax=trigplot, legend=True, linewidth=3) product.plot(ax=trigplot, legend=True, linewidth=3, label='product') identity.plot(ax=trigplot, legend=True, linewidth=3, label='identity') def tenths_place(N): s = str(N) return s[s.find('.') + 1] product.apply(tenths_place) # A major problem when working with real world data is handling missing entries. # Pandas handles missing data by taking s1 = pd.Series({'a': 1, 'b': 2, 'c': 3}) # s2 is missing an entry for 'b' s2 = pd.Series({'a': 4, 'c': 5}) s1 + s2 s1 = pd.Series( { 'A': 1, 'B': 2, 'C': 3, 'D': 4, 'E': 3, 'F': 2, 'G': 1, } ) # You can create a constant Series by passing a scalar value and an index. s2 = pd.Series(2, index=s1.index) greater = s1 > s2 greater less = s1 < s2 less equal = s1 == s2 equal # Comparisons against scalars also work. s1_equal_to_3 = s1 == 3 s1_equal_to_3 #TODO: Move this down? pd.DataFrame({ 's1': s1, 's2': s2, 's1 > s2': greater, 's1 == s2': equal, 's1 < s2': less, 's1 == 3': s1_equal_to_3, }, columns=['s1','s2', 's1 > s2', 's1 == s2', 's1 < s2', 's1 == 3']) # Indexing into a series with a boolean Series masks away the values which were # false in the passed Series. s1[s1 > s2] # We can combine these operators to concisely express complex # computations/filters. s1[(s1 > 1) & ~(s1 > s2)] # Pandas has a special index class, `DatetimeIndex`, for representing # TimeSeries data. start = pd.Timestamp('2014-01-01', tz='UTC') end = pd.Timestamp('2014-01-09', tz='UTC') # date_range is an easy way to construct a DatetimeIndex daily_index = pd.date_range(start, end) daily_index # DatetimeIndex has a notion of its Frequency. from pandas.tseries.offsets import Day, Hour, BDay, Minute hourly_index = pd.date_range( pd.Timestamp('2014-01-01', tz='UTC'), pd.Timestamp('2014-01-9', tz='UTC'), freq=Hour(), ) hourly_index bihourly_index = pd.date_range( pd.Timestamp('2014-01-01', tz='UTC'), pd.Timestamp('2014-01-09', tz='UTC'), freq=Hour(2), ) bihourly_index weekday_index = pd.date_range( pd.Timestamp('2014-01-01', tz='UTC'), pd.Timestamp('2014-01-09', tz='UTC'), freq=BDay(), ) print weekday_index [i for i in weekday_index] ts = pd.Series( np.arange(30) ** 2, pd.date_range( start=pd.Timestamp('2014-01-01', tz='UTC'), freq='1D', periods=30, ) ) ts.plot() # By default, resampling to a lower frequency takes the mean of the entries # that were downsampled. resampled = ts.resample('5D') resampled # We can customize this behavior though. resampled_first = ts.resample('5D', how='first') resampled_first resampled_last = ts.resample('5D', how='last') resampled_last # We can even define our own custom sampling methods. def geometric_mean(subseries): return np.product(subseries.values) ** (1.0 / len(subseries)) resampled_geometric = ts.resample('5D', how=geometric_mean) print resampled_geometric pd.DataFrame( { "resampled": resampled, "resampled_first": resampled_first, "resampled_last": resampled_last, "resampled_geometric": resampled_geometric, } ).plot(linewidth=2, figsize=DEFAULT_FIGSIZE) # Upsampling creates missing data, which is represented by numpy.nan. ts.resample('6H') # We can fill empty values with fillna. zero_filled = ts.resample('6H').fillna(0) print zero_filled # We can forward-fill with the last known prior value. ffilled = ts.resample('6H').ffill() print ffilled # We can backfill with earliest known next value. bfilled = ts.resample('6H').bfill() print bfilled # We can interpolate between known values. # Note: `interpolate` is new as of pandas 0.14.0 # Quantopian is currently on pandas 0.12.0 due to breaking changes in the # pandas API in 0.13.0. linear_interpolated = ts.resample('6H').interpolate() linear_interpolated quadratic_interpolated = ts.resample('6H').interpolate('polynomial', order=2) quadratic_interpolated # Note: `interpolate` is new as of pandas 0.14.0 # Quantopian is currently on pandas 0.12.0 due to breaking changes in the # pandas API in 0.13.0. pd.DataFrame( { "linear_interpolated": linear_interpolated, "quadratic_interpolated": quadratic_interpolated, "bfilled": bfilled, "ffilled": ffilled, "zero_filled": zero_filled, } ).plot(linewidth=2, figsize=DEFAULT_FIGSIZE) # Oftentimes we have more than one axis on which we want to store data. from pandas.io.data import get_data_yahoo spy = get_data_yahoo( symbols='SPY', start=pd.Timestamp('2011-01-01'), end=pd.Timestamp('2014-01-01'), adjust_price=True, ) spy # Just plotting this DataFrame with the default arguments isn't very useful, # because the scale of volume is so much greater than all the other columns. spy.plot(figsize=DEFAULT_FIGSIZE) # Let's make a more interesting plot. # Create a figure fig = plt.figure() # Add a subplot for price. price_subplot = fig.add_subplot('311', xlabel='Date', ylabel='Price') spy['Close'].plot(ax=price_subplot, lw=2) # lw means "line width" # Add another subplot for each day's spread. spread_subplot = fig.add_subplot('312', xlabel='Date', ylabel='Spread') spread = spy['High'] - spy['Low'] spread.plot(ax=spread_subplot, lw=2, color='r') # And add a third plot for volume. volume_subplot = fig.add_subplot('313', xlabel='Date', ylabel='Volume') spy['Volume'].plot(ax=volume_subplot, lw=2) # matplotlib.pyplot.gcf is short for "Get Current Figure". It provides an easy # way to modify the last drawn plot. plt.gcf().set_size_inches(*DEFAULT_FIGSIZE) # Unsurprisingly, spread is strongly correlated with daily volume spread.corr(spy['Volume']) # Default slicing acts on column labels. # Passing a scalar value drops the dimension by one. spy['Close'] # Returns a Series # Passing a list filters the columns down to the supplied values. spy[['Close', 'Volume']] # Using .loc with one argument takes a slice of rows based on label. spy.loc[pd.Timestamp('2013-02-01'):pd.Timestamp('2013-02-28')] # Using .loc with two arguments takes a slice of rows based on label, then a # slice of columns based on name. # Note the comma between the first slice and the second slice! spy.loc[pd.Timestamp('2013-02-01'):pd.Timestamp('2013-02-28'), 'Open':'Low'] # We can use iloc when we want lookups by position. spy.iloc[-20:-10, [0,2]] # Get the days on which SPY closed higher than it opened. up_days = spy['Close'] > spy['Open'] up_days spy[up_days] # We can use .ix when we want mixed lookups. spy.ix[-20:-10, 'Open':'High'] five_day_returns = spy['Close'].pct_change(5) five_day_returns # Checking for equality of floating point numbers is a bad idea because of # roundoff error. `numpy.allclose` does an appropriate epsilon test. test_return = (spy['Close'].iloc[5] - spy['Close'].iloc[0]) / spy['Close'].iloc[0] np.allclose(five_day_returns.iloc[5], test_return) thirty_day_forward_returns = (spy['Close'].shift(-30) - spy['Close']) / spy['Close'] test_return = (spy['Close'].iloc[30] - spy['Close'].iloc[0]) / spy['Close'].iloc[0] np.allclose(thirty_day_forward_returns.iloc[0], test_return) returns = pd.DataFrame( { 'forward_30Day': thirty_day_forward_returns, 'backA_2Day': spy['Close'].pct_change(2), 'backB_5Day': spy['Close'].pct_change(5), 'backD_50Day': spy['Close'].pct_change(50), 'backE_100Day': spy['Close'].pct_change(100), 'backF_200Day': spy['Close'].pct_change(200), 'backG_300Day': spy['Close'].pct_change(300), } ).dropna(how='any') returns.plot(figsize=DEFAULT_FIGSIZE) # Pairwise correlation of forward and backward returns. corr = returns.corr() corr corr.ix['forward_30Day',:-1].plot(kind='bar', position=.5, xlim=(-1, 6)) plt.gcf().set_size_inches(9, 6) # Load data for Pepsi and Coca-Cola from Yahoo. symbols = [ 'PEP', 'KO', ] cola_data = get_data_yahoo(['PEP', 'KO'], adjust_price=True) cola_data # Compute the 1-day forward log returns for both securities' close prices. closes = cola_data['Close'] yesterday_closes = cola_data['Close'].shift(1) cola_log_returns = (closes / yesterday_closes).apply(np.log) cola_raw_returns = closes.pct_change(1) # Look at the data we just calculated by throwing it into a Panel and # pulling out just the DataFrame or Kola. pd.Panel({ 'closes' : closes, 'prev_closes': yesterday_closes, 'log_returns': cola_log_returns, 'raw_returns': cola_raw_returns, }).loc[:,:,'KO'] # Pull the standard returns and the log returns into a single DataFrame using DataFrame.join. closes.join(cola_log_returns, rsuffix='_lr')\ .join(cola_raw_returns, rsuffix='_rr')\ .dropna(how='any') # Create a figure with three 'slots' for subplots. fig = plt.figure() # 311 here means "Put the subplot in the 1st slot of a 3 x 1 grid. # 312 and 313 tell matplotlib to place the subsequent plots in the 2nd and 3rd slot price_subplot = fig.add_subplot('311', xlabel='Date', ylabel='Price') return_subplot_pep = fig.add_subplot('312', xlabel='Date', ylabel='PEP Log Returns') return_subplot_ko = fig.add_subplot('313', xlabel='Date', ylabel='KO Log Returns') cola_data['Close'].plot(ax=price_subplot, color=['purple', 'red']) cola_log_returns['PEP'].plot(ax=return_subplot_pep, color='red') cola_log_returns['KO'].plot(ax=return_subplot_ko, color='purple') # Set the size of the whole plot array. gcf stands for `get_current_figure`. plt.gcf().set_size_inches(14, 10) # Compute the correlation of our log returns correlation = (cola_log_returns['PEP']).corr(cola_log_returns['KO']) correlation # Compute column-wise standard deviation of daily returns and divide by # 1 / sqrt(252) to get annualized volatility. volatility = cola_log_returns.std() * np.sqrt(252) volatility