# Tell IPython to display mapltplotlib plots inline.
%matplotlib inline

# Set default font attributes.
import matplotlib
font = {'family' : 'normal',
        'weight' : 'bold',
        'size'   : 13}
matplotlib.rc('font', **font)

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
randn = np.random.randn

pd.set_option('display.mpl_style', 'default')
pd.set_option('display.max_rows', 15)

# Make a default figure size for later use.
DEFAULT_FIGSIZE = (12, 6)

s = pd.Series([3,5,7,2])
s

# An important concept to understand when working with a `Series` is that it's
# actually composed of two pieces: an index array, and a data array.

print "The index is {0}.".format(s.index)
print "The values are {0}.".format(s.values)

# You can explicitly pass your own labels to use as an index.  If you don't
# Pandas will construct a default index with integer labels.
pd.Series(np.random.randn(4), index=['a', 'b', 'c', 'd'])

# You can also construct a Series from a dictionary.
# The keys are used as the index, and the values are used as the Series' values
pd.Series(
    {
        'a': 1, 
        'b': 2,
        'c': 3,
    }
)

# You get performance (and code clarity!) benefits if your Series'
# labels/values are homogenously-typed, but mixed-type arrays are supported.
pd.Series(
    [1, 2.6, 'a', {'a': 'b'}], 
    index=[1, 'a', 2, 2.5],
)

s = pd.Series(range(10), index=list('ABCDEFGHIJ'))
s


# Lookups by key work as you'd expect.
s['E']

# We can look up multiple values at a time by passing a list of keys.
# The resulting value is a new `Series`.
s[['E', 'I', 'B']]

# Because the Index is ordered, we can use Python's slicing syntax.
s['E':]

 # Label-based slicing is inclusive of both endpoints.
s[:'I']

s['E':'I']

# Step arguments work just like Python lists.
s['E':'I':2]

# If you don't know the label you want, but you do know the position, you can
# use `iloc`.
print "The first entry is: %d" % s.iloc[0]
print "The last entry is: %d" % s.iloc[-1]

# Slicing works with `iloc` as well.

# Note that, unlike with label-based slicing, integer-based slices are
# right-open intervals, i.e. doing s.iloc[X:Y] gives you elements with indices
# in [X, Y).  This is the same as the semantics for list slicing.
s.iloc[5:]

print s.iloc[:5]

s.iloc[-3:]

# Create two Series objects containing 100 samples each of sine and cosine.
sine = pd.Series(np.sin(np.linspace(0, 3.14 * 2, 100)), name='sine')
cosine = pd.Series(np.cos(np.linspace(0, 3.14 * 2, 100)), name='cosine')

sine

cosine

# Multiplying two Series objects produces a new Series by multiplying values that have the same keys.
product = cosine * sine
product

# Adding or multiplying a Series by a scalar applies that operation to each value in the Series.
cosine_plus_one = cosine + 1
cosine_plus_one

# Other binary operators work as you'd expect.  

# Note how much cleaner and clearer this is
# compared to looping over two containers and 
# performing multiple operations on elements 
# from each.
identity = (sine ** 2) + (cosine ** 2)
identity

# Plot our sines values.
trigplot = sine.plot(
    ylim=(-1.2, 1.2),
    legend=True,
    figsize=DEFAULT_FIGSIZE,
    linewidth=3,
    label='sine',
)
# Add our other Series' to the same plot.
cosine.plot(ax=trigplot, legend=True, linewidth=3)
product.plot(ax=trigplot, legend=True, linewidth=3, label='product')
identity.plot(ax=trigplot, legend=True, linewidth=3, label='identity')

def tenths_place(N):
    s = str(N)
    return s[s.find('.') + 1]
product.apply(tenths_place)

# A major problem when working with real world data is handling missing entries.
# Pandas handles missing data by taking 
s1 = pd.Series({'a': 1, 'b': 2, 'c': 3})
# s2 is missing an entry for 'b'
s2 = pd.Series({'a': 4, 'c': 5})
s1 + s2

s1 = pd.Series(
    {
        'A': 1,
        'B': 2,
        'C': 3,
        'D': 4,
        'E': 3,
        'F': 2,
        'G': 1,
    }
)
# You can create a constant Series by passing a scalar value and an index.
s2 = pd.Series(2, index=s1.index)

greater = s1 > s2
greater

less = s1 < s2
less

equal = s1 == s2
equal

# Comparisons against scalars also work.
s1_equal_to_3 = s1 == 3
s1_equal_to_3

#TODO: Move this down?

pd.DataFrame({
    's1': s1,
    's2': s2,
    's1 > s2': greater,
    's1 == s2': equal,
    's1 < s2': less,
    's1 == 3': s1_equal_to_3,
}, columns=['s1','s2', 's1 > s2', 's1 == s2', 's1 < s2', 's1 == 3'])

# Indexing into a series with a boolean Series masks away the values which were
# false in the passed Series.
s1[s1 > s2]

# We can combine these operators to concisely express complex
# computations/filters.
s1[(s1 > 1) & ~(s1 > s2)]

# Pandas has a special index class, `DatetimeIndex`, for representing
# TimeSeries data.
start = pd.Timestamp('2014-01-01', tz='UTC')
end = pd.Timestamp('2014-01-09', tz='UTC')

# date_range is an easy way to construct a DatetimeIndex
daily_index = pd.date_range(start, end)
daily_index

# DatetimeIndex has a notion of its Frequency.
from pandas.tseries.offsets import Day, Hour, BDay, Minute

hourly_index = pd.date_range(
    pd.Timestamp('2014-01-01', tz='UTC'),
    pd.Timestamp('2014-01-9', tz='UTC'),
    freq=Hour(),
)
hourly_index

bihourly_index = pd.date_range(
    pd.Timestamp('2014-01-01', tz='UTC'),
    pd.Timestamp('2014-01-09', tz='UTC'),
    freq=Hour(2),
)
bihourly_index

weekday_index = pd.date_range(
    pd.Timestamp('2014-01-01', tz='UTC'),
    pd.Timestamp('2014-01-09', tz='UTC'),
    freq=BDay(),
)
print weekday_index
[i for i in weekday_index]

ts = pd.Series(
    np.arange(30) ** 2,
    pd.date_range(
        start=pd.Timestamp('2014-01-01', tz='UTC'),
        freq='1D',
        periods=30,
    )
)
ts.plot()

# By default, resampling to a lower frequency takes the mean of the entries
# that were downsampled.
resampled = ts.resample('5D')
resampled

# We can customize this behavior though.
resampled_first = ts.resample('5D', how='first')
resampled_first

resampled_last = ts.resample('5D', how='last')
resampled_last

# We can even define our own custom sampling methods.
def geometric_mean(subseries):
    return np.product(subseries.values) ** (1.0 / len(subseries))
resampled_geometric = ts.resample('5D', how=geometric_mean)
print resampled_geometric

pd.DataFrame(
    {
        "resampled": resampled,
        "resampled_first": resampled_first,
        "resampled_last": resampled_last,
        "resampled_geometric": resampled_geometric,
    }
).plot(linewidth=2, figsize=DEFAULT_FIGSIZE)

# Upsampling creates missing data, which is represented by numpy.nan.
ts.resample('6H')

# We can fill empty values with fillna.
zero_filled = ts.resample('6H').fillna(0)
print zero_filled

# We can forward-fill with the last known prior value.
ffilled = ts.resample('6H').ffill()
print ffilled

# We can backfill with earliest known next value.
bfilled = ts.resample('6H').bfill()
print bfilled

# We can interpolate between known values.

# Note: `interpolate` is new as of pandas 0.14.0
# Quantopian is currently on pandas 0.12.0 due to breaking changes in the
# pandas API in 0.13.0.
linear_interpolated = ts.resample('6H').interpolate()
linear_interpolated

quadratic_interpolated = ts.resample('6H').interpolate('polynomial', order=2)
quadratic_interpolated
# Note: `interpolate` is new as of pandas 0.14.0
# Quantopian is currently on pandas 0.12.0 due to breaking changes in the
# pandas API in 0.13.0.

pd.DataFrame(
    {
        "linear_interpolated": linear_interpolated,
        "quadratic_interpolated": quadratic_interpolated,
        "bfilled": bfilled,
        "ffilled": ffilled,
        "zero_filled": zero_filled,
    }
).plot(linewidth=2, figsize=DEFAULT_FIGSIZE)

# Oftentimes we have more than one axis on which we want to store data.
from pandas.io.data import get_data_yahoo
spy = get_data_yahoo(
    symbols='SPY',
    start=pd.Timestamp('2011-01-01'),
    end=pd.Timestamp('2014-01-01'),
    adjust_price=True,
)
spy

# Just plotting this DataFrame with the default arguments isn't very useful,
# because the scale of volume is so much greater than all the other columns.
spy.plot(figsize=DEFAULT_FIGSIZE)

# Let's make a more interesting plot.

# Create a figure
fig = plt.figure()

# Add a subplot for price.
price_subplot = fig.add_subplot('311', xlabel='Date', ylabel='Price')
spy['Close'].plot(ax=price_subplot, lw=2)  # lw means "line width"

# Add another subplot for each day's spread.
spread_subplot = fig.add_subplot('312', xlabel='Date', ylabel='Spread')
spread = spy['High'] - spy['Low']
spread.plot(ax=spread_subplot, lw=2, color='r')

# And add a third plot for volume.
volume_subplot = fig.add_subplot('313', xlabel='Date', ylabel='Volume')
spy['Volume'].plot(ax=volume_subplot, lw=2)

# matplotlib.pyplot.gcf is short for "Get Current Figure".  It provides an easy
# way to modify the last drawn plot.
plt.gcf().set_size_inches(*DEFAULT_FIGSIZE)

# Unsurprisingly, spread is strongly correlated with daily volume
spread.corr(spy['Volume'])

# Default slicing acts on column labels.
# Passing a scalar value drops the dimension by one.
spy['Close'] # Returns a Series

# Passing a list filters the columns down to the supplied values.
spy[['Close', 'Volume']]

# Using .loc with one argument takes a slice of rows based on label.
spy.loc[pd.Timestamp('2013-02-01'):pd.Timestamp('2013-02-28')]

# Using .loc with two arguments takes a slice of rows based on label, then a
# slice of columns based on name.

# Note the comma between the first slice and the second slice!
spy.loc[pd.Timestamp('2013-02-01'):pd.Timestamp('2013-02-28'), 'Open':'Low']

# We can use iloc when we want lookups by position.
spy.iloc[-20:-10, [0,2]]

# Get the days on which SPY closed higher than it opened.
up_days = spy['Close'] > spy['Open']
up_days

spy[up_days]

# We can use .ix when we want mixed lookups.
spy.ix[-20:-10, 'Open':'High']

five_day_returns = spy['Close'].pct_change(5)
five_day_returns

# Checking for equality of floating point numbers is a bad idea because of
# roundoff error.  `numpy.allclose` does an appropriate epsilon test.
test_return = (spy['Close'].iloc[5] - spy['Close'].iloc[0]) / spy['Close'].iloc[0]
np.allclose(five_day_returns.iloc[5], test_return)

thirty_day_forward_returns = (spy['Close'].shift(-30) - spy['Close']) / spy['Close']
test_return = (spy['Close'].iloc[30] - spy['Close'].iloc[0]) / spy['Close'].iloc[0]
np.allclose(thirty_day_forward_returns.iloc[0], test_return)

returns = pd.DataFrame(
    {
        'forward_30Day': thirty_day_forward_returns,
        'backA_2Day': spy['Close'].pct_change(2),
        'backB_5Day': spy['Close'].pct_change(5),
        'backD_50Day': spy['Close'].pct_change(50),
        'backE_100Day': spy['Close'].pct_change(100),
        'backF_200Day': spy['Close'].pct_change(200),
        'backG_300Day': spy['Close'].pct_change(300),
    }
).dropna(how='any')
returns.plot(figsize=DEFAULT_FIGSIZE)

# Pairwise correlation of forward and backward returns.
corr = returns.corr()
corr

corr.ix['forward_30Day',:-1].plot(kind='bar', position=.5, xlim=(-1, 6))
plt.gcf().set_size_inches(9, 6)

# Load data for Pepsi and Coca-Cola from Yahoo.
symbols = [
    'PEP',
    'KO',
]
cola_data = get_data_yahoo(['PEP', 'KO'], adjust_price=True)
cola_data

# Compute the 1-day forward log returns for both securities' close prices.
closes = cola_data['Close']
yesterday_closes = cola_data['Close'].shift(1)
cola_log_returns = (closes / yesterday_closes).apply(np.log)
cola_raw_returns = closes.pct_change(1)

# Look at the data we just calculated by throwing it into a Panel and
# pulling out just the DataFrame or Kola.
pd.Panel({
    'closes'     : closes,
    'prev_closes': yesterday_closes,
    'log_returns': cola_log_returns,
    'raw_returns': cola_raw_returns,
}).loc[:,:,'KO']

# Pull the standard returns and the log returns into a single DataFrame using DataFrame.join.
closes.join(cola_log_returns, rsuffix='_lr')\
      .join(cola_raw_returns, rsuffix='_rr')\
      .dropna(how='any')

# Create a figure with three 'slots' for subplots.
fig = plt.figure()
# 311 here means "Put the subplot in the 1st slot of a 3 x 1 grid.
# 312 and 313 tell matplotlib to place the subsequent plots in the 2nd and 3rd slot
price_subplot = fig.add_subplot('311', xlabel='Date', ylabel='Price')
return_subplot_pep = fig.add_subplot('312', xlabel='Date', ylabel='PEP Log Returns')
return_subplot_ko = fig.add_subplot('313', xlabel='Date', ylabel='KO Log Returns')

cola_data['Close'].plot(ax=price_subplot, color=['purple', 'red'])
cola_log_returns['PEP'].plot(ax=return_subplot_pep, color='red')
cola_log_returns['KO'].plot(ax=return_subplot_ko, color='purple')

# Set the size of the whole plot array.  gcf stands for `get_current_figure`.
plt.gcf().set_size_inches(14, 10)

# Compute the correlation of our log returns
correlation = (cola_log_returns['PEP']).corr(cola_log_returns['KO'])
correlation

# Compute column-wise standard deviation of daily returns and divide by
# 1 / sqrt(252) to get annualized volatility.
volatility = cola_log_returns.std() * np.sqrt(252)
volatility