This is a Python implementation of MATLAB demo code provided by the course Monte Carlo Methods in Finance provided by iversity.org
https://iversity.org/my/courses/monte-carlo-methods-in-finance/
Random numbers can be generated using either the random
module or the numpy.random
submodule. Here we use the latter.
from __future__ import division
import numpy as np
from numpy import random
import matplotlib.pyplot as plt
The function numpy.random.rand
generates random numbers on the interval [0,1) with uniform probability (note that the interval is half-open).
print random.rand()
0.0926675357494
An array of random numbers is generated by passing the shape of the array as an argument
print random.rand(5,4)
[[ 0.16573073 0.30142696 0.9401207 0.31052939] [ 0.95662973 0.1095877 0.00377109 0.04902205] [ 0.82546732 0.11588215 0.82999227 0.6284355 ] [ 0.68406566 0.14488838 0.53180516 0.97465852] [ 0.38534993 0.24335456 0.61923684 0.04330545]]
In NumPy (psuedo-)random numbers are generated using a deterministic algorithm. The numbers are therefore not truly random. By fixing the seed we can reproduce the same sequence.
( Note that numpy.random.seed is not thread safe, but it's sufficient for our purposes )
random.seed(seed=123)
print random.rand(2,3)
random.seed(seed=123)
print random.rand(2,3) # give the same output
[[ 0.69646919 0.28613933 0.22685145] [ 0.55131477 0.71946897 0.42310646]] [[ 0.69646919 0.28613933 0.22685145] [ 0.55131477 0.71946897 0.42310646]]
Finally, we plot the PDF and CDF of the uniform distribution.
nPlots = 1000
xPlot = np.linspace(-0.5, 1.5, nPlots)
uniformPDF = np.where((xPlot >= 0) & (xPlot <= 1), 1, 0)
# This is one of many ways in which you can define a piecewise function.
# np.where(A, b, c) takes a boolean array A (or converts A to a boolean array) and checks the thruth-value of each element A[i,j]
# The output is an array B of the same shape as A.
# when A[i,j] = True, then B[i,j] = b, and when A[i,j] = False then B[i,j] = c
# Note that here b and c are scalars, but they can be arrays as well
uniformCDF = uniformPDF * xPlot + np.where(xPlot >= 1, 1, 0)
plt.figure()
ax = plt.subplot(211)
plt.plot(xPlot, uniformPDF);
ax.set_title('Uniform PDF')
ax.set_xlabel('x')
ax.set_ylabel('UniformPDF(x)')
ax.set_ybound(-.25,1.25)
ax = plt.subplot(212)
ax.set_title('Uniform CDF')
ax.set_xlabel('x')
ax.set_ylabel('UniformCDF(x)')
plt.plot(xPlot, uniformCDF);
ax.set_ybound(-.25,1.25)
plt.tight_layout()
Alternatively, we can also define uniformPDF
using slicing.
uniformPDF = np.zeros_like(xPlot)
uniformPDF[(xPlot >= 0) & (xPlot <= 1)] = 1
fig = plt.figure()
ax2 = plt.plot(xPlot, uniformPDF)
ax = fig.get_axes()[0]
ax.set_title('Uniform PDF')
ax.set_xlabel('x')
ax.set_ylabel('UniformPDF(x)')
ax.set_ybound(-.1, 1.1)
plt.tight_layout()
Here we look at numpy.random.randn
which generates random numbers from a Gaussian distribution. This functions draws random samples from the normal distribution with mean zero and standard deviation 1. We write this as S∼N(0,1).
random.seed(50)
print random.randn()
S = random.randn(3,4)
print S
-1.56035210868 [[-0.0309776 -0.62092842 -1.46458049 1.41194612] [-0.47673214 -0.78046921 1.07026774 -1.2822926 ] [-1.3274789 0.12633764 0.86219372 0.69673696]]
The PDF and CDF of the normal and other distributions are stored in the scipy.stats
module. Another excellent statistics module is statsmodels
, but we will stick to scipy.stats
.
The class scipy.stats.norm
(which we import below) holds different functions associated with the normal distribution, such as the PDF and the CDF.
See http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.norm.html for more info.
nPlots = 1000
alpha = 3
xPlot = np.linspace(-alpha, alpha, nPlots)
from scipy.stats import norm
normPDF = norm.pdf(xPlot)
normCDF = norm.cdf(xPlot)
plt.figure()
ax = plt.subplot(211)
plt.plot(xPlot, normPDF);
ax.set_title('Normal PDF')
ax.set_xlabel('x')
ax.set_ylabel('NormalPDF(x)')
ax.set_ybound(-.1,.5)
ax = plt.subplot(212)
ax.set_title('Normal CDF')
ax.set_xlabel('x')
ax.set_ylabel('NormalCDF(x)')
plt.plot(xPlot, normCDF);
ax.set_ybound(-.2,1.2)
plt.tight_layout()
We consider a normal distrubtion with mean mu and standard deviation sigma.
mu = 4
sigma = 2
alpha = 4
xMin = mu - alpha*sigma
xMax = mu + alpha*sigma
nPlot = 1000
xPlot = linspace(xMin,xMax,nPlot)
normPDF = norm.pdf(xPlot,mu,sigma) # the mean and standard deviation are passed as arguments to the pdf and cdf functions.
normCDF = norm.cdf(xPlot,mu,sigma)
def plotFigures(): # We use this figure again below
plt.figure()
ax = plt.subplot(211)
ax.set_title('Normal CDF')
ax.set_xlabel('x')
ax.set_ylabel('NormalCDF(x)')
plt.plot(xPlot, normCDF);
ax.set_ybound(0,1.02)
ax = plt.subplot(212)
plt.plot(xPlot, normPDF)
ax.set_title('Normal PDF')
ax.set_xlabel('x')
ax.set_ylabel('NormalPDF(x)')
ax.set_ybound(0,1.1*norm.pdf(mu,mu,sigma))
plt.tight_layout()
plotFigures()
Next we plot symmetric intervals around the mean for the PDF and CDF: [μ−ασ,μ+ασ] with α integer.
mu = 4
sigma = 2
plotFigures()
for alpha in range(0,4):
xLower = mu - alpha * sigma
xUpper = mu + alpha * sigma
cdfLower = norm.cdf(xLower, mu, sigma)
cdfUpper = norm.cdf(xUpper, mu, sigma)
plt.subplot(211)
plt.plot([xLower, xLower], [0, cdfLower], 'k:')
plt.plot([xMin, xLower], [cdfLower, cdfLower], 'k:')
plt.plot([xUpper, xUpper], [0, cdfUpper], 'k:')
plt.plot([xMin, xUpper], [cdfUpper, cdfUpper], 'k:')
plt.subplot(212)
plt.plot([xLower, xLower], [0, norm.pdf(xLower,mu,sigma)], 'k:')
plt.plot([xUpper, xUpper], [0, norm.pdf(xUpper,mu,sigma)], 'k:')
plt.tight_layout()
This demo again plots symmetric intervals around the mean
mu = 4
sigma = 2
alpha = 4
xMin = mu - alpha*sigma
xMax = mu + alpha*sigma
nPlot = 1000
xPlot = linspace(xMin,xMax,nPlot)
normalPdf = norm.pdf(xPlot,mu,sigma);
fig = plt.figure()
plt.plot(xPlot, normPDF)
ax = fig.get_axes()[0]
ax.set_title('Normal PDF')
ax.set_xlabel('x')
ax.set_ylabel('NormalPDF(x)')
ax.set_ybound(0,1.1*norm.pdf(mu,mu,sigma))
for alpha in [0,1,2,3]:
xLower = mu-alpha*sigma
xUpper = mu+alpha*sigma
plot([xLower, xLower],[0, norm.pdf(xLower,mu,sigma)],'k:');
plot([xUpper, xUpper],[0, norm.pdf(xUpper,mu,sigma)],'k:');
If Log[S] is normally distributed we write log[S]∼N(μ,σ). Then the distribution of S is lognormal, denoted S∼logN(μ,σ).
We generate lognormal data as
np.random.seed(99)
mu = 1
sigma = 0.5
M = 1e3 # sample size
S = np.exp(mu + sigma*random.randn(M))
Sample estimates of input parameters of the distribution are
hat_mu = np.mean(np.log(S))
hat_sigma = np.std(np.log(S))
print hat_mu, hat_sigma
1.02927514347 0.501400277218
Next we graphically compare the sample to the true distribution. For this we define a function graphicalComparisonPdf.
def graphicalComparisonPdf(X, modelPdf, scale = True, xMin = None, xMax = None):
_X = X[np.logical_not(np.isnan(X))]
if xMax is None:
xMax = np.max(_X) # default parameter of xMax
if xMin is None:
xMin = np.min(_X) # default parameter of xMin
nPlot = 1000
xPlot = np.linspace(xMin, xMax, nPlot)
yPlot = modelPdf(xPlot)
nBins = np.min([np.sqrt(X.size), 40])
widthHistogram = np.max(_X)- np.min(_X)
averageHeightHistogram = _X.size/nBins
areaHistogram = widthHistogram*averageHeightHistogram
pdfScaleFactor = areaHistogram if not scale else 1
# if scale = False we rescale modelPDF(x) by the area of the histogram
# if scale = True the histogram is scaled, such that its area is 1 (as is the case for modelPDF(x))
fig = plt.figure()
ax = fig.add_subplot(111)
_, _, p = plt.hist(_X, bins=nBins, normed = scale)
l, = plt.plot(xPlot, yPlot * pdfScaleFactor, 'r', linewidth=3)
ax.set_xlabel('x')
ax.set_ylabel('pdf(x)')
if scale:
plt.legend([l, p[0]], ['pdf(x)', 'scaled histogram'])
else:
plt.legend([l, p[0]], ['scaled pdf(x)', 'histogram'])
plt.show()
We now want to compare our sample distribution to the exact lognormal distribution. scipy.stats again comes with a built-in lognormal distribution:
from scipy.stats import lognorm
However, this function is not the same as the matlab function lognpdf (or rather lognorm.pdf is not the same as matlab's lognpdf). In fact, the lognormal distribution of scipy.stats is a bit tricky, see here for more info:
http://nbviewer.ipython.org/url/xweb.geos.ed.ac.uk/~jsteven5/blog/lognormal_distributions.ipynb
The numpy version of Matlab's lognpdf can be defined in two ways: in terms of elementary functions, or by rescaling lognorm.pdf.
def lognpdf_1(x,mean,sig):
return np.where((x > 0) & np.isfinite(x) , np.exp(-(np.log(x)-mean)**2/(2.*sig**2)) /(x*sig*np.sqrt(2*np.pi)), 0 )
def lognpdf_2(x,mean,sig):
return lognorm.pdf(x, scale = np.exp(mean), s = sig)
These funtions create the same pdf.
def lognpdf_fixed_mu_sigma_1(x):
return lognpdf_1(x, hat_mu, hat_sigma)
def lognpdf_fixed_mu_sigma_2(x):
return lognpdf_2(x, hat_mu, hat_sigma)
graphicalComparisonPdf(S, lognpdf_fixed_mu_sigma_1)
plt.show()
graphicalComparisonPdf(S, lognpdf_fixed_mu_sigma_2)
np.random.seed(55)
#parameters
mu = -4
sigma = 2
# sample size
M = 1e4;
# X ~ N(mu,sigma)
X = mu + sigma*np.random.randn(M)
plt.figure()
nBins = 40; # number of bins for the histogram
plt.hist(X,nBins); # generate histogram
# base of histogram
xMin = np.min(X);
xMax = np.max(X);
# area of histogram
areaHistogram = (xMax-xMin)*M/nBins;
# compute values of the pdf in [xMin,xMax]
nPlot = 1000
xPlot = np.linspace(xMin,xMax,nPlot)
yPlot = norm.pdf(xPlot,mu,sigma)
# plot scaled pdf
fig = plt.figure()
_, _, p = plt.hist(X,nBins)
l, = plt.plot(xPlot,yPlot*areaHistogram,'r',linewidth=1.5)
ax = fig.get_axes()[0]
ax.set_xlabel('x')
ax.set_ylabel('count')
plt.legend([l, p[0]], ['scaled pdf(x)', 'histogram']);
# Same figure but now with the histogram scaled
# plot scaled histogram
fig = plt.figure()
_, _, p = plt.hist(X,nBins, normed = True)
l, = plt.plot(xPlot,yPlot,'r',linewidth=1.5)
ax = fig.get_axes()[0]
ax.set_xlabel('x')
ax.set_ylabel('count')
plt.legend([l, p[0]], ['pdf(x)', 'scaled histogram']);
We consider a sample of normally distributed random variables.
np.random.seed(501)
mu = -4
sigma = 2
M = 1e2 # sample size
X = mu + sigma*np.random.randn(M)
We can define an emperical CDF from this sample using. It appears that scipy.stats
does not come with a built-in emperical CDF function, so we import it from statsmodels instead.
This ECDF automatically sorts the data.
from statsmodels.distributions import ECDF as empericalCDF
eCDF = empericalCDF(X)
eCDF holds both the x and emperical CDF(x) values in eCDF.x and eCDF.y. To plot the CDF as a stair-like plot we use plt.step
.
plt.step(eCDF.x, eCDF.y);
Next we compare this graphically to the true CDF. For this we define graphicalComparisonCdf
def graphicalComparisonCdf(X, modelCdf, xMin = None, xMax =None):
_X = X[np.logical_not(np.isnan(X))] # get rid of possible nan's.
if xMin is None:
xMin = np.min(_X)
if xMax is None:
xMax = np.max(_X)
nPlots = 1000
fig = plt.figure()
ax = plt.subplot(111)
xPlot = np.linspace(xMin,xMax, nPlots)
yPlot = modelCdf(xPlot)
ecdf = empericalCDF(X)
plt.step(ecdf.x, ecdf.y, label = 'empfcdf(x)');
plt.plot(xPlot,yPlot,'r', label = 'normcdf(x)')
ax.set_xlabel('x')
ax.set_ylabel('probability')
plt.legend(loc='lower right')
plt.show();
And the plot.
def cdf(x):
return norm.cdf(x, mu, sigma)
graphicalComparisonCdf(X, cdf, mu - 3 * sigma, mu + 3 * sigma)
We now perform a similar analysis for the lognormal distribution
np.random.seed(999)
mu = 1
sigma = .5
M = 1e2
S = np.exp(mu + sigma * np.random.randn(M))
Sample estimates are
hat_mu = np.mean(np.log(S))
hat_sigma = np.std(np.log(S))
print hat_mu, hat_sigma
1.09480438278 0.485396888756
The emperical CDF is
eCDF_lognorm = empericalCDF(S)
Next we define the exact CDF of the lognormal distribution. Again, the CDF provided by scipy.stats.lognorm needs to be used ''with care''.
from scipy.stats import lognorm
def lognormCDF(x):
return lognorm.cdf(x, scale = np.exp(hat_mu), s = hat_sigma)
... and compare this with the emperical CDF
graphicalComparisonCdf(S, lognormCDF) # note that graphicalComparisonCdf determines the ECDF itself
We again define the pdf and cdf of the normal distribution, with fixed parameters mu and sigma.
mu = 1
sigma = 3
def pdf(x):
return norm.pdf(x, mu, sigma)
def cdf(x):
return norm.cdf(x, mu, sigma)
Now we define a function cdfinvNewtonRaphson, which computes the inverse of the CDF using the Newton-Raphson method.
def cdfinvNewtonRaphson(p, pdf, cdf, x0):
"""
Input:
p : probability 0 <= p <= 1
pdf : probability density function
cdf : cumulative density function
x0 : seed
Output:
x : where x solves cdf(x) = p
"""
MAXITER = 100
TOLABS = 1e-6
x = np.ones_like(p, dtype = np.float)
x.fill(x0) # initial seed
dx = np.ones_like(p, dtype =np.float)
dx.fill(10 * TOLABS)
nIter = 0
while (nIter < MAXITER) and (np.any(np.abs(dx) > TOLABS)):
nIter += 1
dx = (cdf(x) - p) / pdf(x)
x = x - dx
if nIter == MAXITER:
print "Maximum number of iterations in NR reached"
return x
M = 10000
p = np.arange(1, 2*M, 2, dtype = np.float) / (2*M)
X = cdfinvNewtonRaphson(p, pdf, cdf, mu)
We compare this to the true pdf using the earlier defined graphicalComparisonPdf.
graphicalComparisonPdf(X, pdf)
It's time to apply some of the techniques to real data. This data is loaded using pandas
import pandas as pd
from pandas.io.data import DataReader
from datetime import datetime
import matplotlib.pyplot as plt
S = DataReader(["IBM", "GOOG"], "yahoo", datetime(2007,7,1), datetime(2013,6,30))['Adj Close']
Stocks = S.join(np.log(S).diff(), rsuffix='_log_diff' )
Stocks.head()
GOOG | IBM | GOOG_log_diff | IBM_log_diff | |
---|---|---|---|---|
Date | ||||
2007-07-02 | 530.38 | 93.52 | NaN | NaN |
2007-07-03 | 534.34 | 94.92 | 0.007439 | 0.014859 |
2007-07-05 | 541.63 | 96.23 | 0.013551 | 0.013707 |
2007-07-06 | 539.40 | 97.10 | -0.004126 | 0.009000 |
2007-07-09 | 542.56 | 97.05 | 0.005841 | -0.000515 |
We create a list of all log returns for easy access.
returns = [colnames for colnames in Stocks.columns if colnames.endswith('_diff') ]
mu = Stocks[returns].mean()
sigma = Stocks[returns].std()
print mu
print
print sigma
GOOG_log_diff 0.000336 IBM_log_diff 0.000467 dtype: float64 GOOG_log_diff 0.020984 IBM_log_diff 0.015495 dtype: float64
matplotlib
does not have a built-in plot function for qqplots, so we import it from scipy.stats
.
from scipy.stats import probplot
for s in returns:
print s
mu = Stocks[s].mean()
sigma = Stocks[s].std()
def pdf(x): return norm.pdf(x, mu, sigma)
def cdf(x): return norm.cdf(x, mu, sigma)
graphicalComparisonPdf(Stocks[s].values, pdf)
graphicalComparisonCdf(Stocks[s].values, cdf)
_S = Stocks[s].values
_S = _S[np.logical_not(np.isnan(_S))] # probplot cannot handle nan values, so we first get rid of them
probplot(_S, dist = "norm", plot = pylab)
GOOG_log_diff
IBM_log_diff
The tails of the data is usually too fat for a normal distribution. Therefore we consider a t-distribution instead.
from scipy.stats import t as student_t
The student-t distribution is defined in terms of three parameters: mean, standard deviation and the degrees of freedom nu. To infer these from the data requires some special techniques, and so we use the fit method instead.
_S = Stocks[returns[0]].values
_S = _S[np.logical_not(np.isnan(_S))]
hat_nu, hat_mean, hat_sig = student_t.fit(_S)
for s in returns:
print s
_S = Stocks[returns[0]].values
_S = _S[np.logical_not(np.isnan(_S))]
hat_nu, hat_mean, hat_sig = student_t.fit(_S)
def pdf(x): return student_t.pdf(x, hat_nu, hat_mean, hat_sig)
def cdf(x): return student_t.cdf(x, hat_nu, hat_mean, hat_sig)
graphicalComparisonPdf(Stocks[s].values, pdf)
graphicalComparisonCdf(Stocks[s].values, cdf)
_S = Stocks[s].values
_S = _S[np.logical_not(np.isnan(_S))] # probplot cannot handle nan values, so we first get rid of them
probplot(_S, sparams=(hat_nu, hat_mean, hat_sig), dist = "t", plot = pylab)
GOOG_log_diff
IBM_log_diff
A function required for the last two demos of this week is the expected value of a function g(X) given a pdf f_X.
from scipy.integrate import quad
def expectedValue(g, f_X, xLow, xUpp):
"""
Input:
g : handle to the function g(X)
f_X : X ~ f_X (the pdf of X)
xLow, xUPP : integration limits
Output:
E[g(X) | xLow <= X <= xUPP]
"""
TOL = 1e-10
def integrand(x):
return f_X(x) * g(x)
return quad(integrand, xLow, xUpp, epsabs=TOL, epsrel=TOL)[0] / quad(f_X, xLow, xUpp, epsabs=TOL, epsrel=TOL)[0]
Parameters of the distribution
mu = 3.
sigma = 2.
Emperical support
R = 10.
xLow = mu - R * sigma
xUpp = mu + R * sigma
Generate B samples of size M
B = 10000
M = 200
X = mu + sigma * np.random.randn(M,B)
Sample mean
sampleMean = np.mean(X, axis = 0)
Expected value, variance and standard deviation.
( The notation lambda x: g(x) is a short way of defining a function g. )
def f_X(x):
return norm.pdf(x, mu, sigma)
E_X = expectedValue(lambda x: x, f_X, xLow, xUpp)
var_X = expectedValue(lambda x: (x-E_X)**2, f_X, xLow, xUpp)
std_X = np.sqrt(var_X)
The central limit theorem predicts that mean of our B samples of data (each sample of size M) follows the normal distribution with mean μ and standard deviation σ/√M. Here we compare this statement graphically
def modelPdf(x): return norm.pdf(x, E_X, std_X/np.sqrt(M) )
graphicalComparisonPdf(sampleMean, modelPdf, scale = True)
Now we look at the Black-Scholes model
S0 = 100 # initial asset price
mu = 0.1 # drift
sigma = 0.4 # volatility
T = 2.0 # time in the future
The Black-Scholes model is given by
X∼N(0,1)
S(T,X)=S0exp((μ−σ2/2)T+σ√TX)
def f_ST(x):
return S0 * np.exp((mu-.5*sigma**2) * T + sigma * sqrt(T) * x)
R = 10
E_ST = expectedValue(f_ST, norm.pdf, -R, R) # note that norm.pdf has default parameters of mean = 0, sigma = 1
Sample mean
B = 10000;
M = 200;
X = np.random.randn(M,B);
ST = f_ST(X);
# B estimates of the sample mean
E_ST_MC = mean(ST, axis = 0); #Each estimate is over a sample of size M
Monte Carlo error
error_MC = np.std(ST) / np.sqrt(M)
error_MC
5.3069481020105558
def modelPDF(x):
return norm.pdf(x, E_ST, error_MC)
graphicalComparisonPdf(E_ST_MC, modelPDF)