#!/usr/bin/env python
# coding: utf-8

# Parametric non Parametric inference
# ===================
# 
# Suppose you have a physical model of an output variable, which takes the form of a parametric model. You now want to model the random effects of the data by a non-parametric (better: infinite parametric) model, such as a Gaussian Process as described in [BayesianLinearRegression](../background/BayesianLinearRegression.ipynb). We can do inference in both worlds, the parameteric and infinite parametric one, by extending the features to a mix between 
# 
# \begin{align}
# p(\mathbf{y}|\boldsymbol{\Phi}, \alpha, \sigma) &= \int p(\mathbf{y}|\boldsymbol{\Phi}, \mathbf{w}, \sigma)p(\mathbf{w}|\alpha) \,\mathrm{d}\mathbf{w}\\
# &= \langle\mathcal{N}(\mathbf{y}|\boldsymbol{\Phi}\mathbf{w}, \sigma^2\mathbf{I})\rangle_{\mathcal{N}(\mathbf{0}, \alpha\mathbf{I})}\\
# &= \mathcal{N}(\mathbf{y}|\mathbf{0}, \alpha\boldsymbol{\Phi}\boldsymbol{\Phi}^\top + \sigma^2\mathbf{I})
# \end{align}
# 
# Thus, we can maximize this marginal likelihood w.r.t. the hyperparameters $\alpha, \sigma$ by log transforming and maximizing:
# 
# \begin{align}
# \hat\alpha, \hat\sigma = \mathop{\arg\max}_{\alpha, \sigma}\log p(\mathbf{y}|\boldsymbol{\Phi}, \alpha, \sigma)
# \end{align}
# 
# So we will define a mixed inference model mixing parametric and non-parametric models together. One part is described by a paramtric feature space mapping $\boldsymbol{\Phi}\mathbf{w}$ and the other part is a non-parametric function $\mathbf{f}_\text{n}$. For this we define the underlying function $\mathbf{f}$ as 
# 
# $$
# \begin{align}
# p(\mathbf{f}) &= p\left(
#   \underbrace{
#   \begin{bmatrix} 
#    \delta(t-T)\\
#    \boldsymbol{\Phi}
#   \end{bmatrix}
#   }_{=:\mathbf{A}}
#   \left.
#   \begin{bmatrix}
#    \mathbf{f}_{\text{n}}\\
#    \mathbf{w}
#   \end{bmatrix}
#   \right|
#   \mathbf{0}, 
#   \mathbf{A}
#   \underbrace{
#   \begin{bmatrix}
#    \mathbf{K}_{\mathbf{f}} & \\
#    & \mathbf{K}_{\mathbf{w}}
#   \end{bmatrix}
#   }_{=:\boldsymbol{\Sigma}}
#   \mathbf{A}^\top
#   \right)\enspace,
# \end{align}
# $$
# 
# where $\mathbf{K}_{\mathbf{f}}$ is the covariance describing the non-parametric part $\mathbf{f}_\text{n}\sim\mathcal{N}(\mathbf{0}, \mathbf{K}_\mathbf{f})$ and $\mathbf{K}_{\mathbf{w}}$ is the covariance of the prior over $\mathbf{w}\sim\mathcal{N}(\mathbf{w}|\mathbf{0}, \mathbf{K}_{\mathbf{w}})$.
# 
# Thus we can now predict the different parts and even the paramters $\mathbf{w}$ themselves using (Note: If someone is willing to write down the proper path to this, here a welcome and thank you very much. Thanks to Philipp Hennig for his ideas in this.)
# 
# $$
# \begin{align}
# p(\mathbf{f}|\mathbf{y}) &= 
#  \mathcal{N}(\mathbf{f} | 
#  \boldsymbol{\Sigma}\mathbf{A}^\top 
#   \underbrace{
#    (\mathbf{A}\boldsymbol{\Sigma}\mathbf{A}^\top + \sigma^2\mathbf{I})^{-1}}_{=:\mathbf{K}^{-1}}\mathbf{y}, \boldsymbol{\Sigma}-\boldsymbol{\Sigma}\mathbf{A}^\top\mathbf{K}^{-1}\mathbf{A}\boldsymbol{\Sigma})
#  \\
# p(\mathbf{w}|\mathbf{y}) &= \mathcal{N}(\mathbf{w} | \mathbf{K}_\mathbf{w}\boldsymbol{\Phi}^\top\mathbf{K}^{-1}\mathbf{y},
# \mathbf{K}_{\mathbf{w}}-\mathbf{K}_{\mathbf{w}}\boldsymbol{\Phi}^\top\mathbf{K}^{-1}\boldsymbol{\Phi}\mathbf{K}_{\mathbf{w}}))
#  \\
# p(\mathbf{f}_\text{n}|\mathbf{y}) &= \mathcal{N}(\mathbf{f}_\text{n}| \mathbf{K}_\mathbf{f}\mathbf{K}^{-1}\mathbf{y},
# \mathbf{K}_{\mathbf{f}}-\mathbf{K}_{\mathbf{f}}\mathbf{K}^{-1}\mathbf{K}_{\mathbf{f}}))
# \end{align}
# $$

# In[1]:


import GPy, numpy as np, pandas as pd
from GPy.kern import LinearSlopeBasisFuncKernel, DomainKernel, ChangePointBasisFuncKernel
get_ipython().run_line_magic('matplotlib', 'inline')
from matplotlib import pyplot as plt


# We will create some data with a non-linear function, strongly driven by piecewise linear trends:

# In[2]:


np.random.seed(12345)


# In[3]:


x = np.random.uniform(0, 10, 40)[:,None]
x.sort(0)
starts, stops = np.arange(0, 10, 3), np.arange(1, 11, 3)

k_lin = LinearSlopeBasisFuncKernel(1, starts, stops, variance=1., ARD=1)
Phi = k_lin.phi(x)
_ = plt.plot(x, Phi)


# We will assume the prior over $w_i\sim\mathcal{N}(0, 3)$ and a Matern32 structure in the non-parametric part. Additionally, we add a half parametric part, which is a periodic effect only active between $x\in[3,8]$:

# In[4]:


k = GPy.kern.Matern32(1, .3)
Kf = k.K(x)

k_per = GPy.kern.PeriodicMatern32(1, variance=100, period=1)
k_per.period.fix()
k_dom = DomainKernel(1, 1., 5.)
k_perdom = k_per * k_dom
Kpd = k_perdom.K(x)


# In[5]:


np.random.seed(1234)
alpha = np.random.gamma(3, 1, Phi.shape[1])
w = np.random.normal(0, alpha)[:,None]

f_SE = np.random.multivariate_normal(np.zeros(x.shape[0]), Kf)[:, None]
f_perdom = np.random.multivariate_normal(np.zeros(x.shape[0]), Kpd)[:, None]
f_w = Phi.dot(w)

f = f_SE + f_w + f_perdom

y = f + np.random.normal(0, .1, f.shape)


# In[6]:


plt.plot(x, f_w)
_ = plt.plot(x, y)
# Make sure the function is driven by the linear trend, as there can be a difficulty in identifiability.


# With this data, we can fit a model using the basis functions as paramtric part. If you want to implement your own basis function kernel, see GPy.kern._src.basis_funcs.BasisFuncKernel and implement the necessary parts. Usually it is enough to implement the phi(X) method, returning the higher dimensional mapping of inputs X.

# In[7]:


k = (GPy.kern.Bias(1) 
     + GPy.kern.Matern52(1) 
     + LinearSlopeBasisFuncKernel(1, ARD=1, start=starts, stop=stops, variance=.1, name='linear_slopes')
     + k_perdom.copy()
    )

k.randomize()
m = GPy.models.GPRegression(x, y, k)


# In[8]:


m.checkgrad()


# In[9]:


m.optimize()


# In[10]:


m.plot()


# In[11]:


x_pred = np.linspace(0, 10, 500)[:,None]
pred_SE, var_SE = m._raw_predict(x_pred, kern=m.kern.Mat52)
pred_per, var_per = m._raw_predict(x_pred, kern=m.kern.mul)
pred_bias, var_bias = m._raw_predict(x_pred, kern=m.kern.bias)
pred_lin, var_lin = m._raw_predict(x_pred, kern=m.kern.linear_slopes)


# In[12]:


m.plot_f(resolution=500, predict_kw=dict(kern=m.kern.Mat52), plot_data=False)
plt.plot(x, f_SE)


# In[13]:


m.plot_f(resolution=500, predict_kw=dict(kern=m.kern.mul), plot_data=False)
plt.plot(x, f_perdom)


# In[14]:


m.plot_f(resolution=500, predict_kw=dict(kern=m.kern.linear_slopes), plot_data=False)
plt.plot(x, f_w)


# In[15]:


w_pred, w_var = m.kern.linear_slopes.posterior_inf()


# In[16]:


df = pd.DataFrame(w, columns=['truth'], index=np.arange(Phi.shape[1]))
df['mean'] = w_pred
df['std'] = np.sqrt(w_var.diagonal())


# In[17]:


np.round(df, 2)