#!/usr/bin/env python
# coding: utf-8

# In[1]:


import numpy as np
import matplotlib.pyplot as plt
import IPython.display as ipd  # for display and clear_output
import time  # for sleep


# In[2]:


# Make some training data
n = 20
X = np.linspace(0., 20.0, n).reshape((n, 1)) - 10
T = 0.2 + 0.05 * (X + 10) + 0.4 * np.sin(X + 10) + 0.2 * np.random.normal(size=(n, 1))

# Make some testing data
Xtest = X + 0.1 * np.random.normal(size=(n, 1))
Ttest = 0.2 + 0.05 * (X + 10) + 0.4 * np.sin(Xtest + 10) + 0.2 * np.random.normal(size=(n, 1))

nSamples = X.shape[0]
nOutputs = T.shape[1]


# In[3]:


plt.plot(X, T, label='Training Data')
plt.plot(Xtest, Ttest, label='Testing Data')
plt.legend();


# In[6]:


import torch
torch.__version__


# In[7]:


X


# In[8]:


T


# In[11]:


torch.from_numpy(X).float()


# In[13]:


V = np.random.uniform(-0.1, 0.1, size=(1, 10))
V.shape


# In[17]:


nnet = torch.nn.Sequential(torch.nn.Linear(1, nHiddens), torch.nn.Tanh(), 
                           torch.nn.Linear(nHiddens, 1))
nnet


# In[18]:


nnet(torch.tensor([10.0]))


# In[47]:


for layer in  nnet:
    print(layer)


# In[54]:


len(nnet)


# In[63]:


# Set parameters of neural network
nHiddens = 10

learning_rate = 0.01
# rhoh = 0.1
# rhoo = 0.4

# rh = rhoh / (nSamples * nOutputs)
#ro = rhoo / (nSamples * nOutputs)

# Initialize weights to uniformly distributed values between small normally-distributed between -0.1 and 0.1
#V = 0.1 * 2 * (np.random.uniform(size=(1 + 1, nHiddens)) - 0.5)
#W = 0.1 * 2 * (np.random.uniform(size=(1 + nHiddens, nOutputs)) - 0.5)


Xt = torch.from_numpy(X).float()
Tt = torch.from_numpy(T).float()

Xtestt = torch.from_numpy(Xtest).float()
Ttestt = torch.from_numpy(Ttest).float()

# Add constant column of 1's
#def addOnes(A):
#    return np.insert(A, 0, 1, axis=1)

#X1 = addOnes(X)
#Xtest1 = addOnes(Xtest)

# Take nSteps steepest descent steps in gradient descent search in mean-squared-error function
nSteps = 100000

# collect training and testing errors for plotting
errorTrace = np.zeros((nSteps, 2))

nnet = torch.nn.Sequential(torch.nn.Linear(1, 10), torch.nn.Tanh(), 
                           torch.nn.Linear(10, 20), torch.nn.Tanh(), 
                           torch.nn.Linear(20, 10), torch.nn.Tanh(), 
                           torch.nn.Linear(10, 1))

mse_f = torch.nn.MSELoss()
optimizer = torch.optim.SGD(nnet.parameters(), lr=learning_rate)

fig = plt.figure(figsize=(10, 12))

def forward_all_layers(X):
    Ys = [X]
    for layer in nnet:
        Ys.append(layer(Ys[-1]))
    return Ys[1:]
        
for step in range(nSteps):

    # Forward pass on training data
    # Z = np.tanh(X1 @ V)
    # Z1 = addOnes(Z)
    # Y = Z1 @ W
    
    Y = nnet(Xt)

    # Error in output
    # error = T - Y
    mse = mse_f(Y, Tt)

    # Backward pass - the backpropagation and weight update steps
    # V = V + rh * X1.T @ ( ( error @ W[1:, :].T) * (1 - Z**2))
    # W = W + ro * Z1.T @ error
    
    optimizer.zero_grad()
    mse.backward()
    optimizer.step()
    
    # error traces for plotting
    errorTrace[step, 0] = mse.sqrt()
    
    Ytest = nnet(Xtestt)
    mse_test = mse_f(Ytest, Ttestt)
    # Ytest = addOnes(np.tanh(Xtest1 @ V)) @ W  #!! Forward pass in one line
    errorTrace[step, 1] = mse_test.sqrt()

    if step % 1000 == 0 or step == nSteps-1:
        plt.clf()
        
        n_hidden_layers = (len(nnet) - 1) //2
        nplots = 2 + n_hidden_layers
        # Plot the trace of the mean squared error on training and testing data
        plt.subplot(nplots, 1, 1)
        plt.plot(errorTrace[:step, :])
        plt.ylim(0, 0.7)
        plt.xlabel('Epochs')
        plt.ylabel('RMSE')
        plt.legend(('Train','Test'), loc='upper left')
        
        # Plot the training and testing data, and 
        # the output of our neural network model on the test data
        plt.subplot(nplots, 1, 2)
        plt.plot(X, T, 'o-', Xtest, Ttest, 'o-', Xtest, Ytest.detach(), 'o-')
        plt.xlim(-10, 10)
        plt.legend(('Training','Testing','Model'), loc='upper left')
        plt.xlabel('$x$')
        plt.ylabel('Actual and Predicted $f(x)$')
        
        Ys = forward_all_layers(Xt)
        Z = Ys[:-1]
        ploti = 2
        for layeri in range(n_hidden_layers, 0, -1):
            ploti += 1
            plt.subplot(nplots, 1, ploti)
            plt.plot(X, Z[layeri * 2 - 1].detach())
            plt.ylim(-1.1, 1.1)
            plt.xlabel('$x$')
            plt.ylabel(f'Hidden Layer {layeri}');
        
        ipd.clear_output(wait=True)
        ipd.display(fig)
ipd.clear_output(wait=True)


# In[ ]: