import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import warnings
warnings.filterwarnings('ignore')
As a warmup, let's write a solver for linear regression using PyTorch
# define feature matrix and labels
x = torch.randn(10, 5, requires_grad=False)
y = torch.randn(10, 3, requires_grad=False)
# a weight matrix
w = torch.randn(5, 3, requires_grad=True)
print(w.grad) # None
print(w)
The heart of the matter is the definition of our loss function, which is the root mean squared error for linear regression:
loss = torch.mean((y - x @ w) ** 2)
All we need to do next is to compute its gradient, which we get for free for every PyTorch tensor:
# calculate the gradient
loss.backward(retain_graph=True)
# manually apply the gradient
w.data -= 0.01 * w.grad.data
# manually zero gradients after update
w.grad.zero_()
print(w)
We can make the computation easier using the built in optimizers:
x = torch.randn(10, 5, requires_grad=False)
y = torch.randn(10, 3, requires_grad=False)
w = torch.randn(5, 3, requires_grad=True)
b = torch.randn(3, requires_grad=True)
learning_rate = 0.1
loss_fn = torch.nn.MSELoss()
optimizer = torch.optim.SGD([w,b], lr=learning_rate)
for step in range(10000):
# compute model predictions
pred = x @ w + b
# compute the loss
loss = loss_fn(pred, y)
if step%1000==0:
print (loss.item())
# manually zero all previous gradients
optimizer.zero_grad()
# calculate new gradients
loss.backward()
# perform an optimization step
optimizer.step()
# a linear mapping from R^5 to R^3
linear = nn.Linear(5, 3)
# some data...
X = torch.randn(2, 5)
# apply the mapping...
linear(X)
tensor([[ 0.4830, 0.6686, 0.4467], [-1.0320, 1.1213, 1.3144]], grad_fn=<ThAddmmBackward>)
linear.parameters()
linear.bias
<generator object Module.parameters at 0x10654aeb8>
Parameter containing: tensor([0.1062, 0.3992, 0.0269], requires_grad=True)
Next we need some nonlinearities...
Pytorch provides the standard nonlinearities: $\tanh(x), \sigma(x), \text{ReLU}(x)$.
data = torch.randn(2, 2)
print(data)
print(F.relu(data))
tensor([[-1.0269, -0.8036], [ 0.2149, 0.0842]]) tensor([[0.0000, 0.0000], [0.2149, 0.0842]])
The $\text{softmax}(x)$ function is another non-linearity, but it is special in that it usually is the last operation in a network. It takes in a vector of real numbers and returns a probability distribution, and therefore useful for multi-class classification. Its definition is as follows. Let $x$ be a vector of real numbers. Then the $i$th component of $\text{softmax}(x)$ is
\begin{align}\frac{\exp(x_i)}{\sum_j \exp(x_j)}\end{align}It should be clear that the output is a probability distribution: each element is non-negative and the sum over all components is 1.
Before creating a simple classifier in PyTorch we need some data. It's easy to convert one of the scikit-learn datasets into PyTorch tensors:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
X,y = load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
X_train = torch.tensor(X_train, dtype=torch.float)
y_train = torch.tensor(y_train, dtype=torch.long)
X_test = torch.tensor(X_test, dtype=torch.float)
y_test = torch.tensor(y_test, dtype=torch.long)
num_features = X_train.shape[1]
num_classes = 2
# hyper-parameters
num_epochs = 5000
learning_rate = 0.00001
model = nn.Sequential(
nn.Linear(num_features, num_classes),
nn.LogSoftmax()
)
loss_function = nn.NLLLoss()
for epoch in range(num_epochs):
pred = model(X_train)
loss = loss_function(pred, y_train)
# backward and optimize
model.zero_grad()
loss.backward()
with torch.no_grad():
for param in model.parameters():
param -= learning_rate * param.grad
if epoch %500 == 0 :
print ('Epoch [{}/{}], Loss: {:.4f}'.format(epoch, num_epochs, loss.item()))
Epoch [0/5000], Loss: 79.6137 Epoch [500/5000], Loss: 0.3970 Epoch [1000/5000], Loss: 0.2464 Epoch [1500/5000], Loss: 0.2113 Epoch [2000/5000], Loss: 0.1980 Epoch [2500/5000], Loss: 0.1915 Epoch [3000/5000], Loss: 0.1876 Epoch [3500/5000], Loss: 0.1850 Epoch [4000/5000], Loss: 0.1830 Epoch [4500/5000], Loss: 0.1815
with torch.no_grad():
output = model(X_test)
predicted = torch.argmax(output.data, dim=1)
(y_test==predicted).sum().item()/len(y_test)
0.9181286549707602
num_features = X_train.shape[1]
num_classes = 2
# hyper-parameters
num_epochs = 5000
learning_rate = 0.00001
class LogisticRegression(nn.Module):
def __init__(self, num_features, num_classes):
# calls the init function of nn.Module. Dont get confused by syntax
super(LogisticRegression, self).__init__()
self.linear = nn.Linear(num_features, num_classes)
def forward(self, x):
# Pass the input through the linear layer,
# then pass that through softmax.
return F.log_softmax(self.linear(x))
num_features = X_train.shape[1]
num_classes = 2
# hyper-parameters
num_epochs = 5000
learning_rate = 0.00001
model = LogisticRegression(num_features, num_classes)
loss_function = nn.NLLLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
#optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
for epoch in range(num_epochs):
pred = model(X_train)
loss = loss_function(pred, y_train)
# backward and optimize
optimizer.zero_grad()
loss.backward()
optimizer.step()
if epoch %500 == 0 :
print ('Epoch [{}/{}], Loss: {:.4f}'.format(epoch, num_epochs, loss.item()))
Epoch [0/5000], Loss: 160.8974 Epoch [500/5000], Loss: 0.4091 Epoch [1000/5000], Loss: 0.3101 Epoch [1500/5000], Loss: 0.2537 Epoch [2000/5000], Loss: 0.2196 Epoch [2500/5000], Loss: 0.2066 Epoch [3000/5000], Loss: 0.2018 Epoch [3500/5000], Loss: 0.1989 Epoch [4000/5000], Loss: 0.1968 Epoch [4500/5000], Loss: 0.1951
with torch.no_grad():
output = model(X_test)
predicted = torch.argmax(output.data, dim=1)
(y_test==predicted).sum().item()/len(y_test)
0.9064327485380117
class NeuralNetwork(nn.Module):
def __init__(self, num_features, num_hidden, num_classes):
super(NeuralNetwork, self).__init__()
self.linear1 = nn.Linear(num_features, num_hidden)
self.linear2 = nn.Linear(num_hidden, num_classes)
def forward(self, x):
hidden=F.relu(self.linear1(x))
return F.log_softmax(self.linear2(hidden))
num_features = X_train.shape[1]
num_classes = 2
# hyper-parameters
num_epochs = 5000
learning_rate = 0.00001
num_hidden = 30
model = NeuralNetwork(num_features, num_hidden, num_classes)
loss_function = nn.NLLLoss()
#optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
for epoch in range(num_epochs):
pred = model(X_train)
loss = loss_function(pred, y_train)
# backward and optimize
optimizer.zero_grad()
loss.backward()
optimizer.step()
if epoch %500 == 0 :
print ('Epoch [{}/{}], Loss: {:.4f}'.format(epoch, num_epochs, loss.item()))
Epoch [0/5000], Loss: 28.8451 Epoch [500/5000], Loss: 10.0051 Epoch [1000/5000], Loss: 0.6491 Epoch [1500/5000], Loss: 0.5519 Epoch [2000/5000], Loss: 0.4772 Epoch [2500/5000], Loss: 0.4080 Epoch [3000/5000], Loss: 0.3500 Epoch [3500/5000], Loss: 0.3043 Epoch [4000/5000], Loss: 0.2656 Epoch [4500/5000], Loss: 0.2523
with torch.no_grad():
output = model(X_test)
predicted = torch.argmax(output.data, dim=1)
(y_test==predicted).sum().item()/len(y_test)
0.9181286549707602