In this notebook, I will go over steps to retrain VGG16 net for the skin cancer dataset.
import os
from glob import glob
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical # convert to one-hot-encoding
from keras.preprocessing.image import ImageDataGenerator
from keras import layers
from keras import Model
from keras.applications.vgg16 import VGG16, preprocess_input
from keras.optimizers import Adam
from keras.callbacks import ReduceLROnPlateau, EarlyStopping
%matplotlib inline
import matplotlib.pyplot as plt
Using TensorFlow backend.
X_train = np.load("/floyd/input/skin_cancer_192_256/256_192_train.npy")
y_train = np.load("/floyd/input/skin_cancer_192_256/train_labels.npy")
X_val = np.load("/floyd/input/skin_cancer_192_256/256_192_val.npy")
y_val = np.load("/floyd/input/skin_cancer_192_256/val_labels.npy")
X_train.shape, X_val.shape
((8111, 192, 256, 3), (902, 192, 256, 3))
y_train.shape, y_val.shape
((8111,), (902,))
y_train = to_categorical(y_train)
y_val = to_categorical(y_val)
y_train.shape, y_val.shape
((8111, 7), (902, 7))
pre_trained_model = VGG16(input_shape=(192, 256, 3), include_top=False, weights="imagenet")
for layer in pre_trained_model.layers:
print(layer.name)
layer.trainable = False
print(len(pre_trained_model.layers))
input_1 block1_conv1 block1_conv2 block1_pool block2_conv1 block2_conv2 block2_pool block3_conv1 block3_conv2 block3_conv3 block3_pool block4_conv1 block4_conv2 block4_conv3 block4_pool block5_conv1 block5_conv2 block5_conv3 block5_pool 19
last_layer = pre_trained_model.get_layer('block5_pool')
print('last layer output shape:', last_layer.output_shape)
last_output = last_layer.output
last layer output shape: (None, 6, 8, 512)
# Flatten the output layer to 1 dimension
x = layers.GlobalMaxPooling2D()(last_output)
# Add a fully connected layer with 512 hidden units and ReLU activation
x = layers.Dense(512, activation='relu')(x)
# Add a dropout rate of 0.5
x = layers.Dropout(0.5)(x)
# Add a final sigmoid layer for classification
x = layers.Dense(7, activation='softmax')(x)
# Configure and compile the model
model = Model(pre_trained_model.input, x)
optimizer = Adam(lr=0.0001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=True)
model.compile(loss='categorical_crossentropy',
optimizer=optimizer,
metrics=['accuracy'])
model.summary()
_________________________________________________________________ Layer (type) Output Shape Param # ================================================================= input_1 (InputLayer) (None, 192, 256, 3) 0 _________________________________________________________________ block1_conv1 (Conv2D) (None, 192, 256, 64) 1792 _________________________________________________________________ block1_conv2 (Conv2D) (None, 192, 256, 64) 36928 _________________________________________________________________ block1_pool (MaxPooling2D) (None, 96, 128, 64) 0 _________________________________________________________________ block2_conv1 (Conv2D) (None, 96, 128, 128) 73856 _________________________________________________________________ block2_conv2 (Conv2D) (None, 96, 128, 128) 147584 _________________________________________________________________ block2_pool (MaxPooling2D) (None, 48, 64, 128) 0 _________________________________________________________________ block3_conv1 (Conv2D) (None, 48, 64, 256) 295168 _________________________________________________________________ block3_conv2 (Conv2D) (None, 48, 64, 256) 590080 _________________________________________________________________ block3_conv3 (Conv2D) (None, 48, 64, 256) 590080 _________________________________________________________________ block3_pool (MaxPooling2D) (None, 24, 32, 256) 0 _________________________________________________________________ block4_conv1 (Conv2D) (None, 24, 32, 512) 1180160 _________________________________________________________________ block4_conv2 (Conv2D) (None, 24, 32, 512) 2359808 _________________________________________________________________ block4_conv3 (Conv2D) (None, 24, 32, 512) 2359808 _________________________________________________________________ block4_pool (MaxPooling2D) (None, 12, 16, 512) 0 _________________________________________________________________ block5_conv1 (Conv2D) (None, 12, 16, 512) 2359808 _________________________________________________________________ block5_conv2 (Conv2D) (None, 12, 16, 512) 2359808 _________________________________________________________________ block5_conv3 (Conv2D) (None, 12, 16, 512) 2359808 _________________________________________________________________ block5_pool (MaxPooling2D) (None, 6, 8, 512) 0 _________________________________________________________________ global_max_pooling2d_1 (Glob (None, 512) 0 _________________________________________________________________ dense_1 (Dense) (None, 512) 262656 _________________________________________________________________ dropout_1 (Dropout) (None, 512) 0 _________________________________________________________________ dense_2 (Dense) (None, 7) 3591 ================================================================= Total params: 14,980,935 Trainable params: 266,247 Non-trainable params: 14,714,688 _________________________________________________________________
If performing fine tuning directly would result in a huge gradient, so it's better that we perform 3 epochs of feature extraction first so that weights of the final fully connected layer aren't completely random. The intuition for this is that if we don't perform feature-extraction, then the gradient will be too large and will change the pretrained weights too much.
train_datagen = ImageDataGenerator(rotation_range=60, width_shift_range=0.2, height_shift_range=0.2,
shear_range=0.2, zoom_range=0.2, fill_mode='nearest')
train_datagen.fit(X_train)
val_datagen = ImageDataGenerator()
val_datagen.fit(X_val)
batch_size = 64
epochs = 3
history = model.fit_generator(train_datagen.flow(X_train,y_train, batch_size=batch_size),
epochs = epochs, validation_data = val_datagen.flow(X_val, y_val),
verbose = 1, steps_per_epoch=(X_train.shape[0] // batch_size),
validation_steps=(X_val.shape[0] // batch_size))
Epoch 1/3 126/126 [==============================] - 91s 720ms/step - loss: 1.2934 - acc: 0.6207 - val_loss: 1.0926 - val_acc: 0.6652 Epoch 2/3 126/126 [==============================] - 84s 667ms/step - loss: 1.0835 - acc: 0.6666 - val_loss: 0.9895 - val_acc: 0.6674 Epoch 3/3 126/126 [==============================] - 85s 676ms/step - loss: 1.0268 - acc: 0.6686 - val_loss: 1.0094 - val_acc: 0.6588
Let's fine tune the last convolutional block of VGG net. I only use learning_rate = 0.0001 with very high momentum = 0.9 and train for 35 epochs only so that the original weights of pretrained VGG net won't be changed too much. learning_rate_reduction function is used and will halve the learning_rate whenever the validation accuracy plateaus for 3 epochs.
for layer in model.layers[:15]:
layer.trainable = False
for layer in model.layers[15:]:
layer.trainable = True
optimizer = Adam(lr=0.0001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)
model.compile(loss='categorical_crossentropy',
optimizer=optimizer,
metrics=['acc'])
model.summary()
_________________________________________________________________ Layer (type) Output Shape Param # ================================================================= input_1 (InputLayer) (None, 192, 256, 3) 0 _________________________________________________________________ block1_conv1 (Conv2D) (None, 192, 256, 64) 1792 _________________________________________________________________ block1_conv2 (Conv2D) (None, 192, 256, 64) 36928 _________________________________________________________________ block1_pool (MaxPooling2D) (None, 96, 128, 64) 0 _________________________________________________________________ block2_conv1 (Conv2D) (None, 96, 128, 128) 73856 _________________________________________________________________ block2_conv2 (Conv2D) (None, 96, 128, 128) 147584 _________________________________________________________________ block2_pool (MaxPooling2D) (None, 48, 64, 128) 0 _________________________________________________________________ block3_conv1 (Conv2D) (None, 48, 64, 256) 295168 _________________________________________________________________ block3_conv2 (Conv2D) (None, 48, 64, 256) 590080 _________________________________________________________________ block3_conv3 (Conv2D) (None, 48, 64, 256) 590080 _________________________________________________________________ block3_pool (MaxPooling2D) (None, 24, 32, 256) 0 _________________________________________________________________ block4_conv1 (Conv2D) (None, 24, 32, 512) 1180160 _________________________________________________________________ block4_conv2 (Conv2D) (None, 24, 32, 512) 2359808 _________________________________________________________________ block4_conv3 (Conv2D) (None, 24, 32, 512) 2359808 _________________________________________________________________ block4_pool (MaxPooling2D) (None, 12, 16, 512) 0 _________________________________________________________________ block5_conv1 (Conv2D) (None, 12, 16, 512) 2359808 _________________________________________________________________ block5_conv2 (Conv2D) (None, 12, 16, 512) 2359808 _________________________________________________________________ block5_conv3 (Conv2D) (None, 12, 16, 512) 2359808 _________________________________________________________________ block5_pool (MaxPooling2D) (None, 6, 8, 512) 0 _________________________________________________________________ global_max_pooling2d_1 (Glob (None, 512) 0 _________________________________________________________________ dense_1 (Dense) (None, 512) 262656 _________________________________________________________________ dropout_1 (Dropout) (None, 512) 0 _________________________________________________________________ dense_2 (Dense) (None, 7) 3591 ================================================================= Total params: 14,980,935 Trainable params: 7,345,671 Non-trainable params: 7,635,264 _________________________________________________________________
By setting the last convolutional block to trainable, we are now retraining for half of the hyperparameters
learning_rate_reduction = ReduceLROnPlateau(monitor='val_acc', patience=3, verbose=1, factor=0.5,
min_lr=0.000001, cooldown=3)
batch_size = 64
epochs = 30
history = model.fit_generator(train_datagen.flow(X_train,y_train, batch_size=batch_size),
epochs = epochs, validation_data = val_datagen.flow(X_val, y_val),
verbose = 1, steps_per_epoch=(X_train.shape[0] // batch_size),
validation_steps=(X_val.shape[0] // batch_size), callbacks=[learning_rate_reduction])
Epoch 1/30 126/126 [==============================] - 89s 707ms/step - loss: 0.8724 - acc: 0.6997 - val_loss: 0.8149 - val_acc: 0.6964 Epoch 2/30 126/126 [==============================] - 85s 673ms/step - loss: 0.7467 - acc: 0.7335 - val_loss: 0.7273 - val_acc: 0.7455 Epoch 3/30 126/126 [==============================] - 85s 675ms/step - loss: 0.6952 - acc: 0.7509 - val_loss: 0.7164 - val_acc: 0.7204 Epoch 4/30 126/126 [==============================] - 85s 673ms/step - loss: 0.6655 - acc: 0.7574 - val_loss: 0.6854 - val_acc: 0.7321 Epoch 5/30 126/126 [==============================] - 85s 671ms/step - loss: 0.6438 - acc: 0.7679 - val_loss: 0.7434 - val_acc: 0.7204 Epoch 00005: ReduceLROnPlateau reducing learning rate to 4.999999873689376e-05. Epoch 6/30 126/126 [==============================] - 85s 675ms/step - loss: 0.5844 - acc: 0.7890 - val_loss: 0.6247 - val_acc: 0.7679 Epoch 7/30 126/126 [==============================] - 85s 674ms/step - loss: 0.5567 - acc: 0.7952 - val_loss: 0.6581 - val_acc: 0.7630 Epoch 8/30 126/126 [==============================] - 85s 673ms/step - loss: 0.5384 - acc: 0.8118 - val_loss: 0.6658 - val_acc: 0.7433 Epoch 9/30 126/126 [==============================] - 85s 672ms/step - loss: 0.5195 - acc: 0.8128 - val_loss: 0.6387 - val_acc: 0.7512 Epoch 10/30 126/126 [==============================] - 85s 671ms/step - loss: 0.4959 - acc: 0.8204 - val_loss: 0.6632 - val_acc: 0.7656 Epoch 00010: ReduceLROnPlateau reducing learning rate to 2.499999936844688e-05. Epoch 11/30 126/126 [==============================] - 84s 668ms/step - loss: 0.4635 - acc: 0.8326 - val_loss: 0.5599 - val_acc: 0.8081 Epoch 12/30 126/126 [==============================] - 85s 677ms/step - loss: 0.4443 - acc: 0.8375 - val_loss: 0.5841 - val_acc: 0.7768 Epoch 13/30 126/126 [==============================] - 85s 672ms/step - loss: 0.4209 - acc: 0.8483 - val_loss: 0.6649 - val_acc: 0.7630 Epoch 14/30 126/126 [==============================] - 85s 673ms/step - loss: 0.4305 - acc: 0.8447 - val_loss: 0.6284 - val_acc: 0.7679 Epoch 15/30 126/126 [==============================] - 85s 671ms/step - loss: 0.4073 - acc: 0.8541 - val_loss: 0.5224 - val_acc: 0.8057 Epoch 00015: ReduceLROnPlateau reducing learning rate to 1.249999968422344e-05. Epoch 16/30 126/126 [==============================] - 85s 675ms/step - loss: 0.3870 - acc: 0.8643 - val_loss: 0.6117 - val_acc: 0.7835 Epoch 17/30 126/126 [==============================] - 84s 670ms/step - loss: 0.3837 - acc: 0.8614 - val_loss: 0.6959 - val_acc: 0.7441 Epoch 18/30 126/126 [==============================] - 85s 671ms/step - loss: 0.3686 - acc: 0.8670 - val_loss: 0.6199 - val_acc: 0.7879 Epoch 19/30 126/126 [==============================] - 84s 669ms/step - loss: 0.3640 - acc: 0.8684 - val_loss: 0.5730 - val_acc: 0.7938 Epoch 20/30 126/126 [==============================] - 85s 672ms/step - loss: 0.3728 - acc: 0.8668 - val_loss: 0.6003 - val_acc: 0.7879 Epoch 00020: ReduceLROnPlateau reducing learning rate to 6.24999984211172e-06. Epoch 21/30 126/126 [==============================] - 84s 667ms/step - loss: 0.3528 - acc: 0.8719 - val_loss: 0.6376 - val_acc: 0.7867 Epoch 22/30 126/126 [==============================] - 84s 669ms/step - loss: 0.3489 - acc: 0.8782 - val_loss: 0.5459 - val_acc: 0.7924 Epoch 23/30 126/126 [==============================] - 84s 669ms/step - loss: 0.3394 - acc: 0.8805 - val_loss: 0.6392 - val_acc: 0.7915 Epoch 24/30 126/126 [==============================] - 85s 672ms/step - loss: 0.3297 - acc: 0.8820 - val_loss: 0.6104 - val_acc: 0.7835 Epoch 25/30 126/126 [==============================] - 85s 671ms/step - loss: 0.3306 - acc: 0.8801 - val_loss: 0.5705 - val_acc: 0.7986 Epoch 00025: ReduceLROnPlateau reducing learning rate to 3.12499992105586e-06. Epoch 26/30 126/126 [==============================] - 85s 672ms/step - loss: 0.3186 - acc: 0.8865 - val_loss: 0.6038 - val_acc: 0.7902 Epoch 27/30 126/126 [==============================] - 84s 669ms/step - loss: 0.3278 - acc: 0.8864 - val_loss: 0.5933 - val_acc: 0.7891 Epoch 28/30 126/126 [==============================] - 84s 670ms/step - loss: 0.3245 - acc: 0.8833 - val_loss: 0.5795 - val_acc: 0.7812 Epoch 29/30 126/126 [==============================] - 84s 667ms/step - loss: 0.3197 - acc: 0.8885 - val_loss: 0.5836 - val_acc: 0.7915 Epoch 30/30 126/126 [==============================] - 84s 669ms/step - loss: 0.3140 - acc: 0.8892 - val_loss: 0.6024 - val_acc: 0.7946 Epoch 00030: ReduceLROnPlateau reducing learning rate to 1.56249996052793e-06.
loss_val, acc_val = model.evaluate(X_val, y_val, verbose=1)
print("Validation: accuracy = %f ; loss_v = %f" % (acc_val, loss_val))
902/902 [==============================] - 8s 9ms/step Validation: accuracy = 0.798226 ; loss_v = 0.650197
Our model clearly overfits the training dataset. But we do observe some improvement with validation accuracy, which is clearly better than the baseline model! Having around 3% improvement from the baseline model justifies some more training time. This improvement proves that weights of pretrained model learned from ImageNet generalize to our dataset, which is completely different from ImageNet.
X_test = np.load("/floyd/input/skin_cancer_192_256/256_192_test.npy")
y_test = np.load("/floyd/input/skin_cancer_192_256/test_labels.npy")
y_test = to_categorical(y_test)
loss_test, acc_test = model.evaluate(X_test, y_test, verbose=1)
print("Test: accuracy = %f ; loss = %f" % (acc_test, loss_test))
1002/1002 [==============================] - 9s 9ms/step Test: accuracy = 0.796407 ; loss = 0.708095
model.save("VGG16.h5")
# Retrieve a list of accuracy results on training and test data
# sets for each training epoch
acc = history.history['acc']
val_acc = history.history['val_acc']
# Retrieve a list of list results on training and test data
# sets for each training epoch
loss = history.history['loss']
val_loss = history.history['val_loss']
# Get number of epochs
epochs = range(len(acc))
# Plot training and validation accuracy per epoch
plt.plot(epochs, acc, label = "training")
plt.plot(epochs, val_acc, label = "validation")
plt.legend(loc="upper left")
plt.title('Training and validation accuracy')
plt.figure()
# Plot training and validation loss per epoch
plt.plot(epochs, loss, label = "training")
plt.plot(epochs, val_loss, label = "validation")
plt.legend(loc="upper right")
plt.title('Training and validation loss')
Text(0.5,1,'Training and validation loss')