'Tensorflow gradient tape returns exploding gradient model.trainable_variables
I'm trying to train my deep learning with tensorflow gradient tape, however the accuracy does not change with the epochs. I also checked for reseting my loss and accuracy.
For the MNIST dataset my code looks the following:
(mnist_train, mnist_test), mnist_info = tfds.load('mnist', split=['train', 'test'], as_supervised=True, with_info=True)
def prepare(ds, batch_size=128):
ds = ds.cache()
ds = ds.batch(batch_size)
ds = ds.prefetch(tf.data.experimental.AUTOTUNE)
return ds
def split_tasks(ds, predicate):
return ds.filter(predicate), ds.filter(lambda img, label: not predicate(img, label))
task_A_train, task_B_train = split_tasks(mnist_train, lambda img, label: label % 2 == 0)
task_A_train, task_B_train = prepare(task_A_train), prepare(task_B_train)
task_A_test, task_B_test = split_tasks(mnist_test, lambda img, label: label % 2 == 0)
task_A_test, task_B_test = prepare(task_A_test), prepare(task_B_test)
def evaluate(model, test_set):
acc = tf.keras.metrics.SparseCategoricalAccuracy(name='accuracy')
for i, (imgs, labels) in enumerate(test_set):
preds = model.predict_on_batch(imgs)
acc.update_state(labels, preds)
return acc.result().numpy()
multi_task_model = tf.keras.Sequential([
tf.keras.layers.Flatten(input_shape=(28, 28, 1)),
tf.keras.layers.Dense(128, activation='relu'),
tf.keras.layers.Dense(10)
])
multi_task_model.compile(optimizer='adam', loss=sparse_categorical_crossentropy, metrics='accuracy')
def l2_penalty(model, theta_A):
penalty = 0
for i, theta_i in enumerate(model.trainable_variables):
_penalty = tf.norm(theta_i - theta_A[i])
penalty += _penalty
return 0.5*penalty
def train_with_l2(model, task_A_train, task_B_train, task_A_test, task_B_test, epochs=6):
# First we're going to fit to task A and retain a copy of parameters trained on Task A
model.fit(task_A_train, epochs=epochs)
theta_A = {n: p.value() for n, p in enumerate(model.trainable_variables.copy())}
print("Task A accuracy after training on Task A: {}".format(evaluate(model, task_A_test)))
# Metrics for the custom training loop
accuracy = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
loss = tf.keras.metrics.SparseCategoricalCrossentropy('loss')
for epoch in range(epochs):
accuracy.reset_states()
loss.reset_states()
for batch, (imgs, labels) in enumerate(task_B_train):
with tf.GradientTape() as tape:
preds = model(imgs)
# Loss is crossentropy loss with regularization term for each parameter
total_loss = model.loss(labels, preds) + l2_penalty(model, theta_A)
grads = tape.gradient(total_loss, model.trainable_variables)
model.optimizer.apply_gradients(zip(grads, model.trainable_variables))
accuracy.update_state(labels, preds)
loss.update_state(labels, preds)
print("\rEpoch: {}, Batch: {}, Loss: {:.3f}, Accuracy: {:.3f}".format(
epoch+1, batch+1, loss.result().numpy(), accuracy.result().numpy()), flush=True, end=''
)
print("")
print("Task B accuracy after training trained model on Task B: {}".format(evaluate(model, task_B_test)))
print("Task A accuracy after training trained model on Task B: {}".format(evaluate(model, task_A_test)))
Does anybody see what I'm doing wrong concerning the training within gradientTape?
EDIT: I rechecked my gradients and it seems that these are exploding and thus return nan. However I cannot see why this is happening.
Sources
This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.
Source: Stack Overflow
| Solution | Source |
|---|
