'Tensorflow gradient tape returns exploding gradient model.trainable_variables

I'm trying to train my deep learning with tensorflow gradient tape, however the accuracy does not change with the epochs. I also checked for reseting my loss and accuracy.

For the MNIST dataset my code looks the following:


(mnist_train, mnist_test), mnist_info = tfds.load('mnist', split=['train', 'test'], as_supervised=True, with_info=True)

def prepare(ds, batch_size=128):
  ds = ds.cache()
  ds = ds.batch(batch_size)
  ds = ds.prefetch(tf.data.experimental.AUTOTUNE)
  return ds

def split_tasks(ds, predicate):
  return ds.filter(predicate), ds.filter(lambda img, label: not predicate(img, label))

task_A_train, task_B_train = split_tasks(mnist_train, lambda img, label: label % 2 == 0)
task_A_train, task_B_train = prepare(task_A_train), prepare(task_B_train)
task_A_test, task_B_test = split_tasks(mnist_test, lambda img, label: label % 2 == 0)
task_A_test, task_B_test = prepare(task_A_test), prepare(task_B_test)

def evaluate(model, test_set):
    acc = tf.keras.metrics.SparseCategoricalAccuracy(name='accuracy')
    for i, (imgs, labels) in enumerate(test_set):
        preds = model.predict_on_batch(imgs)
        acc.update_state(labels, preds)
    return acc.result().numpy()

multi_task_model = tf.keras.Sequential([
   tf.keras.layers.Flatten(input_shape=(28, 28, 1)),
   tf.keras.layers.Dense(128, activation='relu'),
   tf.keras.layers.Dense(10)
])

multi_task_model.compile(optimizer='adam', loss=sparse_categorical_crossentropy, metrics='accuracy')


def l2_penalty(model, theta_A):
  penalty = 0
  for i, theta_i in enumerate(model.trainable_variables):
    _penalty = tf.norm(theta_i - theta_A[i])
    penalty += _penalty
  return 0.5*penalty


def train_with_l2(model, task_A_train, task_B_train, task_A_test, task_B_test, epochs=6):
  # First we're going to fit to task A and retain a copy of parameters trained on Task A
  model.fit(task_A_train, epochs=epochs)
  theta_A = {n: p.value() for n, p in enumerate(model.trainable_variables.copy())}
 
  print("Task A accuracy after training on Task A: {}".format(evaluate(model, task_A_test)))
   
  # Metrics for the custom training loop
  accuracy = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
  loss = tf.keras.metrics.SparseCategoricalCrossentropy('loss')
 
  for epoch in range(epochs):
    accuracy.reset_states()
    loss.reset_states()
    for batch, (imgs, labels) in enumerate(task_B_train):
      with tf.GradientTape() as tape:
        preds = model(imgs)
        # Loss is crossentropy loss with regularization term for each parameter
        total_loss = model.loss(labels, preds) + l2_penalty(model, theta_A)
      grads = tape.gradient(total_loss, model.trainable_variables)
      model.optimizer.apply_gradients(zip(grads, model.trainable_variables))
       
      accuracy.update_state(labels, preds)
      loss.update_state(labels, preds)
      print("\rEpoch: {}, Batch: {}, Loss: {:.3f}, Accuracy: {:.3f}".format(
          epoch+1, batch+1, loss.result().numpy(), accuracy.result().numpy()), flush=True, end=''
         )
    print("")
   
  print("Task B accuracy after training trained model on Task B: {}".format(evaluate(model, task_B_test)))
  print("Task A accuracy after training trained model on Task B: {}".format(evaluate(model, task_A_test)))

Does anybody see what I'm doing wrong concerning the training within gradientTape?

EDIT: I rechecked my gradients and it seems that these are exploding and thus return nan. However I cannot see why this is happening.



Sources

This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.

Source: Stack Overflow

Solution Source