'No gradients provided for any variable, Tensorflow error on TPU

So I am trying to train my Text Summarization model on Colab TPU as training it on Colab CPU is very slow but I am getting a No gradients provided for any variable Error, this error does not appear when training on CPU with or without using tf.function() decorator. Please can anyone help me with this, if you need any more information feel free to ask, Thank You.

class MaskedLoss(tf.keras.losses.Loss):
  def __init__(self):
    self.name = 'masked_loss'
    self.loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')
  
  def __call__(self, y_true, y_pred):
    loss = self.loss(y_true, y_pred)
    mask = tf.cast(y_true !=0, tf.float32)
    loss*=mask
    return tf.nn.compute_average_loss(loss, global_batch_size=GLOBAL_BATCH_SIZE_TPU)

class BatchLogs(tf.keras.callbacks.Callback):
  def __init__(self, key):
    self.key = key
    self.logs = []

  def on_train_batch_end(self, n, logs):
    self.logs.append(logs[self.key])

class CoverageLoss(tf.keras.losses.Loss):
  def __init__(self):
    self.name = "coverage_loss"
  
  def __call__(self, coverage_vector, attention_weights):
    loss = tf.math.minimum(coverage_vector, attention_weights)
    return tf.nn.compute_average_loss(loss, global_batch_size=GLOBAL_BATCH_SIZE_TPU)
class AttentionEncoderDecoder(tf.keras.Model):
  def __init__(self, units, embedding_dims, input_preprocessor, output_preprocessor, batch_size, use_tf_function=False, LAMBDA = 1):
    super(AttentionEncoderDecoder, self).__init__()
    self.units = units
    self.LAMBDA = LAMBDA
    self.BATCH_SIZE = batch_size
    self.embedding = embedding_dims
    self.use_tf_function = use_tf_function
    self.input_preprocessor = input_preprocessor
    self.output_preprocessor = output_preprocessor
    self.MAX_TARGET_VOCAB = self.output_preprocessor.vocabulary_size()
    self.MAX_SOURCE_VOCAB = self.input_preprocessor.vocabulary_size()
    self.encoder = Encoder(self.units, self.embedding, self.MAX_SOURCE_VOCAB)
    self.decoder = Decoder(self.units, self.embedding, self.MAX_TARGET_VOCAB)


  def overflow_tokens_to_rel_input_pos(self, inputs):
    out, inp = inputs
    return tf.where(out >= self.MAX_TARGET_VOCAB, self.MAX_TARGET_VOCAB + tf.argmax(inp == tf.expand_dims(out, axis= 1), axis=1), out)


  def _preprocess(self, input_text, output_text):
    input_tokens = self.input_preprocessor(input_text)
    target_tokens = self.output_preprocessor(output_text)
    target_tokens_on_input_vocab = self.input_preprocessor(output_text)
    target_tokens_mapped = tf.where(target_tokens == 1, target_tokens_on_input_vocab, target_tokens)
    target_tokens_mapped = tf.vectorized_map(self.overflow_tokens_to_rel_input_pos, (target_tokens_mapped, input_tokens))
    input_mask = (input_tokens != 0)
    target_mask = (target_tokens != 0)

    return input_tokens, input_mask, target_tokens, target_tokens_mapped, target_mask


  def train_step(self, inputs):
    if self.use_tf_function:
      return self._tf_train_step(inputs)
    else:
      return self._train_step(inputs)

  # @tf.function(input_signature = [[tf.TensorSpec(dtype = tf.string, shape = [None]),
  #                                  tf.TensorSpec(dtype = tf.string, shape = [None])]])
  def _tf_train_step(self, inputs):
      return self._train_step(inputs)


  def _one_step(self, input_token, target_token, enc_output, input_mask, dec_state, coverage_vector, training = True):
    decoder_input = DecoderInput(input_token, enc_output, input_mask)
    dec_result, dec_state, coverage_vector = self.decoder(decoder_input, coverage_vector, state=dec_state)

    y_true = target_token #(batch, 1)
    y_pred = dec_result.logits #(batch, t_step, outvocab + encoder_output_steps)
    step_loss = self.loss['MaskedLoss'](y_true, y_pred) + self.LAMBDA * self.loss['CoverageLoss'](coverage_vector, dec_result.attention_weights)

    return dec_result, dec_state, coverage_vector, step_loss

  def do_one_step(self, i, max_target_sen_length, input_text, input_tokens, input_mask, target_tokens, target_tokens_mapped, target_mask, coverage_vector, enc_output, enc_state, dec_state, loss):
    input_token = tf.slice(target_tokens, [0, i], [-1, 1])
    target_token = tf.slice(target_tokens, [0, i+1], [-1, 1])
    dec_result, dec_state, coverage_vector, step_loss = self._one_step(input_token, target_token, enc_output, input_mask, dec_state, coverage_vector, True)
    i = i + 1
    return i, max_target_sen_length, input_text, input_tokens, input_mask, target_tokens, target_tokens_mapped, target_mask, coverage_vector, enc_output, enc_state, dec_state, loss


  def _train_step(self, inputs):
    input_text, target_text = inputs
    input_tokens, input_mask, target_tokens, target_tokens_mapped, target_mask = self._preprocess(input_text, target_text)
    target_mask = tf.cast(target_mask, tf.float32)
    input_tokens = tf.cast(input_tokens, tf.float32)
    target_tokens = tf.cast(target_tokens, tf.float32)
    target_tokens_mapped = tf.cast(target_tokens_mapped, tf.float32)
    coverage_vector = tf.zeros(shape = (self.BATCH_SIZE, 1, tf.shape(input_tokens)[1]))
    max_target_sen_length = tf.shape(target_tokens)[1]

    with tf.GradientTape() as tape:
      enc_output, enc_state = self.encoder(input_tokens)
      dec_state = enc_state
      loss = tf.constant(0.0)
      i = tf.constant(0)
      cond = lambda i, max_target_sen_length, input_text, input_tokens, input_mask, target_tokens, target_tokens_mapped, target_mask, coverage_vector, enc_output, enc_state, dec_state, loss : tf.less(i, max_target_sen_length-1)
      loop_variables = i, max_target_sen_length, input_text, input_tokens, input_mask, target_tokens, target_tokens_mapped, target_mask, coverage_vector, enc_output, enc_state, dec_state, loss
      loop_variables = tf.while_loop(cond, self.do_one_step, loop_vars = loop_variables)
      avg_loss = loss/(tf.reduce_sum(target_mask) * (1. / self.BATCH_SIZE))

    variables = self.trainable_variables
    gradients = tape.gradient(avg_loss, variables)
    print(gradients, variables)
    self.optimizer.apply_gradients(zip(gradients, variables))
    return {'batch_loss': avg_loss}
strategy = tf.distribute.MirroredStrategy()
print('Number of devices: {}'.format(strategy.num_replicas_in_sync))
BATCH_SIZE_PER_REPLICA_TPU = 64
GLOBAL_BATCH_SIZE_TPU = BATCH_SIZE_PER_REPLICA_TPU * strategy.num_replicas_in_sync
with strategy.scope():
  tboard_callback = tf.keras.callbacks.TensorBoard(log_dir = logs)
  text_summarizer = AttentionEncoderDecoder(UNITS, EMBEDDING, input_preprocessing, output_preprocessing, GLOBAL_BATCH_SIZE_TPU, True, 1)
  text_summarizer.compile(optimizer = "Adam", loss = {"MaskedLoss": MaskedLoss(), "CoverageLoss" : CoverageLoss()}, steps_per_execution = 50)
text_summarizer.fit(profiler_dist_dataset, epochs = 1, callbacks = [batch_loss, tboard_callback], steps_per_epoch=GLOBAL_BATCH_SIZE_TPU)

Error

Tensor("truediv:0", shape=(), dtype=float32, device=/job:localhost/replica:0/task:0/device:CPU:0)
[None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None] 
[MirroredVariable:{
  0: <tf.Variable 'encoder_1/embedding_2/embeddings:0' shape=(112338, 256) dtype=float32>
}, MirroredVariable:{
  0: <tf.Variable 'encoder_1/gru_2/gru_cell_2/kernel:0' shape=(256, 3072) dtype=float32>
}, MirroredVariable:{
  0: <tf.Variable 'encoder_1/gru_2/gru_cell_2/recurrent_kernel:0' shape=(1024, 3072) dtype=float32>
}, MirroredVariable:{
  0: <tf.Variable 'encoder_1/gru_2/gru_cell_2/bias:0' shape=(2, 3072) dtype=float32>
}, MirroredVariable:{
  0: <tf.Variable 'while/decoder_1/bahdanau_attention_2/dense_6/kernel:0' shape=(1024, 1024) dtype=float32>
}, MirroredVariable:{
  0: <tf.Variable 'while/decoder_1/bahdanau_attention_2/custom_additive_attention_2/scale:0' shape=(1024,) dtype=float32>
}, MirroredVariable:{
  0: <tf.Variable 'while/decoder_1/embedding_3/embeddings:0' shape=(3000, 256) dtype=float32>
}, MirroredVariable:{
  0: <tf.Variable 'while/decoder_1/gru_3/gru_cell_3/kernel:0' shape=(256, 3072) dtype=float32>
}, MirroredVariable:{
  0: <tf.Variable 'while/decoder_1/gru_3/gru_cell_3/recurrent_kernel:0' shape=(1024, 3072) dtype=float32>
}, MirroredVariable:{
  0: <tf.Variable 'while/decoder_1/gru_3/gru_cell_3/bias:0' shape=(2, 3072) dtype=float32>
}, MirroredVariable:{
  0: <tf.Variable 'while/decoder_1/dense_8/kernel:0' shape=(2048, 1024) dtype=float32>
}, MirroredVariable:{
  0: <tf.Variable 'while/decoder_1/dense_9/kernel:0' shape=(1024, 3000) dtype=float32>
}, MirroredVariable:{
  0: <tf.Variable 'while/decoder_1/dense_9/bias:0' shape=(3000,) dtype=float32>
}, MirroredVariable:{
  0: <tf.Variable 'while/decoder_1/generation_probability_layer_2/Variable:0' shape=(1024, 1024) dtype=float32>
}, MirroredVariable:{
  0: <tf.Variable 'while/decoder_1/generation_probability_layer_2/Variable:0' shape=(1024, 1024) dtype=float32>
}, MirroredVariable:{
  0: <tf.Variable 'while/decoder_1/generation_probability_layer_2/Variable:0' shape=(1, 1024) dtype=float32>
}, MirroredVariable:{
  0: <tf.Variable 'while/decoder_1/generation_probability_layer_2/Variable:0' shape=(1024,) dtype=float32>
}]

>INFO:tensorflow:Error reported to Coordinator: No gradients provided for any variable: (['encoder_1/embedding_2/embeddings:0', 'encoder_1/gru_2/gru_cell_2/kernel:0', 'encoder_1/gru_2/gru_cell_2/recurrent_kernel:0', 'encoder_1/gru_2/gru_cell_2/bias:0', 'while/decoder_1/bahdanau_attention_2/dense_6/kernel:0', 'while/decoder_1/bahdanau_attention_2/custom_additive_attention_2/scale:0', 'while/decoder_1/embedding_3/embeddings:0', 'while/decoder_1/gru_3/gru_cell_3/kernel:0', 'while/decoder_1/gru_3/gru_cell_3/recurrent_kernel:0', 'while/decoder_1/gru_3/gru_cell_3/bias:0', 'while/decoder_1/dense_8/kernel:0', 'while/decoder_1/dense_9/kernel:0', 'while/decoder_1/dense_9/bias:0', 'while/decoder_1/generation_probability_layer_2/Variable:0', 'while/decoder_1/generation_probability_layer_2/Variable:0', 'while/decoder_1/generation_probability_layer_2/Variable:0', 'while/decoder_1/generation_probability_layer_2/Variable:0'],). Provided `grads_and_vars` is ((None, MirroredVariable:{
  0: <tf.Variable 'encoder_1/embedding_2/embeddings:0' shape=(112338, 256) dtype=float32>
}), (None, MirroredVariable:{
  0: <tf.Variable 'encoder_1/gru_2/gru_cell_2/kernel:0' shape=(256, 3072) dtype=float32>
}), (None, MirroredVariable:{
  0: <tf.Variable 'encoder_1/gru_2/gru_cell_2/recurrent_kernel:0' shape=(1024, 3072) dtype=float32>
}), (None, MirroredVariable:{
  0: <tf.Variable 'encoder_1/gru_2/gru_cell_2/bias:0' shape=(2, 3072) dtype=float32>
}), (None, MirroredVariable:{
  0: <tf.Variable 'while/decoder_1/bahdanau_attention_2/dense_6/kernel:0' shape=(1024, 1024) dtype=float32>
}), (None, MirroredVariable:{
  0: <tf.Variable 'while/decoder_1/bahdanau_attention_2/custom_additive_attention_2/scale:0' shape=(1024,) dtype=float32>
}), (None, MirroredVariable:{
  0: <tf.Variable 'while/decoder_1/embedding_3/embeddings:0' shape=(3000, 256) dtype=float32>
}), (None, MirroredVariable:{
  0: <tf.Variable 'while/decoder_1/gru_3/gru_cell_3/kernel:0' shape=(256, 3072) dtype=float32>
}), (None, MirroredVariable:{
  0: <tf.Variable 'while/decoder_1/gru_3/gru_cell_3/recurrent_kernel:0' shape=(1024, 3072) dtype=float32>
}), (None, MirroredVariable:{
  0: <tf.Variable 'while/decoder_1/gru_3/gru_cell_3/bias:0' shape=(2, 3072) dtype=float32>
}), (None, MirroredVariable:{
  0: <tf.Variable 'while/decoder_1/dense_8/kernel:0' shape=(2048, 1024) dtype=float32>
}), (None, MirroredVariable:{
  0: <tf.Variable 'while/decoder_1/dense_9/kernel:0' shape=(1024, 3000) dtype=float32>
}), (None, MirroredVariable:{
  0: <tf.Variable 'while/decoder_1/dense_9/bias:0' shape=(3000,) dtype=float32>
}), (None, MirroredVariable:{
  0: <tf.Variable 'while/decoder_1/generation_probability_layer_2/Variable:0' shape=(1024, 1024) dtype=float32>
}), (None, MirroredVariable:{
  0: <tf.Variable 'while/decoder_1/generation_probability_layer_2/Variable:0' shape=(1024, 1024) dtype=float32>
}), (None, MirroredVariable:{
  0: <tf.Variable 'while/decoder_1/generation_probability_layer_2/Variable:0' shape=(1, 1024) dtype=float32>
}), (None, MirroredVariable:{
  0: <tf.Variable 'while/decoder_1/generation_probability_layer_2/Variable:0' shape=(1024,) dtype=float32>
})).


>Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/training/coordinator.py", line 293, in stop_on_exception
    yield
  File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/distribute/mirrored_run.py", line 342, in run
    self.main_result = self.main_fn(*self.main_args, **self.main_kwargs)
  File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/autograph/impl/api.py", line 689, in wrapper
    return converted_call(f, args, kwargs, options=options)
  File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/autograph/impl/api.py", line 377, in converted_call
    return _call_unconverted(f, args, kwargs, options)
  File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/autograph/impl/api.py", line 458, in _call_unconverted
    return f(*args, **kwargs)
  File "/usr/local/lib/python3.7/dist-packages/keras/engine/training.py", line 1000, in run_step
    outputs = model.train_step(data)
  File "<ipython-input-46-7fc5bd50cffa>", line 36, in train_step
    return self._tf_train_step(inputs)
  File "<ipython-input-46-7fc5bd50cffa>", line 43, in _tf_train_step
    return self._train_step(inputs)
  File "<ipython-input-46-7fc5bd50cffa>", line 88, in _train_step
    self.optimizer.apply_gradients(zip(gradients, variables))
  File "/usr/local/lib/python3.7/dist-packages/keras/optimizer_v2/optimizer_v2.py", line 633, in apply_gradients
    grads_and_vars = optimizer_utils.filter_empty_gradients(grads_and_vars)
  File "/usr/local/lib/python3.7/dist-packages/keras/optimizer_v2/utils.py", line 73, in filter_empty_gradients
    raise ValueError(f"No gradients provided for any variable: {variable}. "
ValueError: No gradients provided for any variable: (['encoder_1/embedding_2/embeddings:0', 'encoder_1/gru_2/gru_cell_2/kernel:0', 'encoder_1/gru_2/gru_cell_2/recurrent_kernel:0', 'encoder_1/gru_2/gru_cell_2/bias:0', 'while/decoder_1/bahdanau_attention_2/dense_6/kernel:0', 'while/decoder_1/bahdanau_attention_2/custom_additive_attention_2/scale:0', 'while/decoder_1/embedding_3/embeddings:0', 'while/decoder_1/gru_3/gru_cell_3/kernel:0', 'while/decoder_1/gru_3/gru_cell_3/recurrent_kernel:0', 'while/decoder_1/gru_3/gru_cell_3/bias:0', 'while/decoder_1/dense_8/kernel:0', 'while/decoder_1/dense_9/kernel:0', 'while/decoder_1/dense_9/bias:0', 'while/decoder_1/generation_probability_layer_2/Variable:0', 'while/decoder_1/generation_probability_layer_2/Variable:0', 'while/decoder_1/generation_probability_layer_2/Variable:0', 'while/decoder_1/generation_probability_layer_2/Variable:0'],). Provided `grads_and_vars` is ((None, MirroredVariable:{
  0: <tf.Variable 'encoder_1/embedding_2/embeddings:0' shape=(112338, 256) dtype=float32>
}), (None, MirroredVariable:{
  0: <tf.Variable 'encoder_1/gru_2/gru_cell_2/kernel:0' shape=(256, 3072) dtype=float32>
}), (None, MirroredVariable:{
  0: <tf.Variable 'encoder_1/gru_2/gru_cell_2/recurrent_kernel:0' shape=(1024, 3072) dtype=float32>
}), (None, MirroredVariable:{
  0: <tf.Variable 'encoder_1/gru_2/gru_cell_2/bias:0' shape=(2, 3072) dtype=float32>
}), (None, MirroredVariable:{
  0: <tf.Variable 'while/decoder_1/bahdanau_attention_2/dense_6/kernel:0' shape=(1024, 1024) dtype=float32>
}), (None, MirroredVariable:{
  0: <tf.Variable 'while/decoder_1/bahdanau_attention_2/custom_additive_attention_2/scale:0' shape=(1024,) dtype=float32>
}), (None, MirroredVariable:{
  0: <tf.Variable 'while/decoder_1/embedding_3/embeddings:0' shape=(3000, 256) dtype=float32>
}), (None, MirroredVariable:{
  0: <tf.Variable 'while/decoder_1/gru_3/gru_cell_3/kernel:0' shape=(256, 3072) dtype=float32>
}), (None, MirroredVariable:{
  0: <tf.Variable 'while/decoder_1/gru_3/gru_cell_3/recurrent_kernel:0' shape=(1024, 3072) dtype=float32>
}), (None, MirroredVariable:{
  0: <tf.Variable 'while/decoder_1/gru_3/gru_cell_3/bias:0' shape=(2, 3072) dtype=float32>
}), (None, MirroredVariable:{
  0: <tf.Variable 'while/decoder_1/dense_8/kernel:0' shape=(2048, 1024) dtype=float32>
}), (None, MirroredVariable:{
  0: <tf.Variable 'while/decoder_1/dense_9/kernel:0' shape=(1024, 3000) dtype=float32>
}), (None, MirroredVariable:{
  0: <tf.Variable 'while/decoder_1/dense_9/bias:0' shape=(3000,) dtype=float32>
}), (None, MirroredVariable:{
  0: <tf.Variable 'while/decoder_1/generation_probability_layer_2/Variable:0' shape=(1024, 1024) dtype=float32>
}), (None, MirroredVariable:{
  0: <tf.Variable 'while/decoder_1/generation_probability_layer_2/Variable:0' shape=(1024, 1024) dtype=float32>
}), (None, MirroredVariable:{
  0: <tf.Variable 'while/decoder_1/generation_probability_layer_2/Variable:0' shape=(1, 1024) dtype=float32>
}), (None, MirroredVariable:{
  0: <tf.Variable 'while/decoder_1/generation_probability_layer_2/Variable:0' shape=(1024,) dtype=float32>
})).
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-50-c32868b98bed> in <module>()
----> 1 text_summarizer.fit(profiler_dist_dataset, epochs = 1, callbacks = [batch_loss, 
tboard_callback], steps_per_epoch=GLOBAL_BATCH_SIZE_TPU)

1 frames
/usr/local/lib/python3.7/dist-packages/keras/utils/traceback_utils.py in error_handler(*args, 
**kwargs)
     65     except Exception as e:  # pylint: disable=broad-except
     66       filtered_tb = _process_traceback_frames(e.__traceback__)
 ---> 67       raise e.with_traceback(filtered_tb) from None
     68     finally:
     69       del filtered_tb

/usr/local/lib/python3.7/dist-packages/tensorflow/python/framework/func_graph.py in 
autograph_handler(*args, **kwargs)
   1145           except Exception as e:  # pylint:disable=broad-except
   1146             if hasattr(e, "ag_error_metadata"):
-> 1147               raise e.ag_error_metadata.to_exception(e)
   1148             else:
   1149               raise

ValueError: in user code:

    File "/usr/local/lib/python3.7/dist-packages/keras/engine/training.py", line 1057, in 
train_function  *
        for _ in tf.range(self._steps_per_execution):

    ValueError: 'outputs' must be defined before the loop.


Sources

This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.

Source: Stack Overflow

Solution Source