'No gradients provided for any variable, Tensorflow error on TPU
So I am trying to train my Text Summarization model on Colab TPU as training it on Colab CPU is very slow but I am getting a No gradients provided for any variable Error, this error does not appear when training on CPU with or without using tf.function() decorator. Please can anyone help me with this, if you need any more information feel free to ask, Thank You.
class MaskedLoss(tf.keras.losses.Loss):
def __init__(self):
self.name = 'masked_loss'
self.loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')
def __call__(self, y_true, y_pred):
loss = self.loss(y_true, y_pred)
mask = tf.cast(y_true !=0, tf.float32)
loss*=mask
return tf.nn.compute_average_loss(loss, global_batch_size=GLOBAL_BATCH_SIZE_TPU)
class BatchLogs(tf.keras.callbacks.Callback):
def __init__(self, key):
self.key = key
self.logs = []
def on_train_batch_end(self, n, logs):
self.logs.append(logs[self.key])
class CoverageLoss(tf.keras.losses.Loss):
def __init__(self):
self.name = "coverage_loss"
def __call__(self, coverage_vector, attention_weights):
loss = tf.math.minimum(coverage_vector, attention_weights)
return tf.nn.compute_average_loss(loss, global_batch_size=GLOBAL_BATCH_SIZE_TPU)
class AttentionEncoderDecoder(tf.keras.Model):
def __init__(self, units, embedding_dims, input_preprocessor, output_preprocessor, batch_size, use_tf_function=False, LAMBDA = 1):
super(AttentionEncoderDecoder, self).__init__()
self.units = units
self.LAMBDA = LAMBDA
self.BATCH_SIZE = batch_size
self.embedding = embedding_dims
self.use_tf_function = use_tf_function
self.input_preprocessor = input_preprocessor
self.output_preprocessor = output_preprocessor
self.MAX_TARGET_VOCAB = self.output_preprocessor.vocabulary_size()
self.MAX_SOURCE_VOCAB = self.input_preprocessor.vocabulary_size()
self.encoder = Encoder(self.units, self.embedding, self.MAX_SOURCE_VOCAB)
self.decoder = Decoder(self.units, self.embedding, self.MAX_TARGET_VOCAB)
def overflow_tokens_to_rel_input_pos(self, inputs):
out, inp = inputs
return tf.where(out >= self.MAX_TARGET_VOCAB, self.MAX_TARGET_VOCAB + tf.argmax(inp == tf.expand_dims(out, axis= 1), axis=1), out)
def _preprocess(self, input_text, output_text):
input_tokens = self.input_preprocessor(input_text)
target_tokens = self.output_preprocessor(output_text)
target_tokens_on_input_vocab = self.input_preprocessor(output_text)
target_tokens_mapped = tf.where(target_tokens == 1, target_tokens_on_input_vocab, target_tokens)
target_tokens_mapped = tf.vectorized_map(self.overflow_tokens_to_rel_input_pos, (target_tokens_mapped, input_tokens))
input_mask = (input_tokens != 0)
target_mask = (target_tokens != 0)
return input_tokens, input_mask, target_tokens, target_tokens_mapped, target_mask
def train_step(self, inputs):
if self.use_tf_function:
return self._tf_train_step(inputs)
else:
return self._train_step(inputs)
# @tf.function(input_signature = [[tf.TensorSpec(dtype = tf.string, shape = [None]),
# tf.TensorSpec(dtype = tf.string, shape = [None])]])
def _tf_train_step(self, inputs):
return self._train_step(inputs)
def _one_step(self, input_token, target_token, enc_output, input_mask, dec_state, coverage_vector, training = True):
decoder_input = DecoderInput(input_token, enc_output, input_mask)
dec_result, dec_state, coverage_vector = self.decoder(decoder_input, coverage_vector, state=dec_state)
y_true = target_token #(batch, 1)
y_pred = dec_result.logits #(batch, t_step, outvocab + encoder_output_steps)
step_loss = self.loss['MaskedLoss'](y_true, y_pred) + self.LAMBDA * self.loss['CoverageLoss'](coverage_vector, dec_result.attention_weights)
return dec_result, dec_state, coverage_vector, step_loss
def do_one_step(self, i, max_target_sen_length, input_text, input_tokens, input_mask, target_tokens, target_tokens_mapped, target_mask, coverage_vector, enc_output, enc_state, dec_state, loss):
input_token = tf.slice(target_tokens, [0, i], [-1, 1])
target_token = tf.slice(target_tokens, [0, i+1], [-1, 1])
dec_result, dec_state, coverage_vector, step_loss = self._one_step(input_token, target_token, enc_output, input_mask, dec_state, coverage_vector, True)
i = i + 1
return i, max_target_sen_length, input_text, input_tokens, input_mask, target_tokens, target_tokens_mapped, target_mask, coverage_vector, enc_output, enc_state, dec_state, loss
def _train_step(self, inputs):
input_text, target_text = inputs
input_tokens, input_mask, target_tokens, target_tokens_mapped, target_mask = self._preprocess(input_text, target_text)
target_mask = tf.cast(target_mask, tf.float32)
input_tokens = tf.cast(input_tokens, tf.float32)
target_tokens = tf.cast(target_tokens, tf.float32)
target_tokens_mapped = tf.cast(target_tokens_mapped, tf.float32)
coverage_vector = tf.zeros(shape = (self.BATCH_SIZE, 1, tf.shape(input_tokens)[1]))
max_target_sen_length = tf.shape(target_tokens)[1]
with tf.GradientTape() as tape:
enc_output, enc_state = self.encoder(input_tokens)
dec_state = enc_state
loss = tf.constant(0.0)
i = tf.constant(0)
cond = lambda i, max_target_sen_length, input_text, input_tokens, input_mask, target_tokens, target_tokens_mapped, target_mask, coverage_vector, enc_output, enc_state, dec_state, loss : tf.less(i, max_target_sen_length-1)
loop_variables = i, max_target_sen_length, input_text, input_tokens, input_mask, target_tokens, target_tokens_mapped, target_mask, coverage_vector, enc_output, enc_state, dec_state, loss
loop_variables = tf.while_loop(cond, self.do_one_step, loop_vars = loop_variables)
avg_loss = loss/(tf.reduce_sum(target_mask) * (1. / self.BATCH_SIZE))
variables = self.trainable_variables
gradients = tape.gradient(avg_loss, variables)
print(gradients, variables)
self.optimizer.apply_gradients(zip(gradients, variables))
return {'batch_loss': avg_loss}
strategy = tf.distribute.MirroredStrategy()
print('Number of devices: {}'.format(strategy.num_replicas_in_sync))
BATCH_SIZE_PER_REPLICA_TPU = 64
GLOBAL_BATCH_SIZE_TPU = BATCH_SIZE_PER_REPLICA_TPU * strategy.num_replicas_in_sync
with strategy.scope():
tboard_callback = tf.keras.callbacks.TensorBoard(log_dir = logs)
text_summarizer = AttentionEncoderDecoder(UNITS, EMBEDDING, input_preprocessing, output_preprocessing, GLOBAL_BATCH_SIZE_TPU, True, 1)
text_summarizer.compile(optimizer = "Adam", loss = {"MaskedLoss": MaskedLoss(), "CoverageLoss" : CoverageLoss()}, steps_per_execution = 50)
text_summarizer.fit(profiler_dist_dataset, epochs = 1, callbacks = [batch_loss, tboard_callback], steps_per_epoch=GLOBAL_BATCH_SIZE_TPU)
Error
Tensor("truediv:0", shape=(), dtype=float32, device=/job:localhost/replica:0/task:0/device:CPU:0)
[None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None]
[MirroredVariable:{
0: <tf.Variable 'encoder_1/embedding_2/embeddings:0' shape=(112338, 256) dtype=float32>
}, MirroredVariable:{
0: <tf.Variable 'encoder_1/gru_2/gru_cell_2/kernel:0' shape=(256, 3072) dtype=float32>
}, MirroredVariable:{
0: <tf.Variable 'encoder_1/gru_2/gru_cell_2/recurrent_kernel:0' shape=(1024, 3072) dtype=float32>
}, MirroredVariable:{
0: <tf.Variable 'encoder_1/gru_2/gru_cell_2/bias:0' shape=(2, 3072) dtype=float32>
}, MirroredVariable:{
0: <tf.Variable 'while/decoder_1/bahdanau_attention_2/dense_6/kernel:0' shape=(1024, 1024) dtype=float32>
}, MirroredVariable:{
0: <tf.Variable 'while/decoder_1/bahdanau_attention_2/custom_additive_attention_2/scale:0' shape=(1024,) dtype=float32>
}, MirroredVariable:{
0: <tf.Variable 'while/decoder_1/embedding_3/embeddings:0' shape=(3000, 256) dtype=float32>
}, MirroredVariable:{
0: <tf.Variable 'while/decoder_1/gru_3/gru_cell_3/kernel:0' shape=(256, 3072) dtype=float32>
}, MirroredVariable:{
0: <tf.Variable 'while/decoder_1/gru_3/gru_cell_3/recurrent_kernel:0' shape=(1024, 3072) dtype=float32>
}, MirroredVariable:{
0: <tf.Variable 'while/decoder_1/gru_3/gru_cell_3/bias:0' shape=(2, 3072) dtype=float32>
}, MirroredVariable:{
0: <tf.Variable 'while/decoder_1/dense_8/kernel:0' shape=(2048, 1024) dtype=float32>
}, MirroredVariable:{
0: <tf.Variable 'while/decoder_1/dense_9/kernel:0' shape=(1024, 3000) dtype=float32>
}, MirroredVariable:{
0: <tf.Variable 'while/decoder_1/dense_9/bias:0' shape=(3000,) dtype=float32>
}, MirroredVariable:{
0: <tf.Variable 'while/decoder_1/generation_probability_layer_2/Variable:0' shape=(1024, 1024) dtype=float32>
}, MirroredVariable:{
0: <tf.Variable 'while/decoder_1/generation_probability_layer_2/Variable:0' shape=(1024, 1024) dtype=float32>
}, MirroredVariable:{
0: <tf.Variable 'while/decoder_1/generation_probability_layer_2/Variable:0' shape=(1, 1024) dtype=float32>
}, MirroredVariable:{
0: <tf.Variable 'while/decoder_1/generation_probability_layer_2/Variable:0' shape=(1024,) dtype=float32>
}]
>INFO:tensorflow:Error reported to Coordinator: No gradients provided for any variable: (['encoder_1/embedding_2/embeddings:0', 'encoder_1/gru_2/gru_cell_2/kernel:0', 'encoder_1/gru_2/gru_cell_2/recurrent_kernel:0', 'encoder_1/gru_2/gru_cell_2/bias:0', 'while/decoder_1/bahdanau_attention_2/dense_6/kernel:0', 'while/decoder_1/bahdanau_attention_2/custom_additive_attention_2/scale:0', 'while/decoder_1/embedding_3/embeddings:0', 'while/decoder_1/gru_3/gru_cell_3/kernel:0', 'while/decoder_1/gru_3/gru_cell_3/recurrent_kernel:0', 'while/decoder_1/gru_3/gru_cell_3/bias:0', 'while/decoder_1/dense_8/kernel:0', 'while/decoder_1/dense_9/kernel:0', 'while/decoder_1/dense_9/bias:0', 'while/decoder_1/generation_probability_layer_2/Variable:0', 'while/decoder_1/generation_probability_layer_2/Variable:0', 'while/decoder_1/generation_probability_layer_2/Variable:0', 'while/decoder_1/generation_probability_layer_2/Variable:0'],). Provided `grads_and_vars` is ((None, MirroredVariable:{
0: <tf.Variable 'encoder_1/embedding_2/embeddings:0' shape=(112338, 256) dtype=float32>
}), (None, MirroredVariable:{
0: <tf.Variable 'encoder_1/gru_2/gru_cell_2/kernel:0' shape=(256, 3072) dtype=float32>
}), (None, MirroredVariable:{
0: <tf.Variable 'encoder_1/gru_2/gru_cell_2/recurrent_kernel:0' shape=(1024, 3072) dtype=float32>
}), (None, MirroredVariable:{
0: <tf.Variable 'encoder_1/gru_2/gru_cell_2/bias:0' shape=(2, 3072) dtype=float32>
}), (None, MirroredVariable:{
0: <tf.Variable 'while/decoder_1/bahdanau_attention_2/dense_6/kernel:0' shape=(1024, 1024) dtype=float32>
}), (None, MirroredVariable:{
0: <tf.Variable 'while/decoder_1/bahdanau_attention_2/custom_additive_attention_2/scale:0' shape=(1024,) dtype=float32>
}), (None, MirroredVariable:{
0: <tf.Variable 'while/decoder_1/embedding_3/embeddings:0' shape=(3000, 256) dtype=float32>
}), (None, MirroredVariable:{
0: <tf.Variable 'while/decoder_1/gru_3/gru_cell_3/kernel:0' shape=(256, 3072) dtype=float32>
}), (None, MirroredVariable:{
0: <tf.Variable 'while/decoder_1/gru_3/gru_cell_3/recurrent_kernel:0' shape=(1024, 3072) dtype=float32>
}), (None, MirroredVariable:{
0: <tf.Variable 'while/decoder_1/gru_3/gru_cell_3/bias:0' shape=(2, 3072) dtype=float32>
}), (None, MirroredVariable:{
0: <tf.Variable 'while/decoder_1/dense_8/kernel:0' shape=(2048, 1024) dtype=float32>
}), (None, MirroredVariable:{
0: <tf.Variable 'while/decoder_1/dense_9/kernel:0' shape=(1024, 3000) dtype=float32>
}), (None, MirroredVariable:{
0: <tf.Variable 'while/decoder_1/dense_9/bias:0' shape=(3000,) dtype=float32>
}), (None, MirroredVariable:{
0: <tf.Variable 'while/decoder_1/generation_probability_layer_2/Variable:0' shape=(1024, 1024) dtype=float32>
}), (None, MirroredVariable:{
0: <tf.Variable 'while/decoder_1/generation_probability_layer_2/Variable:0' shape=(1024, 1024) dtype=float32>
}), (None, MirroredVariable:{
0: <tf.Variable 'while/decoder_1/generation_probability_layer_2/Variable:0' shape=(1, 1024) dtype=float32>
}), (None, MirroredVariable:{
0: <tf.Variable 'while/decoder_1/generation_probability_layer_2/Variable:0' shape=(1024,) dtype=float32>
})).
>Traceback (most recent call last):
File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/training/coordinator.py", line 293, in stop_on_exception
yield
File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/distribute/mirrored_run.py", line 342, in run
self.main_result = self.main_fn(*self.main_args, **self.main_kwargs)
File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/autograph/impl/api.py", line 689, in wrapper
return converted_call(f, args, kwargs, options=options)
File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/autograph/impl/api.py", line 377, in converted_call
return _call_unconverted(f, args, kwargs, options)
File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/autograph/impl/api.py", line 458, in _call_unconverted
return f(*args, **kwargs)
File "/usr/local/lib/python3.7/dist-packages/keras/engine/training.py", line 1000, in run_step
outputs = model.train_step(data)
File "<ipython-input-46-7fc5bd50cffa>", line 36, in train_step
return self._tf_train_step(inputs)
File "<ipython-input-46-7fc5bd50cffa>", line 43, in _tf_train_step
return self._train_step(inputs)
File "<ipython-input-46-7fc5bd50cffa>", line 88, in _train_step
self.optimizer.apply_gradients(zip(gradients, variables))
File "/usr/local/lib/python3.7/dist-packages/keras/optimizer_v2/optimizer_v2.py", line 633, in apply_gradients
grads_and_vars = optimizer_utils.filter_empty_gradients(grads_and_vars)
File "/usr/local/lib/python3.7/dist-packages/keras/optimizer_v2/utils.py", line 73, in filter_empty_gradients
raise ValueError(f"No gradients provided for any variable: {variable}. "
ValueError: No gradients provided for any variable: (['encoder_1/embedding_2/embeddings:0', 'encoder_1/gru_2/gru_cell_2/kernel:0', 'encoder_1/gru_2/gru_cell_2/recurrent_kernel:0', 'encoder_1/gru_2/gru_cell_2/bias:0', 'while/decoder_1/bahdanau_attention_2/dense_6/kernel:0', 'while/decoder_1/bahdanau_attention_2/custom_additive_attention_2/scale:0', 'while/decoder_1/embedding_3/embeddings:0', 'while/decoder_1/gru_3/gru_cell_3/kernel:0', 'while/decoder_1/gru_3/gru_cell_3/recurrent_kernel:0', 'while/decoder_1/gru_3/gru_cell_3/bias:0', 'while/decoder_1/dense_8/kernel:0', 'while/decoder_1/dense_9/kernel:0', 'while/decoder_1/dense_9/bias:0', 'while/decoder_1/generation_probability_layer_2/Variable:0', 'while/decoder_1/generation_probability_layer_2/Variable:0', 'while/decoder_1/generation_probability_layer_2/Variable:0', 'while/decoder_1/generation_probability_layer_2/Variable:0'],). Provided `grads_and_vars` is ((None, MirroredVariable:{
0: <tf.Variable 'encoder_1/embedding_2/embeddings:0' shape=(112338, 256) dtype=float32>
}), (None, MirroredVariable:{
0: <tf.Variable 'encoder_1/gru_2/gru_cell_2/kernel:0' shape=(256, 3072) dtype=float32>
}), (None, MirroredVariable:{
0: <tf.Variable 'encoder_1/gru_2/gru_cell_2/recurrent_kernel:0' shape=(1024, 3072) dtype=float32>
}), (None, MirroredVariable:{
0: <tf.Variable 'encoder_1/gru_2/gru_cell_2/bias:0' shape=(2, 3072) dtype=float32>
}), (None, MirroredVariable:{
0: <tf.Variable 'while/decoder_1/bahdanau_attention_2/dense_6/kernel:0' shape=(1024, 1024) dtype=float32>
}), (None, MirroredVariable:{
0: <tf.Variable 'while/decoder_1/bahdanau_attention_2/custom_additive_attention_2/scale:0' shape=(1024,) dtype=float32>
}), (None, MirroredVariable:{
0: <tf.Variable 'while/decoder_1/embedding_3/embeddings:0' shape=(3000, 256) dtype=float32>
}), (None, MirroredVariable:{
0: <tf.Variable 'while/decoder_1/gru_3/gru_cell_3/kernel:0' shape=(256, 3072) dtype=float32>
}), (None, MirroredVariable:{
0: <tf.Variable 'while/decoder_1/gru_3/gru_cell_3/recurrent_kernel:0' shape=(1024, 3072) dtype=float32>
}), (None, MirroredVariable:{
0: <tf.Variable 'while/decoder_1/gru_3/gru_cell_3/bias:0' shape=(2, 3072) dtype=float32>
}), (None, MirroredVariable:{
0: <tf.Variable 'while/decoder_1/dense_8/kernel:0' shape=(2048, 1024) dtype=float32>
}), (None, MirroredVariable:{
0: <tf.Variable 'while/decoder_1/dense_9/kernel:0' shape=(1024, 3000) dtype=float32>
}), (None, MirroredVariable:{
0: <tf.Variable 'while/decoder_1/dense_9/bias:0' shape=(3000,) dtype=float32>
}), (None, MirroredVariable:{
0: <tf.Variable 'while/decoder_1/generation_probability_layer_2/Variable:0' shape=(1024, 1024) dtype=float32>
}), (None, MirroredVariable:{
0: <tf.Variable 'while/decoder_1/generation_probability_layer_2/Variable:0' shape=(1024, 1024) dtype=float32>
}), (None, MirroredVariable:{
0: <tf.Variable 'while/decoder_1/generation_probability_layer_2/Variable:0' shape=(1, 1024) dtype=float32>
}), (None, MirroredVariable:{
0: <tf.Variable 'while/decoder_1/generation_probability_layer_2/Variable:0' shape=(1024,) dtype=float32>
})).
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-50-c32868b98bed> in <module>()
----> 1 text_summarizer.fit(profiler_dist_dataset, epochs = 1, callbacks = [batch_loss,
tboard_callback], steps_per_epoch=GLOBAL_BATCH_SIZE_TPU)
1 frames
/usr/local/lib/python3.7/dist-packages/keras/utils/traceback_utils.py in error_handler(*args,
**kwargs)
65 except Exception as e: # pylint: disable=broad-except
66 filtered_tb = _process_traceback_frames(e.__traceback__)
---> 67 raise e.with_traceback(filtered_tb) from None
68 finally:
69 del filtered_tb
/usr/local/lib/python3.7/dist-packages/tensorflow/python/framework/func_graph.py in
autograph_handler(*args, **kwargs)
1145 except Exception as e: # pylint:disable=broad-except
1146 if hasattr(e, "ag_error_metadata"):
-> 1147 raise e.ag_error_metadata.to_exception(e)
1148 else:
1149 raise
ValueError: in user code:
File "/usr/local/lib/python3.7/dist-packages/keras/engine/training.py", line 1057, in
train_function *
for _ in tf.range(self._steps_per_execution):
ValueError: 'outputs' must be defined before the loop.
Sources
This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.
Source: Stack Overflow
| Solution | Source |
|---|
