'How to properly initialize TensorFlow GRU-layer with noisy states?
I wish to experiement with noisy GRU states instead of resetting them to zero for each batch. I try below an implementation. My initial code was resetting initial states to zero with (states = None), I changed the train_step with
  noisy_states = tf.convert_to_tensor(np.random.random([BATCH_SIZE, RNN_UNITS]).astype(np.float32))
  predictions, states = self(inputs, states=noisy_states, return_state=True, training=True)
The model class inherited from Tensorflow Model now looks like
class MyModel(tf.keras.Model):
  
  def __init__(self, vocab_size, embedding_dim, rnn_units):
    super().__init__(self)
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(rnn_units,
                                   stateful=True,
                                   return_sequences=True,
                                   return_state=True,
                                   activation='tanh',
                                   recurrent_activation='sigmoid',  
                                   recurrent_dropout=0.2,
                                   dropout=0.2,
                                   reset_after=True  
                                   )
    self.dense = tf.keras.layers.Dense(vocab_size)
  def call(self, inputs, states=None, return_state=False, training=False):        
    x = inputs
    x = self.embedding(x, training=training)
    if states is None:
      states = self.gru.get_initial_state(x)          
    x, states = self.gru(x, initial_state=states, training=training)    
    x = self.dense(x, training=training)
    if return_state:
      return x, states
    else:
      return x
  @tf.function
  def train_step(self, inputs):
    inputs, labels = inputs      
    with tf.GradientTape() as tape:          
      noisy_states = tf.convert_to_tensor(np.random.random([BATCH_SIZE, RNN_UNITS]).astype(np.float32))
      predictions, states = self(inputs, states=noisy_states, return_state=True, training=True)     
      loss=self.compiled_loss(labels, predictions, regularization_losses=self.losses)
    grads=tape.gradient(loss, model.trainable_variables)
    self.optimizer.apply_gradients(zip(grads, model.trainable_variables))
    self.compiled_metrics.update_state(labels, predictions)
    return {m.name: m.result() for m in self.metrics}
Training runs with no error, but inference fails with
ValueError: in user code:
    train.py:239 generate_one_step  *
        predicted_logits, states = self.model(inputs=input_ids, states=states,
    train-v4.py:133 call  *
        x, states = self.gru(x, initial_state=states, training=training)
    /usr/local/lib/python3.6/dist-packages/keras/layers/recurrent.py:716 __call__  **
        return super(RNN, self).__call__(inputs, **kwargs)
 [...]
    ValueError: Input 0 is incompatible with layer gru: expected shape=(64, None, 256), found shape=(1, None, 256)
The generator looks like this
class OneStep(tf.keras.Model):
  def __init__(self, model, chars_from_ids, ids_from_chars, temperature=1.0):
    super().__init__()
    self.temperature = temperature
    self.model = model
  [initialize stuff]  
  [...]
  @tf.function
  def generate_one_step(self, inputs, states=None):
    input_chars = tf.strings.unicode_split(inputs, 'UTF-8')
    input_ids = self.ids_from_chars(input_chars).to_tensor()
  
    predicted_logits, states = self.model(inputs=input_ids, states=states,
                                          return_state=True)
    predicted_logits = predicted_logits[:, -1, :]
    predicted_logits = predicted_logits/self.temperature
    predicted_logits = predicted_logits + self.prediction_mask
    predicted_ids = tf.random.categorical(predicted_logits, num_samples=1)
    predicted_ids = tf.squeeze(predicted_ids, axis=-1)
    predicted_chars = self.chars_from_ids(predicted_ids)
    return predicted_chars, states
and the code throwing the error is
for n in range(10000):
  next_char, states = one_step_model.generate_one_step(next_char, states=states)
  result.append(next_char)
In my understanding, we initialize states with some noise instead of zeroes to avoid overfitting. Model is better trained as before, and weights saved for inference. Should the inference model be changed as well, should the states behavior be updated in the generator too?
Sources
This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.
Source: Stack Overflow
| Solution | Source | 
|---|
