'How to properly initialize TensorFlow GRU-layer with noisy states?

I wish to experiement with noisy GRU states instead of resetting them to zero for each batch. I try below an implementation. My initial code was resetting initial states to zero with (states = None), I changed the train_step with

  noisy_states = tf.convert_to_tensor(np.random.random([BATCH_SIZE, RNN_UNITS]).astype(np.float32))
  predictions, states = self(inputs, states=noisy_states, return_state=True, training=True)

The model class inherited from Tensorflow Model now looks like

class MyModel(tf.keras.Model):
  
  def __init__(self, vocab_size, embedding_dim, rnn_units):
    super().__init__(self)
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(rnn_units,
                                   stateful=True,
                                   return_sequences=True,
                                   return_state=True,
                                   activation='tanh',
                                   recurrent_activation='sigmoid',  
                                   recurrent_dropout=0.2,
                                   dropout=0.2,
                                   reset_after=True  
                                   )
    self.dense = tf.keras.layers.Dense(vocab_size)

  def call(self, inputs, states=None, return_state=False, training=False):        
    x = inputs
    x = self.embedding(x, training=training)
    if states is None:
      states = self.gru.get_initial_state(x)          
    x, states = self.gru(x, initial_state=states, training=training)    
    x = self.dense(x, training=training)
    if return_state:
      return x, states
    else:
      return x

  @tf.function
  def train_step(self, inputs):
    inputs, labels = inputs      
    with tf.GradientTape() as tape:          
      noisy_states = tf.convert_to_tensor(np.random.random([BATCH_SIZE, RNN_UNITS]).astype(np.float32))
      predictions, states = self(inputs, states=noisy_states, return_state=True, training=True)     
      loss=self.compiled_loss(labels, predictions, regularization_losses=self.losses)

    grads=tape.gradient(loss, model.trainable_variables)
    self.optimizer.apply_gradients(zip(grads, model.trainable_variables))
    self.compiled_metrics.update_state(labels, predictions)

    return {m.name: m.result() for m in self.metrics}

Training runs with no error, but inference fails with

ValueError: in user code:

    train.py:239 generate_one_step  *
        predicted_logits, states = self.model(inputs=input_ids, states=states,
    train-v4.py:133 call  *
        x, states = self.gru(x, initial_state=states, training=training)
    /usr/local/lib/python3.6/dist-packages/keras/layers/recurrent.py:716 __call__  **
        return super(RNN, self).__call__(inputs, **kwargs)
 [...]

    ValueError: Input 0 is incompatible with layer gru: expected shape=(64, None, 256), found shape=(1, None, 256)

The generator looks like this

class OneStep(tf.keras.Model):
  def __init__(self, model, chars_from_ids, ids_from_chars, temperature=1.0):
    super().__init__()
    self.temperature = temperature
    self.model = model
  [initialize stuff]  
  [...]

  @tf.function
  def generate_one_step(self, inputs, states=None):
    input_chars = tf.strings.unicode_split(inputs, 'UTF-8')
    input_ids = self.ids_from_chars(input_chars).to_tensor()
  
    predicted_logits, states = self.model(inputs=input_ids, states=states,
                                          return_state=True)

    predicted_logits = predicted_logits[:, -1, :]
    predicted_logits = predicted_logits/self.temperature
    predicted_logits = predicted_logits + self.prediction_mask
    predicted_ids = tf.random.categorical(predicted_logits, num_samples=1)
    predicted_ids = tf.squeeze(predicted_ids, axis=-1)
    predicted_chars = self.chars_from_ids(predicted_ids)

    return predicted_chars, states

and the code throwing the error is

for n in range(10000):
  next_char, states = one_step_model.generate_one_step(next_char, states=states)
  result.append(next_char)

In my understanding, we initialize states with some noise instead of zeroes to avoid overfitting. Model is better trained as before, and weights saved for inference. Should the inference model be changed as well, should the states behavior be updated in the generator too?



Sources

This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.

Source: Stack Overflow

Solution Source