'Actor Critic model returns NaN for action probabilities

I am new to RL and walking through the Keras implementation of Actor Critic.

As a variant of it, I am trying to learn the strategy for WORDLE. However, after a few runs, my action spaces all go down to nan.

actions = [nan nan nan ... nan nan nan]

Not sure what's happening. Could someone have any insights or pointers?

Attaching my code for reference.

Thanks

import pandas as pd
import numpy as np
import random
import string
import random
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# Configuration parameters for the whole setup
gamma = 0.9  # Discount factor for past rewards
max_runs = 10000
eps = np.finfo(np.float32).eps.item()  # Smallest number such that 1.0 + eps != 1.0

my_file = open("<wordle set of words data path>", "r")
content = my_file.read()
content = list(content.split('\n'))

lower_alphabet = list(string.ascii_letters)[:26]

def get_secret_word():
    return random.choice(content)

def reset_available_action_space():
    return [1 for i in range(len(content))]

def reset_guessed_alphabet_state():
    return [0 for i in range(len(lower_alphabet))]

# array of 26 which represents which alphabet is available in word
def reset_contains_alphabet_state():
    return [0 for i in range(len(lower_alphabet))]

# Array of 26*5. 
# First 26 represent which alphabet was correctly guessed at the first slot
# Second 26 represent which alphabet was correctly guessed at the second slot. And so on for the next 5 slots.
def reset_correct_alphabet_pos_state():
    return [0 for i in range(len(lower_alphabet)*5)]

def select_and_update_AVAILABLE_ACTION_SPACE(actions):
    action_index = 0
    while AVAILABLE_ACTION_SPACE[actions[action_index]] == False:
        action_index += 1
    AVAILABLE_ACTION_SPACE[actions[action_index]] = 0
    return actions[action_index]

def env_reset():
    AVAILABLE_ACTION_SPACE = reset_available_action_space()
    guessed_alphabet_state = reset_guessed_alphabet_state()
    contains_alphabet_state = reset_contains_alphabet_state()
    correct_alphabet_pos_state = reset_correct_alphabet_pos_state()
    state = guessed_alphabet_state + contains_alphabet_state + correct_alphabet_pos_state
    SECRET_WORD = get_secret_word()
    return state, SECRET_WORD, AVAILABLE_ACTION_SPACE

def env_step(action, state):
    guessed_word = content[action]
    
    guessed_alphabet_state = state[:26]
    contains_alphabet_state = state[26:52]
    correct_alphabet_pos_state = state[52:]
    
    done = False
    reward = -10
    
    if SECRET_WORD == guessed_word:
        done = True
        reward = 10
    secret_word = list(SECRET_WORD)
    guessed_word = list(guessed_word)
    for index_, char_ in enumerate(guessed_word):
        alphabet_index = lower_alphabet.index(char_)
        guessed_alphabet_state[alphabet_index] = 1
        if char_ in secret_word:
            contains_alphabet_state[alphabet_index] = 1
            if secret_word[index_] == char_:
                correct_alphabet_pos_state[26*index_ + alphabet_index] = 1
    state = guessed_alphabet_state + contains_alphabet_state + correct_alphabet_pos_state
    return state, reward, done

num_inputs = 182
num_actions = len(content)
num_hidden_1 = 256
num_hidden_2 = 128

inputs = layers.Input(shape=(num_inputs,))
common = layers.Dense(num_hidden_1, activation="relu")(inputs)
common = layers.Dense(num_hidden_2, activation="relu")(common)
action = layers.Dense(num_actions, activation="softmax")(common)
critic = layers.Dense(1)(common)

model = keras.Model(inputs=inputs, outputs=[action, critic])

optimizer = keras.optimizers.Adam(learning_rate=0.001)
huber_loss = keras.losses.Huber()
action_probs_history = []
critic_value_history = []
rewards_history = []
running_reward = 0
episode_count = 0

for runs in range(max_runs):
    max_steps_per_episode = 6
    state, SECRET_WORD, AVAILABLE_ACTION_SPACE = env_reset()
    episode_reward = 0
    with tf.GradientTape() as tape:
        for timestep in range(max_steps_per_episode):
            state_tensor = tf.convert_to_tensor(state)
            state_tensor = tf.expand_dims(state, 0)
            
            action_probs, critic_value = model(state_tensor)
            critic_value_history.append(critic_value[0, 0])

            actions = np.random.choice(num_actions, size=max_steps_per_episode+1, replace = False, p=np.squeeze(action_probs))
            action = select_and_update_AVAILABLE_ACTION_SPACE(actions)

            action_probs_history.append(tf.math.log(action_probs[0, action]))

            state, reward, done = env_step(action, state)
            rewards_history.append(reward)
            episode_reward += reward

            if done:
                break
                
        # Update running reward to check condition for solving
        running_reward = 0.05 * episode_reward + (1 - 0.05) * running_reward

        # Calculate expected value from rewards
        # - At each timestep what was the total reward received after that timestep
        # - Rewards in the past are discounted by multiplying them with gamma
        # - These are the labels for our critic
        returns = []
        discounted_sum = 0
        for r in rewards_history[::-1]:
            discounted_sum = r + gamma * discounted_sum
            returns.insert(0, discounted_sum)

        # Normalize
        returns = np.array(returns)
        returns = (returns - np.mean(returns)) / (np.std(returns) + eps)
        returns = returns.tolist()

        # Calculating loss values to update our network
        history = zip(action_probs_history, critic_value_history, returns)
        actor_losses = []
        critic_losses = []
        for log_prob, value, ret in history:
            # At this point in history, the critic estimated that we would get a
            # total reward = `value` in the future. We took an action with log probability
            # of `log_prob` and ended up recieving a total reward = `ret`.
            # The actor must be updated so that it predicts an action that leads to
            # high rewards (compared to critic's estimate) with high probability.
            diff = ret - value
            actor_losses.append(-log_prob * diff)  # actor loss

            # The critic must be updated so that it predicts a better estimate of
            # the future rewards.
            critic_losses.append(
                huber_loss(tf.expand_dims(value, 0), tf.expand_dims(ret, 0))
            )

        # Backpropagation
        loss_value = sum(actor_losses) + sum(critic_losses)
        grads = tape.gradient(loss_value, model.trainable_variables)
        optimizer.apply_gradients(zip(grads, model.trainable_variables))

        # Clear the loss and reward history
        action_probs_history.clear()
        critic_value_history.clear()
        rewards_history.clear()  
        
    # Log details
    episode_count += 1
    if episode_count % 10 == 0:
        template = "running reward: {:.2f} at episode {}"
        print(template.format(running_reward, episode_count))
wordle set of words data path => https://gist.github.com/cfreshman/a03ef2cba789d8cf00c08f767e0fad7b

My State Space is [guessed alphabets, alphabets contained in the secret word, alphabet in the correct position]

guessed alphabets => Array of 26 size (a-z)

alphabets contained in the secret word => Array of 26 size (a-z)

alphabet in the correct position => Array of 26 * 5 [(a-z), (a-z), (a-z), (a-z), (a-z)] (as each word is 5 letters)

The Available action spaces get updated after every action. The previously taken actions are no longer available for future actions.

I have tried both relu and tanh for activation

Observation: Critic Value keeps increasing to an extremely large values

reinforcement-learning

Sources

This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.

Source: Stack Overflow

Solution	Source

'Actor Critic model returns NaN for action probabilities

Sources

Related Questions