'RuntimeError: Found dtype Double but expected Float - Pytorch RL

I am trying to get an actor critic variant of the pendulum running, but I seem to be running into a particular problem.

RuntimeError: Found dtype Double but expected Float

I saw this had come up multiple times before so I have been through those and have attempted to change the data types of my loss (kept in comments) but it is still not working. Could anyone point out how to resolve this so that I can learn from it?

Full code below

import gym, os
import numpy as np
from itertools import count
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Normal
from collections import namedtuple

SavedAction = namedtuple('SavedAction', ['log_prob', 'value'])
LOG_SIG_MAX = 2
LOG_SIG_MIN = -20

class ActorCritic(nn.Module):
    """
    Implementing both heads of the actor critic model
    """
    def __init__(self, state_space, action_space):
        super(ActorCritic, self).__init__()

        self.state_space = state_space
        self.action_space = action_space
        
        # HL 1
        self.linear1 = nn.Linear(self.state_space, 128)
        # HL 2
        self.linear2 = nn.Linear(128, 256)

        # Outputs
        self.critic_head = nn.Linear(256, 1)
        self.action_mean = nn.Linear(256, self.action_space)
        self.action_std = nn.Linear(256, self.action_space)

        # Saving
        self.saved_actions = []
        self.rewards = []

        # Optimizer
        self.optimizer = optim.Adam(self.parameters(), lr = 1e-3)
        self.eps = np.finfo(np.float32).eps.item()

    def forward(self, state):
        """
        Forward pass for both actor and critic
        """

        # State to Layer 1
        l1_output = F.relu(self.linear1(state))

        # Layer 1 to Layer 2
        l2_output = F.relu(self.linear2(l1_output))

        # Layer 2 to Action
        mean = self.action_mean(l2_output)
        std = self.action_std(l2_output)
        std = torch.clamp(std, min=LOG_SIG_MIN, max = LOG_SIG_MAX)
        std = std.exp()

        # Layer 2 to Value
        value_est = self.critic_head(l2_output)

        return value_est, mean, std

    def select_action(self,state):
        state = torch.from_numpy(state).float().unsqueeze(0)

        value_est, mean, std = self.forward(state)
        value_est = value_est.reshape(-1)

        # Make prob Normal dist
        dist = Normal(mean, std)

        action = dist.sample()
        action = torch.tanh(action)

        ln_prob = dist.log_prob(action)
        ln_prob = ln_prob.sum()     

        self.saved_actions.append(SavedAction(ln_prob, value_est))

        action = action.numpy()

        return action[0]


    def compute_returns(self, gamma): # This is the error causing code
        """
        Calculate losses and do backprop
        """

        R = 0

        saved_actions = self.saved_actions

        policy_losses = []
        value_losses = []
        returns = []

        for r in self.rewards[::-1]:
            # Discount value
            R = r + gamma*R
            returns.insert(0,R)

        returns = torch.tensor(returns)
        returns = (returns - returns.mean())/(returns.std()+self.eps)

        for (log_prob, value), R in zip(saved_actions, returns):
            advantage = R - value.item()
            advantage = advantage.type(torch.FloatTensor)
            policy_losses.append(-log_prob*advantage)

            value_losses.append(F.mse_loss(value, torch.tensor([R])))

        self.optimizer.zero_grad()

        loss = torch.stack(policy_losses).sum() + torch.stack(value_losses).sum()

        loss = loss.type(torch.FloatTensor)
        loss.backward()
        self.optimizer.step()

        del self.rewards[:]
        del self.saved_actions[:]

env = gym.make("Pendulum-v0")

state_space = env.observation_space.shape[0]
action_space = env.action_space.shape[0]

# Train Expert AC

model = ActorCritic(state_space, action_space)

train = True
if train == True:

    # Main loop
    window = 50
    reward_history = []

    for ep in count():

        state = env.reset()

        ep_reward = 0

        for t in range(1,1000):

            if ep%50 == 0:
                env.render()

            action = model.select_action(state)

            state, reward, done, _ = env.step(action)

            model.rewards.append(reward)
            ep_reward += reward

            if done:
                break
        print(reward)
        model.compute_returns(0.99) # Error begins here
        reward_history.append(ep_reward)

        # Result information
        if ep % 50 == 0:
            mean = np.mean(reward_history[-window:])
            print(f"Episode: {ep} Last Reward: {ep_reward} Rolling Mean: {mean}")

        if np.mean(reward_history[-100:])>199:
            print(f"Environment solved at episode {ep}, average run length > 200")
            break

Complete error log, some elements redacted for privacy. Originally the loop actor critic and main loop were in separate files. Comments added to appropriate error causing lines.

Traceback (most recent call last):
  File "pendulum.py", line 59, in <module>
    model.compute_returns(0.99)
  File "/home/x/Software/git/x/x/solvers/actorcritic_cont.py", line 121, in compute_returns
    loss.backward()
  File "/home/x/.local/lib/python3.8/site-packages/torch/_tensor.py", line 255, in backward
    torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
  File "/home/x/.local/lib/python3.8/site-packages/torch/autograd/__init__.py", line 147, in backward
    Variable._execution_engine.run_backward(
RuntimeError: Found dtype Double but expected Float


Solution 1:[1]

Answering here in case anyone has similar issues in the future. The output of the reward in OpenAI Gym Pendulum-v0 is a double, so when you compute the return over the episode you need to change that to a float tensor.

I did this just by:

returns = torch.tensor(returns)
returns = (returns - returns.mean())/(returns.std()+self.eps)
returns = returns.type(torch.FloatTensor)

Sources

This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.

Source: Stack Overflow

Solution Source
Solution 1 EmptySet