'Actor-Critic Reinforcement Learning,network can't learning
I have some Actor-Critic reinforcement learning questions want to ask.In my code,I try to use two network(actor and critic) to play Pong-game,but I don't know my "learn()" function have problem?The "Loss1" is actor_network loss(-logP(a|s) * TDerror),the "Loss2" is critic_network loss((pre - TDtarget) ** 2 / 2).
class Actor_Critic():
def __init__(self,input_dim=6400,n_actions=2):
self.input_dim = input_dim
self.n_action = n_actions//Pong game action space
self.lr = 0.0005
self.gamma = 0.99 // discounted factor
self.critic_network = self.__build_critic_network()
self.actor_network = self.__build_actor_network()
def __build_critic_network(self):
model_input = layers.Input(shape=(self.input_dim + self.n_action,))
layer1 = layers.Dense(128, activation='relu')(model_input)
layer2 = layers.Dense(32, activation='relu')(layer1)
model_output = layers.Dense(1, activation=None)(layer2)
model = Model(model_input,model_output)
model.compile(optimizer=Adam(learning_rate=self.lr))
return model
def __build_actor_network(self):
model_input = layers.Input(shape=(self.input_dim,))
layer1 = layers.Dense(128, activation='relu')(model_input)
layer2 = layers.Dense(32, activation='relu')(layer1)
model_output = layers.Dense(self.n_action, activation='softmax')(layer2)
model = Model(model_input, model_output)
model.compile(optimizer=Adam(learning_rate=self.lr))
return model
def choose_action(self,state):
state = tf.convert_to_tensor([state],dtype=tf.float16)
probability = self.actor_network(state)
action_probs = tfp.distributions.Categorical(probs=probability)
action = action_probs.sample()
return action_probs,action.numpy()[0] // return action probability and action
def action_hot_code(self, action):
action_hot_code = np.zeros(self.n_action, dtype=np.float16)
action_hot_code[action] = 1.0
action_hot_code = tf.convert_to_tensor([action_hot_code], dtype=tf.float16)
return action_hot_code
def learn(self,state,reward,next_state,done):
with tf.GradientTape(persistent=True) as tape:
action_probs, action = self.choose_action(state)
next_action_prob, next_action = self.choose_action(next_state)
action_hot_code = self.action_hot_code(action)
next_action_hot_code = self.action_hot_code(next_action)
state = tf.convert_to_tensor([state], dtype=tf.float16)
next_state = tf.convert_to_tensor([next_state], dtype=tf.float16)
state_action = tf.concat([state, action_hot_code], axis=1)
next_state_action = tf.concat([next_state, next_action_hot_code], axis=1)
state_value = self.critic_network(state_action)
next_state_value = self.critic_network(next_state_action)
TDtarget = reward + self.gamma * next_state_value * (1-done)
TDerror = state_value - TDtarget
Loss1 = -TDerror * action_probs.log_prob(action)
Loss2 = (TDerror ** 2) / 2
Loss1 = tf.squeeze(Loss1)
Loss2 = tf.squeeze(Loss2)
gradient1 = tape.gradient(Loss1, self.actor_network.trainable_variables)
gradient2 = tape.gradient(Loss2,self.critic_network.trainable_variables)
self.actor_network.optimizer.apply_gradients(zip(gradient1, self.actor_network.trainable_variables))
self.critic_network.optimizer.apply_gradients(zip(gradient2,self.critic_network.trainable_variables))
Sources
This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.
Source: Stack Overflow
| Solution | Source |
|---|
