I’m currently working on converting snake-dqn project(Tensorflow demo, tfjs-examples/snake-dqn at master · tensorflow/tfjs-examples · GitHub) from tensorflow.js to python tensorflow.
And the code below is the converted code of the trainOnReplayBatch function of snake-dqn’s agent.js to Python.
def train_on_replay_batch(self, batch_size, gamma, optimizer):
"""
:param batch_size:
:type batch_size: int
:param gamma:
:type gamma:
:param optimizer:
:type optimizer:
:return:
:rtype:
"""
batch = self.replay_memory.sample(batch_size)
#I edited get_state_tensor function because i don't need parameter h and w.
#i think it is not causing the error.
state_tensor = get_state_tensor([example[0] for example in batch])
action_tensor = tf.constant([example[1] for example in batch], dtype='int32')
reward_tensor = tf.constant([example[2] for example in batch], dtype=tf.int32)
next_state_tensor = get_state_tensor([example[4] for example in batch])
with tf.GradientTape() as tape:
qs = tf.Variable(tf.cast(tf.math.reduce_sum(
self.online_network.__call__(state_tensor, True) * (tf.one_hot(action_tensor, SQUARE_SIZE)), axis=-1),
dtype=tf.int32))
result = tf.Variable(self.target_network(next_state_tensor))
next_max_q_tensor = tf.math.reduce_max(result, axis=-1)
tape.watch(next_max_q_tensor)
done_mask = tf.cast(
tf.Variable(1, dtype=tf.int32) - tf.Variable([example[3] for example in batch], dtype=tf.int32),
dtype=tf.float32)
tape.watch(done_mask)
target_qs = reward_tensor + tf.cast(next_max_q_tensor * done_mask * gamma, dtype=tf.int32)
loss_fn = keras.losses.MeanSquaredError()
loss = tf.reduce_mean(loss_fn(target_qs, qs))
grads = tape.gradient(target=tf.cast(loss,dtype=tf.float32),sources= self.target_network.trainable_variables)
optimizer.apply_gradients(zip(grads,self.target_network.trainable_variables))
tf.dispose(grads)
When I run this code, i get ValueError: No gradients provided for any variable.
I asked about this error in other websites, and i got some solutions from them.
Here are solutions:
- tape.watch(next_max_q_tensor) and tape.watch(done_mask) make next_max_q_tensor and done_mask to be leaf variable. So remove that code and insert tf.stop_gradient(next_max_q_tensor)
- these codes try to update parameters of target_network using next_max_q_tensor (which is leaf variable).So change code to update online_network instead.
And i re-writed code and it comes below.
def train_on_replay_batch(self, batch_size, gamma, optimizer):
"""
:param batch_size:
:type batch_size: int
:param gamma:
:type gamma:
:param optimizer:
:type optimizer:
:return:
:rtype:
"""
batch = self.replay_memory.sample(batch_size)
#I edited get_state_tensor function because i don't need parameter h and w.
#i think it is not causing the error.
state_tensor = get_state_tensor([example[0] for example in batch])
action_tensor = tf.constant([example[1] for example in batch], dtype=tf.int32)
reward_tensor = tf.constant([example[2] for example in batch], dtype=tf.float32)
next_state_tensor = get_state_tensor([example[4] for example in batch])
loss_fn = keras.losses.MeanSquaredError()
with tf.GradientTape() as tape:
qs = tf.Variable(tf.cast(tf.math.reduce_sum(
self.online_network.__call__(state_tensor, training=True) * (tf.one_hot(action_tensor, SQUARE_SIZE)),
axis=-1),
dtype=tf.float32))
result = tf.Variable(self.target_network(next_state_tensor))
next_max_q_tensor = tf.math.reduce_max(result, axis=-1)
done_mask = tf.Variable(1, dtype=tf.float32) - tf.Variable([example[3] for example in batch], dtype=tf.float32)
target_qs = reward_tensor + next_max_q_tensor * done_mask * gamma
loss = tf.reduce_mean(loss_fn(target_qs, qs))
grads = tape.gradient(target=tf.cast(loss, dtype=tf.float32), sources=self.target_network.trainable_variables)
optimizer.apply_gradients(zip(grads, self.online_network.trainable_variables))
tf.dispose(grads)
But i still get the same error. What should i do to fix it?