Customize the attention mechanism but report an error, tensorflow: Gradients do not exist for variables

Sorry, I just started learning to build neural networks using tensorflow. The tensorflow version I am using is 2.3. I want to use a custom attention layer to associate the output of the encoding layer with another input, this is my main code

tf.config.experimental_run_functions_eagerly(True)

class InteractionLayer(tf.keras.Model):

  def __init__(self, hidden_units, num_neighbors, step_input):
      super(InteractionLayer, self).__init__()
      self.hidden_units = hidden_units
      self.step_input = step_input
  def build(self, input_shape):
      self.wq = tf.keras.layers.Dense(self.step_input, trainable=True, name='wq')
      self.wk = tf.keras.layers.Dense(self.step_input, trainable=True, name='wk')
      self.wv = tf.keras.layers.Dense(self.step_input, trainable=True, name='wv')
      self.reshape = tf.keras.layers.Reshape(target_shape=(self.step_input, self.hidden_units \* 2),trainable=True)
      self.reshape1 = tf.keras.layers.Reshape(target_shape=(self.step_input, 1, 2, self.hidden_units),trainable=True)
      self.dense1 = tf.keras.layers.Dense(self.hidden_units \* 2, trainable=True)
      self.reshape_weight = tf.keras.layers.Reshape(target_shape=(self.step_input, 1, self.step_input),trainable=True)
      self.dense_weight = tf.keras.layers.Dense(2, trainable=True)

  def call(self, hidden, risk, **kwargs):

      hidden_reshape = self.reshape(hidden)

      current_hidden = self.wq(hidden_reshape)
      neighbor_hidden = self.wk(risk)
      v_hidden = self.wv(hidden_reshape)

      similarity_scores = tf.linalg.matmul(current_hidden, neighbor_hidden, transpose_b=True)

      interaction_weights = tf.nn.softmax(similarity_scores)
      fused_representation = tf.linalg.matmul(interaction_weights, v_hidden)

      fused_representation_1 = self.dense1(fused_representation)
      fused_representation_2 = self.reshape1(fused_representation_1)

      attention_weights = self.reshape_weight(interaction_weights)
      attention_weights_out = self.dense_weight(attention_weights)

      return fused_representation_2, attention_weights_out
class MyLoss(tf.keras.losses.Loss):

def __init__(self, attention):
    super(MyLoss, self).__init__()
    self.risk = attention

def call(self, y_true, y_pred):
    loss = tf.reduce_sum(tf.square(y_true - y_pred) * self.risk)
    return loss

model.compile(optimizer=tf.keras.optimizers.Adam(0.001, clipnorm=1.0),loss=MyLoss(myloss),experimental_run_tf_function=False

Sorry, I just started learning to build neural networks using tensorflow. The tensorflow version I am using is 2.3. I want to use a custom attention layer to associate the output of the encoding layer with another input, this is my main code


tf.config.experimental_run_functions_eagerly(True)

class InteractionLayer(tf.keras.Model):

  def __init__(self, hidden_units, num_neighbors, step_input):
      super(InteractionLayer, self).__init__()
      self.hidden_units = hidden_units
      self.step_input = step_input
  def build(self, input_shape):
      self.wq = tf.keras.layers.Dense(self.step_input, trainable=True, name='wq')
      self.wk = tf.keras.layers.Dense(self.step_input, trainable=True, name='wk')
      self.wv = tf.keras.layers.Dense(self.step_input, trainable=True, name='wv')
      self.reshape = tf.keras.layers.Reshape(target_shape=(self.step_input, self.hidden_units \* 2),trainable=True)
      self.reshape1 = tf.keras.layers.Reshape(target_shape=(self.step_input, 1, 2, self.hidden_units),trainable=True)
      self.dense1 = tf.keras.layers.Dense(self.hidden_units \* 2, trainable=True)
      self.reshape_weight = tf.keras.layers.Reshape(target_shape=(self.step_input, 1, self.step_input),trainable=True)
      self.dense_weight = tf.keras.layers.Dense(2, trainable=True)

  def call(self, hidden, risk, **kwargs):

      hidden_reshape = self.reshape(hidden)

      current_hidden = self.wq(hidden_reshape)
      neighbor_hidden = self.wk(risk)
      v_hidden = self.wv(hidden_reshape)

      similarity_scores = tf.linalg.matmul(current_hidden, neighbor_hidden, transpose_b=True)

      interaction_weights = tf.nn.softmax(similarity_scores)
      fused_representation = tf.linalg.matmul(interaction_weights, v_hidden)

      fused_representation_1 = self.dense1(fused_representation)
      fused_representation_2 = self.reshape1(fused_representation_1)

      attention_weights = self.reshape_weight(interaction_weights)
      attention_weights_out = self.dense_weight(attention_weights)

      return fused_representation_2, attention_weights_out

def Seq2Seq(hidden_units, step_input, feature, num_neighbors):

    encoder_inputs = tf.keras.Input(shape=(step_input, 1, feature, 1), name="encode_input")
    decoder_inputs = tf.keras.Input(shape=(step_input, 1, feature, 1), name="decode_input")
    risk_input = tf.keras.Input(shape=(step_input, 6), name="risk_input")
    encoder = Encoder(hidden_units, step_input, num_neighbors)
    enc_outputs, enc_state_h, enc_state_c = encoder(encoder_inputs,  risk_input)
    interaction = InteractionLayer(hidden_units, num_neighbors, step_input)
    interaction_output, attention_weights = interaction(enc_outputs, risk_input)
    dec_states_inputs = [enc_state_h, enc_state_c]
    decoder = Decoder(hidden_units, step_input, num_neighbors) 
    attention_output, dec_state_h, dec_state_c = decoder(interaction_output, decoder_inputs,dec_states_inputs)
    conv2d_outputs = Conv2D(filters=1, kernel_size=(1, 2), activation='relu', padding='same',name="conv2d")(attention_output)
    model = tf.keras.Model(inputs=[encoder_inputs, decoder_inputs, risk_input], outputs=conv2d_outputs)
    return model, attention_weights

model, myloss = Seq2Seq(hidden_units, step_input, feature, Surrounding_Risk_Quantity)

class MyLoss(tf.keras.losses.Loss):

def __init__(self, attention):
    super(MyLoss, self).__init__()
    self.risk = attention

def call(self, y_true, y_pred):
    loss = tf.reduce_sum(tf.square(y_true - y_pred) * self.risk)
    return loss

model.compile(optimizer=tf.keras.optimizers.Adam(0.001, clipnorm=1.0),loss=MyLoss(myloss),experimental_run_tf_function=False

)

history = model.fit(\[x_train_1, x_train_eval, x_train_risk\], y_train, batch_size=16, epochs=2,validation_data=(\[x_test_1, x_test_eval, x_test_risk\], y_test),validation_freq=1, callbacks=\[cp_callback\])

When I run this code a warning appears WARNING:tensorflow:Gradients do not exist for variables [‘interaction_layer/wq/kernel:0’, ‘interaction_layer/wq/bias:0’, ‘interaction_layer/wk/kernel:0’, ‘interaction_layer/wk/bias:0’, ‘interaction_layer/wv/kernel:0’, ‘interaction_layer/wv/bias:0’, ‘interaction_layer/dense/kernel:0’, ‘interaction_layer/dense/bias:0’, ‘interaction_layer/dense_1/kernel:0’, ‘interaction_layer/dense_1/bias:0’] when minimizing the loss.

Although it is a warning, I guess it will affect the accuracy of the model. I guess the error may be related to the reshape operation, so I tried to use ‘tf.debugging.assert_all_finite(hidden_reshape,“Tensor contains non-finite values.”)’ to detect whether its direction propagation is normal, and no error was reported.

I used tf.print to print the output of wq, wk and wv, and the output shows that their weights have values and the bias term is 0.

I have interaction_layer in my model.summary.

My question is 1. Is there any structural error in my custom attention mechanism? 2. How should I solve this error problem? 3. Is my custom loss function correct? If I want to add the weight of the attention mechanism to it, how should I define this loss function? I would appreciate it if you could give me some advice.