Is there any way to automatically perform hyperparameter tuning when using the tensorflow custom-manual model?

I took the TF_Transformer_xl model from huggingspace and tried to automatically perform hyperparameter tuning, but I keep getting errors.
The method I’m currently using is hyperopt.
The problem is that the following error occurs when the first training is finished in the place decorated with @tf.function, and the hyperparameter is changed and retrained.


@tf.function
def train_step(model, data1,data2, target, mems, optimizer):
    with tf.GradientTape() as tape:
        outputs = model(concepts=data1,responses=data2, labels=target, mems=mems)
        logit = outputs.logit
        mems = outputs.mems
        logit_mx = target != -100
        logit_value = logit[logit_mx]
        logit_value = tf.reshape(logit_value, [-1, config_xl.R_vocab_size])
        labels = target[logit_mx]

        
        loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=labels, logits=logit_value)
        # batch_loss = tf.reduce_sum(loss) / valid_samples
        mean_loss = tf.reduce_mean(loss)
        train_loss(loss)
        train_accuracy(labels,logit_value)
        predictions =tf.nn.softmax(logit_value)
        train_auc(tf.one_hot(labels, depth=predictions.shape[1]), predictions)

    gradients = tape.gradient(mean_loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    
    return mems,mean_loss


def evaluate(model,test_dataset,config_xl):
    total_loss = 0.0
    num_batches = 0
    evaluation_metrics = []
    test_mems = None

    for input_data, masked_responses, responses in tqdm(test_dataset, desc='eval'):

        outputs = model(concepts=input_data, responses=masked_responses, labels=responses, mems=test_mems, training=False)
        logit = outputs.logit
        test_mems = outputs.mems

        logit_mx = responses != -100
        logit_value = logit[logit_mx]
        logit_value = tf.reshape(logit_value, [-1, config_xl.R_vocab_size])
        labels = responses[logit_mx]

        loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=labels, logits=logit_value)
        mean_loss = tf.reduce_mean(loss)

        # Update precision and recall metrics
        predicted_labels = tf.argmax(logit_value, axis=1)
        predictions =tf.nn.softmax(logit_value)

        
        test_auc(tf.one_hot(labels, depth=predictions.shape[1]), predictions)
        test_precision(labels, predicted_labels)
        test_recall(labels, predicted_labels)

        test_accuracy(labels, logit_value)
        test_loss(loss)
        
        
        precision = test_precision.result().numpy()
        recall = test_recall.result().numpy()
        f1_score = 2 * (precision * recall) / (precision + recall + 1e-7)

        evaluation_metrics.append(test_accuracy.result().numpy())

        total_loss += mean_loss.numpy()
        num_batches += 1


def train(train_dataset,config_xl):
    try:
        learning_rate = CustomSchedule(config_xl.d_model)

        optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)
        model = TFTransfoXLMLMHeadModel(config=config_xl)

        loss_values = []
        num_batches = 0

        for epoch in range(config_xl.epoch):
            start = time.time()
            total_loss = 0.0
            mems = None                   
            for input_data, masked_responses, responses in tqdm(train_dataset, desc='train'):
                mems, loss_value = train_step(model, input_data,masked_responses, responses, mems, optimizer)
                num_batches += 1
                total_loss += loss_value.numpy()



def main(config_xl) :
    train_dataset,test_dataset,dkeyid2idx=load_TFdataset(config_xl)
    model =train(train_dataset.take(10),config_xl)
    test_loss,test_acc,test_precision, test_recall, test_f1_score = evaluate(model, test_dataset,config_xl)


if __name__ == "__main__":

    config_xl = TransfoXLConfig(
            d_embed=args.d_embed,
            d_head = args.d_head,
            d_model=args.d_model,
            mem_len=args.mem_len,
            n_head=args.n_head,
            n_layer=args.n_layer,
            eos_token = args.eos_token,
            mask_token=args.mask_token,
            batch_size=args.batch_size,
            tgt_len=args.tgt_len,
            C_vocab_size=args.C_vocab_size,
            Q_vocab_size = args.Q_vocab_size,
            R_vocab_size = args.R_vocab_size,
            epoch = args.epoch,
            mode = args.mode, # concepts or questions 
            tf_data_dir = args.tf_data_dir,
            tensorboard_log_dir = args.tensorboard_log_dir,
            tensorboard_emb_log_dir = args.tensorboard_emb_log_dir,
            model_save_dir = args.model_save_dir
        )
    
    #hyperparameter
    # d_inner, num_layer, n_head, dropout 변동은 자유롭게, d_embedding과 d_model은 일치 
    space = {
    'num_layer': hp.quniform('num_layer', low=4, high=12, q=2),  # 32에서 128 사이의 값을 8의 배수로 선택
    'n_head': hp.quniform('n_head', low=6, high=12, q=2)  # 128에서 512 사이의 값을 64의 배수로 선택
}
    


    logging.info('config_xl:  %s',config_xl)
    

    # Create a new MLflow Experiment
    mlflow.set_experiment("MLflow Test")

    # Start an MLflow run
    with mlflow.start_run():
        #set a run name
        mlflow.set_tag("mlflow.runName", '{}ep_{}mem_{}'.format(args.epoch,args.mem_len, args.mode))
        
        # Set a tag that we can use to remind ourselves what this run was for
        mlflow.set_tag("Training Info", '{}ep_{}mem_{}'.format(args.epoch,args.mem_len, args.mode))

        # Log the hyperparameters
        mlflow.log_params(config_xl.to_dict())
        # mlflow.tensorflow.autolog()


        # main(config_xl)
        trials = Trials()
        best = fmin(
            fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=8,
            trials=trials,
        )
        train_dataset,test_dataset,dkeyid2idx=load_TFdataset(config_xl)
        input_data, masked_responses, responses = next(iter(test_dataset))
   
        input_schema = Schema(
        [
            TensorSpec(np.dtype(np.int32), (-1,len(input_data[1].numpy())), "input_data"),
            TensorSpec(np.dtype(np.int32), (-1,len(masked_responses[1].numpy())), "responses"),
        ])


        signature = ModelSignature(input_schema)

      
        best_run = sorted(trials.results, key=lambda x: x["loss"])[0]
ERROR:root:Error: in user code:

    File "train_args_mlflows.py", line 113, in train_step  *
        outputs = model(concepts=data1,responses=data2, labels=target, mems=mems)
    File "/home/jun/miniconda3/envs/new1/lib/python3.8/site-packages/keras/utils/traceback_utils.py", line 70, in error_handler  **
        raise e.with_traceback(filtered_tb) from None
    File "/tmp/__autograph_generated_filej0_kgex_.py", line 26, in tf__call
        transformer_outputs = ag__.converted_call(ag__.ld(self).transformer, (ag__.ld(inputs)['concepts'], ag__.ld(inputs)['responses'], ag__.ld(inputs)['mems'], ag__.ld(inputs)['head_mask'], ag__.ld(inputs)['inputs_embeds'], ag__.ld(inputs)['output_attentions'], ag__.ld(inputs)['output_hidden_states'], ag__.ld(inputs)['return_dict']), dict(training=ag__.ld(inputs)['training']), fscope)
    File "/tmp/__autograph_generated_filekwj7svil.py", line 126, in tf__call
        ag__.if_stmt((ag__.ld(inputs)['inputs_embeds'] is not None), if_body_6, else_body_6, get_state_6, set_state_6, ('word_emb',), 1)
    File "/tmp/__autograph_generated_filekwj7svil.py", line 120, in else_body_6
        word_emb_C = ag__.converted_call(ag__.ld(self).word_emb_C, (ag__.ld(inputs)['concepts'],), None, fscope)

    ValueError: Exception encountered when calling layer 'tf_transfo_xlmlm_head_model_1' (type TFTransfoXLMLMHeadModel).
    
    in user code:
    
        File "/home/jun/workspace/KT/models/model_for_kt.py", line 1782, in call  *
            transformer_outputs = self.transformer(
        File "/home/jun/miniconda3/envs/new1/lib/python3.8/site-packages/keras/utils/traceback_utils.py", line 70, in error_handler  **
            raise e.with_traceback(filtered_tb) from None
        File "/tmp/__autograph_generated_filekwj7svil.py", line 126, in tf__call
            ag__.if_stmt((ag__.ld(inputs)['inputs_embeds'] is not None), if_body_6, else_body_6, get_state_6, set_state_6, ('word_emb',), 1)
        File "/tmp/__autograph_generated_filekwj7svil.py", line 120, in else_body_6
            word_emb_C = ag__.converted_call(ag__.ld(self).word_emb_C, (ag__.ld(inputs)['concepts'],), None, fscope)
    
        ValueError: Exception encountered when calling layer 'transformer' (type TFTransfoXLMLMMainLayer).
        
        in user code:
        
            File "/home/jun/workspace/KT/models/model_for_kt.py", line 1171, in call  *
                word_emb_C = self.word_emb_C(inputs["concepts"])
            File "/home/jun/miniconda3/envs/new1/lib/python3.8/site-packages/keras/utils/traceback_utils.py", line 70, in error_handler  **
                raise e.with_traceback(filtered_tb) from None
        
            ValueError: tf.function only supports singleton tf.Variables created on the first call. Make sure the tf.Variable is only created once or created outside tf.function. See https://www.tensorflow.org/guide/function#creating_tfvariables for more information.
        
        
        Call arguments received by layer 'transformer' (type TFTransfoXLMLMMainLayer):
          • concepts=tf.Tensor(shape=(65, 140), dtype=int32)
          • responses=tf.Tensor(shape=(65, 140), dtype=int32)
          • mems=None
          • head_mask=None
          • inputs_embeds=None
          • output_attentions=False
          • output_hidden_states=False
          • return_dict=True
          • labels=None
          • training=False
          • kwargs=<class 'inspect._empty'>
    
    
    Call arguments received by layer 'tf_transfo_xlmlm_head_model_1' (type TFTransfoXLMLMHeadModel):
      • concepts=tf.Tensor(shape=(65, 140), dtype=int32)
      • responses=tf.Tensor(shape=(65, 140), dtype=int32)
      • mems=None
      • head_mask=None
      • inputs_embeds=None
      • output_attentions=None
      • output_hidden_states=None
      • return_dict=None
      • labels=tf.Tensor(shape=(65, 140), dtype=int32)
      • training=False
      • kwargs=<class 'inspect._empty'>

Is there any automated hyperparameter tuning method that I can use in the code I provided, even if it’s not hyperopt?

[Google DeepMind Assisted]

The error you’re encountering suggests that there’s an issue with the way TensorFlow variables are being handled within a tf.function. The error message “tf.function only supports singleton tf.Variables created on the first call” implies that TensorFlow variables are being created on subsequent calls to the tf.function, which is not supported.

This can often happen when using high-level APIs or custom models that are not fully compatible with the way tf.function works, especially when combined with dynamic operations like hyperparameter tuning where the model’s architecture or parameters may change between runs.

To address this issue and still perform automated hyperparameter tuning with your TensorFlow custom model, consider the following approaches:

1. Use TensorFlow’s Hyperband or RandomSearch:

TensorFlow’s Keras Tuner library includes Hyperband and RandomSearch classes that are designed to work seamlessly with TensorFlow models. These tuners can handle hyperparameter optimization in a more integrated way with TensorFlow’s execution model. You would need to define a model-building function that takes hyperparameters as input and produces a compiled model, which you can then pass to the tuner.

2. Modify Your Training Loop:

Ensure that the model and its variables are created outside of any tf.function decorated functions. You can then pass the model into the tf.function as an argument. This approach helps to ensure that the model’s variables are created only once, avoiding the issue highlighted by the error message.

3. Simplify the Use of tf.function:

If possible, reduce the complexity within the tf.function decorated functions or limit their use to only the most performance-critical parts of your code. This can sometimes help avoid issues related to variable creation and state management within tf.function.

4. Use a Different Hyperparameter Tuning Library:

If the integration with Hyperopt is causing issues, you might consider other hyperparameter tuning libraries that might have better or different integrations with TensorFlow. Libraries such as Optuna or Ray Tune offer flexible and powerful hyperparameter optimization frameworks that can be adapted to work with TensorFlow models.

5. Ensure Proper Model Re-Initialization:

When performing hyperparameter tuning, ensure that the model is properly re-initialized or reconstructed from scratch in each tuning iteration. This can help prevent carryover state or variables from one iteration to the next, which might be causing the error.

Example with Keras Tuner:

pythonCopy code

import kerastuner as kt

def model_builder(hp):
    # Define hyperparameters
    num_layers = hp.Int('num_layers', min_value=2, max_value=20, step=2)
    d_model = hp.Int('d_model', min_value=64, max_value=512, step=64)
    # Model architecture
    model = ...  # Your model definition using num_layers, d_model, etc.
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

tuner = kt.Hyperband(model_builder,
                     objective='val_accuracy',
                     max_epochs=10,
                     directory='my_dir',
                     project_name='intro_to_kt')

tuner.search(train_dataset, validation_data=val_dataset, epochs=50)

Remember, when integrating hyperparameter tuning, especially with complex models like TF_Transformer_xl, it’s crucial to ensure that the model’s architecture and the tuning loop are compatible and that TensorFlow’s execution model is respected, particularly in relation to variable creation and reuse within tf.function decorated functions.

Thank you. I’ve clearly learned which parts to modify