Challenges in Achieving Convergence of the Objective Function During Hyperparameter Optimization

I am trying to optimize the hyperparameters of a deep learning model. I am using Bayesian optimization to find the optimal hyperparameters. The optimization algorithm works well and produces values that result in good calibration and validation scores. However, it never converges to that point, even after 2000 iterations. As you can see from the figure, the values keep fluctuating between 0 and 7, and sometimes even higher, without converging to values like 0.02 or 0.03. Any advice on changes I could make to the code would be helpful. The data is available here: X_train = array([[18.2, 45.4, 31.2], [51.6, 65.4, 40. ], [37.6, - Pastebin.com

import tensorflow as tf
import numpy as np
import random
import pandas as pd
import os
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import Adam, RMSprop, SGD
from bayes_opt import BayesianOptimization
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from bayes_opt import UtilityFunction
import matplotlib.pyplot as plt
from bayes_opt import SequentialDomainReductionTransformer
from matplotlib import gridspec
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.ensemble import RandomForestRegressor 
from sklearn.gaussian_process import GaussianProcessRegressor

# Set seeds for reproducibility
SEED = 42
tf.random.set_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)
os.environ['TF_DETERMINISTIC_OPS'] = '1'

# # Normalize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
scaler1 = StandardScaler()
y_train = scaler1.fit_transform(y_train.reshape(-1, 1))
y_val = scaler1.transform(y_val.reshape(-1, 1))

def create_model(neurons, activation, learning_rate, layers, dropout_rate, optimizer,clipvalue):
    model = Sequential()
    model.add(Dense(neurons, activation=activation, input_shape=(X_train.shape[1],)))
    for _ in range(int(layers) - 1):
        model.add(Dense(neurons, activation=activation))
        model.add(Dropout(dropout_rate))
    model.add(Dense(1))   
    if optimizer == 'adam':
        opt = Adam(learning_rate=learning_rate, clipvalue=clipvalue)
    elif optimizer == 'rmsprop':
        opt = RMSprop(learning_rate=learning_rate, clipvalue=clipvalue)
    else:
        opt = SGD(learning_rate=learning_rate, clipvalue=clipvalue)    
    model.compile(optimizer=opt, loss='mse', metrics=['mse'])
    return model

def model_evaluate(neurons, activation, learning_rate, layers, epochs, dropout_rate, optimizer, clipvalue):
    neurons = int(neurons)
    layers = int(layers)
    epochs = int(epochs)
    clipvalue = float(clipvalue)
    ac = activation_function(activation)
    op = optimizer_function(optimizer) 
    SEED = 42
    tf.random.set_seed(SEED)
    np.random.seed(SEED)
    random.seed(SEED)
    model = create_model(neurons, ac, learning_rate, layers, dropout_rate, op, clipvalue)
    model.fit(X_train, y_train, epochs=epochs, batch_size=16, verbose=0, validation_data=(X_val, y_val))    
    predictions = model.predict(X_val, verbose=0)
    mse = mean_squared_error(y_val, predictions)
    return -mse  # Minimize mean squared error

# Define the hyperparameter space
pbounds = {
    'neurons': (1, 20),
    'activation': (0, 2),  # Will be used to choose between 'relu' and 'tanh'
    'learning_rate': (0.01, 0.1),
    'layers': (1, 7),
    'epochs': (30, 100),
    'dropout_rate': (0.0, 0.3),
    'optimizer': (0, 2),  # Will be used to choose between 'adam', 'rmsprop', and 'sgd'
    'clipvalue': (1.0, 5.0)  # Clip gradients to the range [-clipvalue, clipvalue]
}

def activation_function(x):
    if x <= 1:
        return 'relu'
    else:
        return 'tanh'
    # elif x <= 2 and x > 1:
    #     return 'tanh'
    # else:
    #     return 'sigmoid'
    #return 'relu' if x < 0.5 else 'tanh'

def optimizer_function(x):
    if x <= 1:
        return 'adam'
    elif x > 1 and x < 2:
        return 'rmsprop'
    else:
        return 'sgd'

    
bounds_transformer = SequentialDomainReductionTransformer(minimum_window=0)
optimizer = BayesianOptimization(
    f=model_evaluate,
    pbounds=pbounds,
    random_state=SEED,
    verbose=2,
    bounds_transformer=bounds_transformer,
    allow_duplicate_points=True
)
utility = UtilityFunction(kind="ucb", kappa=1e-35, xi=0.0)
optimizer._gp = GaussianProcessRegressor(random_state=SEED)
for il in range(500):
    next_point = optimizer.suggest(utility)
    target = model_evaluate(**next_point)
    optimizer.register(params=next_point, target=target) 
     
iterations = range(len(optimizer.res))
best_values = [-res['target'] for res in optimizer.res]  # Assuming we're minimizing
plt.plot(iterations, best_values, 'b-o')
plt.ylim(0,20)
plt.xlabel('Iteration')
plt.ylabel('Best Objective Value')
plt.title('Best Objective Value Over Iterations')

i) I tried to adjust the solution space.
ii) I tried different kappa values in Bayesian optimization.
iii) I tried different utility functions.
iv) I ran it for more than 2000 iterations.
But it is not converging at the best values.