Im trying to deploy a TFT on my data but even trying 4 different approach once i arrive intot he mutihead i keep having same error on and on. here the shapes and the code i use.
Shapes for the data:
Before splitting the data
df shape is: (10896, 81)
df static feature shape (10896, 64)
Shapes after sequence creation and reshaping:
X_train_temporal shape: (10574, 28, 10), X_train_static shape: (10574, 28, 64), y_train shape: (10574,)
X_val_temporal shape: (119, 28, 10), X_val_static shape: (119, 28, 64), y_val shape: (119,)
X_test_temporal shape: (119, 28, 10), X_test_static shape: (119, 28, 64), y_test shape: (119,)
X_train_static shape (10574, 28, 64) X_val_static shape (119, 28, 64) X_test_static shape (119, 28, 64)
Time sereis.
# ======================================================================================
# Hyperparameters setup
# ======================================================================================
params = {
"learning_rate": 1e-3, # Learning rate for the optimizer
"epochs": 100, # Number of epochs for training
"batch_size": 128, # Batch size for training
"time_steps": 28, # Number of time steps for the model
"dropout": 0.1, # Dropout rate for the transformer block
"mlp_dropout": 0.1, # Dropout rate for the MLP\ "dropout_rate": 0.1,
"dropout_rate": 0.1,
"dropout_rate1": 0.2, # Dropout rate for regularization
"dropout_rate2": 0.2, # Dropout rate for regularization
"dropout_rate3": 0.2, # Dropout rate for regularization
"dropout_rate4": 0.2, # Dropout rate for regularization
"dropout_rate5": 0.2, # Dropout rate for regularization
"l1": 0.005, # L1 regularization rate
"l2": 0.001, # L2 regularization rate
"threshold": 10.0, # Threshold for StopOnTooLargeLoss
"patience_es": 300, # Patience for EarlyStopping
"factor_lr": 0.1, # Factor for ReduceLROnPlateau
"decay_factor": 0.9, # Decay factor for learning rate decay
"decay_step_multiplier": 10, # Decay step multiplier for learning rate decay
"patience_lr": 2, # Patience for ReduceLROnPlateau
"min_lr": 1e-9, # Minimum learning rate for ReduceLROnPlateau
"initial_lr": 1e-2, # Initial learning rate for GradualLRDecay
"use_multiprocessing": True, # Use multiprocessing for parallelization
"workers": 8, # Number of workers for multiprocessing
"max_queue_size": 10, # Maximum queue size for multiprocessing
"warmup_epochs": 5, # Number of epochs for learning rate warmup
"beta_1": 0.9, # Beta1 for the Adam optimizer
"beta_2": 0.999, # Beta2 for the Adam optimizer
"epsilon": 1e-7, # Epsilon for the Adam optimizer
"clipvalue": 0.5, # Clip value for gradient clipping
"clipnorm": 1.0, # Clip norm for gradient clipping
# Transformer-specific hyperparameters
"num_static_features": X_train_static.shape[-1],
"num_temporal_features": X_train_temporal.shape[-1],
"static_encoder_units": 32,
"temporal_encoder_units": 64,
"temporal_conv_filters": 64,
"temporal_conv_kernel": 3,
"lstm_units": 32,
"num_heads": 8,
"head_size": 16,
"ff_dim": 128,
"num_transformer_blocks": 4,
}
# ======================================================================================
# Hyperparameters setup
# ======================================================================================
model code
def static_encoder(static_input, static_encoder_units):
"""Encodes static features using a dense layer.
Args:
static_input: A 3D tensor of shape (batch_size, time_steps, num_static_features)
representing the static input.
static_encoder_units: Number of units in the dense layer.
Returns:
A 2D tensor of shape (batch_size, static_encoder_units) representing the
encoded static features.
"""
# Input Shape Validation
if len(static_input.shape) != 3:
raise ValueError("Static input should be a 3D tensor with shape (batch_size, time_steps, num_static_features)")
# Encoding
x = Dense(static_encoder_units, activation='relu')(static_input)
x = Flatten()(x) # Flatten for concatenation
return x
def temporal_encoder(temporal_input, temporal_encoder_units, temporal_conv_filters, temporal_conv_kernel):
"""Encodes temporal features using an LSTM and a Conv1D layer.
Args:
temporal_input: A 3D tensor with shape (batch_size, time_steps, num_temporal_features),
representing the temporal input.
temporal_encoder_units: Number of units in the LSTM layer.
temporal_conv_filters: Number of filters in the Conv1D layer.
temporal_conv_kernel: Kernel size of the Conv1D layer.
Returns:
A 2D tensor of shape (batch_size, temporal_encoder_units) representing the
encoded temporal features.
"""
# Input Shape Validation (similar to static_encoder)
if len(temporal_input.shape) != 3:
raise ValueError("Temporal input should be a 3D tensor with shape (batch_size, time_steps, num_temporal_features)")
# Temporal Encoding
x = LSTM(temporal_encoder_units, return_sequences=True)(temporal_input)
x = Conv1D(filters=temporal_conv_filters, kernel_size=temporal_conv_kernel, activation='relu')(x)
x = Flatten()(x)
return x
def variable_selection_network(encoders_concat, num_temporal_features, lstm_units):
"""Implements variable selection using an LSTM layer.
Args:
encoders_concat: A 2D tensor with shape (batch_size, num_features) representing
the concatenated encoded features.
num_temporal_features: The number of temporal features.
lstm_units: Number of units in the LSTM layer.
Returns:
A 2D tensor with shape (batch_size, num_temporal_features) representing
variable selection weights.
"""
x = Reshape((1, -1))(encoders_concat) # Reshape for LSTM input (batch_size, 1, num_features)
x = LSTM(lstm_units)(x) # Process with LSTM
x = Dense(num_temporal_features, activation='sigmoid')(x)
return x
def fix_shape_mismatch(encoders_concat, num_heads, head_size):
# Calculate the total dimension size required for the transformer block
total_dim_size = num_heads * head_size
# Calculate the padding size
padding_size = total_dim_size - (encoders_concat.shape[-1] % total_dim_size)
# Pad the encoders_concat tensor to match the required dimension size for the transformer block
if padding_size > 0:
padding = tf.zeros((tf.shape(encoders_concat)[0], padding_size))
encoders_concat = tf.concat([encoders_concat, padding], axis=-1)
return encoders_concat
def transformer_block(inputs, num_heads, head_size, ff_dim, dropout_rate=0.1):
"""A single Transformer block with Multi-Head Attention, feed-forward network, and shape checks.
Args:
inputs: A 2D tensor. Expects shape (batch_size, embedding_dim).
num_heads: Number of attention heads.
head_size: Dimensionality of each head.
ff_dim: Hidden layer size in the feed-forward network.
dropout_rate: Dropout rate for regularization.
Returns:
A 2D tensor of shape (batch_size, embedding_dim) representing the output.
"""
print("Input shape to transformer block:", inputs.shape)
# Input Shape Check
if inputs.shape[-1] != num_heads * head_size:
raise ValueError(f"Input to transformer block should have a final dimension "
f"divisible by num_heads * head_size (Got shape {inputs.shape})")
print("this print is before the attn_output")
# Multi-Head Attention Layer (Masked for self-attention)
attn_output, _ = tf.keras.layers.MultiHeadAttention(
num_heads=num_heads, key_dim=head_size
)(inputs, inputs)
print("attn_output shape:", attn_output.shape)
# Shape Check after Attention
if attn_output.shape[-1] != num_heads * head_size:
raise ValueError(f"Output of MultiHeadAttention should have final dimension "
f"divisible by num_heads * head_size. Got: {attn_output.shape}")
attn_output = tf.keras.layers.Dropout(dropout_rate)(attn_output)
attn_output = tf.keras.layers.LayerNormalization(epsilon=1e-6)(inputs + attn_output)
# Feed Forward Network
ff_output = tf.keras.layers.Dense(ff_dim, activation='relu')(attn_output)
ff_output = tf.keras.layers.Dropout(dropout_rate)(ff_output)
ff_output = tf.keras.layers.Dense(inputs.shape[-1])(ff_output)
ff_output = tf.keras.layers.LayerNormalization(epsilon=1e-6)(attn_output + ff_output)
return ff_output
def build_simple_tft_model(params):
"""Builds a Temporal Fusion Transformer (TFT) model.
Args:
params: A dictionary containing the following hyperparameters:
- num_static_features: Number of static features.
- num_temporal_features: Number of temporal features.
- static_encoder_units: Number of units in the static encoder.
- temporal_encoder_units: Number of units in the temporal encoder.
- temporal_conv_filters: Number of filters in the Conv1D layer.
- temporal_conv_kernel: Kernel size of the Conv1D layer.
- lstm_units: Number of units in the LSTM-based variable selection layer.
- num_heads: Number of attention heads in the transformer blocks.
- head_size: Dimensionality of each attention head.
- ff_dim: Hidden layer size in the transformer feed-forward network.
- dropout_rate: Dropout rate for regularization.
- num_transformer_blocks: Number of transformer blocks.
Returns:
A compiled Keras model.
"""
# Inputs
static_input = Input(shape=(params['time_steps'], params['num_static_features']))
temporal_input = Input(shape=(params['time_steps'], params['num_temporal_features']))
# Encoding
static_encoded = static_encoder(static_input, params['static_encoder_units'])
temporal_encoded = temporal_encoder(temporal_input, params['temporal_encoder_units'],
params['temporal_conv_filters'], params['temporal_conv_kernel'])
# Concatenate Encoded Features
encoders_concat = Concatenate()([static_encoded, temporal_encoded])
# Variable Selection
variable_selection = variable_selection_network(encoders_concat,
params['num_temporal_features'],
params['lstm_units'])
# Apply selection weights
encoders_concat = tf.reshape(encoders_concat, (-1, params['num_temporal_features']))
selected_features = tf.keras.layers.Multiply()([encoders_concat, variable_selection])
print("Shape of encoders_concat before reshape:", encoders_concat.shape)
print("Shape of selected_features before reshape:", selected_features.shape)
# # Reshape for transformer compatibility
if selected_features.shape[-1] != params['num_heads'] * params['head_size']:
# Apply a Dense layer to adjust the dimensionality to the expected size
selected_features = tf.keras.layers.Dense(params['num_heads'] * params['head_size'], activation='relu')(selected_features)
print("Shape of selected_features after reshape with dense:", selected_features.shape)
# Transformer Blocks
x = selected_features
print("Input shape to transformer x:", x.shape) # Should be (batch_size, embedding_dim)
# Shape Check
expected_shape = (None, params['num_heads'] * params['head_size'])
if x.shape != expected_shape:
raise ValueError(f"Input to transformer blocks should have shape {expected_shape}. Received shape: {x.shape}")
for _ in range(params['num_transformer_blocks']):
x = transformer_block(x, params['num_heads'], params['head_size'], params['ff_dim'], params['dropout_rate'])
# Output Layer
output = Dense(1)(x)
# Model Creation
model = Model(inputs=[static_input, temporal_input], outputs=output)
# Compilation
model.compile(loss='mean_squared_error', optimizer='adam')
return model
model compile
# Build the TFT Model
model = build_simple_tft_model(params)
# Updated optimizer configuration
optimizer = tf.keras.optimizers.Adam(learning_rate=params["learning_rate"],
beta_1=params["beta_1"],
beta_2=params["beta_2"],
epsilon=params["epsilon"],
clipvalue=params["clipvalue"],
clipnorm=params["clipnorm"])
# Compilation with your metrics
model.compile(optimizer=optimizer, loss='mean_squared_error',
metrics=[tf.keras.metrics.MeanAbsoluteError(),
tf.keras.metrics.RootMeanSquaredError(),
tf.keras.metrics.MeanAbsolutePercentageError(),
r_squared, 'accuracy'])
# Model Summary
model.summary()
now every time i get same error from the MultipleHead
here the print resulting
Shape of encoders_concat before reshape: (None, 10)
Shape of selected_features before reshape: (None, 10)
Shape of selected_features after reshape with dense: (None, 128)
Input shape to transformer x: (None, 128)
Input shape to transformer block: (None, 128)
this print is before the attn_output
here the error
---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
Cell In[66], line 59
2 params = {
3 "learning_rate": 1e-3, # Learning rate for the optimizer
4 "epochs": 100, # Number of epochs for training
(...)
55
56 }
58 # Build the TFT Model
---> 59 model = build_simple_tft_model(params)
61 # Updated optimizer configuration
62 optimizer = tf.keras.optimizers.Adam(learning_rate=params["learning_rate"],
63 beta_1=params["beta_1"],
64 beta_2=params["beta_2"],
65 epsilon=params["epsilon"],
66 clipvalue=params["clipvalue"],
67 clipnorm=params["clipnorm"])
Cell In[65], line 204
201 raise ValueError(f"Input to transformer blocks should have shape {expected_shape}. Received shape: {x.shape}")
203 for _ in range(params['num_transformer_blocks']):
--> 204 x = transformer_block(x, params['num_heads'], params['head_size'], params['ff_dim'], params['dropout_rate'])
206 # Output Layer
207 output = Dense(1)(x)
Cell In[65], line 119
117 print("this print is before the attn_output")
118 # Multi-Head Attention Layer (Masked for self-attention)
--> 119 attn_output, _ = tf.keras.layers.MultiHeadAttention(
120 num_heads=num_heads, key_dim=head_size
121 )(inputs, inputs)
122 print("attn_output shape:", attn_output.shape)
124 # Shape Check after Attention
File ~/miniconda3/envs/AIFlow_Lab/lib/python3.9/site-packages/keras/src/utils/traceback_utils.py:70, in filter_traceback.<locals>.error_handler(*args, **kwargs)
67 filtered_tb = _process_traceback_frames(e.__traceback__)
68 # To get the full stack trace, call:
69 # `tf.debugging.disable_traceback_filtering()`
---> 70 raise e.with_traceback(filtered_tb) from None
71 finally:
72 del filtered_tb
File ~/miniconda3/envs/AIFlow_Lab/lib/python3.9/site-packages/keras/src/layers/activation/softmax.py:107, in Softmax.call(self, inputs, mask)
102 return tf.exp(
103 inputs
104 - tf.reduce_logsumexp(inputs, axis=self.axis, keepdims=True)
105 )
106 else:
--> 107 return backend.softmax(inputs, axis=self.axis[0])
108 return backend.softmax(inputs, axis=self.axis)
IndexError: Exception encountered when calling layer 'softmax' (type Softmax).
tuple index out of range
Call arguments received by layer 'softmax' (type Softmax):
• inputs=tf.Tensor(shape=(None, 8), dtype=float32)
• mask=None
Can someone please enlight me on how do i have to shape the data when i enter this code:
attn_output, _ = tf.keras.layers.MultiHeadAttention(
num_heads=num_heads, key_dim=head_size
)(inputs, inputs)
Is where im blocked since last friday…
Tnks a lot to everyone that will spend some time reading this.