Exception encountered when calling layer 'softmax' (type Softmax)

Im trying to deploy a TFT on my data but even trying 4 different approach once i arrive intot he mutihead i keep having same error on and on. here the shapes and the code i use.

Shapes for the data:

Before splitting the data
df shape is: (10896, 81)
df static feature shape (10896, 64)

Shapes after sequence creation and reshaping:
X_train_temporal shape: (10574, 28, 10), X_train_static shape: (10574, 28, 64), y_train shape: (10574,)
X_val_temporal shape: (119, 28, 10), X_val_static shape: (119, 28, 64), y_val shape: (119,)
X_test_temporal shape: (119, 28, 10), X_test_static shape: (119, 28, 64), y_test shape: (119,)
X_train_static shape (10574, 28, 64) X_val_static shape (119, 28, 64) X_test_static shape (119, 28, 64)

Time sereis.

# ======================================================================================
# Hyperparameters setup
# ======================================================================================
params = {
    "learning_rate": 1e-3,  # Learning rate for the optimizer
    "epochs": 100,  # Number of epochs for training
    "batch_size": 128,  # Batch size for training
    "time_steps": 28,  # Number of time steps for the model
    "dropout": 0.1,  # Dropout rate for the transformer block
    "mlp_dropout": 0.1,  # Dropout rate for the MLP\    "dropout_rate": 0.1, 
    "dropout_rate": 0.1, 
    "dropout_rate1": 0.2,  # Dropout rate for regularization
    "dropout_rate2": 0.2,  # Dropout rate for regularization
    "dropout_rate3": 0.2,  # Dropout rate for regularization
    "dropout_rate4": 0.2,  # Dropout rate for regularization
    "dropout_rate5": 0.2,  # Dropout rate for regularization


    "l1": 0.005,  # L1 regularization rate
    "l2": 0.001,  # L2 regularization rate

    "threshold": 10.0,  # Threshold for StopOnTooLargeLoss
    "patience_es": 300,  # Patience for EarlyStopping
    "factor_lr": 0.1,  # Factor for ReduceLROnPlateau
    "decay_factor": 0.9,  # Decay factor for learning rate decay
    "decay_step_multiplier": 10,  # Decay step multiplier for learning rate decay
    "patience_lr": 2,  # Patience for ReduceLROnPlateau
    "min_lr": 1e-9,  # Minimum learning rate for ReduceLROnPlateau
    "initial_lr": 1e-2,  # Initial learning rate for GradualLRDecay


    "use_multiprocessing": True,  # Use multiprocessing for parallelization
    "workers": 8,  # Number of workers for multiprocessing
    "max_queue_size": 10,  # Maximum queue size for multiprocessing
    "warmup_epochs": 5,  # Number of epochs for learning rate warmup


    "beta_1": 0.9,  # Beta1 for the Adam optimizer
    "beta_2": 0.999,  # Beta2 for the Adam optimizer
    "epsilon": 1e-7,  # Epsilon for the Adam optimizer
    "clipvalue": 0.5,  # Clip value for gradient clipping
    "clipnorm": 1.0,  # Clip norm for gradient clipping

    # Transformer-specific hyperparameters
    
    "num_static_features": X_train_static.shape[-1],
    "num_temporal_features": X_train_temporal.shape[-1],
    "static_encoder_units": 32,
    "temporal_encoder_units": 64,
    "temporal_conv_filters": 64,
    "temporal_conv_kernel": 3,
    "lstm_units": 32,  
    "num_heads": 8, 
    "head_size": 16,  
    "ff_dim": 128,  
    "num_transformer_blocks": 4,
   
}
# ======================================================================================
# Hyperparameters setup
# ======================================================================================

model code

def static_encoder(static_input, static_encoder_units):
    """Encodes static features using a dense layer.

    Args:
        static_input: A 3D tensor of shape (batch_size, time_steps, num_static_features)
                      representing the static input.
        static_encoder_units: Number of units in the dense layer.

    Returns:
        A 2D tensor of shape (batch_size, static_encoder_units) representing the 
        encoded static features.
    """

    # Input Shape Validation
    if len(static_input.shape) != 3: 
        raise ValueError("Static input should be a 3D tensor with shape (batch_size, time_steps, num_static_features)")

    # Encoding
    x = Dense(static_encoder_units, activation='relu')(static_input)
    x = Flatten()(x)  # Flatten for concatenation 

    return x


def temporal_encoder(temporal_input, temporal_encoder_units, temporal_conv_filters, temporal_conv_kernel):
    """Encodes temporal features using an LSTM and a Conv1D layer.

    Args:
        temporal_input: A 3D tensor with shape (batch_size, time_steps, num_temporal_features),
                        representing the temporal input.
        temporal_encoder_units: Number of units in the LSTM layer.
        temporal_conv_filters: Number of filters in the Conv1D layer.
        temporal_conv_kernel: Kernel size of the Conv1D layer.

    Returns:
        A 2D tensor of shape (batch_size, temporal_encoder_units) representing the 
        encoded temporal features.
    """

    # Input Shape Validation (similar to static_encoder)
    if len(temporal_input.shape) != 3: 
        raise ValueError("Temporal input should be a 3D tensor with shape (batch_size, time_steps, num_temporal_features)")

     # Temporal Encoding
    x = LSTM(temporal_encoder_units, return_sequences=True)(temporal_input)
    x = Conv1D(filters=temporal_conv_filters, kernel_size=temporal_conv_kernel, activation='relu')(x)
    x = Flatten()(x)

    return x

def variable_selection_network(encoders_concat, num_temporal_features, lstm_units):
    """Implements variable selection using an LSTM layer.

    Args:
        encoders_concat: A 2D tensor with shape (batch_size, num_features) representing 
                         the concatenated encoded features.
        num_temporal_features: The number of temporal features.
        lstm_units: Number of units in the LSTM layer.

    Returns:
        A 2D tensor with shape (batch_size, num_temporal_features) representing 
        variable selection weights.
    """

    x = Reshape((1, -1))(encoders_concat)  # Reshape for LSTM input (batch_size, 1, num_features)
    x = LSTM(lstm_units)(x)  # Process with LSTM
    x = Dense(num_temporal_features, activation='sigmoid')(x)  
    return x


def fix_shape_mismatch(encoders_concat, num_heads, head_size):
    # Calculate the total dimension size required for the transformer block
    total_dim_size = num_heads * head_size
    # Calculate the padding size
    padding_size = total_dim_size - (encoders_concat.shape[-1] % total_dim_size)
    # Pad the encoders_concat tensor to match the required dimension size for the transformer block
    if padding_size > 0:
        padding = tf.zeros((tf.shape(encoders_concat)[0], padding_size))
        encoders_concat = tf.concat([encoders_concat, padding], axis=-1)
    return encoders_concat


def transformer_block(inputs, num_heads, head_size, ff_dim, dropout_rate=0.1):
    """A single Transformer block with Multi-Head Attention, feed-forward network, and shape checks.

    Args:
        inputs: A 2D tensor. Expects shape (batch_size, embedding_dim).
        num_heads: Number of attention heads.
        head_size: Dimensionality of each head.
        ff_dim: Hidden layer size in the feed-forward network.
        dropout_rate: Dropout rate for regularization.

    Returns:
        A 2D tensor of shape (batch_size, embedding_dim) representing the output.
    """
    print("Input shape to transformer block:", inputs.shape)

    # Input Shape Check
    if inputs.shape[-1] != num_heads * head_size:  
        raise ValueError(f"Input to transformer block should have a final dimension "
                         f"divisible by num_heads * head_size (Got shape {inputs.shape})")


    print("this print is before the attn_output")
    # Multi-Head Attention Layer (Masked for self-attention)
    attn_output, _ = tf.keras.layers.MultiHeadAttention(
        num_heads=num_heads, key_dim=head_size
    )(inputs, inputs)
    print("attn_output shape:", attn_output.shape)  

    # Shape Check after Attention
    if attn_output.shape[-1] != num_heads * head_size:
        raise ValueError(f"Output of MultiHeadAttention should have final dimension "
                         f"divisible by num_heads * head_size. Got: {attn_output.shape}")

    attn_output = tf.keras.layers.Dropout(dropout_rate)(attn_output)
    attn_output = tf.keras.layers.LayerNormalization(epsilon=1e-6)(inputs + attn_output)

    # Feed Forward Network
    ff_output = tf.keras.layers.Dense(ff_dim, activation='relu')(attn_output)
    ff_output = tf.keras.layers.Dropout(dropout_rate)(ff_output)
    ff_output = tf.keras.layers.Dense(inputs.shape[-1])(ff_output)
    ff_output = tf.keras.layers.LayerNormalization(epsilon=1e-6)(attn_output + ff_output)

    return ff_output

def build_simple_tft_model(params):
    """Builds a Temporal Fusion Transformer (TFT) model.

    Args:
        params: A dictionary containing the following hyperparameters:
            - num_static_features: Number of static features.
            - num_temporal_features: Number of temporal features.
            - static_encoder_units: Number of units in the static encoder.
            - temporal_encoder_units: Number of units in the temporal encoder.
            - temporal_conv_filters: Number of filters in the Conv1D layer.
            - temporal_conv_kernel: Kernel size of the Conv1D layer.
            - lstm_units: Number of units in the LSTM-based variable selection layer.
            - num_heads: Number of attention heads in the transformer blocks.
            - head_size: Dimensionality of each attention head.
            - ff_dim: Hidden layer size in the transformer feed-forward network.
            - dropout_rate: Dropout rate for regularization.
            - num_transformer_blocks: Number of transformer blocks.

    Returns:
        A compiled Keras model.
    """

    # Inputs
    static_input = Input(shape=(params['time_steps'], params['num_static_features']))
    temporal_input = Input(shape=(params['time_steps'], params['num_temporal_features']))

    # Encoding
    static_encoded = static_encoder(static_input, params['static_encoder_units'])
    temporal_encoded = temporal_encoder(temporal_input, params['temporal_encoder_units'],
                                        params['temporal_conv_filters'], params['temporal_conv_kernel'])

    # Concatenate Encoded Features
    encoders_concat = Concatenate()([static_encoded, temporal_encoded])

    # Variable Selection
    variable_selection = variable_selection_network(encoders_concat, 
                                                    params['num_temporal_features'], 
                                                    params['lstm_units'])

    # Apply selection weights 
    encoders_concat = tf.reshape(encoders_concat, (-1, params['num_temporal_features'])) 
    selected_features = tf.keras.layers.Multiply()([encoders_concat, variable_selection]) 

    print("Shape of encoders_concat before reshape:", encoders_concat.shape)
    print("Shape of selected_features before reshape:", selected_features.shape)


    # # Reshape for transformer compatibility
    if selected_features.shape[-1] != params['num_heads'] * params['head_size']:
        # Apply a Dense layer to adjust the dimensionality to the expected size
        selected_features = tf.keras.layers.Dense(params['num_heads'] * params['head_size'], activation='relu')(selected_features)
    print("Shape of selected_features after reshape with dense:", selected_features.shape)

    # Transformer Blocks
    x = selected_features 

    print("Input shape to transformer x:", x.shape) # Should be (batch_size, embedding_dim)

    # Shape Check 
    expected_shape = (None, params['num_heads'] * params['head_size'])
    if x.shape != expected_shape:
        raise ValueError(f"Input to transformer blocks should have shape {expected_shape}. Received shape: {x.shape}")

    for _ in range(params['num_transformer_blocks']):
        x = transformer_block(x, params['num_heads'], params['head_size'], params['ff_dim'], params['dropout_rate'])

    # Output Layer
    output = Dense(1)(x)  

    # Model Creation
    model = Model(inputs=[static_input, temporal_input], outputs=output)

    # Compilation
    model.compile(loss='mean_squared_error', optimizer='adam')  

    return model

model compile

# Build the TFT Model
model = build_simple_tft_model(params)

# Updated optimizer configuration
optimizer = tf.keras.optimizers.Adam(learning_rate=params["learning_rate"],
                                     beta_1=params["beta_1"],
                                     beta_2=params["beta_2"],
                                     epsilon=params["epsilon"],
                                     clipvalue=params["clipvalue"],
                                     clipnorm=params["clipnorm"])

#  Compilation with your metrics 
model.compile(optimizer=optimizer, loss='mean_squared_error',
              metrics=[tf.keras.metrics.MeanAbsoluteError(),
                       tf.keras.metrics.RootMeanSquaredError(),
                       tf.keras.metrics.MeanAbsolutePercentageError(),
                       r_squared, 'accuracy'])

# Model Summary
model.summary() 

now every time i get same error from the MultipleHead

here the print resulting

Shape of encoders_concat before reshape: (None, 10)
Shape of selected_features before reshape: (None, 10)
Shape of selected_features after reshape with dense: (None, 128)
Input shape to transformer x: (None, 128)
Input shape to transformer block: (None, 128)
this print is before the attn_output

here the error

---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
Cell In[66], line 59
      2 params = {
      3     "learning_rate": 1e-3,  # Learning rate for the optimizer
      4     "epochs": 100,  # Number of epochs for training
   (...)
     55    
     56 }
     58 # Build the TFT Model
---> 59 model = build_simple_tft_model(params)
     61 # Updated optimizer configuration
     62 optimizer = tf.keras.optimizers.Adam(learning_rate=params["learning_rate"],
     63                                      beta_1=params["beta_1"],
     64                                      beta_2=params["beta_2"],
     65                                      epsilon=params["epsilon"],
     66                                      clipvalue=params["clipvalue"],
     67                                      clipnorm=params["clipnorm"])

Cell In[65], line 204
    201     raise ValueError(f"Input to transformer blocks should have shape {expected_shape}. Received shape: {x.shape}")
    203 for _ in range(params['num_transformer_blocks']):
--> 204     x = transformer_block(x, params['num_heads'], params['head_size'], params['ff_dim'], params['dropout_rate'])
    206 # Output Layer
    207 output = Dense(1)(x)  

Cell In[65], line 119
    117 print("this print is before the attn_output")
    118 # Multi-Head Attention Layer (Masked for self-attention)
--> 119 attn_output, _ = tf.keras.layers.MultiHeadAttention(
    120     num_heads=num_heads, key_dim=head_size
    121 )(inputs, inputs)
    122 print("attn_output shape:", attn_output.shape)  
    124 # Shape Check after Attention

File ~/miniconda3/envs/AIFlow_Lab/lib/python3.9/site-packages/keras/src/utils/traceback_utils.py:70, in filter_traceback.<locals>.error_handler(*args, **kwargs)
     67     filtered_tb = _process_traceback_frames(e.__traceback__)
     68     # To get the full stack trace, call:
     69     # `tf.debugging.disable_traceback_filtering()`
---> 70     raise e.with_traceback(filtered_tb) from None
     71 finally:
     72     del filtered_tb

File ~/miniconda3/envs/AIFlow_Lab/lib/python3.9/site-packages/keras/src/layers/activation/softmax.py:107, in Softmax.call(self, inputs, mask)
    102         return tf.exp(
    103             inputs
    104             - tf.reduce_logsumexp(inputs, axis=self.axis, keepdims=True)
    105         )
    106     else:
--> 107         return backend.softmax(inputs, axis=self.axis[0])
    108 return backend.softmax(inputs, axis=self.axis)

IndexError: Exception encountered when calling layer 'softmax' (type Softmax).

tuple index out of range

Call arguments received by layer 'softmax' (type Softmax):
  • inputs=tf.Tensor(shape=(None, 8), dtype=float32)
  • mask=None

Can someone please enlight me on how do i have to shape the data when i enter this code:

    attn_output, _ = tf.keras.layers.MultiHeadAttention(
        num_heads=num_heads, key_dim=head_size
    )(inputs, inputs)

Is where im blocked since last friday…

Tnks a lot to everyone that will spend some time reading this.

1 Like