Are zero padding values completely eliminated for downstream layers in LSTM?

I’m curious about the zero-padding effect after the LSTM. I observed that when we have sequences that are 0 padded and then masked, the LSTM can handle them.

Thus, my question is whether, if I use another method as a downstream layer after LSTM, such as self-attention, Zero-padding will have no effect on self-attention and attention weights and thus will have no effect on the model’s performance?

Example of Input feeding into model:

[ 2 3.2 0.2 4.2 1.7
  3 1.0 3.2 1.0 2.3
  1 0.3 3.1 0.4 3.8
  0  0   0   0   0
  0  0   0   0   0 
  0  0   0   0   0
  0  0   0   0   0
  0  0   0   0   0]

Here is the code:

def LSTM_model(input_shape, total_classes):
    input_layer = tf.keras.Input(shape=input_shape, name="time_series_activity")
    input_mask = tf.keras.layers.Masking(mask_value=0.00000)(input_layer)
      lstm_l5 = tf.keras.layers.LSTM(128, activation='tanh',
                                   recurrent_initializer=tf.keras.initializers.Orthogonal(), dropout=0.5,
                                   recurrent_dropout=0.5, return_sequences=True)(input_mask)
    lstm_l6 = tf.keras.layers.LSTM(128, activation='tanh',
                                   recurrent_initializer=tf.keras.initializers.Orthogonal(), dropout=0.9,
                                   recurrent_dropout=0.5)(lstm_l5)
    output_layer = tf.keras.layers.Dense(total_classes, activation="softmax")(lstm_l6)
    return tf.keras.models.Model(inputs=input_layer, outputs=output_layer)

train_generator = Generator(train_x, train_y, shuffle_input_segments=True)
test_generator = Generator(test_x, test_y)

input_shape = (None, train_x[0].shape[1])
model = LSTM_model(input_shape, total_classes)
model.summary()
opt = tf.keras.optimizers.Adam(learning_rate=0.001)
model.compile(loss="categorical_crossentropy",
               optimizer=opt,
               metrics=[tfa.metrics.F1Score(num_classes=total_classes, average='macro')]
               )

path_checkpoint = os.path.join(filepath,
                                dataset,
                                "best_model_checkpoint"
                                   )
callback_model_saving = tf.keras.callbacks.ModelCheckpoint(filepath=path_checkpoint,
                                                               monitor='val_f1_score',
                                                               mode='max',
                                                               verbose=1,
                                                               save_best_only=True)
history = model.fit_generator(train_generator, epochs=total_epoch, steps_per_epoch=len(train_generator),
                                  callbacks=[callbacks, callback_model_saving],
                                  validation_data=test_generator, validation_steps=len(test_generator))