Model does not train properly when explicitly applying the gradients

Hi all,
I’m trying to constrain the weight of my model by explicitly applying the gradient; shower, this is not working and I can’t figure out why.

I’m defining the model with the following function:

def init_model(num_hidden_layers=2, num_neurons_per_layer=64):
    model = tf.keras.Sequential()
    model.add(tf.keras.Input(shape=(2,)) )  
    for _ in range(num_hidden_layers):
        model.add(tf.keras.layers.Dense(num_neurons_per_layer, activation=tf.keras.layers.LeakyReLU( ),kernel_initializer="glorot_uniform")  )    
    model.add(tf.keras.layers.Dense(1,kernel_initializer="glorot_uniform"))    
    return model

When using the fit method, the loss function decreases and the model fits the data:

Nepochs = 1500
lr      = 0.001

def my_loss(u_true, u_pred):
    return tf.math.reduce_mean(tf.math.square(u_true - u_pred))

model_0 = init_model(num_hidden_layers=2, num_neurons_per_layer=64)
optim_0 = tf.keras.optimizers.Adam(learning_rate=lr)
model_0.compile(loss=my_loss, optimizer=optim_0)
model_0.summary()
history_0 = model_0.fit(X_train,u_train,validation_data=(X_test.numpy(),u_test.numpy()),epochs=Nepochs, batch_size=X_train.shape[0])

When I explicitly specify and apply the gradient, the loss function stagnates and the output does not fit the data (it is uniform everywhere):

Nepochs = 1500
lr      = 0.001

def compute_loss(model, X_data, u_data):
    u_pred = model(X_data)
    loss = tf.math.reduce_mean(tf.math.square(u_data - u_pred))
    return loss

@tf.function
def training(model, optim, X_train, u_train, X_test=None, u_test=None):
    if X_test is not None:
        validation_loss  = compute_loss(model, X_test, u_test )
    else:
        validation_loss = None    
    with tf.GradientTape(persistent=True) as tape:
        tape.watch(model.trainable_variables)
        loss = compute_loss(model, X_train, u_train )
    grad_theta = tape.gradient(loss, model.trainable_variables)
    optim.apply_gradients(zip(grad_theta, model.trainable_variables))        
    return loss,validation_loss

model_G = init_model(num_hidden_layers=2, num_neurons_per_layer=64)
optim_G = tf.keras.optimizers.Adam(learning_rate=lr)
model_G.summary()

hist = {'val_loss':[],'loss':[]}
for i in range(Nepochs+1):
    loss, val_loss = training(model_G,optim_G,X_train,u_train,X_test,u_test)
    hist['loss'].append(loss.numpy())
    hist['val_loss'].append(val_loss.numpy())
    if val_loss is not None:
        print('It {:05d}: loss = {:10.8e}, validation loss = {:10.8e} '.format(i,loss,val_loss))
    else:
        print('It {:05d}: loss = {:10.8e}'.format(i,loss))

Why do the two versions provide different results?

Thanks for the help.

Cesare

I found that expanding the dimension of the targets as follows:

u_train = tf.expand_dims(u_train,axis=-1)
u_test = tf.expand_dims(u_test,axis=-1)

the model training properly and the loss functions are correctly evaluated.
u_train and u_test previously had shapes equal to the number of entries N only; by expanding the dimension, the shape now is (N,1).

using fit the code works with both; when explicitly using the gradient, only with targets of shape (N,1).