Different results between fit and appy_grad

I’m trying to train a deep neural network (DNN) that approximates the following function, defined on the 2D square, that describes a circle with a radius equal to 0.25 :

lb = tf.constant([0,0], dtype=np.float32)
ub = tf.constant([1,1], dtype=np.float32)

def fun_u_0(xx):
    c_0  = 0.5*(lb+ub)
    rr   = 0.25*np.min(ub-lb)    
    dsq  = tf.math.reduce_sum( (xx-c_0)*(xx-c_0),axis=1)
    return(tf.where( dsq<=rr*rr, 1.0, 0.0) )

I’m using the following function to initialise the DNN:

def init_model(num_hidden_layers=2, num_neurons_per_layer=64):
    model = tf.keras.Sequential()
    model.add(tf.keras.Input(shape=(2,)) )  
    for _ in range(num_hidden_layers):
        model.add(tf.keras.layers.Dense(num_neurons_per_layer, activation='relu')  )    
    return model

After sampling 4096 from uniform distributions on both coordinates:

N_0 = 4096
X_data = tf.random.uniform((N_0,2), lb, ub, dtype=np.float32)
u_data = fun_u_0(X_data)

I’m using 80% of them as the training set and 20% as a test set.

Using the fit function:

model_0 = init_model(num_hidden_layers=2, num_neurons_per_layer=64)
optim_0 = tf.keras.optimizers.Adam(learning_rate=lr)
model_0.compile(loss='mean_squared_error', optimizer=optim_0)
history = model_0.fit(X_data,u_data,validation_split=0.2, epochs=Nepochs)

the network is trained correctly (both the loss functions decrease; the output of the network is very close to the function I want to approximate)

Then, I defined the following functions to compute the loss, the gradient of the loss w.r.t. the training variables and to perform a training step:

def compute_loss(model, X_data, u_data):
    u_pred = model(X_data)
    loss = tf.math.reduce_mean(tf.math.square(u_data - u_pred))
    return loss

def get_grad(model, X_train, u_train,X_test=None,u_test=None):
    print('get_grad invoked')
    if X_test is not None:
        validation_loss  = compute_loss(model, X_test, u_test )
        validation_loss = None
    with tf.GradientTape(persistent=True) as tape:
        loss = compute_loss(model, X_train, u_train )
    g = tape.gradient(loss, model.trainable_variables)
    del tape
    return loss, g,validation_loss

def train_step():
    test_len = int(0.2*X_data.shape[0])
    X_train = X_data[:test_len]
    u_train = u_data[:test_len]
    X_test  = X_data[test_len:]
    u_test  = u_data[test_len:]
    loss, grad_theta,validation_loss = get_grad(model,  X_train, u_train,X_test,u_test)
    optim.apply_gradients(zip(grad_theta, model.trainable_variables))    
    return loss,validation_loss

Finally, I tried to train the DNN by explicitly applying the gradient:

hist = {'val_loss':[],'loss':[]}
for i in range(Nepochs+1):
    loss, val_loss = train_step()
    if i%50 == 0:
        print('It {:05d}: loss = {:10.8e}'.format(i,loss))

However, the loss function no longer decreases with the epochs. I also noticed that the gradient (function get_grad) is computed only twice (the output “gradient_invoked” appeared 2 times only at the beginning).

What I’m doing wrong? I tried to switch the attribute persistent to False but nothing happened.