Tensorflow Base and Keras Model Different Results

I have been trying to learn how to deal with tensorflow base to build ANN as opposed to just using the keras API. Tensorflow has a nice tutorial on getting started on this with the MNIST digits dataset
Multilayer perceptrons for digit recognition with Core APIs  |  TensorFlow Core. I wanted to change this up a little and modify it to do regression instead of classification.

Basically all the code is as follows

import tensorflow as tf
import numpy as np
from tensorflow import keras
import pdb

def xavier_init(shape):
# Computes the xavier initialization values for a weight matrix
in_dim, out_dim = shape
xavier_lim = tf.sqrt(6.)/tf.sqrt(tf.cast(in_dim + out_dim, tf.float32))
weight_vals = tf.random.uniform(shape=(in_dim, out_dim), minval=-xavier_lim, maxval=xavier_lim, seed=22)
return weight_vals

class DenseLayer(tf.Module):
def init(self,out_dim,weight_init=xavier_init,activation=tf.identity):
super().init()
self.out_dim=out_dim
self.activation=activation
self.built=False
self.weight_init=weight_init

def __call__(self,x):
    if not self.built:
        self.in_dim=x.shape[1]
        self.w=tf.Variable(xavier_init(shape=(self.in_dim,self.out_dim)))
        self.b=tf.Variable(tf.zeros(shape=(self.out_dim,)))
        self.built=True
    z=tf.add(tf.matmul(x,self.w),self.b)
    return self.activation(z)

class LinearLayer(tf.Module):
def init(self,out_dim,weight_init=xavier_init):
super().init()
self.out_dim=out_dim
self.built=False
self.weight_init=weight_init

def __call__(self,x):
    if not self.built:
        self.in_dim=x.shape[1]
        self.w=tf.Variable(xavier_init(shape=(self.in_dim,self.out_dim)))
        self.b=tf.Variable(tf.zeros(shape=(self.out_dim,)))
        self.built=True
    z=tf.add(tf.matmul(x,self.w),self.b)
    return z

class MLP_REG(tf.Module):
def init(self,layers):
self.layers=layers

@tf.function
def __call__(self,x,preds=False):
    for layer in self.layers:
        x=layer(x)
    return x

def mse_loss(ypred,y):
return tf.reduce_mean(tf.square(ypred-y))

def accuracy(ypred,y):
loss=tf.math.reduce_sum(tf.square(ypred-y))
return loss

class Adam:

def __init__(self, learning_rate=1e-3, beta_1=0.9, beta_2=0.999, ep=1e-7):
  # Initialize optimizer parameters and variable slots
  super().__init__()
  self.beta_1 = beta_1
  self.beta_2 = beta_2
  self.learning_rate = learning_rate
  self.ep = ep
  self.t = 1.
  self.v_dvar, self.s_dvar = [], []
  self.built = False

def apply_gradients(self, grads, vars):
  # Initialize variables on the first call
  if not self.built:
    for var in vars:
      v = tf.Variable(tf.zeros(shape=var.shape))
      s = tf.Variable(tf.zeros(shape=var.shape))
      self.v_dvar.append(v)
      self.s_dvar.append(s)
    self.built = True
  # Update the model variables given their gradients
  for i, (d_var, var) in enumerate(zip(grads, vars)):
    self.v_dvar[i].assign(self.beta_1*self.v_dvar[i] + (1-self.beta_1)*d_var)
    self.s_dvar[i].assign(self.beta_2*self.s_dvar[i] + (1-self.beta_2)*tf.square(d_var))
    v_dvar_bc = self.v_dvar[i]/(1-(self.beta_1**self.t))
    s_dvar_bc = self.s_dvar[i]/(1-(self.beta_2**self.t))
    var.assign_sub(self.learning_rate*(v_dvar_bc/(tf.sqrt(s_dvar_bc) + self.ep)))
  self.t += 1.
  return    

def train_step(x_batch,y_batch,loss,acc,model,optimizer):
with tf.GradientTape() as tape:
y_pred = model(x_batch)
batch_loss = loss(y_pred, y_batch)
batch_acc = acc(y_pred, y_batch)
grads = tape.gradient(batch_loss, model.variables)
optimizer.apply_gradients(grads, model.variables)
return batch_loss, batch_acc

def val_step(x_batch, y_batch, loss, acc, model):
# Evaluate the model on given a batch of validation data
y_pred = model(x_batch)
batch_loss = loss(y_pred, y_batch)
batch_acc = acc(y_pred, y_batch)
return batch_loss, batch_acc

def train_model(mlp, train_data, val_data, loss, acc, optimizer, epochs):
# Initialize data structures
train_losses, train_accs = [], []
val_losses, val_accs = [], []
for epoch in range(epochs):
batch_losses_train, batch_accs_train = [], []
batch_losses_val, batch_accs_val = [], []
# Iterate over training

    for x_batch, y_batch in train_data:
        # Compute gradients and update the model's parameters
        batch_loss, batch_acc = train_step(x_batch, y_batch, loss, acc, mlp, optimizer)
        # Keep track of batch-level training performance
        batch_losses_train.append(batch_loss)
        batch_accs_train.append(batch_acc)  
        
    # iterate of validation 
    for x_batch, y_batch in val_data:
        batch_loss, batch_acc = val_step(x_batch, y_batch, loss, acc, mlp)
        batch_losses_val.append(batch_loss)
        batch_accs_val.append(batch_acc)

    train_loss, train_acc = tf.reduce_mean(batch_losses_train), tf.reduce_mean(batch_accs_train)
    val_loss, val_acc = tf.reduce_mean(batch_losses_val), tf.reduce_mean(batch_accs_val)
    train_losses.append(train_loss)
    train_accs.append(train_acc)
    val_losses.append(val_loss)
    val_accs.append(val_acc)
    print(f"Epoch: {epoch}")
    print(f"Training loss: {train_loss:.3f}, Training accuracy: {train_acc:.3f}")
    print(f"Validation loss: {val_loss:.3f}, Validation accuracy: {val_acc:.3f}")            
return train_losses, train_accs, val_losses, val_accs

for a dataset I used the sklearn diabetes dataset

import sklearn.datasets as ds
DS=ds.load_diabetes()

Anyways I use this to build a model with two hidden layers with 300 units and 150 units both with relu activation using and using mean squared error as loss.

I train this for 10 epochs and the loss for the training data is on the order of 7000

however, if I use Keras as follows

from tensorflow import keras
from tensflow.keras import layers

model=keras.Sequential([
keras.layers.Dense(300,activation=‘relu’),
keras.layers.Dense(150,activation=‘relu’),
keras.layers.Dense(1)])

model.compile(optimizer=‘Adam’,loss=‘mse’)
model.fit(train_data,batch_size=30,validation_split=0.25,epochs=10)

after this training loss on this model is about 5300. I feel like the two should be pretty much the same and am not sure why they are different. The parameters for the ADAM optimizer are the same. I also know that for some reason the first tensorflow base is running on the CPU which I am not sure why or if that would influence results. Is keras using different more complex optimization that would make it approach a local minimum faster?