# Though Training accuracy is high performance on training data during inference in transformer translation is poor

Hi, I am trying to code transformer architecture from scratch. I tried it on a toy translation problem from english to german. I see there is a tendency to overfit on the training data as a the validation loss is about twice that of the training loss. But however, when i run the trained model on training data it performs abysmally. I have seen training accuracy upto 0.99. My understanding is even if there is overfitting it i test it on my training data it should perform well, which leads to me think something is wrong with my inference. The notebook is attached. Any help in identifying what i am missing much appreciated!

import tensorflow as tf
from tensorflow import convert_to_tensor, string
from tensorflow.keras.layers import TextVectorization, Embedding, Layer
from tensorflow.data import Dataset
import numpy as np
import matplotlib.pyplot as plt

class PositionalEmbeddingFixedWeights(Layer):
def init(self, seq_len, vocab_size, output_dim, **kwargs):
super().init(**kwargs)
word_embedding_matrix = self.get_position_encoding(vocab_size, output_dim)
pos_embedding_matrix = self.get_position_encoding(seq_len, output_dim)

``````    self.word_embedding_layer = Embedding(
input_dim=vocab_size, output_dim=output_dim,
weights=[word_embedding_matrix],
trainable=False)

self.position_embedding_layer = Embedding(
input_dim=seq_len, output_dim=output_dim,
weights =[pos_embedding_matrix],
trainable=False)

def get_position_encoding(self, seq_len, d, n=10000):
p = np.zeros((seq_len, d))
for k in range(seq_len):
for i in range(int(d/2)):
denominator = n**(2*i/d)
p[k, 2*i] =  np.sin(k/denominator)
p[k, 2*i+1] = np.cos(k/denominator)
return p
def call(self, inputs):
position_indices = tf.range(tf.shape(inputs)[-1])
embedded_words = self.word_embedding_layer(inputs)
embedded_indices = self.position_embedding_layer(position_indices)
return embedded_words + embedded_indices
``````

from tensorflow import matmul, math, cast, float32
from tensorflow.keras.layers import Layer
from keras.backend import softmax
from tensorflow import math, matmul, reshape, shape, transpose, cast, float32
from tensorflow.keras.layers import Dense, Layer
from tensorflow.keras.backend import softmax

class DotProductAttention(Layer):
def init(self, **kwargs):
super().init(**kwargs)
def call(self, queries, keys, values, d_k, mask=None):
scores = matmul(queries, keys, transpose_b=True)/math.sqrt(cast(d_k, float32))
weights = softmax(scores)
return matmul(weights, values)

def init(self, h, d_k, d_v, d_model, **kwargs):
super().init(**kwargs)
self.attention = DotProductAttention()
self.d_k = d_k
self.d_v = d_v
self.d_model = d_model
self.W_q = Dense(d_k)
self.W_k = Dense(d_k)
self.W_v = Dense(d_v)
self.W_o = Dense(d_model)

``````def reshape_tensor(self, x, heads, flag):
if flag:
x = reshape(x, shape=(shape(x)[0], shape(x)[1], heads, -1))
x = transpose(x, perm=(0,2,1,3))
else:
#print("shape ", tf.shape(x))
x = transpose(x, perm=(0,2,1,3))
x = reshape(x, shape=(shape(x)[0], shape(x)[1], self.d_k ))
#print("shape1", tf.shape(x))
return x
def call(self, queries, keys, values, mask=None):
o_reshaped = self.attention(q_reshaped, k_reshaped, v_reshaped, self.d_k, mask)
#print("outputshape", tf.shape(output))
return self.W_o(output)
``````

from tensorflow.keras.layers import LayerNormalization, Layer, Dense, ReLU, Dropout

def init(self, **kwargs):
super().init(**kwargs)
self.layer_norm = LayerNormalization()

``````def call(self, x, sublayer_x):
``````

class FeedForward(Layer):
def init(self, d_ff, d_model, **kwargs):
super().init(**kwargs)
self.fully_connected1 = Dense(d_ff)
self.fully_connected2 = Dense(d_model)
self.activation = ReLU()
def call(self, x):
x_fc1 = self.fully_connected1(x)
return self.fully_connected2(self.activation(x_fc1))
class EncoderLayer(Layer):
def init(self, sequence_length, h, d_k, d_v, d_model, d_ff, rate, **kwargs):
super().init(**kwargs)
self.sequence_length = sequence_length
self.d_model = d_model
self.build(input_shape=[None, sequence_length, d_model])
self.dropout1 = Dropout(rate)
self.feed_forward = FeedForward(d_ff, d_model)
self.dropout2 = Dropout(rate)
def build_graph(self):
input_layer = Input(shape=(self.sequence_length, self.d_model))
return Model(inputs=[input_layer], outputs=self.call(input_layer, None, True))

``````def call(self, x, padding_mask, training):
feedforward_output = self.dropout2(feedforward_output, training=training)
``````

class Encoder(Layer):
def init(self, vocab_size, sequence_length, h, d_k, d_v, d_model, d_ff, n, rate, **kwargs):
super().init(**kwargs)
self.pos_encoding = PositionalEmbeddingFixedWeights(sequence_length, vocab_size, d_model)
self.dropout = Dropout(rate)
self.encoder_layers = [EncoderLayer(sequence_length, h, d_k, d_v, d_model, d_ff, rate) for _ in range(n)]
pos_encoding_output = self.pos_encoding(input_sentence)
x = self.dropout(pos_encoding_output)
for layer in self.encoder_layers:
return x

class DecoderLayer(Layer):
def init(self, sequence_length, h, d_k, d_v, d_model, d_ff, rate, **kwargs):
super().init(**kwargs)
self.d_model = d_model
self.build(input_shape=[None, sequence_length, d_model])
self.sequence_length = sequence_length
self.dropout1 = Dropout(rate)
self.dropout2 = Dropout(rate)
self.feed_forward = FeedForward(d_ff, d_model)
self.dropout3 = Dropout(rate)

``````def build_graph(self):
input_layer = Input(shape=(self.sequence_length, self.d_model))
return Model(inputs=[input_layer], outputs=self.call(input_layer, input_layer, None, None, True))

feedforward_output = self.dropout3(feedforward_output, training=training)
``````

class Decoder(Layer):
def init(self, vocab_size, sequence_length, h, d_k, d_v, d_model, d_ff, n, rate, **kwargs):
super().init(**kwargs)
self.pos_encoding = PositionalEmbeddingFixedWeights(sequence_length, vocab_size, d_model)
self.dropout = Dropout(rate)
self.decoder_layers = [DecoderLayer(sequence_length, h, d_k, d_v, d_model, d_ff, rate) for _ in range(n)]
pos_encoding_output = self.pos_encoding(output_target)
x = self.dropout(pos_encoding_output, training=training)
for layer in self.decoder_layers:
return x

from tensorflow import math, cast, float32
from tensorflow import linalg, ones
from tensorflow import math, cast, float32, linalg, ones, maximum, newaxis
from tensorflow.keras import Model
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.layers import Input
class TransformerModel(Model):
def init(self, enc_vocab_size, dec_vocab_size, enc_seq_length, dec_seq_length,
h, d_k, d_v, d_model, d_ff_inner, n, rate, **kwargs):
super().init(**kwargs)
self.encoder = Encoder(enc_vocab_size, enc_seq_length, h, d_k, d_v, d_model, d_ff_inner, n, rate)
self.decoder = Decoder(dec_vocab_size, dec_seq_length, h, d_k, d_v, d_model, d_ff_inner, n, rate)
self.model_last_layer = Dense(dec_vocab_size)
mask = linalg.band_part(ones((shape, shape)), 0, -1)
def call(self, encoder_input, decoder_input, training):

``````    encoder_output = self.encoder(encoder_input, enc_padding_mask, training)
training)
model_output = self.model_last_layer(decoder_output)
``````

# tf.print(“DecInputShape”, decoder_input.shape, “DecOutputShape”, decoder_output.shape)

``````    return model_output
``````

#Training
from numpy.random import shuffle
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow import convert_to_tensor, int64
from pickle import load, dump, HIGHEST_PROTOCOL
from numpy.random import shuffle
from numpy import savetxt
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow import convert_to_tensor, int64
#from prepare_dataset import PrepareDataset

class PrepareDataset:
def init(self, **kwargs):
super().init(**kwargs)
self.n_sentences = 10000
self.train_split = 0.8
self.val_split = 0.1
def create_tokenizer(self, dataset):
tokenizer = Tokenizer()
tokenizer.fit_on_texts(dataset)
def find_seq_length(self, dataset):
return max(len(seq.split()) for seq in dataset)
def find_vocab_size(self, tokenizer, dataset):
tokenizer.fit_on_texts(dataset)
return len(tokenizer.word_index) + 1
x = tokenizer.texts_to_sequences(dataset)
x = convert_to_tensor(x, dtype=int64)
return x
def save_tokenizer(self, tokenizer, name):
with open(name + “_tokenizer.pkl”, ‘wb’) as handle:
dump(tokenizer, handle, protocol=HIGHEST_PROTOCOL)
def call(self, filename, **kwargs):
dataset = clean_dataset[:self.n_sentences, :]
for i in range(len(dataset[:,0])):
if i < 10:
print(dataset[i,:])
dataset[i, 0] = “ " + dataset[i,0] + " ”
dataset[i, 1] = “ " + dataset[i,1] + " ”
shuffle(dataset)
train = dataset[:int(len(dataset)self.train_split)]
val = dataset[int(len(dataset)self.train_split):int(len(dataset)(1-self.val_split))]
test = dataset[int(len(dataset)
(1-self.val_split)):]

``````    enc_tokenizer = self.create_tokenizer(dataset[:, 0])
enc_seq_length = self.find_seq_length(dataset[:, 0])
enc_vocab_size = self.find_vocab_size(enc_tokenizer, train[:, 0])

dec_tokenizer = self.create_tokenizer(dataset[:, 1])
dec_seq_length = self.find_seq_length(dataset[:, 1])
dec_vocab_size = self.find_vocab_size(dec_tokenizer, train[:, 1])

trainX = self.encode_pad(train[:, 0], enc_tokenizer, enc_seq_length)
trainY = self.encode_pad(train[:, 1], dec_tokenizer, dec_seq_length)
valX = self.encode_pad(val[:, 0], enc_tokenizer, enc_seq_length)
valY = self.encode_pad(val[:, 1], dec_tokenizer, dec_seq_length)

self.save_tokenizer(enc_tokenizer, "enc")
self.save_tokenizer(dec_tokenizer, "dec")
savetxt("test_dataset.txt", test, fmt="%s")
return (trainX, trainY, valX, valY, train, val, enc_seq_length, dec_seq_length,
enc_vocab_size, dec_vocab_size)
``````

from tensorflow.keras.optimizers.schedules import LearningRateSchedule
from tensorflow.keras.metrics import Mean
from tensorflow import data, train, math, reduce_sum, cast, equal, argmax,
from tensorflow.keras.losses import sparse_categorical_crossentropy
from time import time
from pickle import dump

# Define the model parameters

h = 8 # Number of self-attention heads
d_k = 64 # Dimensionality of the linearly projected queries and keys
d_v = 64 # Dimensionality of the linearly projected values
d_model = 512 # Dimensionality of model layers’ outputs
d_ff = 2048 # Dimensionality of the inner fully connected layer
n = 6 # Number of layers in the encoder stack

# n = 2 # Number of layers in the encoder stack

epochs = 160
batch_size = 64
beta_1 = 0.9
beta_2 = 0.98
epsilon = 1e-9
dropout_rate = 0.3

class LRScheduler(LearningRateSchedule):
def init(self, d_model, warmup_steps=4000, **kwargs):
super().init(**kwargs)
self.d_model = cast(d_model, float32)
self.warmup_steps = warmup_steps
def call(self, step_num):
# Linearly increasing the learning rate for the first warmup_steps, and
# decreasing it thereafter
arg1 = cast(step_num, float32) ** -0.5
arg2 = cast(step_num, float32) * (self.warmup_steps ** -1.5)
return (self.d_model ** -0.5) * math.minimum(arg1, arg2)
optimizer = Adam(LRScheduler(d_model), beta_1, beta_2, epsilon)

dataset = PrepareDataset()
trainX, trainY, valX, valY, train_orig, val_orig, enc_seq_length, dec_seq_length, enc_vocab_size, dec_vocab_size
= dataset(‘data/Neural-Machine-Translation-System/english-german-both.pkl’)

print(“Sizes:”, enc_seq_length, dec_seq_length, enc_vocab_size, dec_vocab_size)

# Prepare the training dataset batches

train_dataset = data.Dataset.from_tensor_slices((trainX, trainY))
train_dataset = train_dataset.batch(batch_size)

# Prepare the validation dataset batches

val_dataset = data.Dataset.from_tensor_slices((valX, valY))
val_dataset = val_dataset.batch(batch_size)

# Create model

training_model = TransformerModel(enc_vocab_size, dec_vocab_size, enc_seq_length,
dec_seq_length, h, d_k, d_v, d_model, d_ff, n, dropout_rate)

# Defining the loss function

def loss_fcn(target, prediction):
# Create mask so that the zero padding values are not included in the
# computation of loss
# Compute a sparse categorical cross-entropy loss on the unmasked values
loss = sparse_categorical_crossentropy(target, prediction, from_logits=True) * mask
# Compute the mean loss over the unmasked values
def accuracy_fcn(target, prediction):
# Create mask so that the zero padding values are not included in the
# computation of accuracy
# Find equal prediction and target values, and apply the padding mask
accuracy = equal(target, argmax(prediction, axis=2))
# Cast the True/False values to 32-bit-precision floating-point numbers
accuracy = cast(accuracy, float32)
# Compute the mean accuracy over the unmasked values

# Include metrics monitoring

train_loss = Mean(name=‘train_loss’)
train_accuracy = Mean(name=‘train_accuracy’)
val_loss = Mean(name=‘val_loss’)

# Create a checkpoint object and manager to manage multiple checkpoints

ckpt = train.Checkpoint(model=training_model, optimizer=optimizer)
ckpt_manager = train.CheckpointManager(ckpt, “./checkpoints”, max_to_keep=None)

# Initialise dictionaries to store the training and validation losses

train_loss_dict = {}
val_loss_dict = {}

@function
def train_step(encoder_input, decoder_input, decoder_output):
# Run the forward pass of the model to generate a prediction
prediction = training_model(encoder_input, decoder_input, training=True)
# Compute the training loss
loss = loss_fcn(decoder_output, prediction)
# Compute the training accuracy
accuracy = accuracy_fcn(decoder_output, prediction)
# Retrieve gradients of the trainable variables with respect to the training loss
# Update the values of the trainable variables by gradient descent
train_loss(loss)
train_accuracy(accuracy)

for epoch in range(epochs):
train_loss.reset_states()
train_accuracy.reset_states()
val_loss.reset_states()
print(“\nStart of epoch %d” % (epoch + 1))
start_time = time()
# Iterate over the dataset batches
for step, (train_batchX, train_batchY) in enumerate(train_dataset):
# Define the encoder and decoder inputs, and the decoder output
encoder_input = train_batchX[:, 1:]
decoder_input = train_batchY[:, :-1]
decoder_output = train_batchY[:, 1:]
#print(“step”, step, “calling train_step”)
train_step(encoder_input, decoder_input, decoder_output)
if step % 50 == 0:
print(f"Epoch {epoch + 1} Step {step} Loss {train_loss.result():.4f} "
+ f"Accuracy {train_accuracy.result():.4f}“)
# Run a validation step after every epoch of training
for val_batchX, val_batchY in val_dataset:
# Define the encoder and decoder inputs, and the decoder output
encoder_input = val_batchX[:, 1:]
decoder_input = val_batchY[:, :-1]
decoder_output = val_batchY[:, 1:]
# Generate a prediction
prediction = training_model(encoder_input, decoder_input, training=False)
# Compute the validation loss
#tf.print(“dcoder_outputshape”, decoder_output.shape, “prediction shape”, prediction.shape)
loss = loss_fcn(decoder_output, prediction)
val_loss(loss)
# Print epoch number and accuracy and loss values at the end of every epoch
print(f"Epoch {epoch+1}: Training Loss {train_loss.result():.4f}, "
+ f"Training Accuracy {train_accuracy.result():.4f}, "
+ f"Validation Loss {val_loss.result():.4f}”)
# Save a checkpoint after every epoch
if (epoch + 1) % 1 == 0:
save_path = ckpt_manager.save()
print(f"Saved checkpoint at epoch {epoch+1}")
# Save the trained model weights
training_model.save_weights(“weights/wghtstemp” + str(epoch + 1) + “.ckpt”)
train_loss_dict[epoch] = train_loss.result()
val_loss_dict[epoch] = val_loss.result()

# Save the training loss values

with open(‘./train_loss.pkl’, ‘wb’) as file:
dump(train_loss_dict, file)

# Save the validation loss values

with open(‘./val_loss.pkl’, ‘wb’) as file:
dump(val_loss_dict, file)
print(“Total time taken: %.2fs” % (time() - start_time))

### Inference

from tensorflow import Module
from tensorflow import convert_to_tensor, int64, TensorArray, argmax, newaxis, transpose
#from translate import Translate

Define the model parameters
h = 8 # Number of self-attention heads
d_k = 64 # Dimensionality of the linearly projected queries and keys
d_v = 64 # Dimensionality of the linearly projected values
d_model = 512 # Dimensionality of model layers’ outputs
d_ff = 2048 # Dimensionality of the inner fully connected layer
n = 6 # Number of layers in the encoder stack

# Define the dataset parameters

enc_seq_length = 7 # Encoder sequence length
dec_seq_length = 12 # Decoder sequence length
enc_vocab_size = 2404 # Encoder vocabulary size
dec_vocab_size = 3864 # Decoder vocabulary size

# Create model

inferencing_model = TransformerModel(enc_vocab_size, dec_vocab_size, enc_seq_length, dec_seq_length, h, d_k, d_v, d_model, d_ff, n, 0)

class Translate_1(Module):
def init(self, inferencing_model, **kwargs):
super().init(**kwargs)
self.transformer = inferencing_model

``````def load_tokenizer(self, name):
with open(name, 'rb') as handle:

def __call__(self, sentence):
# Append start and end of string tokens to the input sentence
sentence[0] = "<START> " + sentence[0] + " <EOS>"

# Load encoder and decoder tokenizers

# Prepare the input sentence by tokenizing, padding and converting to tensor
encoder_input = enc_tokenizer.texts_to_sequences(sentence)
encoder_input = convert_to_tensor(encoder_input, dtype=int64)
print("encoder_input", encoder_input)
# Prepare the output <START> token by tokenizing, and converting to tensor
output_start = dec_tokenizer.texts_to_sequences(["<START>"])
output_start = convert_to_tensor(output_start[0], dtype=int64)

# Prepare the output <EOS> token by tokenizing, and converting to tensor
output_end = dec_tokenizer.texts_to_sequences(["<EOS>"])
output_end = convert_to_tensor(output_end[0], dtype=int64)

# Prepare the output array of dynamic size
decoder_output = TensorArray(dtype=int64, size=0, dynamic_size=True)
decoder_output = decoder_output.write(0, output_start)

for i in range(dec_seq_length):
# Predict an output token
predictionorig = self.transformer(encoder_input,transpose(decoder_output.stack()),
training=False)
#print("predshape", predictionorig.shape)
prediction = predictionorig[:, -1, :]
#print("prediction", prediction)
output_str = []
prediction_output = TensorArray(dtype=int64, size=0, dynamic_size=True)

prediction_output = prediction_output.write(0, output_start)

# Decode the predicted tokens into an output string
for k in range(predictionorig.shape[-2]):
key = argmax(predictionorig[:, k, :][0])[newaxis]
#print("key is", key)
prediction_output = prediction_output.write(k+1, key)

pred_output = transpose(prediction_output.stack())[0]
pred_output = pred_output.numpy()

output_str = []

# Decode the predicted tokens into an output string
for k in range(pred_output.shape[0]):
key = pred_output[k]
output_str.append(dec_tokenizer.index_word[key])
print("PredOutputString", output_str, predictionorig.shape[-2])

# Select the prediction with the highest score
predicted_id = argmax(prediction, axis=-1)
predicted_id = predicted_id[0][newaxis]
#print("predicted_idshape", predicted_id.shape)
# Write the selected prediction to the output array at the next
# available index
decoder_output = decoder_output.write(i + 1, predicted_id)

# Break if an <EOS> token is predicted
if predicted_id == output_end:
#print("breaking")
break

output = transpose(decoder_output.stack())[0]
output = output.numpy()

output_str = []

# Decode the predicted tokens into an output string
for i in range(output.shape[0]):
key = output[i]
output_str.append(dec_tokenizer.index_word[key])

return output_str
``````

sentences = [[‘i like both’, ‘ich mag beide’],
[‘she misses him’, ‘er fehlt ihr’],
[‘i followed him’, ‘ich folgte ihm’],
[‘its unusual’, ‘es ist ungewohnlich’],
[‘she sounded mad’, ‘sie klang wutend’],
[‘this is nothing’, ‘das ist nichts’],
[‘good evening’, ‘guten abend’],
[‘we cant escape’, ‘wir konnen nicht entkommen’],
[‘he is my type’, ‘er ist mein typ’],
[‘i want my mommy’, ‘ich will zu meiner mama’]]

for pair in sentences:
print(pair[0], pair[1])