Though Training accuracy is high performance on training data during inference in transformer translation is poor

Hi, I am trying to code transformer architecture from scratch. I tried it on a toy translation problem from english to german. I see there is a tendency to overfit on the training data as a the validation loss is about twice that of the training loss. But however, when i run the trained model on training data it performs abysmally. I have seen training accuracy upto 0.99. My understanding is even if there is overfitting it i test it on my training data it should perform well, which leads to me think something is wrong with my inference. The notebook is attached. Any help in identifying what i am missing much appreciated!

import tensorflow as tf
from tensorflow import convert_to_tensor, string
from tensorflow.keras.layers import TextVectorization, Embedding, Layer
from tensorflow.data import Dataset
import numpy as np
import matplotlib.pyplot as plt

class PositionalEmbeddingFixedWeights(Layer):
def init(self, seq_len, vocab_size, output_dim, **kwargs):
super().init(**kwargs)
word_embedding_matrix = self.get_position_encoding(vocab_size, output_dim)
pos_embedding_matrix = self.get_position_encoding(seq_len, output_dim)

    self.word_embedding_layer = Embedding(
    input_dim=vocab_size, output_dim=output_dim, 
    weights=[word_embedding_matrix],
    trainable=False)
    
    self.position_embedding_layer = Embedding(
    input_dim=seq_len, output_dim=output_dim,
    weights =[pos_embedding_matrix],
    trainable=False)

def get_position_encoding(self, seq_len, d, n=10000):
    p = np.zeros((seq_len, d))
    for k in range(seq_len):
        for i in range(int(d/2)):
            denominator = n**(2*i/d)
            p[k, 2*i] =  np.sin(k/denominator)
            p[k, 2*i+1] = np.cos(k/denominator)
    return p
def call(self, inputs):
    position_indices = tf.range(tf.shape(inputs)[-1])
    embedded_words = self.word_embedding_layer(inputs)
    embedded_indices = self.position_embedding_layer(position_indices)
    return embedded_words + embedded_indices

from tensorflow import matmul, math, cast, float32
from tensorflow.keras.layers import Layer
from keras.backend import softmax
from tensorflow import math, matmul, reshape, shape, transpose, cast, float32
from tensorflow.keras.layers import Dense, Layer
from tensorflow.keras.backend import softmax

class DotProductAttention(Layer):
def init(self, **kwargs):
super().init(**kwargs)
def call(self, queries, keys, values, d_k, mask=None):
scores = matmul(queries, keys, transpose_b=True)/math.sqrt(cast(d_k, float32))
if mask is not None:
scores += 1e-9 * mask
weights = softmax(scores)
return matmul(weights, values)

class MultiHeadAttention(Layer):
def init(self, h, d_k, d_v, d_model, **kwargs):
super().init(**kwargs)
self.attention = DotProductAttention()
self.heads = h
self.d_k = d_k
self.d_v = d_v
self.d_model = d_model
self.W_q = Dense(d_k)
self.W_k = Dense(d_k)
self.W_v = Dense(d_v)
self.W_o = Dense(d_model)

def reshape_tensor(self, x, heads, flag):
    if flag:
        x = reshape(x, shape=(shape(x)[0], shape(x)[1], heads, -1))
        x = transpose(x, perm=(0,2,1,3))
    else:
        #print("shape ", tf.shape(x))
        x = transpose(x, perm=(0,2,1,3))
        x = reshape(x, shape=(shape(x)[0], shape(x)[1], self.d_k ))
        #print("shape1", tf.shape(x))
    return x
def call(self, queries, keys, values, mask=None):
    q_reshaped = self.reshape_tensor(self.W_q(queries), self.heads, True)
    k_reshaped = self.reshape_tensor(self.W_k(keys), self.heads, True)
    v_reshaped = self.reshape_tensor(self.W_v(values), self.heads, True)
    o_reshaped = self.attention(q_reshaped, k_reshaped, v_reshaped, self.d_k, mask)
    output = self.reshape_tensor(o_reshaped, self.heads, False)
    #print("outputshape", tf.shape(output))
    return self.W_o(output)

from tensorflow.keras.layers import LayerNormalization, Layer, Dense, ReLU, Dropout

class AddNormalization(Layer):
def init(self, **kwargs):
super().init(**kwargs)
self.layer_norm = LayerNormalization()

def call(self, x, sublayer_x):
    add = x+ sublayer_x
    return self.layer_norm(add)

class FeedForward(Layer):
def init(self, d_ff, d_model, **kwargs):
super().init(**kwargs)
self.fully_connected1 = Dense(d_ff)
self.fully_connected2 = Dense(d_model)
self.activation = ReLU()
def call(self, x):
x_fc1 = self.fully_connected1(x)
return self.fully_connected2(self.activation(x_fc1))
class EncoderLayer(Layer):
def init(self, sequence_length, h, d_k, d_v, d_model, d_ff, rate, **kwargs):
super().init(**kwargs)
self.sequence_length = sequence_length
self.d_model = d_model
self.build(input_shape=[None, sequence_length, d_model])
self.multihead_attention = MultiHeadAttention(h, d_k, d_v, d_model)
self.dropout1 = Dropout(rate)
self.add_norm1 = AddNormalization()
self.feed_forward = FeedForward(d_ff, d_model)
self.dropout2 = Dropout(rate)
self.add_norm2 = AddNormalization()
def build_graph(self):
input_layer = Input(shape=(self.sequence_length, self.d_model))
return Model(inputs=[input_layer], outputs=self.call(input_layer, None, True))

def call(self, x, padding_mask, training):
    multihead_output = self.multihead_attention(x,x,x,padding_mask)
    multihead_output = self.dropout1(multihead_output, training= training)
    addnorm_output = self.add_norm1(x, multihead_output)
    feedforward_output = self.feed_forward(addnorm_output)
    feedforward_output = self.dropout2(feedforward_output, training=training)
    return self.add_norm2(addnorm_output, feedforward_output)

class Encoder(Layer):
def init(self, vocab_size, sequence_length, h, d_k, d_v, d_model, d_ff, n, rate, **kwargs):
super().init(**kwargs)
self.pos_encoding = PositionalEmbeddingFixedWeights(sequence_length, vocab_size, d_model)
self.dropout = Dropout(rate)
self.encoder_layers = [EncoderLayer(sequence_length, h, d_k, d_v, d_model, d_ff, rate) for _ in range(n)]
def call(self, input_sentence, padding_mask, training):
pos_encoding_output = self.pos_encoding(input_sentence)
x = self.dropout(pos_encoding_output)
for layer in self.encoder_layers:
x = layer(x, padding_mask, training)
return x

class DecoderLayer(Layer):
def init(self, sequence_length, h, d_k, d_v, d_model, d_ff, rate, **kwargs):
super().init(**kwargs)
self.d_model = d_model
self.build(input_shape=[None, sequence_length, d_model])
self.sequence_length = sequence_length
self.multihead_attention1 = MultiHeadAttention(h, d_k, d_v, d_model)
self.dropout1 = Dropout(rate)
self.add_norm1 = AddNormalization()
self.multihead_attention2 = MultiHeadAttention(h, d_k, d_v, d_model)
self.dropout2 = Dropout(rate)
self.add_norm2 = AddNormalization()
self.feed_forward = FeedForward(d_ff, d_model)
self.dropout3 = Dropout(rate)
self.add_norm3 = AddNormalization()

def build_graph(self):
    input_layer = Input(shape=(self.sequence_length, self.d_model))
    return Model(inputs=[input_layer], outputs=self.call(input_layer, input_layer, None, None, True))
def call(self, x, encoder_output, lookahead_mask, padding_mask, training):
    multiheadoutput1 = self.multihead_attention1(x,x,x,lookahead_mask)
    multiheadoutput1 = self.dropout1(multiheadoutput1, training=training)
    addnorm_output1 = self.add_norm1(x, multiheadoutput1)
    
    multiheadoutput2 = self.multihead_attention2(addnorm_output1, encoder_output, encoder_output, padding_mask)
    multiheadoutput2 = self.dropout2(multiheadoutput2, training=training)
    addnorm_output2 = self.add_norm1(addnorm_output1, multiheadoutput2)
    feedforward_output = self.feed_forward(addnorm_output2)
    feedforward_output = self.dropout3(feedforward_output, training=training)
    return self.add_norm3(addnorm_output2, feedforward_output)

class Decoder(Layer):
def init(self, vocab_size, sequence_length, h, d_k, d_v, d_model, d_ff, n, rate, **kwargs):
super().init(**kwargs)
self.pos_encoding = PositionalEmbeddingFixedWeights(sequence_length, vocab_size, d_model)
self.dropout = Dropout(rate)
self.decoder_layers = [DecoderLayer(sequence_length, h, d_k, d_v, d_model, d_ff, rate) for _ in range(n)]
def call(self, output_target, encoder_output, lookahead_mask, padding_mask, training):
pos_encoding_output = self.pos_encoding(output_target)
x = self.dropout(pos_encoding_output, training=training)
for layer in self.decoder_layers:
x = layer(x, encoder_output, lookahead_mask, padding_mask, training)
return x

from tensorflow import math, cast, float32
from tensorflow import linalg, ones
from tensorflow import math, cast, float32, linalg, ones, maximum, newaxis
from tensorflow.keras import Model
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.layers import Input
class TransformerModel(Model):
def init(self, enc_vocab_size, dec_vocab_size, enc_seq_length, dec_seq_length,
h, d_k, d_v, d_model, d_ff_inner, n, rate, **kwargs):
super().init(**kwargs)
self.encoder = Encoder(enc_vocab_size, enc_seq_length, h, d_k, d_v, d_model, d_ff_inner, n, rate)
self.decoder = Decoder(dec_vocab_size, dec_seq_length, h, d_k, d_v, d_model, d_ff_inner, n, rate)
self.model_last_layer = Dense(dec_vocab_size)
def padding_mask(self, input):
mask = math.equal(input, 0)
mask = cast(mask, float32)
return mask[:, newaxis, newaxis, :]
def lookahead_mask(self, shape):
mask = linalg.band_part(ones((shape, shape)), 0, -1)
return mask
def call(self, encoder_input, decoder_input, training):
enc_padding_mask = self.padding_mask(encoder_input)
dec_in_padding_mask = self.padding_mask(decoder_input)
dec_in_lookahead_mask = self.lookahead_mask(decoder_input.shape[1])
dec_in_lookahead_mask = maximum(dec_in_padding_mask, dec_in_lookahead_mask)

    encoder_output = self.encoder(encoder_input, enc_padding_mask, training)
    decoder_output = self.decoder(decoder_input, encoder_output, dec_in_lookahead_mask, enc_padding_mask,
                                  training)
    model_output = self.model_last_layer(decoder_output)

tf.print(“outputshape”,model_output.shape)

tf.print(“EncInputShape”, encoder_input.shape, “EncOutputshape”, encoder_output.shape)

tf.print(“DecInputShape”, decoder_input.shape, “DecOutputShape”, decoder_output.shape)

    return model_output

training
from pickle import load
from numpy.random import shuffle
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow import convert_to_tensor, int64
from pickle import load, dump, HIGHEST_PROTOCOL
from numpy.random import shuffle
from numpy import savetxt
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow import convert_to_tensor, int64
#from prepare_dataset import PrepareDataset

class PrepareDataset:
def init(self, **kwargs):
super().init(**kwargs)
self.n_sentences = 10000
self.train_split = 0.8
self.val_split = 0.1
def create_tokenizer(self, dataset):
tokenizer = Tokenizer()
tokenizer.fit_on_texts(dataset)
return tokenizer
def find_seq_length(self, dataset):
return max(len(seq.split()) for seq in dataset)
def find_vocab_size(self, tokenizer, dataset):
tokenizer.fit_on_texts(dataset)
return len(tokenizer.word_index) + 1
def encode_pad(self, dataset, tokenizer, seq_length):
x = tokenizer.texts_to_sequences(dataset)
x = pad_sequences(x, maxlen=seq_length, padding=‘post’)
x = convert_to_tensor(x, dtype=int64)
return x
def save_tokenizer(self, tokenizer, name):
with open(name + “_tokenizer.pkl”, ‘wb’) as handle:
dump(tokenizer, handle, protocol=HIGHEST_PROTOCOL)
def call(self, filename, **kwargs):
clean_dataset = load(open(filename, ‘rb’))
dataset = clean_dataset[:self.n_sentences, :]
for i in range(len(dataset[:,0])):
if i < 10:
print(dataset[i,:])
dataset[i, 0] = “ " + dataset[i,0] + " ”
dataset[i, 1] = “ " + dataset[i,1] + " ”
shuffle(dataset)
train = dataset[:int(len(dataset)self.train_split)]
val = dataset[int(len(dataset)self.train_split):int(len(dataset)(1-self.val_split))]
test = dataset[int(len(dataset)
(1-self.val_split)):]

    enc_tokenizer = self.create_tokenizer(dataset[:, 0])
    enc_seq_length = self.find_seq_length(dataset[:, 0])
    enc_vocab_size = self.find_vocab_size(enc_tokenizer, train[:, 0])
    
    dec_tokenizer = self.create_tokenizer(dataset[:, 1])
    dec_seq_length = self.find_seq_length(dataset[:, 1])
    dec_vocab_size = self.find_vocab_size(dec_tokenizer, train[:, 1])
    
    trainX = self.encode_pad(train[:, 0], enc_tokenizer, enc_seq_length)
    trainY = self.encode_pad(train[:, 1], dec_tokenizer, dec_seq_length)
    valX = self.encode_pad(val[:, 0], enc_tokenizer, enc_seq_length)
    valY = self.encode_pad(val[:, 1], dec_tokenizer, dec_seq_length)
    
    self.save_tokenizer(enc_tokenizer, "enc")
    self.save_tokenizer(dec_tokenizer, "dec")
    savetxt("test_dataset.txt", test, fmt="%s")
    return (trainX, trainY, valX, valY, train, val, enc_seq_length, dec_seq_length, 
            enc_vocab_size, dec_vocab_size)

from tensorflow.keras.optimizers import Adam
from tensorflow.keras.optimizers.schedules import LearningRateSchedule
from tensorflow.keras.metrics import Mean
from tensorflow import data, train, math, reduce_sum, cast, equal, argmax,
float32, GradientTape, function
from tensorflow.keras.losses import sparse_categorical_crossentropy
from time import time
from pickle import dump

Define the model parameters

Define the model parameters

h = 8 # Number of self-attention heads
d_k = 64 # Dimensionality of the linearly projected queries and keys
d_v = 64 # Dimensionality of the linearly projected values
d_model = 512 # Dimensionality of model layers’ outputs
d_ff = 2048 # Dimensionality of the inner fully connected layer
n = 6 # Number of layers in the encoder stack

epochs = 160

batch_size = 64

beta_1 = 0.9

beta_2 = 0.98

epsilon = 1e-9

dropout_rate = 0.1

h = 2 # Number of self-attention heads

d_k = 16 # Dimensionality of the linearly projected queries and keys

d_v = 16 # Dimensionality of the linearly projected values

d_model = 32 # Dimensionality of model layers’ outputs

d_ff = 512 # Dimensionality of the inner fully connected layer

n = 2 # Number of layers in the encoder stack

epochs = 160
batch_size = 64
beta_1 = 0.9
beta_2 = 0.98
epsilon = 1e-9
dropout_rate = 0.3

class LRScheduler(LearningRateSchedule):
def init(self, d_model, warmup_steps=4000, **kwargs):
super().init(**kwargs)
self.d_model = cast(d_model, float32)
self.warmup_steps = warmup_steps
def call(self, step_num):
# Linearly increasing the learning rate for the first warmup_steps, and
# decreasing it thereafter
arg1 = cast(step_num, float32) ** -0.5
arg2 = cast(step_num, float32) * (self.warmup_steps ** -1.5)
return (self.d_model ** -0.5) * math.minimum(arg1, arg2)
optimizer = Adam(LRScheduler(d_model), beta_1, beta_2, epsilon)

dataset = PrepareDataset()
trainX, trainY, valX, valY, train_orig, val_orig, enc_seq_length, dec_seq_length, enc_vocab_size, dec_vocab_size
= dataset(‘data/Neural-Machine-Translation-System/english-german-both.pkl’)

print(“Sizes:”, enc_seq_length, dec_seq_length, enc_vocab_size, dec_vocab_size)

Prepare the training dataset batches

train_dataset = data.Dataset.from_tensor_slices((trainX, trainY))
train_dataset = train_dataset.batch(batch_size)

Prepare the validation dataset batches

val_dataset = data.Dataset.from_tensor_slices((valX, valY))
val_dataset = val_dataset.batch(batch_size)

Create model

training_model = TransformerModel(enc_vocab_size, dec_vocab_size, enc_seq_length,
dec_seq_length, h, d_k, d_v, d_model, d_ff, n, dropout_rate)

Defining the loss function

def loss_fcn(target, prediction):
# Create mask so that the zero padding values are not included in the
# computation of loss
mask = math.logical_not(equal(target, 0))
mask = cast(mask, float32)
# Compute a sparse categorical cross-entropy loss on the unmasked values
loss = sparse_categorical_crossentropy(target, prediction, from_logits=True) * mask
# Compute the mean loss over the unmasked values
return reduce_sum(loss) / reduce_sum(mask)
def accuracy_fcn(target, prediction):
# Create mask so that the zero padding values are not included in the
# computation of accuracy
mask = math.logical_not(equal(target, 0))
# Find equal prediction and target values, and apply the padding mask
accuracy = equal(target, argmax(prediction, axis=2))
accuracy = math.logical_and(mask, accuracy)
# Cast the True/False values to 32-bit-precision floating-point numbers
mask = cast(mask, float32)
accuracy = cast(accuracy, float32)
# Compute the mean accuracy over the unmasked values
return reduce_sum(accuracy) / reduce_sum(mask)

Include metrics monitoring

train_loss = Mean(name=‘train_loss’)
train_accuracy = Mean(name=‘train_accuracy’)
val_loss = Mean(name=‘val_loss’)

Create a checkpoint object and manager to manage multiple checkpoints

ckpt = train.Checkpoint(model=training_model, optimizer=optimizer)
ckpt_manager = train.CheckpointManager(ckpt, “./checkpoints”, max_to_keep=None)

Initialise dictionaries to store the training and validation losses

train_loss_dict = {}
val_loss_dict = {}

@function
def train_step(encoder_input, decoder_input, decoder_output):
with GradientTape() as tape:
# Run the forward pass of the model to generate a prediction
prediction = training_model(encoder_input, decoder_input, training=True)
# Compute the training loss
loss = loss_fcn(decoder_output, prediction)
# Compute the training accuracy
accuracy = accuracy_fcn(decoder_output, prediction)
# Retrieve gradients of the trainable variables with respect to the training loss
gradients = tape.gradient(loss, training_model.trainable_weights)
# Update the values of the trainable variables by gradient descent
optimizer.apply_gradients(zip(gradients, training_model.trainable_weights))
train_loss(loss)
train_accuracy(accuracy)

for epoch in range(epochs):
train_loss.reset_states()
train_accuracy.reset_states()
val_loss.reset_states()
print(“\nStart of epoch %d” % (epoch + 1))
start_time = time()
# Iterate over the dataset batches
for step, (train_batchX, train_batchY) in enumerate(train_dataset):
# Define the encoder and decoder inputs, and the decoder output
encoder_input = train_batchX[:, 1:]
decoder_input = train_batchY[:, :-1]
decoder_output = train_batchY[:, 1:]
#print(“step”, step, “calling train_step”)
train_step(encoder_input, decoder_input, decoder_output)
if step % 50 == 0:
print(f"Epoch {epoch + 1} Step {step} Loss {train_loss.result():.4f} "
+ f"Accuracy {train_accuracy.result():.4f}“)
# Run a validation step after every epoch of training
for val_batchX, val_batchY in val_dataset:
# Define the encoder and decoder inputs, and the decoder output
encoder_input = val_batchX[:, 1:]
decoder_input = val_batchY[:, :-1]
decoder_output = val_batchY[:, 1:]
# Generate a prediction
prediction = training_model(encoder_input, decoder_input, training=False)
# Compute the validation loss
#tf.print(“dcoder_outputshape”, decoder_output.shape, “prediction shape”, prediction.shape)
loss = loss_fcn(decoder_output, prediction)
val_loss(loss)
# Print epoch number and accuracy and loss values at the end of every epoch
print(f"Epoch {epoch+1}: Training Loss {train_loss.result():.4f}, "
+ f"Training Accuracy {train_accuracy.result():.4f}, "
+ f"Validation Loss {val_loss.result():.4f}”)
# Save a checkpoint after every epoch
if (epoch + 1) % 1 == 0:
save_path = ckpt_manager.save()
print(f"Saved checkpoint at epoch {epoch+1}")
# Save the trained model weights
training_model.save_weights(“weights/wghtstemp” + str(epoch + 1) + “.ckpt”)
train_loss_dict[epoch] = train_loss.result()
val_loss_dict[epoch] = val_loss.result()

Save the training loss values

with open(‘./train_loss.pkl’, ‘wb’) as file:
dump(train_loss_dict, file)

Save the validation loss values

with open(‘./val_loss.pkl’, ‘wb’) as file:
dump(val_loss_dict, file)
print(“Total time taken: %.2fs” % (time() - start_time))

Inference

from pickle import load
from tensorflow import Module
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow import convert_to_tensor, int64, TensorArray, argmax, newaxis, transpose
#from translate import Translate

Define the model parameters
h = 8 # Number of self-attention heads
d_k = 64 # Dimensionality of the linearly projected queries and keys
d_v = 64 # Dimensionality of the linearly projected values
d_model = 512 # Dimensionality of model layers’ outputs
d_ff = 2048 # Dimensionality of the inner fully connected layer
n = 6 # Number of layers in the encoder stack

h = 2 # Number of self-attention heads

d_k = 16 # Dimensionality of the linearly projected queries and keys

d_v = 16 # Dimensionality of the linearly projected values

d_model = 32 # Dimensionality of model layers’ outputs

d_ff = 512 # Dimensionality of the inner fully connected layer

n = 2 # Number of layers in the encoder stack

Define the dataset parameters

enc_seq_length = 7 # Encoder sequence length
dec_seq_length = 12 # Decoder sequence length
enc_vocab_size = 2404 # Encoder vocabulary size
dec_vocab_size = 3864 # Decoder vocabulary size

Create model

inferencing_model = TransformerModel(enc_vocab_size, dec_vocab_size, enc_seq_length, dec_seq_length, h, d_k, d_v, d_model, d_ff, n, 0)

class Translate_1(Module):
def init(self, inferencing_model, **kwargs):
super().init(**kwargs)
self.transformer = inferencing_model

def load_tokenizer(self, name):
    with open(name, 'rb') as handle:
        return load(handle)

def __call__(self, sentence):
    # Append start and end of string tokens to the input sentence
    sentence[0] = "<START> " + sentence[0] + " <EOS>"

    # Load encoder and decoder tokenizers
    enc_tokenizer = self.load_tokenizer('enc_tokenizer.pkl')
    dec_tokenizer = self.load_tokenizer('dec_tokenizer.pkl')

    # Prepare the input sentence by tokenizing, padding and converting to tensor
    encoder_input = enc_tokenizer.texts_to_sequences(sentence)
    encoder_input = pad_sequences(encoder_input,
                                  maxlen=enc_seq_length, padding='post')
    encoder_input = convert_to_tensor(encoder_input, dtype=int64)
    print("encoder_input", encoder_input)
    # Prepare the output <START> token by tokenizing, and converting to tensor
    output_start = dec_tokenizer.texts_to_sequences(["<START>"])
    output_start = convert_to_tensor(output_start[0], dtype=int64)

    # Prepare the output <EOS> token by tokenizing, and converting to tensor
    output_end = dec_tokenizer.texts_to_sequences(["<EOS>"])
    output_end = convert_to_tensor(output_end[0], dtype=int64)

    # Prepare the output array of dynamic size
    decoder_output = TensorArray(dtype=int64, size=0, dynamic_size=True)
    decoder_output = decoder_output.write(0, output_start)
    
    

    for i in range(dec_seq_length):
        # Predict an output token
        predictionorig = self.transformer(encoder_input,transpose(decoder_output.stack()),
                                      training=False)
        #print("predshape", predictionorig.shape)
        prediction = predictionorig[:, -1, :]
        #print("prediction", prediction)
        output_str = []
        prediction_output = TensorArray(dtype=int64, size=0, dynamic_size=True)
        
        prediction_output = prediction_output.write(0, output_start)

    # Decode the predicted tokens into an output string
        for k in range(predictionorig.shape[-2]):
            key = argmax(predictionorig[:, k, :][0])[newaxis]
            #print("key is", key)
            prediction_output = prediction_output.write(k+1, key)
        
        pred_output = transpose(prediction_output.stack())[0]
        pred_output = pred_output.numpy()

        output_str = []

    # Decode the predicted tokens into an output string
        for k in range(pred_output.shape[0]):
            key = pred_output[k]
            output_str.append(dec_tokenizer.index_word[key])
        print("PredOutputString", output_str, predictionorig.shape[-2])

        # Select the prediction with the highest score
        predicted_id = argmax(prediction, axis=-1)
        predicted_id = predicted_id[0][newaxis]
        #print("predicted_idshape", predicted_id.shape)
        # Write the selected prediction to the output array at the next
        # available index
        decoder_output = decoder_output.write(i + 1, predicted_id)

        # Break if an <EOS> token is predicted
        if predicted_id == output_end:
            #print("breaking")
            break

    output = transpose(decoder_output.stack())[0]
    output = output.numpy()

    output_str = []

    # Decode the predicted tokens into an output string
    for i in range(output.shape[0]):
        key = output[i]
        output_str.append(dec_tokenizer.index_word[key])

    return output_str

sentences = [[‘i like both’, ‘ich mag beide’],
[‘she misses him’, ‘er fehlt ihr’],
[‘i followed him’, ‘ich folgte ihm’],
[‘its unusual’, ‘es ist ungewohnlich’],
[‘she sounded mad’, ‘sie klang wutend’],
[‘this is nothing’, ‘das ist nichts’],
[‘good evening’, ‘guten abend’],
[‘we cant escape’, ‘wir konnen nicht entkommen’],
[‘he is my type’, ‘er ist mein typ’],
[‘i want my mommy’, ‘ich will zu meiner mama’]]

for pair in sentences:
print(pair[0], pair[1])

Load the trained model’s weights at the specified epoch

inferencing_model.load_weights(‘weights/wghtstemp160.ckpt’)

Create a new instance of the ‘Translate’ class

translator = Translate_1(inferencing_model)

Translate the input sentence

print(translator(sentence))

for pair in sentences:
print(“---------------------------”,pair[0], pair[1])
print(translator([pair[0]]))