Transformer transalation

aperson_64 · September 18, 2023, 7:04am

from tensorflow.python.keras.layers import Layer, Embedding, Add, Dense, Dropout
from tensorflow.python.keras.models import Model, Sequential
from keras.layers import MultiHeadAttention, LayerNormalization, TextVectorization

import tensorflow as tf
import numpy as np

length = 2048
BATCH_SIZE = 2048

source_sentences = ["Hello"]
target_sentences = ["Hola"]

tokenizer = TextVectorization(output_sequence_length=64)
tokenizer.adapt(source_sentences + target_sentences)

input_sequences = tokenizer(source_sentences)
target_sequences = tokenizer(target_sentences)

dataset = tf.data.Dataset.from_tensor_slices((input_sequences, target_sequences))
dataset = dataset.shuffle(buffer_size=len(input_sequences))
dataset = dataset.batch(batch_size=BATCH_SIZE)
dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

def positional_encoding(length, depth):
  depth = depth/2

  positions = np.arange(length)[:, np.newaxis]     # (seq, 1)
  depths = np.arange(depth)[np.newaxis, :]/depth   # (1, depth)

  angle_rates = 1 / (10000**depths)         # (1, depth)
  angle_rads = positions * angle_rates      # (pos, depth)

  pos_encoding = np.concatenate(
      [np.sin(angle_rads), np.cos(angle_rads)],
      axis=-1)

  return tf.cast(pos_encoding, dtype=tf.float32)

class PositionalEmbedding(Layer):
    def __init__(self, vocab_size, d_model):
        super().__init__()
        self.d_model = d_model
        self.embedding = Embedding(vocab_size, d_model, mask_zero=True)
        self.pos_encoding = positional_encoding(length=length, depth=d_model)

    def compute_mask(self, inputs, mask=None):
        return self.embedding.compute_mask(inputs, mask)

    def call(self, x):
        length = np.shape(x)[1]
        x = self.embedding(x)
        x *= np.sqrt(tf.cast(self.d_model, tf.float32))
        x = x + self.pos_encoding[tf.newaxis, :length, :]
        return x

class SelfAttention(Layer):
    def __init__(self, d_model, num_heads):
        super().__init__()
        self.mha = MultiHeadAttention(num_heads=num_heads, key_dim=d_model)
        self.norm = LayerNormalization()
        self.add = Add()

    def call(self, x):
        attn_outputs, attn_scores = self.mha(key=x, value=x, query=x, return_attention_scores=True)
        x = self.add([x, attn_outputs])
        x = self.norm(x)
        return x

class MultiHeadedAttention(Layer):
    def __init__(self, d_model, num_heads):
        super().__init__()
        self.mha = MultiHeadAttention(num_heads=num_heads, key_dim=d_model)
        self.norm = LayerNormalization()
        self.add = Add()

    def call(self, x, context):
        attn_outputs, attn_scores = self.mha(key=context, value=context, query=x, return_attention_scores=True)
        x = self.add([x, attn_outputs])
        x = self.norm(x)
        return x

class MaskedMultiHeadedAttention(Layer):
    def __init__(self, d_model, num_heads):
        super().__init__()
        self.mha = MultiHeadAttention(num_heads=num_heads, key_dim=d_model)
        self.norm = LayerNormalization()
        self.add = Add()

    def call(self, x, context):
        attn_outputs, attn_scores = self.mha(key=context, value=context, query=x, return_attention_scores=True, use_casual_mask=True)
        x = self.add([x, attn_outputs])
        x = self.norm(x)
        return x

class FeedForward(Layer):
    def __init__(self, d_model, dff):
        super().__init__()
        self.sequential = Sequential([
            Dense(dff),
            Dense(d_model),
            Dropout(0.1)
        ])
        self.add = Add()
        self.norm = LayerNormalization()

    def call(self, x):
        x = self.add([x, self.sequential(x)])
        x = self.norm(x)


class EncoderLayer(Layer):
    def __init__(self, d_model, num_heads, dff):
        super().__init__()
        self.mha = SelfAttention(d_model, num_heads)
        self.ffn = FeedForward(d_model, dff)

    def call(self, inputs, training=False):
        x = self.mha(inputs)
        x = self.ffn(x)
        return x

class Encoder(Layer):
    def __init__(self, vocab_size, d_model, dff, num_heads, num_layers):
        super().__init__()
        self.num_layers = num_layers
        self.pe = PositionalEmbedding(vocab_size=vocab_size, d_model=d_model)
        self.enc_layers = [EncoderLayer(d_model=d_model, num_heads=num_heads, dff=dff) for _ in range(num_layers)]

    def call(self, inputs):
        x = self.pe(inputs)
        for i in range(self.num_layers):
            x = self.enc_layers[i](x)
        print(np.shape(x))
        return x

class DecoderLayer(Layer):
    def __init__(self, d_model, num_heads, dff):
        super().__init__()
        self.mha = MultiHeadedAttention(d_model=d_model, num_heads=num_heads)
        self.ffn = FeedForward(d_model=d_model, dff=dff)
        self.mmha = MaskedMultiHeadedAttention(d_model=d_model, num_heads=num_heads)

    def call(self, x, context, training=False):
        x = self.mmha(x, context)
        x = self.mha(x, context)
        x = self.ffn(x)

        return x

class Decoder(Layer):
    def __init__(self, vocab_size, d_model, num_heads, dff, num_layers):
        super().__init__()
        self.pe = PositionalEmbedding(vocab_size=vocab_size, d_model=d_model)
        self.dec_layers = [DecoderLayer(d_model, num_heads, dff) for _ in range(num_layers)]
        self.ffn = FeedForward(d_model=d_model, dff=dff)

    def call(self, x, context):
        x = self.pe(x)

        for i in range(self.num_layers):
            x = self.dec_layers[i](x)

        x = self.ffn(x)
        return x

class Transformer(Model):
    def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size, target_vocab_size, dropout_rate=0.1):
        super().__init__()
        self.enc = Encoder(vocab_size=input_vocab_size, num_layers=num_layers, d_model=d_model, dff=dff, num_heads=num_heads)
        self.dec = Decoder(num_layers=num_layers, d_model=d_model, num_heads=num_heads, vocab_size=target_vocab_size, dff=dff)

        self.final = Dense(target_vocab_size)

    @tf.function
    def call(self, inputs, training=False):
        context, x = inputs
        context = self.enc(context)
        x - self.dec(x, context)
        logits = self.final(x)
        return logits

num_layers = 4
d_model = 128
dff = 512
num_heads = 8

vocab_size = tokenizer.vocabulary_size()

def masked_loss(label, pred):
  mask = label != 0
  loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')
  loss = loss_object(label, pred)

  mask = tf.cast(mask, dtype=loss.dtype)
  loss *= mask

  loss = tf.reduce_sum(loss)/tf.reduce_sum(mask)
  return loss


def masked_accuracy(label, pred):
  pred = tf.argmax(pred, axis=2)
  label = tf.cast(label, pred.dtype)
  match = label == pred

  mask = label != 0

  match = match & mask

  match = tf.cast(match, dtype=tf.float32)
  mask = tf.cast(mask, dtype=tf.float32)
  return tf.reduce_sum(match)/tf.reduce_sum(mask)

transformer = Transformer(num_layers=num_layers, d_model=d_model, num_heads=num_heads, dff=dff, target_vocab_size=vocab_size, input_vocab_size=vocab_size)
transformer.compile(loss=masked_loss, optimizer='Adam', metrics=[masked_accuracy])
transformer.built = True

ex = np.array([[1, 2, 3], [1, 2, 3]])
ey = np.array([[1, 2, 3], [1, 2, 3]])

transformer.fit(dataset, epochs=10)

When i run this code it gives me this error
OperatorNotAllowedInGraphError: Iterating over a symbolictf.Tensoris not allowed: AutoGraph did convert this function. This might indicate you are trying to use an unsupported feature.

I cannot find a solution anywhere online

Sohail_Mohammad · September 18, 2023, 10:14am

    @tf.function
    def call(self, inputs, training=False):
        context, x = inputs
        context = self.enc(context)
        x - self.dec(x, context)       # This might be the problem?
        logits = self.final(x)
        return logits

Are you using this as it is? because i find somethings wrong…
I am also working with vanilla Transfomers right now…you don’t need to decorate the call method of keras.Model class as its automatically traced. also you don’t need the training=False argument unless you are doing inference. also your batch size is very large and also you need to preprocess the dataset inside your data pipeline

Topic		Replies	Views
Though Training accuracy is high performance on training data during inference in transformer translation is poor General Discussion models , transformers	0	611	June 9, 2023
Model save and load but not working well Keras tfkeras , model_optimization	1	177	March 21, 2024
Nan loss occurring when training transformer model for machine translation General Discussion machine-learning , model , training	1	210	January 9, 2025
Save load and retrain machine translation model (help needed) General Discussion models , keras	2	623	June 6, 2023
Transformer from scratch, subclassing with keras. Gradients does not exist for these variables General Discussion models , keras , transformers	0	809	January 10, 2023

Transformer transalation

Related topics