Transformer transalation

from tensorflow.python.keras.layers import Layer, Embedding, Add, Dense, Dropout
from tensorflow.python.keras.models import Model, Sequential
from keras.layers import MultiHeadAttention, LayerNormalization, TextVectorization

import tensorflow as tf
import numpy as np

length = 2048
BATCH_SIZE = 2048

source_sentences = ["Hello"]
target_sentences = ["Hola"]

tokenizer = TextVectorization(output_sequence_length=64)
tokenizer.adapt(source_sentences + target_sentences)

input_sequences = tokenizer(source_sentences)
target_sequences = tokenizer(target_sentences)

dataset = tf.data.Dataset.from_tensor_slices((input_sequences, target_sequences))
dataset = dataset.shuffle(buffer_size=len(input_sequences))
dataset = dataset.batch(batch_size=BATCH_SIZE)
dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

def positional_encoding(length, depth):
  depth = depth/2

  positions = np.arange(length)[:, np.newaxis]     # (seq, 1)
  depths = np.arange(depth)[np.newaxis, :]/depth   # (1, depth)

  angle_rates = 1 / (10000**depths)         # (1, depth)
  angle_rads = positions * angle_rates      # (pos, depth)

  pos_encoding = np.concatenate(
      [np.sin(angle_rads), np.cos(angle_rads)],
      axis=-1)

  return tf.cast(pos_encoding, dtype=tf.float32)

class PositionalEmbedding(Layer):
    def __init__(self, vocab_size, d_model):
        super().__init__()
        self.d_model = d_model
        self.embedding = Embedding(vocab_size, d_model, mask_zero=True)
        self.pos_encoding = positional_encoding(length=length, depth=d_model)

    def compute_mask(self, inputs, mask=None):
        return self.embedding.compute_mask(inputs, mask)

    def call(self, x):
        length = np.shape(x)[1]
        x = self.embedding(x)
        x *= np.sqrt(tf.cast(self.d_model, tf.float32))
        x = x + self.pos_encoding[tf.newaxis, :length, :]
        return x

class SelfAttention(Layer):
    def __init__(self, d_model, num_heads):
        super().__init__()
        self.mha = MultiHeadAttention(num_heads=num_heads, key_dim=d_model)
        self.norm = LayerNormalization()
        self.add = Add()

    def call(self, x):
        attn_outputs, attn_scores = self.mha(key=x, value=x, query=x, return_attention_scores=True)
        x = self.add([x, attn_outputs])
        x = self.norm(x)
        return x

class MultiHeadedAttention(Layer):
    def __init__(self, d_model, num_heads):
        super().__init__()
        self.mha = MultiHeadAttention(num_heads=num_heads, key_dim=d_model)
        self.norm = LayerNormalization()
        self.add = Add()

    def call(self, x, context):
        attn_outputs, attn_scores = self.mha(key=context, value=context, query=x, return_attention_scores=True)
        x = self.add([x, attn_outputs])
        x = self.norm(x)
        return x

class MaskedMultiHeadedAttention(Layer):
    def __init__(self, d_model, num_heads):
        super().__init__()
        self.mha = MultiHeadAttention(num_heads=num_heads, key_dim=d_model)
        self.norm = LayerNormalization()
        self.add = Add()

    def call(self, x, context):
        attn_outputs, attn_scores = self.mha(key=context, value=context, query=x, return_attention_scores=True, use_casual_mask=True)
        x = self.add([x, attn_outputs])
        x = self.norm(x)
        return x

class FeedForward(Layer):
    def __init__(self, d_model, dff):
        super().__init__()
        self.sequential = Sequential([
            Dense(dff),
            Dense(d_model),
            Dropout(0.1)
        ])
        self.add = Add()
        self.norm = LayerNormalization()

    def call(self, x):
        x = self.add([x, self.sequential(x)])
        x = self.norm(x)


class EncoderLayer(Layer):
    def __init__(self, d_model, num_heads, dff):
        super().__init__()
        self.mha = SelfAttention(d_model, num_heads)
        self.ffn = FeedForward(d_model, dff)

    def call(self, inputs, training=False):
        x = self.mha(inputs)
        x = self.ffn(x)
        return x

class Encoder(Layer):
    def __init__(self, vocab_size, d_model, dff, num_heads, num_layers):
        super().__init__()
        self.num_layers = num_layers
        self.pe = PositionalEmbedding(vocab_size=vocab_size, d_model=d_model)
        self.enc_layers = [EncoderLayer(d_model=d_model, num_heads=num_heads, dff=dff) for _ in range(num_layers)]

    def call(self, inputs):
        x = self.pe(inputs)
        for i in range(self.num_layers):
            x = self.enc_layers[i](x)
        print(np.shape(x))
        return x

class DecoderLayer(Layer):
    def __init__(self, d_model, num_heads, dff):
        super().__init__()
        self.mha = MultiHeadedAttention(d_model=d_model, num_heads=num_heads)
        self.ffn = FeedForward(d_model=d_model, dff=dff)
        self.mmha = MaskedMultiHeadedAttention(d_model=d_model, num_heads=num_heads)

    def call(self, x, context, training=False):
        x = self.mmha(x, context)
        x = self.mha(x, context)
        x = self.ffn(x)

        return x

class Decoder(Layer):
    def __init__(self, vocab_size, d_model, num_heads, dff, num_layers):
        super().__init__()
        self.pe = PositionalEmbedding(vocab_size=vocab_size, d_model=d_model)
        self.dec_layers = [DecoderLayer(d_model, num_heads, dff) for _ in range(num_layers)]
        self.ffn = FeedForward(d_model=d_model, dff=dff)

    def call(self, x, context):
        x = self.pe(x)

        for i in range(self.num_layers):
            x = self.dec_layers[i](x)

        x = self.ffn(x)
        return x

class Transformer(Model):
    def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size, target_vocab_size, dropout_rate=0.1):
        super().__init__()
        self.enc = Encoder(vocab_size=input_vocab_size, num_layers=num_layers, d_model=d_model, dff=dff, num_heads=num_heads)
        self.dec = Decoder(num_layers=num_layers, d_model=d_model, num_heads=num_heads, vocab_size=target_vocab_size, dff=dff)

        self.final = Dense(target_vocab_size)

    @tf.function
    def call(self, inputs, training=False):
        context, x = inputs
        context = self.enc(context)
        x - self.dec(x, context)
        logits = self.final(x)
        return logits

num_layers = 4
d_model = 128
dff = 512
num_heads = 8

vocab_size = tokenizer.vocabulary_size()

def masked_loss(label, pred):
  mask = label != 0
  loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')
  loss = loss_object(label, pred)

  mask = tf.cast(mask, dtype=loss.dtype)
  loss *= mask

  loss = tf.reduce_sum(loss)/tf.reduce_sum(mask)
  return loss


def masked_accuracy(label, pred):
  pred = tf.argmax(pred, axis=2)
  label = tf.cast(label, pred.dtype)
  match = label == pred

  mask = label != 0

  match = match & mask

  match = tf.cast(match, dtype=tf.float32)
  mask = tf.cast(mask, dtype=tf.float32)
  return tf.reduce_sum(match)/tf.reduce_sum(mask)

transformer = Transformer(num_layers=num_layers, d_model=d_model, num_heads=num_heads, dff=dff, target_vocab_size=vocab_size, input_vocab_size=vocab_size)
transformer.compile(loss=masked_loss, optimizer='Adam', metrics=[masked_accuracy])
transformer.built = True

ex = np.array([[1, 2, 3], [1, 2, 3]])
ey = np.array([[1, 2, 3], [1, 2, 3]])

transformer.fit(dataset, epochs=10)

When i run this code it gives me this error
OperatorNotAllowedInGraphError: Iterating over a symbolictf.Tensoris not allowed: AutoGraph did convert this function. This might indicate you are trying to use an unsupported feature.

I cannot find a solution anywhere online

    @tf.function
    def call(self, inputs, training=False):
        context, x = inputs
        context = self.enc(context)
        x - self.dec(x, context)       # This might be the problem?
        logits = self.final(x)
        return logits

Are you using this as it is? because i find somethings wrong…
I am also working with vanilla Transfomers right now…you don’t need to decorate the call method of keras.Model class as its automatically traced. also you don’t need the training=False argument unless you are doing inference. also your batch size is very large and also you need to preprocess the dataset inside your data pipeline

1 Like