from tensorflow.python.keras.layers import Layer, Embedding, Add, Dense, Dropout
from tensorflow.python.keras.models import Model, Sequential
from keras.layers import MultiHeadAttention, LayerNormalization, TextVectorization
import tensorflow as tf
import numpy as np
length = 2048
BATCH_SIZE = 2048
source_sentences = ["Hello"]
target_sentences = ["Hola"]
tokenizer = TextVectorization(output_sequence_length=64)
tokenizer.adapt(source_sentences + target_sentences)
input_sequences = tokenizer(source_sentences)
target_sequences = tokenizer(target_sentences)
dataset = tf.data.Dataset.from_tensor_slices((input_sequences, target_sequences))
dataset = dataset.shuffle(buffer_size=len(input_sequences))
dataset = dataset.batch(batch_size=BATCH_SIZE)
dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
def positional_encoding(length, depth):
depth = depth/2
positions = np.arange(length)[:, np.newaxis] # (seq, 1)
depths = np.arange(depth)[np.newaxis, :]/depth # (1, depth)
angle_rates = 1 / (10000**depths) # (1, depth)
angle_rads = positions * angle_rates # (pos, depth)
pos_encoding = np.concatenate(
[np.sin(angle_rads), np.cos(angle_rads)],
axis=-1)
return tf.cast(pos_encoding, dtype=tf.float32)
class PositionalEmbedding(Layer):
def __init__(self, vocab_size, d_model):
super().__init__()
self.d_model = d_model
self.embedding = Embedding(vocab_size, d_model, mask_zero=True)
self.pos_encoding = positional_encoding(length=length, depth=d_model)
def compute_mask(self, inputs, mask=None):
return self.embedding.compute_mask(inputs, mask)
def call(self, x):
length = np.shape(x)[1]
x = self.embedding(x)
x *= np.sqrt(tf.cast(self.d_model, tf.float32))
x = x + self.pos_encoding[tf.newaxis, :length, :]
return x
class SelfAttention(Layer):
def __init__(self, d_model, num_heads):
super().__init__()
self.mha = MultiHeadAttention(num_heads=num_heads, key_dim=d_model)
self.norm = LayerNormalization()
self.add = Add()
def call(self, x):
attn_outputs, attn_scores = self.mha(key=x, value=x, query=x, return_attention_scores=True)
x = self.add([x, attn_outputs])
x = self.norm(x)
return x
class MultiHeadedAttention(Layer):
def __init__(self, d_model, num_heads):
super().__init__()
self.mha = MultiHeadAttention(num_heads=num_heads, key_dim=d_model)
self.norm = LayerNormalization()
self.add = Add()
def call(self, x, context):
attn_outputs, attn_scores = self.mha(key=context, value=context, query=x, return_attention_scores=True)
x = self.add([x, attn_outputs])
x = self.norm(x)
return x
class MaskedMultiHeadedAttention(Layer):
def __init__(self, d_model, num_heads):
super().__init__()
self.mha = MultiHeadAttention(num_heads=num_heads, key_dim=d_model)
self.norm = LayerNormalization()
self.add = Add()
def call(self, x, context):
attn_outputs, attn_scores = self.mha(key=context, value=context, query=x, return_attention_scores=True, use_casual_mask=True)
x = self.add([x, attn_outputs])
x = self.norm(x)
return x
class FeedForward(Layer):
def __init__(self, d_model, dff):
super().__init__()
self.sequential = Sequential([
Dense(dff),
Dense(d_model),
Dropout(0.1)
])
self.add = Add()
self.norm = LayerNormalization()
def call(self, x):
x = self.add([x, self.sequential(x)])
x = self.norm(x)
class EncoderLayer(Layer):
def __init__(self, d_model, num_heads, dff):
super().__init__()
self.mha = SelfAttention(d_model, num_heads)
self.ffn = FeedForward(d_model, dff)
def call(self, inputs, training=False):
x = self.mha(inputs)
x = self.ffn(x)
return x
class Encoder(Layer):
def __init__(self, vocab_size, d_model, dff, num_heads, num_layers):
super().__init__()
self.num_layers = num_layers
self.pe = PositionalEmbedding(vocab_size=vocab_size, d_model=d_model)
self.enc_layers = [EncoderLayer(d_model=d_model, num_heads=num_heads, dff=dff) for _ in range(num_layers)]
def call(self, inputs):
x = self.pe(inputs)
for i in range(self.num_layers):
x = self.enc_layers[i](x)
print(np.shape(x))
return x
class DecoderLayer(Layer):
def __init__(self, d_model, num_heads, dff):
super().__init__()
self.mha = MultiHeadedAttention(d_model=d_model, num_heads=num_heads)
self.ffn = FeedForward(d_model=d_model, dff=dff)
self.mmha = MaskedMultiHeadedAttention(d_model=d_model, num_heads=num_heads)
def call(self, x, context, training=False):
x = self.mmha(x, context)
x = self.mha(x, context)
x = self.ffn(x)
return x
class Decoder(Layer):
def __init__(self, vocab_size, d_model, num_heads, dff, num_layers):
super().__init__()
self.pe = PositionalEmbedding(vocab_size=vocab_size, d_model=d_model)
self.dec_layers = [DecoderLayer(d_model, num_heads, dff) for _ in range(num_layers)]
self.ffn = FeedForward(d_model=d_model, dff=dff)
def call(self, x, context):
x = self.pe(x)
for i in range(self.num_layers):
x = self.dec_layers[i](x)
x = self.ffn(x)
return x
class Transformer(Model):
def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size, target_vocab_size, dropout_rate=0.1):
super().__init__()
self.enc = Encoder(vocab_size=input_vocab_size, num_layers=num_layers, d_model=d_model, dff=dff, num_heads=num_heads)
self.dec = Decoder(num_layers=num_layers, d_model=d_model, num_heads=num_heads, vocab_size=target_vocab_size, dff=dff)
self.final = Dense(target_vocab_size)
@tf.function
def call(self, inputs, training=False):
context, x = inputs
context = self.enc(context)
x - self.dec(x, context)
logits = self.final(x)
return logits
num_layers = 4
d_model = 128
dff = 512
num_heads = 8
vocab_size = tokenizer.vocabulary_size()
def masked_loss(label, pred):
mask = label != 0
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
from_logits=True, reduction='none')
loss = loss_object(label, pred)
mask = tf.cast(mask, dtype=loss.dtype)
loss *= mask
loss = tf.reduce_sum(loss)/tf.reduce_sum(mask)
return loss
def masked_accuracy(label, pred):
pred = tf.argmax(pred, axis=2)
label = tf.cast(label, pred.dtype)
match = label == pred
mask = label != 0
match = match & mask
match = tf.cast(match, dtype=tf.float32)
mask = tf.cast(mask, dtype=tf.float32)
return tf.reduce_sum(match)/tf.reduce_sum(mask)
transformer = Transformer(num_layers=num_layers, d_model=d_model, num_heads=num_heads, dff=dff, target_vocab_size=vocab_size, input_vocab_size=vocab_size)
transformer.compile(loss=masked_loss, optimizer='Adam', metrics=[masked_accuracy])
transformer.built = True
ex = np.array([[1, 2, 3], [1, 2, 3]])
ey = np.array([[1, 2, 3], [1, 2, 3]])
transformer.fit(dataset, epochs=10)
When i run this code it gives me this error
OperatorNotAllowedInGraphError: Iterating over a symbolic
tf.Tensoris not allowed: AutoGraph did convert this function. This might indicate you are trying to use an unsupported feature.
I cannot find a solution anywhere online