Cannot convert 'russian' to EagerTensor of dtype int32

Hello, i have a problem while model.fit, already a few hours trying to fix, but, i have no result:

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
Cell In[6], line 2
      1 epochs = config_train['epoch_train']
----> 2 model.fit_model(train_english, 
      3                          train_russian,
      4                          valid_english,
      5                          valid_russian,
      6                          epochs=epochs,
      7                          save_model_each_epoch=True,
      8                          logs=True,
      9                          model_name='novelsdreamer-ru-t4m')

File ~/Documents/GitHub/novelsdreamer-ru-t4m/modules/transformer_custom.py:209, in Transformer.fit_model(self, train_english, train_russian, valid_english, valid_russian, epochs, model_name, save_model_each_epoch, logs, logs_path)
    203 """
    204 Fit the model to the data and save the model.
    205 """
    207 for epoch in tqdm(range(epochs)):
    208     # Ensure the data is in the correct format before creating masks
--> 209     enc_padding_mask, combined_mask, dec_padding_mask = self.create_masks(tf.convert_to_tensor(train_english, dtype=tf.int32), tf.convert_to_tensor(train_russian, dtype=tf.int32))
    210     predictions, _ = self.call(tf.convert_to_tensor(train_english, dtype=tf.int32), tf.convert_to_tensor(train_russian, dtype=tf.int32), True, enc_padding_mask, combined_mask, dec_padding_mask)
    211     loss = self.loss_function(tf.convert_to_tensor(train_russian, dtype=tf.int32), predictions)

File ~/miniconda3/lib/python3.11/site-packages/tensorflow/python/util/traceback_utils.py:153, in filter_traceback..error_handler(*args, **kwargs)
    151 except Exception as e:
    152   filtered_tb = _process_traceback_frames(e.__traceback__)
--> 153   raise e.with_traceback(filtered_tb) from None
    154 finally:
    155   del filtered_tb

File ~/miniconda3/lib/python3.11/site-packages/tensorflow/python/framework/constant_op.py:103, in convert_to_eager_tensor(value, ctx, dtype)
    101     dtype = dtypes.as_dtype(dtype).as_datatype_enum
    102 ctx.ensure_initialized()
--> 103 return ops.EagerTensor(value, ctx.device_name, dtype)

TypeError: Cannot convert 'russian' to EagerTensor of dtype int32

here is transformer code:

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np
from math import log
import matplotlib.pyplot as plt
import os
from tqdm import tqdm
# Positional Encoding
def get_angles(pos, i, d_model):
    angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(d_model))
    return pos * angle_rates

def positional_encoding(position, d_model):
    angle_rads = get_angles(np.arange(position)[:, np.newaxis],
                            np.arange(d_model)[np.newaxis, :],
                            d_model)
    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
    pos_encoding = angle_rads[np.newaxis, ...]
    return tf.cast(pos_encoding, dtype=tf.float32)

# Scaled Dot Product Attention
def scaled_dot_product_attention(q, k, v, mask):
    matmul_qk = tf.matmul(q, k, transpose_b=True)
    dk = tf.cast(tf.shape(k)[-1], tf.float32)
    scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)
    if mask is not None:
        scaled_attention_logits += (mask * -1e9)
    attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)
    output = tf.matmul(attention_weights, v)
    return output, attention_weights

# Multi-head Attention
class MultiHeadAttention(layers.Layer):
    def __init__(self, d_model, num_heads, embedding=None):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model
        assert d_model % self.num_heads == 0
        self.depth = d_model // self.num_heads
        self.wq = layers.Dense(d_model)
        self.wk = layers.Dense(d_model)
        self.wv = layers.Dense(d_model)
        self.dense = layers.Dense(d_model)
        self.embedding = embedding

    def split_heads(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self, v, k, q, mask):
        batch_size = tf.shape(q)[0]
        q = self.wq(q)
        k = self.wk(k)
        v = self.wv(v)
        q = self.split_heads(q, batch_size)
        k = self.split_heads(k, batch_size)
        v = self.split_heads(v, batch_size)
        scaled_attention, attention_weights = scaled_dot_product_attention(q, k, v, mask)
        scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])
        concat_attention = tf.reshape(scaled_attention, (batch_size, -1, self.d_model))
        output = self.dense(concat_attention)
        return output, attention_weights

def point_wise_feed_forward_network(d_model, dff):
    return tf.keras.Sequential([
        layers.Dense(dff, activation='relu'),
        layers.Dense(d_model)
    ])

class EncoderLayer(layers.Layer):
    def __init__(self, d_model, num_heads, dff, rate=0.1, embedding=None):
        super(EncoderLayer, self).__init__()
        self.mha = MultiHeadAttention(d_model, num_heads, embedding)
        self.ffn = point_wise_feed_forward_network(d_model, dff)
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, x, training, mask):
        attn_output, _ = self.mha(x, x, x, mask)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(x + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        out2 = self.layernorm2(out1 + ffn_output)
        return out2

# Decoder Layer
class DecoderLayer(layers.Layer):
    def __init__(self, d_model, num_heads, dff, rate=0.1, embedding=None):
        super(DecoderLayer, self).__init__()
        self.mha1 = MultiHeadAttention(d_model, num_heads, embedding)
        self.mha2 = MultiHeadAttention(d_model, num_heads, embedding)
        self.ffn = point_wise_feed_forward_network(d_model, dff)
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm3 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)
        self.dropout3 = layers.Dropout(rate)

    def call(self, x, enc_output, training, look_ahead_mask, padding_mask):
        attn1, attn_weights_block1 = self.mha1(x, x, x, look_ahead_mask)
        attn1 = self.dropout1(attn1, training=training)
        out1 = self.layernorm1(attn1 + x)
        attn2, attn_weights_block2 = self.mha2(enc_output, enc_output, out1, padding_mask)
        attn2 = self.dropout2(attn2, training=training)
        out2 = self.layernorm2(attn2 + out1)
        ffn_output = self.ffn(out2)
        ffn_output = self.dropout3(ffn_output, training=training)
        out3 = self.layernorm3(ffn_output + out2)
        return out3, attn_weights_block1, attn_weights_block2

# Encoder
class Encoder(layers.Layer):
    def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size, maximum_position_encoding, rate=0.1, embedding=None):
        super(Encoder, self).__init__()
        self.d_model = d_model
        self.num_layers = num_layers
        self.embedding = layers.Embedding(input_vocab_size, d_model) if embedding is None else embedding
        self.pos_encoding = positional_encoding(maximum_position_encoding, self.d_model)
        self.enc_layers = [EncoderLayer(d_model, num_heads, dff, rate, embedding) for _ in range(num_layers)]
        self.dropout = layers.Dropout(rate)

    def call(self, x, training, mask):
        seq_len = tf.shape(x)[1]
        x = self.embedding(x)
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x += self.pos_encoding[:, :seq_len, :]
        x = self.dropout(x, training=training)
        for i in range(self.num_layers):
            x = self.enc_layers[i](x, training, mask)
        return x

# Decoder
class Decoder(layers.Layer):
    def __init__(self, num_layers, d_model, num_heads, dff, target_vocab_size, maximum_position_encoding, rate=0.1, embedding=None):
        super(Decoder, self).__init__()
        self.d_model = d_model
        self.num_layers = num_layers
        self.embedding = layers.Embedding(target_vocab_size, d_model) if embedding is None else embedding
        self.pos_encoding = positional_encoding(maximum_position_encoding, self.d_model)
        self.dec_layers = [DecoderLayer(d_model, num_heads, dff, rate, embedding) for _ in range(num_layers)]
        self.dropout = layers.Dropout(rate)

    def call(self, x, enc_output, training, look_ahead_mask, padding_mask):
        seq_len = tf.shape(x)[1]
        attention_weights = {}
        x = self.embedding(x)
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x += self.pos_encoding[:, :seq_len, :]
        x = self.dropout(x, training=training)
        for i in range(self.num_layers):
            x, block1, block2 = self.dec_layers[i](x, enc_output, training, look_ahead_mask, padding_mask)
            attention_weights['decoder_layer{}_block1'.format(i+1)] = block1
            attention_weights['decoder_layer{}_block2'.format(i+1)] = block2
        return x, attention_weights

# Transformer
class Transformer(tf.keras.Model):
    """
    Intialization transformer layer. Contain Encoder, Decoder and MultiHeadAttention.
    """
    def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size, target_vocab_size, pe_input, pe_target, rate=0.1, embedding=None):
        super(Transformer, self).__init__()
        self.encoder = Encoder(num_layers, d_model, num_heads, dff, input_vocab_size, pe_input, rate, embedding)
        self.decoder = Decoder(num_layers, d_model, num_heads, dff, target_vocab_size, pe_target, rate, embedding)
        self.final_layer = tf.keras.layers.Dense(target_vocab_size)

    def call(self, inp, tar, training, enc_padding_mask, look_ahead_mask, dec_padding_mask):
        enc_output = self.encoder(inp, training, enc_padding_mask)
        dec_output, attention_weights = self.decoder(tar, enc_output, training, look_ahead_mask, dec_padding_mask)
        final_output = self.final_layer(dec_output)
        return final_output, attention_weights

    def beam_search_decoder(self, data, k):
        sequences = [[list(), 1.0]]
        for row in data:
            all_candidates = list()
            for i in range(len(sequences)):
                seq, score = sequences[i]
                for j in range(len(row)):
                    candidate = [seq + [j], score * -log(row[j])]
                    all_candidates.append(candidate)
            ordered = sorted(all_candidates, key=lambda tup:tup[1])
            sequences = ordered[:k]
        return sequences

    @staticmethod
    def save_model(model, name):
        """
        Saving model with safetensors extension.
        >>> transformer.save_model(model, name)
        """
        from safetensors.tensorflow import save_file
        save_file(model, name)
        return

    def fit_model(self, train_english, train_russian, valid_english, valid_russian, epochs, model_name, save_model_each_epoch=False, logs=True, logs_path= '/logs/plots'):
        """
        Fit the model to the data and save the model.
        """

        for epoch in tqdm(range(epochs)):
            # Ensure the data is in the correct format before creating masks
            enc_padding_mask, combined_mask, dec_padding_mask = self.create_masks(tf.convert_to_tensor(train_english, dtype=tf.int32), tf.convert_to_tensor(train_russian, dtype=tf.int32))
            predictions, _ = self.call(tf.convert_to_tensor(train_english, dtype=tf.int32), tf.convert_to_tensor(train_russian, dtype=tf.int32), True, enc_padding_mask, combined_mask, dec_padding_mask)
            loss = self.loss_function(tf.convert_to_tensor(train_russian, dtype=tf.int32), predictions)
            gradients = self.tape.gradient(loss, self.trainable_variables)
            self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))
            
            # Validation
            enc_padding_mask_valid, combined_mask_valid, dec_padding_mask_valid = self.create_masks(tf.convert_to_tensor(valid_english, dtype=tf.int32), tf.convert_to_tensor(valid_russian, dtype=tf.int32))
            predictions_valid, _ = self.call(tf.convert_to_tensor(valid_english, dtype=tf.int32), tf.convert_to_tensor(valid_russian, dtype=tf.int32), False, enc_padding_mask_valid, combined_mask_valid, dec_padding_mask_valid)
            loss_valid = self.loss_function(tf.convert_to_tensor(valid_russian, dtype=tf.int32), predictions_valid)
            print('Epoch {} Loss {:.4f} Validation Loss {:.4f}'.format(epoch + 1, loss, loss_valid))
            print(f'Epoch {epoch} finished.')
            if logs:
                # Plotting and saving dot diagram
                plt.figure(figsize=(10, 5))
                plt.plot(predictions_valid, 'ro')
                plt.title('Dot Diagram of Predictions')
                plt.xlabel('Index')
                plt.ylabel('Prediction')
                plt.grid(True)
                if not os.path.exists('plots'):
                    os.makedirs('plots')
                plt.savefig(os.path.join(logs_path ,f'plots/dot_diagram_epoch_{epoch+1}.png'))
                plt.close()

            if save_model_each_epoch:
                epoch += 1
                
                self.save_model(self, f'{model_name}_epoch_{epoch+1}')


        
        # Save the model after training
        try:
            self.save_model(self, model_name)
            print('Final weights saved.')
        except:
            print('Happened an error during saving final weights.')
            
        return self

    def create_masks(self, inp, tar):
        """
        Create masks for training.
        """
        enc_padding_mask = self.create_padding_mask(inp)
        dec_padding_mask = self.create_padding_mask(inp)
        look_ahead_mask = self.create_look_ahead_mask(tf.shape(tar)[1])
        dec_target_padding_mask = self.create_padding_mask(tar)
        combined_mask = tf.maximum(dec_target_padding_mask, look_ahead_mask)
        return enc_padding_mask, combined_mask, dec_padding_mask

    @staticmethod
    def create_padding_mask(seq):
        """
        Create padding mask for sequence.
        """
        seq = tf.cast(tf.math.equal(seq, 0), tf.float32)
        return seq[:, tf.newaxis, tf.newaxis, :]

    @staticmethod
    def create_look_ahead_mask(size):
        """
        Create look ahead mask for sequence.
        """
        mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
        return mask



here is a part of code in notebook:

epochs = config_train['epoch_train']
model.fit_model(train_english, 
                         train_russian,
                         valid_english,
                         valid_russian,
                         epochs=epochs,
                         save_model_each_epoch=True,
                         logs=True,
                         model_name='novelsdreamer-ru-t4m')

and datagen:

import os
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

class DataGenerator:
    """
    Generates tensors by converting input data.

    Usage:
    >>> datagen = DataGenerator(TRAIN_DATASET_DIR, VALID_DATASET_DIR)

    >>> (train_english, train_russian), (valid_english, valid_russian) = datagen.generate()

    
    """
    def __init__(self, train_dir, valid_dir, padding_type='post', trunc_type='post'):
        self.train_dir = train_dir
        self.valid_dir = valid_dir
        self.padding_type = padding_type
        self.trunc_type = trunc_type


    def load_data(self, dir_name):
        data = {}
        for class_name in os.listdir(dir_name):
            class_dir = os.path.join(dir_name, class_name)
            if not os.path.exists(class_dir):
                os.makedirs(class_dir)
            data[class_name] = []
            for filename in os.listdir(class_dir):
                if os.path.isfile(os.path.join(class_dir, filename)):
                    with open(os.path.join(class_dir, filename), 'r') as f:
                        data[class_name].append(f.read())
        return data

    def prepare_data(self, data):
        tokenizer = Tokenizer()
        for class_name in data.keys():
            tokenizer.fit_on_texts(data[class_name])
            sequences = tokenizer.texts_to_sequences(data[class_name])
            if sequences:
                padded = pad_sequences(sequences, padding=self.padding_type, truncating=self.trunc_type)
                data[class_name] = [tf.expand_dims(p, -1) for p in padded]  # Add an extra dimension at the end to avoid ValueError
        print(data)
        return data


    def generate(self):
        """
        Do main process of converting tensors.
        Usage:
        >>> (train_english, train_russian), (valid_english, valid_russian) = datagen.generate()
        """
        train_data = self.load_data(self.train_dir)
        valid_data = self.load_data(self.valid_dir)
        train_data = self.prepare_data(train_data)
        valid_data = self.prepare_data(valid_data)

        train_data = {k: v for k, v in train_data.items() if len(v) > 0}
        valid_data = {k: v for k, v in valid_data.items() if len(v) > 0}

        print(f"Train data info: {len(train_data.keys())} classes, {sum([len(v) for v in train_data.values()])} samples")
        print(f"Valid data info: {len(valid_data.keys())} classes, {sum([len(v) for v in valid_data.values()])} samples")

        return (train_data, valid_data)

will be glad if you help me!

Hi @oblivisheee. Apologies for my question but data of what type are stored in your datasets?

Hi, for now, just 4 txt files for test.(Later i’ll increase dataset)