Hello, i have a problem while model.fit, already a few hours trying to fix, but, i have no result:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
Cell In[6], line 2
1 epochs = config_train['epoch_train']
----> 2 model.fit_model(train_english,
3 train_russian,
4 valid_english,
5 valid_russian,
6 epochs=epochs,
7 save_model_each_epoch=True,
8 logs=True,
9 model_name='novelsdreamer-ru-t4m')
File ~/Documents/GitHub/novelsdreamer-ru-t4m/modules/transformer_custom.py:209, in Transformer.fit_model(self, train_english, train_russian, valid_english, valid_russian, epochs, model_name, save_model_each_epoch, logs, logs_path)
203 """
204 Fit the model to the data and save the model.
205 """
207 for epoch in tqdm(range(epochs)):
208 # Ensure the data is in the correct format before creating masks
--> 209 enc_padding_mask, combined_mask, dec_padding_mask = self.create_masks(tf.convert_to_tensor(train_english, dtype=tf.int32), tf.convert_to_tensor(train_russian, dtype=tf.int32))
210 predictions, _ = self.call(tf.convert_to_tensor(train_english, dtype=tf.int32), tf.convert_to_tensor(train_russian, dtype=tf.int32), True, enc_padding_mask, combined_mask, dec_padding_mask)
211 loss = self.loss_function(tf.convert_to_tensor(train_russian, dtype=tf.int32), predictions)
File ~/miniconda3/lib/python3.11/site-packages/tensorflow/python/util/traceback_utils.py:153, in filter_traceback..error_handler(*args, **kwargs)
151 except Exception as e:
152 filtered_tb = _process_traceback_frames(e.__traceback__)
--> 153 raise e.with_traceback(filtered_tb) from None
154 finally:
155 del filtered_tb
File ~/miniconda3/lib/python3.11/site-packages/tensorflow/python/framework/constant_op.py:103, in convert_to_eager_tensor(value, ctx, dtype)
101 dtype = dtypes.as_dtype(dtype).as_datatype_enum
102 ctx.ensure_initialized()
--> 103 return ops.EagerTensor(value, ctx.device_name, dtype)
TypeError: Cannot convert 'russian' to EagerTensor of dtype int32
here is transformer code:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np
from math import log
import matplotlib.pyplot as plt
import os
from tqdm import tqdm
# Positional Encoding
def get_angles(pos, i, d_model):
angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(d_model))
return pos * angle_rates
def positional_encoding(position, d_model):
angle_rads = get_angles(np.arange(position)[:, np.newaxis],
np.arange(d_model)[np.newaxis, :],
d_model)
angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
pos_encoding = angle_rads[np.newaxis, ...]
return tf.cast(pos_encoding, dtype=tf.float32)
# Scaled Dot Product Attention
def scaled_dot_product_attention(q, k, v, mask):
matmul_qk = tf.matmul(q, k, transpose_b=True)
dk = tf.cast(tf.shape(k)[-1], tf.float32)
scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)
if mask is not None:
scaled_attention_logits += (mask * -1e9)
attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)
output = tf.matmul(attention_weights, v)
return output, attention_weights
# Multi-head Attention
class MultiHeadAttention(layers.Layer):
def __init__(self, d_model, num_heads, embedding=None):
super(MultiHeadAttention, self).__init__()
self.num_heads = num_heads
self.d_model = d_model
assert d_model % self.num_heads == 0
self.depth = d_model // self.num_heads
self.wq = layers.Dense(d_model)
self.wk = layers.Dense(d_model)
self.wv = layers.Dense(d_model)
self.dense = layers.Dense(d_model)
self.embedding = embedding
def split_heads(self, x, batch_size):
x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
return tf.transpose(x, perm=[0, 2, 1, 3])
def call(self, v, k, q, mask):
batch_size = tf.shape(q)[0]
q = self.wq(q)
k = self.wk(k)
v = self.wv(v)
q = self.split_heads(q, batch_size)
k = self.split_heads(k, batch_size)
v = self.split_heads(v, batch_size)
scaled_attention, attention_weights = scaled_dot_product_attention(q, k, v, mask)
scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])
concat_attention = tf.reshape(scaled_attention, (batch_size, -1, self.d_model))
output = self.dense(concat_attention)
return output, attention_weights
def point_wise_feed_forward_network(d_model, dff):
return tf.keras.Sequential([
layers.Dense(dff, activation='relu'),
layers.Dense(d_model)
])
class EncoderLayer(layers.Layer):
def __init__(self, d_model, num_heads, dff, rate=0.1, embedding=None):
super(EncoderLayer, self).__init__()
self.mha = MultiHeadAttention(d_model, num_heads, embedding)
self.ffn = point_wise_feed_forward_network(d_model, dff)
self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
self.dropout1 = layers.Dropout(rate)
self.dropout2 = layers.Dropout(rate)
def call(self, x, training, mask):
attn_output, _ = self.mha(x, x, x, mask)
attn_output = self.dropout1(attn_output, training=training)
out1 = self.layernorm1(x + attn_output)
ffn_output = self.ffn(out1)
ffn_output = self.dropout2(ffn_output, training=training)
out2 = self.layernorm2(out1 + ffn_output)
return out2
# Decoder Layer
class DecoderLayer(layers.Layer):
def __init__(self, d_model, num_heads, dff, rate=0.1, embedding=None):
super(DecoderLayer, self).__init__()
self.mha1 = MultiHeadAttention(d_model, num_heads, embedding)
self.mha2 = MultiHeadAttention(d_model, num_heads, embedding)
self.ffn = point_wise_feed_forward_network(d_model, dff)
self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
self.layernorm3 = layers.LayerNormalization(epsilon=1e-6)
self.dropout1 = layers.Dropout(rate)
self.dropout2 = layers.Dropout(rate)
self.dropout3 = layers.Dropout(rate)
def call(self, x, enc_output, training, look_ahead_mask, padding_mask):
attn1, attn_weights_block1 = self.mha1(x, x, x, look_ahead_mask)
attn1 = self.dropout1(attn1, training=training)
out1 = self.layernorm1(attn1 + x)
attn2, attn_weights_block2 = self.mha2(enc_output, enc_output, out1, padding_mask)
attn2 = self.dropout2(attn2, training=training)
out2 = self.layernorm2(attn2 + out1)
ffn_output = self.ffn(out2)
ffn_output = self.dropout3(ffn_output, training=training)
out3 = self.layernorm3(ffn_output + out2)
return out3, attn_weights_block1, attn_weights_block2
# Encoder
class Encoder(layers.Layer):
def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size, maximum_position_encoding, rate=0.1, embedding=None):
super(Encoder, self).__init__()
self.d_model = d_model
self.num_layers = num_layers
self.embedding = layers.Embedding(input_vocab_size, d_model) if embedding is None else embedding
self.pos_encoding = positional_encoding(maximum_position_encoding, self.d_model)
self.enc_layers = [EncoderLayer(d_model, num_heads, dff, rate, embedding) for _ in range(num_layers)]
self.dropout = layers.Dropout(rate)
def call(self, x, training, mask):
seq_len = tf.shape(x)[1]
x = self.embedding(x)
x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
x += self.pos_encoding[:, :seq_len, :]
x = self.dropout(x, training=training)
for i in range(self.num_layers):
x = self.enc_layers[i](x, training, mask)
return x
# Decoder
class Decoder(layers.Layer):
def __init__(self, num_layers, d_model, num_heads, dff, target_vocab_size, maximum_position_encoding, rate=0.1, embedding=None):
super(Decoder, self).__init__()
self.d_model = d_model
self.num_layers = num_layers
self.embedding = layers.Embedding(target_vocab_size, d_model) if embedding is None else embedding
self.pos_encoding = positional_encoding(maximum_position_encoding, self.d_model)
self.dec_layers = [DecoderLayer(d_model, num_heads, dff, rate, embedding) for _ in range(num_layers)]
self.dropout = layers.Dropout(rate)
def call(self, x, enc_output, training, look_ahead_mask, padding_mask):
seq_len = tf.shape(x)[1]
attention_weights = {}
x = self.embedding(x)
x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
x += self.pos_encoding[:, :seq_len, :]
x = self.dropout(x, training=training)
for i in range(self.num_layers):
x, block1, block2 = self.dec_layers[i](x, enc_output, training, look_ahead_mask, padding_mask)
attention_weights['decoder_layer{}_block1'.format(i+1)] = block1
attention_weights['decoder_layer{}_block2'.format(i+1)] = block2
return x, attention_weights
# Transformer
class Transformer(tf.keras.Model):
"""
Intialization transformer layer. Contain Encoder, Decoder and MultiHeadAttention.
"""
def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size, target_vocab_size, pe_input, pe_target, rate=0.1, embedding=None):
super(Transformer, self).__init__()
self.encoder = Encoder(num_layers, d_model, num_heads, dff, input_vocab_size, pe_input, rate, embedding)
self.decoder = Decoder(num_layers, d_model, num_heads, dff, target_vocab_size, pe_target, rate, embedding)
self.final_layer = tf.keras.layers.Dense(target_vocab_size)
def call(self, inp, tar, training, enc_padding_mask, look_ahead_mask, dec_padding_mask):
enc_output = self.encoder(inp, training, enc_padding_mask)
dec_output, attention_weights = self.decoder(tar, enc_output, training, look_ahead_mask, dec_padding_mask)
final_output = self.final_layer(dec_output)
return final_output, attention_weights
def beam_search_decoder(self, data, k):
sequences = [[list(), 1.0]]
for row in data:
all_candidates = list()
for i in range(len(sequences)):
seq, score = sequences[i]
for j in range(len(row)):
candidate = [seq + [j], score * -log(row[j])]
all_candidates.append(candidate)
ordered = sorted(all_candidates, key=lambda tup:tup[1])
sequences = ordered[:k]
return sequences
@staticmethod
def save_model(model, name):
"""
Saving model with safetensors extension.
>>> transformer.save_model(model, name)
"""
from safetensors.tensorflow import save_file
save_file(model, name)
return
def fit_model(self, train_english, train_russian, valid_english, valid_russian, epochs, model_name, save_model_each_epoch=False, logs=True, logs_path= '/logs/plots'):
"""
Fit the model to the data and save the model.
"""
for epoch in tqdm(range(epochs)):
# Ensure the data is in the correct format before creating masks
enc_padding_mask, combined_mask, dec_padding_mask = self.create_masks(tf.convert_to_tensor(train_english, dtype=tf.int32), tf.convert_to_tensor(train_russian, dtype=tf.int32))
predictions, _ = self.call(tf.convert_to_tensor(train_english, dtype=tf.int32), tf.convert_to_tensor(train_russian, dtype=tf.int32), True, enc_padding_mask, combined_mask, dec_padding_mask)
loss = self.loss_function(tf.convert_to_tensor(train_russian, dtype=tf.int32), predictions)
gradients = self.tape.gradient(loss, self.trainable_variables)
self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))
# Validation
enc_padding_mask_valid, combined_mask_valid, dec_padding_mask_valid = self.create_masks(tf.convert_to_tensor(valid_english, dtype=tf.int32), tf.convert_to_tensor(valid_russian, dtype=tf.int32))
predictions_valid, _ = self.call(tf.convert_to_tensor(valid_english, dtype=tf.int32), tf.convert_to_tensor(valid_russian, dtype=tf.int32), False, enc_padding_mask_valid, combined_mask_valid, dec_padding_mask_valid)
loss_valid = self.loss_function(tf.convert_to_tensor(valid_russian, dtype=tf.int32), predictions_valid)
print('Epoch {} Loss {:.4f} Validation Loss {:.4f}'.format(epoch + 1, loss, loss_valid))
print(f'Epoch {epoch} finished.')
if logs:
# Plotting and saving dot diagram
plt.figure(figsize=(10, 5))
plt.plot(predictions_valid, 'ro')
plt.title('Dot Diagram of Predictions')
plt.xlabel('Index')
plt.ylabel('Prediction')
plt.grid(True)
if not os.path.exists('plots'):
os.makedirs('plots')
plt.savefig(os.path.join(logs_path ,f'plots/dot_diagram_epoch_{epoch+1}.png'))
plt.close()
if save_model_each_epoch:
epoch += 1
self.save_model(self, f'{model_name}_epoch_{epoch+1}')
# Save the model after training
try:
self.save_model(self, model_name)
print('Final weights saved.')
except:
print('Happened an error during saving final weights.')
return self
def create_masks(self, inp, tar):
"""
Create masks for training.
"""
enc_padding_mask = self.create_padding_mask(inp)
dec_padding_mask = self.create_padding_mask(inp)
look_ahead_mask = self.create_look_ahead_mask(tf.shape(tar)[1])
dec_target_padding_mask = self.create_padding_mask(tar)
combined_mask = tf.maximum(dec_target_padding_mask, look_ahead_mask)
return enc_padding_mask, combined_mask, dec_padding_mask
@staticmethod
def create_padding_mask(seq):
"""
Create padding mask for sequence.
"""
seq = tf.cast(tf.math.equal(seq, 0), tf.float32)
return seq[:, tf.newaxis, tf.newaxis, :]
@staticmethod
def create_look_ahead_mask(size):
"""
Create look ahead mask for sequence.
"""
mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
return mask
here is a part of code in notebook:
epochs = config_train['epoch_train']
model.fit_model(train_english,
train_russian,
valid_english,
valid_russian,
epochs=epochs,
save_model_each_epoch=True,
logs=True,
model_name='novelsdreamer-ru-t4m')
and datagen:
import os
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
class DataGenerator:
"""
Generates tensors by converting input data.
Usage:
>>> datagen = DataGenerator(TRAIN_DATASET_DIR, VALID_DATASET_DIR)
>>> (train_english, train_russian), (valid_english, valid_russian) = datagen.generate()
"""
def __init__(self, train_dir, valid_dir, padding_type='post', trunc_type='post'):
self.train_dir = train_dir
self.valid_dir = valid_dir
self.padding_type = padding_type
self.trunc_type = trunc_type
def load_data(self, dir_name):
data = {}
for class_name in os.listdir(dir_name):
class_dir = os.path.join(dir_name, class_name)
if not os.path.exists(class_dir):
os.makedirs(class_dir)
data[class_name] = []
for filename in os.listdir(class_dir):
if os.path.isfile(os.path.join(class_dir, filename)):
with open(os.path.join(class_dir, filename), 'r') as f:
data[class_name].append(f.read())
return data
def prepare_data(self, data):
tokenizer = Tokenizer()
for class_name in data.keys():
tokenizer.fit_on_texts(data[class_name])
sequences = tokenizer.texts_to_sequences(data[class_name])
if sequences:
padded = pad_sequences(sequences, padding=self.padding_type, truncating=self.trunc_type)
data[class_name] = [tf.expand_dims(p, -1) for p in padded] # Add an extra dimension at the end to avoid ValueError
print(data)
return data
def generate(self):
"""
Do main process of converting tensors.
Usage:
>>> (train_english, train_russian), (valid_english, valid_russian) = datagen.generate()
"""
train_data = self.load_data(self.train_dir)
valid_data = self.load_data(self.valid_dir)
train_data = self.prepare_data(train_data)
valid_data = self.prepare_data(valid_data)
train_data = {k: v for k, v in train_data.items() if len(v) > 0}
valid_data = {k: v for k, v in valid_data.items() if len(v) > 0}
print(f"Train data info: {len(train_data.keys())} classes, {sum([len(v) for v in train_data.values()])} samples")
print(f"Valid data info: {len(valid_data.keys())} classes, {sum([len(v) for v in valid_data.values()])} samples")
return (train_data, valid_data)
will be glad if you help me!