Lass : nan problem

I am using Tensorflow version 2.8.4 and wanted to develop a chatbot. However, I ran into a problem, the loss: nan problem. The changes I made to batch_size, state_size and optimizer didn’t work.

import tensorflow as tf
import numpy as np

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, GRU, Embedding ,Dropout
from tensorflow.compat.v1.keras.layers import CuDNNGRU
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

mark_start = 'ssss '
mark_end = ' eeee'

data_src = []
data_dest = []

for line in open("dialogs.txt", encoding="UTF-8"):
    soru, cevap = line.rstrip().split("\t")
    cevap = mark_start + cevap + mark_end
    data_src.append(soru)
    data_dest.append(cevap)

len(data_src)

class TokenizerWrap(Tokenizer):
    def __init__(self,texts, padding, reverse=False,num_words=None):
        Tokenizer.__init__(self, num_words=num_words)
        self.fit_on_texts(texts)
        self.index_to_word=dict(zip(self.word_index.values(), self.word_index.keys()))
        self.tokens=self.texts_to_sequences(texts)
        if reverse:
            self.tokens = [list(reversed(x)) for x in self.tokens]
            truncating="pre"
        else:
            truncating="post"

        self.num_tokens=[len(x) for x in self.tokens]
        self.max_tokens=int(np.mean(self.num_tokens)+2*np.std(self.num_tokens))
        self.tokens_padded=pad_sequences(self.tokens,
                                  maxlen=self.max_tokens,
                                  padding=padding,
                                  truncating=truncating)
    def token_to_word(self,token):
        word = ' ' if token==0 else self.index_to_word[token]
        return word
    def tokens_to_string(self,tokens):
        words = [self.index_to_word[token] for token in tokens if token!=0]
        text = ' '.join(words)
        return text
    def text_to_tokens(self,text,padding,reverse=False):
        tokens = self.texts_to_sequences([text])
        tokens = np.array(tokens)
        if reverse:
            tokens = np.flip(tokens,axis=1)
            truncating='pre'
        else:
            truncating='post'
        tokens=pad_sequences(tokens,maxlen=self.max_tokens,padding=padding,truncating=truncating)
        return tokens

tokenizer_src = TokenizerWrap(texts=data_src,
                              padding='pre',
                              reverse=True,
                              num_words=None)

tokenizer_dest = TokenizerWrap(texts=data_dest,
                              padding='post',
                              reverse=False,
                              num_words=None)

tokens_src = tokenizer_src.tokens_padded
tokens_dest = tokenizer_dest.tokens_padded
print(tokens_src.shape, tokens_dest.shape)

token_start = tokenizer_dest.word_index[mark_start.strip()]
token_start
token_end = tokenizer_dest.word_index[mark_end.strip()]
token_end

encoder_input_data=tokens_src

decoder_input_data=tokens_dest[:, :-1]
decoder_output_data=tokens_dest[:, 1:]

num_encoder_words = len(tokenizer_src.word_index)
num_decoder_words = len(tokenizer_dest.word_index)

embedding_size = 100

word2vec = {}
with open('glove.6B.100d.txt', encoding='UTF-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vec = np.asarray(values[1:], dtype='float32')
        word2vec[word] = vec

embedding_matrix=np.random.uniform(-1,1,(num_encoder_words, embedding_size))
for word, i in tokenizer_src.word_index.items():
    if i < num_encoder_words:
        embedding_vector = word2vec.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

embedding_matrix.shape

encoder_input = Input(shape=(None,),name='encoder_input')

encoder_embedding = Embedding(input_dim=num_encoder_words,
                              output_dim=embedding_size,
                              weights=[embedding_matrix],
                              trainable=True,
                              name='encoder_embedding')

state_size = 128

encoder_gru1=CuDNNGRU(state_size,name="encoder_gru1",return_sequences=True)
encoder_gru2=CuDNNGRU(state_size,name="encoder_gru2",return_sequences=True)
encoder_gru3=CuDNNGRU(state_size,name="encoder_gru3",return_sequences=False)

dropout_rate=0.1
def connect_encoder():
    net = encoder_input
    net = encoder_embedding(net)
    net = Dropout(dropout_rate)(net)
    net = encoder_gru1(net)
    net = encoder_gru2(net)
    net = encoder_gru3(net)
    encoder_output = net
    return encoder_output

encoder_output = connect_encoder()

decoder_initial_state = Input(shape=(state_size,),name='decoder_initial_state')

decoder_input = Input(shape=(None,),name='decoder_input')

decoder_embedding = Embedding(input_dim=num_decoder_words,
                              output_dim=embedding_size,
                              name='decoder_embedding')

decoder_gru1=CuDNNGRU(state_size,name="decoder_gru1",return_sequences=True)
decoder_gru2=CuDNNGRU(state_size,name="decoder_gru2",return_sequences=True)
decoder_gru3=CuDNNGRU(state_size,name="decoder_gru3",return_sequences=True)

decoder_dense = Dense(num_decoder_words,
                      activation='linear',
                      name='decoder_output')

def connect_decoder(initial_state):
    net = decoder_input
    net = decoder_embedding(net)
    net = Dropout(dropout_rate)(net)
    net = decoder_gru1(net, initial_state=initial_state)
    net = decoder_gru2(net, initial_state=initial_state)
    net = decoder_gru3(net, initial_state=initial_state)
    decoder_output = decoder_dense(net)
    return decoder_output

decoder_output = connect_decoder(initial_state=encoder_output)

model_train = Model(inputs=[encoder_input,decoder_input],
                    outputs=[decoder_output])

model_encoder = Model(inputs=[encoder_input],
                      outputs=[encoder_output])

decoder_output = connect_decoder(initial_state=decoder_initial_state)
model_decoder = Model(inputs=[decoder_input, decoder_initial_state], outputs=[decoder_output])

def sparse_cross_entropy(y_true,y_pred):
    loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
        labels = y_true,
        logits = y_pred)
    loss_mean = tf.reduce_mean(loss)
    return loss_mean

optimizer = RMSprop(learning_rate=1e-4)

tf.compat.v1.disable_eager_execution()
decoder_target = tf.compat.v1.placeholder(dtype="int32", shape=(None, None))

model_train.compile(optimizer=optimizer,
                    loss=sparse_cross_entropy,
                    target_tensors=[decoder_target])

path_checkpoint = 'checkpoint.keras'
checkpoint = ModelCheckpoint(filepath=path_checkpoint,save_weights_only=True)

try:
    model_train.load_weights(path_checkpoint)
except Exception as error:
    print('Checkpoint yüklenirken hata oluştu. Eğitime sıfırdan başlanıyor.')
    print(error)

x_data = {'encoder_input':encoder_input_data,
          'decoder_input':decoder_input_data}

y_data={'decoder_output':decoder_output_data}

model_train.fit(x=x_data,
                y=y_data,
                batch_size=128,
                epochs=10,
                callbacks=[checkpoint])

def translate(input_text):
    input_tokens = tokenizer_src.text_to_tokens(input_text,
                                                reverse=True,
                                                padding='pre')
    initial_state = model_encoder.predict(input_tokens)
    max_tokens = tokenizer_dest.max_tokens
    decoder_input_data = np.zeros(shape=(1,max_tokens),dtype=np.int_)
    token_int = token_start
    output_text = ""
    count_tokens = 0
    while token_int != token_end and count_tokens < max_tokens:
        decoder_input_data[0,count_tokens] = token_int
        x_data = {'decoder_initial_state':initial_state,'decoder_input':decoder_input_data}
        decoder_output = model_decoder.predict(x_data)
        token_onehot = decoder_output[0, count_tokens,:]
        token_int = np.argmax(token_onehot)
        sampled_word = tokenizer_dest.token_to_word(token_int)
        output_text += ' ' + sampled_word
        count_tokens+=1
    print("Input metni:")
    print('-',input_text.capitalize())
    print('-'*44)
    print("Model cevabı:")
    print('-'+output_text.replace(" eeee","").capitalize()
          )
    print('-'*44)

translate(input_text='hi, how are you doing?')

Output :
Train on 3725 samples Epoch 1/10 3725/3725 [==============================] - 3s 780us/sample - loss: nan Epoch 2/10 3725/3725 [==============================] - 1s 306us/sample - loss: nan Epoch 3/10 3725/3725 [==============================] - 1s 306us/sample - loss: nan Epoch 4/10 3725/3725 [==============================] - 1s 302us/sample - loss: nan Epoch 5/10 3725/3725 [==============================] - 1s 307us/sample - loss: nan Epoch 6/10 3725/3725 [==============================] - 1s 315us/sample - loss: nan Epoch 7/10 3725/3725 [==============================] - 1s 305us/sample - loss: nan Epoch 8/10 3725/3725 [==============================] - 1s 309us/sample - loss: nan Epoch 9/10 3725/3725 [==============================] - 1s 300us/sample - loss: nan Epoch 10/10 3725/3725 [==============================] - 1s 307us/sample - loss: nan

<keras.callbacks.History at 0x295beed61d0>

I will be glad if you help me.:blush:

Hi @William_Afton, This loss can be nan due to several reasons like exploding and vanishing gradients, nan values present in the input data, etc,

Could you please try to use gradient clipping in the RMSprop by passing value to the clipvalue argument, try using the standard GRU layer instead of using CuDNNGRU as compat.v1 as deprecated. Thank You.