Tensorflow Keras Loss : Nan problem

I wanted to make a chatbot that uses a simple seq2seq architecture. But in education, loss is nan. I tried changing the optimizer and it didn’t work.

import tensorflow as tf
import numpy as np

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Embedding
from tensorflow.compat.v1.keras.layers import CuDNNGRU
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

mark_start = "ssss "
mark_end = " eeee"
sorular=[]
cevaplar=[]

#TXT dataset
for line in open("dialogs.txt", encoding="UTF-8"):
    soru, cevap = line.rstrip().split("\t")
    cevap = mark_start + cevap + mark_end
    sorular.append(soru)
    cevaplar.append(cevap)
soru=''
cevap=''

ques=sorular
ans=cevaplar

class TokenizerWrap(Tokenizer):
    def __init__(self,texts, padding, reverse=False,num_words=None):
        Tokenizer.__init__(self, num_words=num_words)
        self.fit_on_texts(texts)
        self.index_to_word=dict(zip(self.word_index.values(), self.word_index.keys()))
        self.tokens=self.texts_to_sequences(texts)
        if reverse:
            self.tokens = [list(reversed(x)) for x in self.tokens]
            truncating="pre"
        else:
            truncating="post"

        self.num_tokens=[len(x) for x in self.tokens]
        self.max_tokens=int(np.mean(self.num_tokens)+2*np.std(self.num_tokens))
        self.tokens_padded=pad_sequences(self.tokens,
                                  maxlen=self.max_tokens,
                                  padding=padding,
                                  truncating=truncating)
    def token_to_word(self,token):
        word = ' ' if token==0 else self.index_to_word[token]
        return word
    def tokens_to_string(self,tokens):
        words = [self.index_to_word[token] for token in tokens if token!=0]
        text = ' '.join(words)
        return text
    def text_to_tokens(self,text,padding,reverse=False):
        tokens = self.texts_to_sequences([text])
        tokens = np.array(tokens)
        if reverse:
            tokens = np.flip(tokens,axis=1)
            truncating='pre'
        else:
            truncating='post'
        tokens=pad_sequences(tokens,maxlen=self.max_tokens,padding=padding,truncating=truncating)
        return tokens

tokenizer_ques = TokenizerWrap(ques,padding='pre',reverse=True)
tokenizer_ans = TokenizerWrap(ans,padding='post',reverse=False)

tokens_ques=tokenizer_ques.tokens_padded
tokens_ans=tokenizer_ans.tokens_padded

token_start = tokenizer_ans.word_index[mark_start.strip()]
token_end = tokenizer_ans.word_index[mark_end.strip()]

encoder_input_data = tokens_ques

decoder_input_data=tokens_ans[:, :-1]
decoder_output_data=tokens_ans[:, 1:]

num_encoder_words = len(tokenizer_ques.word_index)
num_decoder_words = len(tokenizer_ans.word_index)

embedding_size = 50

word2vec = {}
with open('glove.6B.50d.txt', encoding='UTF-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vec = np.asarray(values[1:], dtype='float32')
        word2vec[word] = vec

embedding_matrix=np.random.uniform(-1,1,(num_encoder_words, embedding_size))
for word, i in tokenizer_ques.word_index.items():
    if i < num_encoder_words:
        embedding_vector = word2vec.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
embedding_matrix.shape

encoder_input = Input(shape=(None,),name='encoder_input')

encoder_embedding = Embedding(input_dim=num_encoder_words,
                              output_dim=embedding_size,
                              weights=[embedding_matrix],
                              trainable=True,
                              name='encoder_embedding')

state_size = 256

encoder_gru1=CuDNNGRU(state_size,name="encoder_gru1",return_sequences=True)
encoder_gru2=CuDNNGRU(state_size,name="encoder_gru2",return_sequences=True)
encoder_gru3=CuDNNGRU(state_size,name="encoder_gru3",return_sequences=False)

def connect_encoder():
    net = encoder_input
    net = encoder_embedding(net)
    net = encoder_gru1(net)
    net = encoder_gru2(net)
    net = encoder_gru3(net)
    encoder_output = net
    return encoder_output
encoder_output = connect_encoder()

decoder_initial_state = Input(shape=(state_size,),name='decoder_initial_state')

decoder_input = Input(shape=(None,),name='decoder_input')

decoder_embedding = Embedding(input_dim=num_decoder_words,output_dim=embedding_size,name='decoder_embedding')

decoder_gru1=CuDNNGRU(state_size,name="decoder_gru1",return_sequences=True)
decoder_gru2=CuDNNGRU(state_size,name="decoder_gru2",return_sequences=True)
decoder_gru3=CuDNNGRU(state_size,name="decoder_gru3",return_sequences=True)

decoder_dense = Dense(num_decoder_words,activation='linear',name='decoder_output')

def connect_decoder(initial_state):
    net = decoder_input
    net = decoder_embedding(net)
    net = decoder_gru1(net,initial_state=initial_state)
    net = decoder_gru2(net,initial_state=initial_state)
    net = decoder_gru3(net,initial_state=initial_state)
    decoder_output = decoder_dense(net)
    return decoder_output
decoder_output = connect_decoder(initial_state=encoder_output)

model_train = Model(inputs=[encoder_input,decoder_input],outputs=[decoder_output])

model_encoder = Model(inputs=[encoder_input],outputs=[encoder_output])

decoder_output = connect_decoder(initial_state=decoder_initial_state)
model_decoder = Model(inputs=[decoder_input, decoder_initial_state], outputs=[decoder_output])

def sparse_cross_entropy(y_true,y_pred):
    loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
        labels = y_true,
        logits = y_pred)
    loss_mean = tf.reduce_mean(loss)
    return loss_mean

optimizer = RMSprop(learning_rate=1e-3)

tf.compat.v1.disable_eager_execution()
decoder_target = tf.compat.v1.placeholder(dtype="int32", shape=(None, None))

model_train.compile(optimizer=optimizer,loss=sparse_cross_entropy,target_tensors=[decoder_target])

x_data = {'encoder_input':encoder_input_data,
          'decoder_input':decoder_input_data}
y_data={'decoder_output':decoder_output_data}

model_train.fit(x_data,y_data,batch_size=256,epochs=10)

Your help will be appreciated.

Hi @William_Afton,

This may be due to various factors like NaN values in input data, exploding gradients,vanishing gradients or due to high learning rate.Could you please review and adjust these factors, and let us know?

In addition, CuDNNGRU from tensorflow.compat.v1.keras.layers has been deprecated. Please use GRU from tensorflow.keras.layers for compatibility.

Thank You.