I wanted to make a chatbot that uses a simple seq2seq architecture. But in education, loss is nan. I tried changing the optimizer and it didn’t work.
import tensorflow as tf
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Embedding
from tensorflow.compat.v1.keras.layers import CuDNNGRU
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
mark_start = "ssss "
mark_end = " eeee"
sorular=[]
cevaplar=[]
#TXT dataset
for line in open("dialogs.txt", encoding="UTF-8"):
soru, cevap = line.rstrip().split("\t")
cevap = mark_start + cevap + mark_end
sorular.append(soru)
cevaplar.append(cevap)
soru=''
cevap=''
ques=sorular
ans=cevaplar
class TokenizerWrap(Tokenizer):
def __init__(self,texts, padding, reverse=False,num_words=None):
Tokenizer.__init__(self, num_words=num_words)
self.fit_on_texts(texts)
self.index_to_word=dict(zip(self.word_index.values(), self.word_index.keys()))
self.tokens=self.texts_to_sequences(texts)
if reverse:
self.tokens = [list(reversed(x)) for x in self.tokens]
truncating="pre"
else:
truncating="post"
self.num_tokens=[len(x) for x in self.tokens]
self.max_tokens=int(np.mean(self.num_tokens)+2*np.std(self.num_tokens))
self.tokens_padded=pad_sequences(self.tokens,
maxlen=self.max_tokens,
padding=padding,
truncating=truncating)
def token_to_word(self,token):
word = ' ' if token==0 else self.index_to_word[token]
return word
def tokens_to_string(self,tokens):
words = [self.index_to_word[token] for token in tokens if token!=0]
text = ' '.join(words)
return text
def text_to_tokens(self,text,padding,reverse=False):
tokens = self.texts_to_sequences([text])
tokens = np.array(tokens)
if reverse:
tokens = np.flip(tokens,axis=1)
truncating='pre'
else:
truncating='post'
tokens=pad_sequences(tokens,maxlen=self.max_tokens,padding=padding,truncating=truncating)
return tokens
tokenizer_ques = TokenizerWrap(ques,padding='pre',reverse=True)
tokenizer_ans = TokenizerWrap(ans,padding='post',reverse=False)
tokens_ques=tokenizer_ques.tokens_padded
tokens_ans=tokenizer_ans.tokens_padded
token_start = tokenizer_ans.word_index[mark_start.strip()]
token_end = tokenizer_ans.word_index[mark_end.strip()]
encoder_input_data = tokens_ques
decoder_input_data=tokens_ans[:, :-1]
decoder_output_data=tokens_ans[:, 1:]
num_encoder_words = len(tokenizer_ques.word_index)
num_decoder_words = len(tokenizer_ans.word_index)
embedding_size = 50
word2vec = {}
with open('glove.6B.50d.txt', encoding='UTF-8') as f:
for line in f:
values = line.split()
word = values[0]
vec = np.asarray(values[1:], dtype='float32')
word2vec[word] = vec
embedding_matrix=np.random.uniform(-1,1,(num_encoder_words, embedding_size))
for word, i in tokenizer_ques.word_index.items():
if i < num_encoder_words:
embedding_vector = word2vec.get(word)
if embedding_vector is not None:
embedding_matrix[i] = embedding_vector
embedding_matrix.shape
encoder_input = Input(shape=(None,),name='encoder_input')
encoder_embedding = Embedding(input_dim=num_encoder_words,
output_dim=embedding_size,
weights=[embedding_matrix],
trainable=True,
name='encoder_embedding')
state_size = 256
encoder_gru1=CuDNNGRU(state_size,name="encoder_gru1",return_sequences=True)
encoder_gru2=CuDNNGRU(state_size,name="encoder_gru2",return_sequences=True)
encoder_gru3=CuDNNGRU(state_size,name="encoder_gru3",return_sequences=False)
def connect_encoder():
net = encoder_input
net = encoder_embedding(net)
net = encoder_gru1(net)
net = encoder_gru2(net)
net = encoder_gru3(net)
encoder_output = net
return encoder_output
encoder_output = connect_encoder()
decoder_initial_state = Input(shape=(state_size,),name='decoder_initial_state')
decoder_input = Input(shape=(None,),name='decoder_input')
decoder_embedding = Embedding(input_dim=num_decoder_words,output_dim=embedding_size,name='decoder_embedding')
decoder_gru1=CuDNNGRU(state_size,name="decoder_gru1",return_sequences=True)
decoder_gru2=CuDNNGRU(state_size,name="decoder_gru2",return_sequences=True)
decoder_gru3=CuDNNGRU(state_size,name="decoder_gru3",return_sequences=True)
decoder_dense = Dense(num_decoder_words,activation='linear',name='decoder_output')
def connect_decoder(initial_state):
net = decoder_input
net = decoder_embedding(net)
net = decoder_gru1(net,initial_state=initial_state)
net = decoder_gru2(net,initial_state=initial_state)
net = decoder_gru3(net,initial_state=initial_state)
decoder_output = decoder_dense(net)
return decoder_output
decoder_output = connect_decoder(initial_state=encoder_output)
model_train = Model(inputs=[encoder_input,decoder_input],outputs=[decoder_output])
model_encoder = Model(inputs=[encoder_input],outputs=[encoder_output])
decoder_output = connect_decoder(initial_state=decoder_initial_state)
model_decoder = Model(inputs=[decoder_input, decoder_initial_state], outputs=[decoder_output])
def sparse_cross_entropy(y_true,y_pred):
loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
labels = y_true,
logits = y_pred)
loss_mean = tf.reduce_mean(loss)
return loss_mean
optimizer = RMSprop(learning_rate=1e-3)
tf.compat.v1.disable_eager_execution()
decoder_target = tf.compat.v1.placeholder(dtype="int32", shape=(None, None))
model_train.compile(optimizer=optimizer,loss=sparse_cross_entropy,target_tensors=[decoder_target])
x_data = {'encoder_input':encoder_input_data,
'decoder_input':decoder_input_data}
y_data={'decoder_output':decoder_output_data}
model_train.fit(x_data,y_data,batch_size=256,epochs=10)
Your help will be appreciated.