How to convert string to indices in a rnn model while exporting it using saved_model.save

n0obcoder · October 18, 2021, 9:31am

I have a text classification model. I have made a dataset class where I make item2idx and idx2item dictionaries using the text corpus. Then I trained the model, and exported it.
Everything is fine as long as I feed int64 tensors to the model after loading the exported model. But how do I feed in the strings and get the model predictions? Or how do I handle the text preprocessing of the post-processing step after exporting the model?

import tensorflow as tf
print("tf.__version__: ", tf.__version__)
import os, sys, random, pdb
from pprint import pprint
import numpy as np

START_EPOCH = 0 
END_EPOCH = 3
MAX_LENGTH = 5
BATCH_SIZE = 256
WORD_EMB_DIM = 32
LSTM_DIM=32
SAVE_MODEL_PATH = "saved_models/1"

def q(exit_msg=""):
    print(f"\n>{exit_msg}<")
    sys.exit()

text_data = [
        "i like this movie",
        "i feel happy watch movie",
        "great taste",
        "like the look of it",
        "great news",
        "hate this movie",
        "very bad news",
        "horrible movie",
        "very bad news",
        "i do not like it"
        ]

label_data = [1, 1, 1, 1, 1, 0, 0, 0, 0, 0]

# BUILD DATASET
class text_dataset():
    def __init__(self):
        self._build_vocab()
        
    def _build_vocab(self):
        words = []
        for words_list in [t.split(" ") for t in text_data]:
            words.extend(words_list)
        words = sorted(list(set(words)))

        self.item2idx = {}
        self.item2idx["<pad>"] = 0
        for w_idx, w in enumerate(words):
            self.item2idx[w] = w_idx + 1
        
        self.idx2item = {w_idx: w for w, w_idx in self.item2idx.items()}
        self.vocab_size = len(self.idx2item)
        print("self.vocab_size: ", self.vocab_size)

    def data_generator(self):
        batch_idx = 0
        while batch_idx < 8:
            sample_indices = [random.randint(0, len(text_data)-1) for _ in range(BATCH_SIZE)]

            x_raw = [text_data[i] for i in sample_indices]
            y = [label_data[i] for i in sample_indices]
            x_raw = [i.split(" ") for i in x_raw]                       
            x_raw = [[self.item2idx[j] for j in i] for i in x_raw]        
            zero_array = np.zeros((BATCH_SIZE,  MAX_LENGTH))        
            for i in range(len(x_raw)):
                zero_array[i, :len(x_raw[i])] = x_raw[i]   
            x_train = np.array(zero_array) # (BATCH_SIZE, MAX_LENGTH)
            y_train = np.array(y)          # (BATCH_SIZE, )
            yield tuple((x_train, y_train))
            batch_idx += 1


# BUILD MODEL
class classification_model(tf.keras.Model):
    def __init__(self, vocab_size):        
        super(classification_model, self).__init__()   
        self.word_emb = tf.keras.layers.Embedding(vocab_size, 
                                                  WORD_EMB_DIM, 
                                                  mask_zero=True, 
                                                  name="word_embedding_layer")               
            
        self.lstm   = tf.keras.layers.LSTM(LSTM_DIM, return_state=True, name="rnn_layer")    
        self.dense = tf.keras.layers.Dense(2)           
    
    # @tf.function(input_signature=[tf.TensorSpec(shape=[None, MAX_LENGTH], dtype=tf.int64)])
    @tf.function(input_signature=[tf.TensorSpec(shape=[None, None], dtype=tf.int64)])
    def call(self, word_emb_inp, initial_state=None):   
        word_emb = self.word_emb(word_emb_inp)                           # (bs, MAX_LEN, WORD_EMB_DIM)
        word_emb_mask = self.word_emb.compute_mask(word_emb_inp)         # (bs, MAX_LEN)

        lstm_inp = word_emb                                              # (bs, MAX_LEN, WORD_EMB_DIM)
        lstm_inp_mask = word_emb_mask                                    # (bs, MAX_LEN)
                            
        lstm, state_h, state_c = self.lstm(lstm_inp, mask=word_emb_mask, initial_state=initial_state)
        dense_out = self.dense(lstm)
        return dense_out


# INITIALIZING DATASET AND MODEL
dataset = text_dataset()
model = classification_model(dataset.vocab_size)

# print(help(model.build))
# model.build(input_shape=(None, MAX_LENGTH))
# model.summary()

optimizer = tf.keras.optimizers.Adam() 

loss_func = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True,
    name='sparse_categorical_crossentropy'
)

# TRAINING
print("\nTRAINING\n")
for e in range(START_EPOCH, END_EPOCH):
    print(f"EPOCH: {str(e+1).zfill(len(str(END_EPOCH)))}/{END_EPOCH}")

    train_gen  = dataset.data_generator
    train_gen = tf.data.Dataset.from_generator(
                            train_gen,
                            output_types=(tf.dtypes.int64, tf.dtypes.int64),
                            output_shapes=((None, MAX_LENGTH), (None,)) 
                            )   

    for batch_idx, batch in enumerate(train_gen): 
        x, y = batch
        with tf.GradientTape() as tape:
            logits = model(x) # model is supposed to output the logits (BATCH_SIZE, 2)

            loss_value = loss_func(y, logits)
            print(loss_value.numpy(), end="\r")
            grads = tape.gradient(loss_value, model.trainable_variables)
            optimizer.apply_gradients(zip(grads, model.trainable_variables))
    
    print(f"loss@epoch#{e}: {loss_value.numpy()}")


print("\nEXPORTING THE MODEL\n")
class MyModule(tf.Module):
  def __init__(self, model, item2idx):
    self.model = model
    self._item2idx = item2idx

  @tf.function(input_signature=[tf.TensorSpec(shape=(None, None), dtype=tf.int64)])
  def predict(self, inp_seq):
    result = self.model(inp_seq)
    return { "scores": result }

  @tf.function(input_signature=[])
  def metadata(self):
    return { "item2idx": self._item2idx }

import json

item2idx = json.dumps(dataset.item2idx)
module = MyModule(model, item2idx)
tf.saved_model.save(module, 
                    SAVE_MODEL_PATH, 
                    signatures={ "score": module.predict, 
                                 "metadata": module.metadata})



print("\nIMPORTING...")
imported = tf.saved_model.load(SAVE_MODEL_PATH)
inp = tf.constant([[0, 0, 1, 2, 3]], dtype=tf.int64)
out = imported.signatures["score"](inp)["scores"].numpy()
pprint(out)

### HOW CAN I FEED STRING INPUTS LIKE THE ONE MENTIONED BELOW ???
# inp = tf.constant([["happy", "watch", "movie"]], dtype=tf.string)

Renu_Patel · October 25, 2023, 7:24am

Hi @n0obcoder

Welcome to the TensorFlow Forum!

You can use TextVectorization layer to do the text preprocessing. Please refer to this Text Classification model using RNN which might be helpful for you.

You can also have a look at the StringLookup layer to Vectorize the text into numerical value by referring the mentioned link to preprocess the text. Thank you.

Topic		Replies	Views
Why I'm not able to pass a string to my custom layer? General Discussion	1	564	January 27, 2024
Problem with a custom pre-processing layer for serving a NER model General Discussion models , help_request	1	292	January 2, 2023
Trouble using tfds load to vectorize text Keras datasets , epoc , tfkeras	2	255	February 6, 2024
Text-based Tensorflow unexpected result of train_function (empty logs) General Discussion models , nlp , keras , tfdata , help_request	5	6815	July 27, 2022
LSTM model for sentiment analysis General Discussion models , nlp , keras , help_request	14	2137	September 27, 2021

How to convert string to indices in a rnn model while exporting it using saved_model.save

Related topics