I have a text classification model. I have made a dataset class where I make item2idx and idx2item dictionaries using the text corpus. Then I trained the model, and exported it.
Everything is fine as long as I feed int64 tensors to the model after loading the exported model. But how do I feed in the strings and get the model predictions? Or how do I handle the text preprocessing of the post-processing step after exporting the model?
import tensorflow as tf
print("tf.__version__: ", tf.__version__)
import os, sys, random, pdb
from pprint import pprint
import numpy as np
START_EPOCH = 0
END_EPOCH = 3
MAX_LENGTH = 5
BATCH_SIZE = 256
WORD_EMB_DIM = 32
LSTM_DIM=32
SAVE_MODEL_PATH = "saved_models/1"
def q(exit_msg=""):
print(f"\n>{exit_msg}<")
sys.exit()
text_data = [
"i like this movie",
"i feel happy watch movie",
"great taste",
"like the look of it",
"great news",
"hate this movie",
"very bad news",
"horrible movie",
"very bad news",
"i do not like it"
]
label_data = [1, 1, 1, 1, 1, 0, 0, 0, 0, 0]
# BUILD DATASET
class text_dataset():
def __init__(self):
self._build_vocab()
def _build_vocab(self):
words = []
for words_list in [t.split(" ") for t in text_data]:
words.extend(words_list)
words = sorted(list(set(words)))
self.item2idx = {}
self.item2idx["<pad>"] = 0
for w_idx, w in enumerate(words):
self.item2idx[w] = w_idx + 1
self.idx2item = {w_idx: w for w, w_idx in self.item2idx.items()}
self.vocab_size = len(self.idx2item)
print("self.vocab_size: ", self.vocab_size)
def data_generator(self):
batch_idx = 0
while batch_idx < 8:
sample_indices = [random.randint(0, len(text_data)-1) for _ in range(BATCH_SIZE)]
x_raw = [text_data[i] for i in sample_indices]
y = [label_data[i] for i in sample_indices]
x_raw = [i.split(" ") for i in x_raw]
x_raw = [[self.item2idx[j] for j in i] for i in x_raw]
zero_array = np.zeros((BATCH_SIZE, MAX_LENGTH))
for i in range(len(x_raw)):
zero_array[i, :len(x_raw[i])] = x_raw[i]
x_train = np.array(zero_array) # (BATCH_SIZE, MAX_LENGTH)
y_train = np.array(y) # (BATCH_SIZE, )
yield tuple((x_train, y_train))
batch_idx += 1
# BUILD MODEL
class classification_model(tf.keras.Model):
def __init__(self, vocab_size):
super(classification_model, self).__init__()
self.word_emb = tf.keras.layers.Embedding(vocab_size,
WORD_EMB_DIM,
mask_zero=True,
name="word_embedding_layer")
self.lstm = tf.keras.layers.LSTM(LSTM_DIM, return_state=True, name="rnn_layer")
self.dense = tf.keras.layers.Dense(2)
# @tf.function(input_signature=[tf.TensorSpec(shape=[None, MAX_LENGTH], dtype=tf.int64)])
@tf.function(input_signature=[tf.TensorSpec(shape=[None, None], dtype=tf.int64)])
def call(self, word_emb_inp, initial_state=None):
word_emb = self.word_emb(word_emb_inp) # (bs, MAX_LEN, WORD_EMB_DIM)
word_emb_mask = self.word_emb.compute_mask(word_emb_inp) # (bs, MAX_LEN)
lstm_inp = word_emb # (bs, MAX_LEN, WORD_EMB_DIM)
lstm_inp_mask = word_emb_mask # (bs, MAX_LEN)
lstm, state_h, state_c = self.lstm(lstm_inp, mask=word_emb_mask, initial_state=initial_state)
dense_out = self.dense(lstm)
return dense_out
# INITIALIZING DATASET AND MODEL
dataset = text_dataset()
model = classification_model(dataset.vocab_size)
# print(help(model.build))
# model.build(input_shape=(None, MAX_LENGTH))
# model.summary()
optimizer = tf.keras.optimizers.Adam()
loss_func = tf.keras.losses.SparseCategoricalCrossentropy(
from_logits=True,
name='sparse_categorical_crossentropy'
)
# TRAINING
print("\nTRAINING\n")
for e in range(START_EPOCH, END_EPOCH):
print(f"EPOCH: {str(e+1).zfill(len(str(END_EPOCH)))}/{END_EPOCH}")
train_gen = dataset.data_generator
train_gen = tf.data.Dataset.from_generator(
train_gen,
output_types=(tf.dtypes.int64, tf.dtypes.int64),
output_shapes=((None, MAX_LENGTH), (None,))
)
for batch_idx, batch in enumerate(train_gen):
x, y = batch
with tf.GradientTape() as tape:
logits = model(x) # model is supposed to output the logits (BATCH_SIZE, 2)
loss_value = loss_func(y, logits)
print(loss_value.numpy(), end="\r")
grads = tape.gradient(loss_value, model.trainable_variables)
optimizer.apply_gradients(zip(grads, model.trainable_variables))
print(f"loss@epoch#{e}: {loss_value.numpy()}")
print("\nEXPORTING THE MODEL\n")
class MyModule(tf.Module):
def __init__(self, model, item2idx):
self.model = model
self._item2idx = item2idx
@tf.function(input_signature=[tf.TensorSpec(shape=(None, None), dtype=tf.int64)])
def predict(self, inp_seq):
result = self.model(inp_seq)
return { "scores": result }
@tf.function(input_signature=[])
def metadata(self):
return { "item2idx": self._item2idx }
import json
item2idx = json.dumps(dataset.item2idx)
module = MyModule(model, item2idx)
tf.saved_model.save(module,
SAVE_MODEL_PATH,
signatures={ "score": module.predict,
"metadata": module.metadata})
print("\nIMPORTING...")
imported = tf.saved_model.load(SAVE_MODEL_PATH)
inp = tf.constant([[0, 0, 1, 2, 3]], dtype=tf.int64)
out = imported.signatures["score"](inp)["scores"].numpy()
pprint(out)
### HOW CAN I FEED STRING INPUTS LIKE THE ONE MENTIONED BELOW ???
# inp = tf.constant([["happy", "watch", "movie"]], dtype=tf.string)