How to convert string to indices in a rnn model while exporting it using saved_model.save

I have a text classification model. I have made a dataset class where I make item2idx and idx2item dictionaries using the text corpus. Then I trained the model, and exported it.
Everything is fine as long as I feed int64 tensors to the model after loading the exported model. But how do I feed in the strings and get the model predictions? Or how do I handle the text preprocessing of the post-processing step after exporting the model?

import tensorflow as tf
print("tf.__version__: ", tf.__version__)
import os, sys, random, pdb
from pprint import pprint
import numpy as np

START_EPOCH = 0 
END_EPOCH = 3
MAX_LENGTH = 5
BATCH_SIZE = 256
WORD_EMB_DIM = 32
LSTM_DIM=32
SAVE_MODEL_PATH = "saved_models/1"

def q(exit_msg=""):
    print(f"\n>{exit_msg}<")
    sys.exit()

text_data = [
        "i like this movie",
        "i feel happy watch movie",
        "great taste",
        "like the look of it",
        "great news",
        "hate this movie",
        "very bad news",
        "horrible movie",
        "very bad news",
        "i do not like it"
        ]

label_data = [1, 1, 1, 1, 1, 0, 0, 0, 0, 0]

# BUILD DATASET
class text_dataset():
    def __init__(self):
        self._build_vocab()
        
    def _build_vocab(self):
        words = []
        for words_list in [t.split(" ") for t in text_data]:
            words.extend(words_list)
        words = sorted(list(set(words)))

        self.item2idx = {}
        self.item2idx["<pad>"] = 0
        for w_idx, w in enumerate(words):
            self.item2idx[w] = w_idx + 1
        
        self.idx2item = {w_idx: w for w, w_idx in self.item2idx.items()}
        self.vocab_size = len(self.idx2item)
        print("self.vocab_size: ", self.vocab_size)

    def data_generator(self):
        batch_idx = 0
        while batch_idx < 8:
            sample_indices = [random.randint(0, len(text_data)-1) for _ in range(BATCH_SIZE)]

            x_raw = [text_data[i] for i in sample_indices]
            y = [label_data[i] for i in sample_indices]
            x_raw = [i.split(" ") for i in x_raw]                       
            x_raw = [[self.item2idx[j] for j in i] for i in x_raw]        
            zero_array = np.zeros((BATCH_SIZE,  MAX_LENGTH))        
            for i in range(len(x_raw)):
                zero_array[i, :len(x_raw[i])] = x_raw[i]   
            x_train = np.array(zero_array) # (BATCH_SIZE, MAX_LENGTH)
            y_train = np.array(y)          # (BATCH_SIZE, )
            yield tuple((x_train, y_train))
            batch_idx += 1


# BUILD MODEL
class classification_model(tf.keras.Model):
    def __init__(self, vocab_size):        
        super(classification_model, self).__init__()   
        self.word_emb = tf.keras.layers.Embedding(vocab_size, 
                                                  WORD_EMB_DIM, 
                                                  mask_zero=True, 
                                                  name="word_embedding_layer")               
            
        self.lstm   = tf.keras.layers.LSTM(LSTM_DIM, return_state=True, name="rnn_layer")    
        self.dense = tf.keras.layers.Dense(2)           
    
    # @tf.function(input_signature=[tf.TensorSpec(shape=[None, MAX_LENGTH], dtype=tf.int64)])
    @tf.function(input_signature=[tf.TensorSpec(shape=[None, None], dtype=tf.int64)])
    def call(self, word_emb_inp, initial_state=None):   
        word_emb = self.word_emb(word_emb_inp)                           # (bs, MAX_LEN, WORD_EMB_DIM)
        word_emb_mask = self.word_emb.compute_mask(word_emb_inp)         # (bs, MAX_LEN)

        lstm_inp = word_emb                                              # (bs, MAX_LEN, WORD_EMB_DIM)
        lstm_inp_mask = word_emb_mask                                    # (bs, MAX_LEN)
                            
        lstm, state_h, state_c = self.lstm(lstm_inp, mask=word_emb_mask, initial_state=initial_state)
        dense_out = self.dense(lstm)
        return dense_out


# INITIALIZING DATASET AND MODEL
dataset = text_dataset()
model = classification_model(dataset.vocab_size)

# print(help(model.build))
# model.build(input_shape=(None, MAX_LENGTH))
# model.summary()

optimizer = tf.keras.optimizers.Adam() 

loss_func = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True,
    name='sparse_categorical_crossentropy'
)

# TRAINING
print("\nTRAINING\n")
for e in range(START_EPOCH, END_EPOCH):
    print(f"EPOCH: {str(e+1).zfill(len(str(END_EPOCH)))}/{END_EPOCH}")

    train_gen  = dataset.data_generator
    train_gen = tf.data.Dataset.from_generator(
                            train_gen,
                            output_types=(tf.dtypes.int64, tf.dtypes.int64),
                            output_shapes=((None, MAX_LENGTH), (None,)) 
                            )   

    for batch_idx, batch in enumerate(train_gen): 
        x, y = batch
        with tf.GradientTape() as tape:
            logits = model(x) # model is supposed to output the logits (BATCH_SIZE, 2)

            loss_value = loss_func(y, logits)
            print(loss_value.numpy(), end="\r")
            grads = tape.gradient(loss_value, model.trainable_variables)
            optimizer.apply_gradients(zip(grads, model.trainable_variables))
    
    print(f"loss@epoch#{e}: {loss_value.numpy()}")


print("\nEXPORTING THE MODEL\n")
class MyModule(tf.Module):
  def __init__(self, model, item2idx):
    self.model = model
    self._item2idx = item2idx

  @tf.function(input_signature=[tf.TensorSpec(shape=(None, None), dtype=tf.int64)])
  def predict(self, inp_seq):
    result = self.model(inp_seq)
    return { "scores": result }

  @tf.function(input_signature=[])
  def metadata(self):
    return { "item2idx": self._item2idx }

import json

item2idx = json.dumps(dataset.item2idx)
module = MyModule(model, item2idx)
tf.saved_model.save(module, 
                    SAVE_MODEL_PATH, 
                    signatures={ "score": module.predict, 
                                 "metadata": module.metadata})



print("\nIMPORTING...")
imported = tf.saved_model.load(SAVE_MODEL_PATH)
inp = tf.constant([[0, 0, 1, 2, 3]], dtype=tf.int64)
out = imported.signatures["score"](inp)["scores"].numpy()
pprint(out)

### HOW CAN I FEED STRING INPUTS LIKE THE ONE MENTIONED BELOW ???
# inp = tf.constant([["happy", "watch", "movie"]], dtype=tf.string)

Hi @n0obcoder

Welcome to the TensorFlow Forum!

You can use TextVectorization layer to do the text preprocessing. Please refer to this Text Classification model using RNN which might be helpful for you.

You can also have a look at the StringLookup layer to Vectorize the text into numerical value by referring the mentioned link to preprocess the text. Thank you.