How to add array feature in tensorflow recommendation

Himanshu_Bansal · June 24, 2021, 3:30pm

I studied the example given by the tensor flow documentation for the movielens dataset but then never explained how to handle boolean and array data types and how to create embedding for them.

So I have written some code but not able to understand where I am wrong

import numpy as np
from math import ceil

import tensorflow as tf
from tensorflow_datasets.core import download

import tensorflow_recommenders as tfrs
import tensorflow_datasets as tfds

tf.autograph.set_verbosity(10)
tf.get_logger().setLevel('CRITICAL')


class DCN(tfrs.Model):

    def __init__(self, use_cross_layer, deep_layer_sizes, datainfo, projection_dim=None):
        super().__init__()

        self.embedding_dimension = 32

        self._all_features = datainfo['all_features']
        self._embeddings = {}

        # Compute embeddings for string features.
        for feature_name in datainfo['str_features']:
            vocabulary = datainfo['vocabularies'][feature_name]
            self._embeddings[feature_name] = tf.keras.Sequential([
                tf.keras.layers.experimental.preprocessing.StringLookup(
                    vocabulary=vocabulary, mask_token=None),
                tf.keras.layers.Embedding(
                    len(vocabulary) + 1, self.embedding_dimension)
            ], name=feature_name)

        for feature_name in datainfo['int_lookup_feature']+datainfo['list_features']:
            vocabulary = datainfo['vocabularies'][feature_name]
            self._embeddings[feature_name] = tf.keras.Sequential([
                tf.keras.layers.experimental.preprocessing.IntegerLookup(
                    vocabulary=vocabulary, mask_token=None),
                tf.keras.layers.Embedding(
                    len(vocabulary) + 1, self.embedding_dimension)
            ], name=feature_name)

        for feature_name in datainfo['bool_features']:
            self._embeddings[feature_name] = tf.keras.Sequential([
                tf.keras.layers.Activation('relu'),
                tf.keras.layers.Dense(units=1)
            ], name=feature_name)

        if use_cross_layer:
            self._cross_layer = tfrs.layers.dcn.Cross(
                projection_dim=projection_dim,
                kernel_initializer="glorot_uniform")
        else:
            self._cross_layer = None

        self._deep_layers = [tf.keras.layers.Dense(layer_size, activation="relu")
                             for layer_size in deep_layer_sizes]

        self._logit_layer = tf.keras.layers.Dense(1)

        self.task = tfrs.tasks.Ranking(
            loss=tf.keras.losses.MeanSquaredError(),
            metrics=[tf.keras.metrics.RootMeanSquaredError("RMSE")]
        )

    def call(self, features):
        # Concatenate embeddings
        embeddings = []
        for feature_name in self._all_features:
            embedding_fn = self._embeddings.get(feature_name, None)
            if embedding_fn is not None:
                embeddings.append(embedding_fn(features[feature_name]))

        x = tf.concat(embeddings, axis=1)

        # Build Cross Network
        if self._cross_layer is not None:
            x = self._cross_layer(x)

        # Build Deep Network
        for deep_layer in self._deep_layers:
            x = deep_layer(x)

        return self._logit_layer(x)

    def compute_loss(self, features, training=False):
        labels = features.pop("user_rating")
        scores = self(features)
        return self.task(
            labels=labels,
            predictions=scores,
        )


def main():
    tf.random.set_seed(42)
    ds = tfds.load("movie_lens/100k-ratings", split="train")
    ds = ds.map(lambda x: {
        "movie_id": x["movie_id"],
        "user_id": x["user_id"],
        "user_rating": x["user_rating"],
        "user_gender": int(x["user_gender"]),
        "user_zip_code": x["user_zip_code"],
        "user_occupation_text": x["user_occupation_text"],
        "bucketized_user_age": int(x["bucketized_user_age"]),
        "movie_genres": x["movie_genres"],
    })
    dataLen = len(ds)
    trainLen = ceil(dataLen*0.8)
    testLen = dataLen - trainLen
    shuffled = ds.shuffle(100, reshuffle_each_iteration=False)

    str_features = ["movie_id", "user_id",
                    "user_zip_code", "user_occupation_text"]
    int_lookup_feature = ["bucketized_user_age"]
    list_features = ["movie_genres"]
    bool_features = ["user_gender"]

    all_features = str_features + \
        bool_features + list_features + int_lookup_feature
    vocabularies = {}
    dataValues = {}

    for feature_name in str_features+int_lookup_feature:
        vocab = shuffled.map(lambda x: x[feature_name])
        vocabularies[feature_name] = np.unique(
            [i.numpy() for i in list(vocab)]).tolist()

    for feature_name in list_features:
        vocab = shuffled.map(lambda x: x[feature_name])
        vocabularies[feature_name] = np.unique(
            np.concatenate(list(vocab))).tolist()

    datainfo = {
        'all_features': all_features,
        'str_features': str_features,
        'list_features': list_features,
        'int_lookup_feature': int_lookup_feature,
        'bool_features': bool_features,
        'vocabularies': vocabularies,
        'dataValues': dataValues
    }

    train = shuffled.take(trainLen)
    test = shuffled.skip(trainLen).take(testLen)

    cached_train = train.shuffle(100_000).batch(8192).cache()
    cached_test = test.batch(4096).cache()

    epochs = 8
    learning_rate = 0.01
    use_cross_layer = True
    deep_layer_sizes = [192, 192]
    projection_dim = None

    model = DCN(use_cross_layer=use_cross_layer,
                deep_layer_sizes=deep_layer_sizes,
                projection_dim=projection_dim,
                datainfo=datainfo)
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate))
    model.fit(cached_train, epochs=epochs, verbose=True)
    metrics = model.evaluate(cached_test, return_dict=True)

    print(metrics)


main()

It is giving me the wrong dimensions error in my layers but I could not figure out which layer.

Jetti_Bharat · October 11, 2024, 7:20am

Hello @Himanshu_Bansal

Thank you for using TensorFlow
In the code provided, one work around would be to print all the layers shape and see if the shape is expected embedding_output = embedding_fn(features[feature_name]) print(f" {feature_name}: {embedding_output.shape}") in the call function.
In the definition of model for computation of embeddings, we can add GlobalAveragepooling layer to handle 1D lists in the dataset.

Thank you

Topic		Replies	Views
IndexError: tuple index out of range while doing prediction in tensorflow model General Discussion models , recommenders , keras , education , help_request	3	3930	July 26, 2022
AttributeError: Layer retrieval_1 has no inbound nodes General Discussion models , datasets , keras	2	1696	October 21, 2022
When using `TextVectorization` to tokenize strings, the input rank must be 1 or the last shape dimension must be 1. Received: General Discussion models , help_request , tensorflow	2	4141	April 10, 2023
Getting Nan Loss when training Deep neural Recommender model using tensorflow General Discussion models , recommenders , help_request	2	5992	June 6, 2022
'StringLookup' object has no attribute 'vocab_size' TensorFlow recommenders , tfkeras , tensorflow	6	294	September 10, 2024

How to add array feature in tensorflow recommendation

Related topics