How to add array feature in tensorflow recommendation

I studied the example given by the tensor flow documentation for the movielens dataset but then never explained how to handle boolean and array data types and how to create embedding for them.

So I have written some code but not able to understand where I am wrong

import numpy as np
from math import ceil

import tensorflow as tf
from tensorflow_datasets.core import download

import tensorflow_recommenders as tfrs
import tensorflow_datasets as tfds

tf.autograph.set_verbosity(10)
tf.get_logger().setLevel('CRITICAL')


class DCN(tfrs.Model):

    def __init__(self, use_cross_layer, deep_layer_sizes, datainfo, projection_dim=None):
        super().__init__()

        self.embedding_dimension = 32

        self._all_features = datainfo['all_features']
        self._embeddings = {}

        # Compute embeddings for string features.
        for feature_name in datainfo['str_features']:
            vocabulary = datainfo['vocabularies'][feature_name]
            self._embeddings[feature_name] = tf.keras.Sequential([
                tf.keras.layers.experimental.preprocessing.StringLookup(
                    vocabulary=vocabulary, mask_token=None),
                tf.keras.layers.Embedding(
                    len(vocabulary) + 1, self.embedding_dimension)
            ], name=feature_name)

        for feature_name in datainfo['int_lookup_feature']+datainfo['list_features']:
            vocabulary = datainfo['vocabularies'][feature_name]
            self._embeddings[feature_name] = tf.keras.Sequential([
                tf.keras.layers.experimental.preprocessing.IntegerLookup(
                    vocabulary=vocabulary, mask_token=None),
                tf.keras.layers.Embedding(
                    len(vocabulary) + 1, self.embedding_dimension)
            ], name=feature_name)

        for feature_name in datainfo['bool_features']:
            self._embeddings[feature_name] = tf.keras.Sequential([
                tf.keras.layers.Activation('relu'),
                tf.keras.layers.Dense(units=1)
            ], name=feature_name)

        if use_cross_layer:
            self._cross_layer = tfrs.layers.dcn.Cross(
                projection_dim=projection_dim,
                kernel_initializer="glorot_uniform")
        else:
            self._cross_layer = None

        self._deep_layers = [tf.keras.layers.Dense(layer_size, activation="relu")
                             for layer_size in deep_layer_sizes]

        self._logit_layer = tf.keras.layers.Dense(1)

        self.task = tfrs.tasks.Ranking(
            loss=tf.keras.losses.MeanSquaredError(),
            metrics=[tf.keras.metrics.RootMeanSquaredError("RMSE")]
        )

    def call(self, features):
        # Concatenate embeddings
        embeddings = []
        for feature_name in self._all_features:
            embedding_fn = self._embeddings.get(feature_name, None)
            if embedding_fn is not None:
                embeddings.append(embedding_fn(features[feature_name]))

        x = tf.concat(embeddings, axis=1)

        # Build Cross Network
        if self._cross_layer is not None:
            x = self._cross_layer(x)

        # Build Deep Network
        for deep_layer in self._deep_layers:
            x = deep_layer(x)

        return self._logit_layer(x)

    def compute_loss(self, features, training=False):
        labels = features.pop("user_rating")
        scores = self(features)
        return self.task(
            labels=labels,
            predictions=scores,
        )


def main():
    tf.random.set_seed(42)
    ds = tfds.load("movie_lens/100k-ratings", split="train")
    ds = ds.map(lambda x: {
        "movie_id": x["movie_id"],
        "user_id": x["user_id"],
        "user_rating": x["user_rating"],
        "user_gender": int(x["user_gender"]),
        "user_zip_code": x["user_zip_code"],
        "user_occupation_text": x["user_occupation_text"],
        "bucketized_user_age": int(x["bucketized_user_age"]),
        "movie_genres": x["movie_genres"],
    })
    dataLen = len(ds)
    trainLen = ceil(dataLen*0.8)
    testLen = dataLen - trainLen
    shuffled = ds.shuffle(100, reshuffle_each_iteration=False)

    str_features = ["movie_id", "user_id",
                    "user_zip_code", "user_occupation_text"]
    int_lookup_feature = ["bucketized_user_age"]
    list_features = ["movie_genres"]
    bool_features = ["user_gender"]

    all_features = str_features + \
        bool_features + list_features + int_lookup_feature
    vocabularies = {}
    dataValues = {}

    for feature_name in str_features+int_lookup_feature:
        vocab = shuffled.map(lambda x: x[feature_name])
        vocabularies[feature_name] = np.unique(
            [i.numpy() for i in list(vocab)]).tolist()

    for feature_name in list_features:
        vocab = shuffled.map(lambda x: x[feature_name])
        vocabularies[feature_name] = np.unique(
            np.concatenate(list(vocab))).tolist()

    datainfo = {
        'all_features': all_features,
        'str_features': str_features,
        'list_features': list_features,
        'int_lookup_feature': int_lookup_feature,
        'bool_features': bool_features,
        'vocabularies': vocabularies,
        'dataValues': dataValues
    }

    train = shuffled.take(trainLen)
    test = shuffled.skip(trainLen).take(testLen)

    cached_train = train.shuffle(100_000).batch(8192).cache()
    cached_test = test.batch(4096).cache()

    epochs = 8
    learning_rate = 0.01
    use_cross_layer = True
    deep_layer_sizes = [192, 192]
    projection_dim = None

    model = DCN(use_cross_layer=use_cross_layer,
                deep_layer_sizes=deep_layer_sizes,
                projection_dim=projection_dim,
                datainfo=datainfo)
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate))
    model.fit(cached_train, epochs=epochs, verbose=True)
    metrics = model.evaluate(cached_test, return_dict=True)

    print(metrics)


main()

It is giving me the wrong dimensions error in my layers but I could not figure out which layer.

1 Like

Hello @Himanshu_Bansal

Thank you for using TensorFlow
In the code provided, one work around would be to print all the layers shape and see if the shape is expected embedding_output = embedding_fn(features[feature_name]) print(f" {feature_name}: {embedding_output.shape}") in the call function.
In the definition of model for computation of embeddings, we can add GlobalAveragepooling layer to handle 1D lists in the dataset.

Thank you