I studied the example given by the tensor flow documentation for the movielens dataset but then never explained how to handle boolean and array data types and how to create embedding for them.
So I have written some code but not able to understand where I am wrong
import numpy as np
from math import ceil
import tensorflow as tf
from tensorflow_datasets.core import download
import tensorflow_recommenders as tfrs
import tensorflow_datasets as tfds
tf.autograph.set_verbosity(10)
tf.get_logger().setLevel('CRITICAL')
class DCN(tfrs.Model):
def __init__(self, use_cross_layer, deep_layer_sizes, datainfo, projection_dim=None):
super().__init__()
self.embedding_dimension = 32
self._all_features = datainfo['all_features']
self._embeddings = {}
# Compute embeddings for string features.
for feature_name in datainfo['str_features']:
vocabulary = datainfo['vocabularies'][feature_name]
self._embeddings[feature_name] = tf.keras.Sequential([
tf.keras.layers.experimental.preprocessing.StringLookup(
vocabulary=vocabulary, mask_token=None),
tf.keras.layers.Embedding(
len(vocabulary) + 1, self.embedding_dimension)
], name=feature_name)
for feature_name in datainfo['int_lookup_feature']+datainfo['list_features']:
vocabulary = datainfo['vocabularies'][feature_name]
self._embeddings[feature_name] = tf.keras.Sequential([
tf.keras.layers.experimental.preprocessing.IntegerLookup(
vocabulary=vocabulary, mask_token=None),
tf.keras.layers.Embedding(
len(vocabulary) + 1, self.embedding_dimension)
], name=feature_name)
for feature_name in datainfo['bool_features']:
self._embeddings[feature_name] = tf.keras.Sequential([
tf.keras.layers.Activation('relu'),
tf.keras.layers.Dense(units=1)
], name=feature_name)
if use_cross_layer:
self._cross_layer = tfrs.layers.dcn.Cross(
projection_dim=projection_dim,
kernel_initializer="glorot_uniform")
else:
self._cross_layer = None
self._deep_layers = [tf.keras.layers.Dense(layer_size, activation="relu")
for layer_size in deep_layer_sizes]
self._logit_layer = tf.keras.layers.Dense(1)
self.task = tfrs.tasks.Ranking(
loss=tf.keras.losses.MeanSquaredError(),
metrics=[tf.keras.metrics.RootMeanSquaredError("RMSE")]
)
def call(self, features):
# Concatenate embeddings
embeddings = []
for feature_name in self._all_features:
embedding_fn = self._embeddings.get(feature_name, None)
if embedding_fn is not None:
embeddings.append(embedding_fn(features[feature_name]))
x = tf.concat(embeddings, axis=1)
# Build Cross Network
if self._cross_layer is not None:
x = self._cross_layer(x)
# Build Deep Network
for deep_layer in self._deep_layers:
x = deep_layer(x)
return self._logit_layer(x)
def compute_loss(self, features, training=False):
labels = features.pop("user_rating")
scores = self(features)
return self.task(
labels=labels,
predictions=scores,
)
def main():
tf.random.set_seed(42)
ds = tfds.load("movie_lens/100k-ratings", split="train")
ds = ds.map(lambda x: {
"movie_id": x["movie_id"],
"user_id": x["user_id"],
"user_rating": x["user_rating"],
"user_gender": int(x["user_gender"]),
"user_zip_code": x["user_zip_code"],
"user_occupation_text": x["user_occupation_text"],
"bucketized_user_age": int(x["bucketized_user_age"]),
"movie_genres": x["movie_genres"],
})
dataLen = len(ds)
trainLen = ceil(dataLen*0.8)
testLen = dataLen - trainLen
shuffled = ds.shuffle(100, reshuffle_each_iteration=False)
str_features = ["movie_id", "user_id",
"user_zip_code", "user_occupation_text"]
int_lookup_feature = ["bucketized_user_age"]
list_features = ["movie_genres"]
bool_features = ["user_gender"]
all_features = str_features + \
bool_features + list_features + int_lookup_feature
vocabularies = {}
dataValues = {}
for feature_name in str_features+int_lookup_feature:
vocab = shuffled.map(lambda x: x[feature_name])
vocabularies[feature_name] = np.unique(
[i.numpy() for i in list(vocab)]).tolist()
for feature_name in list_features:
vocab = shuffled.map(lambda x: x[feature_name])
vocabularies[feature_name] = np.unique(
np.concatenate(list(vocab))).tolist()
datainfo = {
'all_features': all_features,
'str_features': str_features,
'list_features': list_features,
'int_lookup_feature': int_lookup_feature,
'bool_features': bool_features,
'vocabularies': vocabularies,
'dataValues': dataValues
}
train = shuffled.take(trainLen)
test = shuffled.skip(trainLen).take(testLen)
cached_train = train.shuffle(100_000).batch(8192).cache()
cached_test = test.batch(4096).cache()
epochs = 8
learning_rate = 0.01
use_cross_layer = True
deep_layer_sizes = [192, 192]
projection_dim = None
model = DCN(use_cross_layer=use_cross_layer,
deep_layer_sizes=deep_layer_sizes,
projection_dim=projection_dim,
datainfo=datainfo)
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate))
model.fit(cached_train, epochs=epochs, verbose=True)
metrics = model.evaluate(cached_test, return_dict=True)
print(metrics)
main()
It is giving me the wrong dimensions error in my layers but I could not figure out which layer.