I’m trying to follow this tutorial to use BERT Classify text with BERT | Text | TensorFlow but with a different dataset for multilabel text classification.
This is how the model is created from the tutorial
import tensorflow_hub as hub
import tensorflow_text
#bert_model_name = 'small_bert/bert_en_uncased_L-4_H-512_A-8'
tfhub_handle_encoder = 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1'
tfhub_handle_preprocess = 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3'
def build_classifier_model(input_shape, output_dim):
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name='preprocessing')
encoder_inputs = preprocessing_layer(text_input)
encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='BERT_encoder')
outputs = encoder(encoder_inputs)
net = outputs['pooled_output']
net = tf.keras.layers.Dropout(0.1)(net)
net = tf.keras.layers.Dense(output_dim, activation=None, name='classifier')(net)
return tf.keras.Model(text_input, net)
classifier_model = build_classifier_model(None, lookup.vocabulary_size())
This is how I create my datasets
padding_token = "<pad>"
auto = tf.data.AUTOTUNE
def make_dataset(dataframe, lookup, is_train=True):
labels = tf.ragged.constant(dataframe["ATTRIBUTE_VALUE"].values) # uneven number of labels in each row
label_binarized = lookup(labels).numpy() # get multi hot encoding
dataset = tf.data.Dataset.from_tensor_slices(
(dataframe["TEXT"].values, label_binarized)
) # combine text and multi hot into one tf dataset
return dataset
terms = tf.ragged.constant(train_df["ATTRIBUTE_VALUE"].values)
lookup = tf.keras.layers.StringLookup(output_mode="multi_hot") # maps attribute values to multi hot encoding
lookup.adapt(terms)
train_dataset = make_dataset(train_df, lookup, is_train=True)
validation_dataset = make_dataset(val_df, lookup, is_train=False)
test_dataset = make_dataset(test_df, lookup, is_train=False)
train_dataset = train_dataset.cache().prefetch(buffer_size=AUTOTUNE)
validation_dataset = validation_dataset.cache().prefetch(buffer_size=AUTOTUNE)
test_dataset = test_dataset.cache().prefetch(buffer_size=AUTOTUNE)
After creating my datasets they have shape=()
. When I try to train the model
from official.nlp import optimization
steps_per_epoch = tf.data.experimental.cardinality(train_dataset).numpy()
num_train_steps = steps_per_epoch * epochs
num_warmup_steps = int(0.1*num_train_steps)
optimizer = optimization.create_optimizer(init_lr=init_lr,
num_train_steps=num_train_steps,
num_warmup_steps=num_warmup_steps,
optimizer_type='adamw')
classifier_model.compile(optimizer=optimizer,
loss="binary_crossentropy",
metrics=[tf.keras.metrics.Recall()])
history = classifier_model.fit(x=train_dataset,
validation_data=validation_dataset,
epochs=epochs)
I get the following error which I believe is saying my dataset needs to be of shape=(None,)
instead of shape=()
.
ValueError: Exception encountered when calling layer “preprocessing”
(type KerasLayer).in user code: File "/opt/anaconda3/envs/mv_prodmatch/lib/python3.10/site-packages/tensorflow_hub/keras_layer.py",
line 237, in call *
result = smart_cond.smart_cond(training,ValueError: Could not find matching concrete function to call loaded from the SavedModel. Got: Positional arguments (3 total): * Tensor("inputs:0", shape=(), dtype=string) * False * None Keyword arguments: {} Expected these arguments to match one of the following 4 option(s): Option 1: Positional arguments (3 total): * TensorSpec(shape=(None,), dtype=tf.string, name='sentences') * False * None Keyword arguments: {} Option 2: Positional arguments (3 total): * TensorSpec(shape=(None,), dtype=tf.string, name='sentences') * True * None Keyword arguments: {} Option 3: Positional arguments (3 total): * TensorSpec(shape=(None,), dtype=tf.string, name='inputs') * False * None Keyword arguments: {} Option 4: Positional arguments (3 total): * TensorSpec(shape=(None,), dtype=tf.string, name='inputs') * True * None Keyword arguments: {} Call arguments received: • inputs=tf.Tensor(shape=(), dtype=string) • training=True
How can I make my dataset have shape=(None,)
instead of shape=()
?