I’m trying to train three models together against four training objectives. To do so, I define a trainer model
import keras
import qarac.models.QaracEncoderModel
import qarac.models.QaracDecoderModel
class QaracTrainerModel(keras.Model):
def __init__(self,base_encoder_model,base_decoder_model,tokenizer):
"""
Sets up the Trainer model
Parameters
----------
base_encoder_model : transformers.TFRobertaModel
Base model for encoders.
base_decoder_model : transformers.TFRobertaModel
Base model for decoder
tokenizer : transformers.RobertaTokenizer
Tokeniaer for decoder
Returns
-------
None.
"""
super(QaracTrainerModel,self).__init__()
self.question_encoder = qarac.models.QaracEncoderModel.QaracEncoderModel(base_encoder_model)
self.answer_encoder = qarac.models.QaracEncoderModel.QaracEncoderModel(base_encoder_model)
self.decoder = qarac.models.QaracDecoderModel.QaracDecoderModel(base_decoder_model,tokenizer)
self.consistency = keras.layers.Dot(axes=1,normalize=True)
def call(self,inputs,training=None):
"""
Generates training objective outputs from training data
Parameters
----------
inputs : dict[str,tensoflow.tensor]
Fields are
'all_text': Tokenized text to train answer encoder to produce vectors
and decoder to convert them back to text
'offset_text': Same text as in 'all_text', but preceded by <s>
'question': Tokenized text of questions for question answering
objective
'answer': Tokenized text of answers for question answering objective
'proposition0': tokenized proposition for reasoning objective
'proposition1': tokenized proposition for reasoning objective
'conclusion_offset': tokenized text of conclusions for reasoning
objective, prefixed by '<s>'
'statement0': tokenized statement for consistency objective
'statement1: tokenized statement for consistency objective'
training : Bool, optional
Not used. The default is None.
Returns
-------
results : dict[str,tensorflow.tensor]
Fields are
'encode_decode': tokeniaed text from decoding of vectors produced by
answer encoder from 'all_text'
'question_answering': difference between vector produced by question
encoder for 'question' and answer encoder for
'answer'
'reasoning': tokenised text produced by decoder from sum of vectors
produced by answwr endocer for 'proposition0' and
'proposition1'
'consistency': cosine similarity of vectors produced by answer encoder
from 'statement0' and 'statement1'
"""
results = {}
results['encode_decode'] = self.decoder((self.answer_encoder(inputs['all_text']),
inputs['offset_text']))
results['question_answering'] = self.question_encoder(inputs['question']) - self.answer_encoder(inputs['answer'])
results['reasoning'] = self.decoder((self.answer_encoder(inputs['proposition0'])
+self.answer_encoder(inputs['proposition1']),
inputs['conclusion_offset']))
results['consistency'] = self.consistency((self.answer_encoder(inputs['statement0']),
self.answer_encoder(inputs['statement1'])))
return results
and complie and fit it with the follwing code
def train_models(path):
encoder_base = transformers.TFRobertaModel.from_pretrained('roberta-base')
config = encoder_base.config
config.is_decoder = True
decoder_base = transformers.TFRobertaModel.from_pretrained('roberta-base',
config=config)
tokenizer = tokenizers.Tokenizer.from_pretrained('roberta-base')
trainer = qarac.models.QaracTrainerModel.QaracTrainerModel(encoder_base,
decoder_base,
tokenizer)
losses={'encode_decode':keras.losses.SparseCategoricalCrossentropy(from_logits=True),
'question_answering':keras.losses.mean_squared_error,
'reasoning':keras.losses.SparseCategoricalCrossentropy(from_logits=True),
'consistency':keras.losses.mean_squared_error}
optimizer = keras.optimizers.Nadam(learning_rate=keras.optimizers.schedules.ExponentialDecay(1.0e-5, 100, 0.99))
trainer.compile(optimizer=optimizer,
loss=losses)
training_data = qarac.corpora.CombinedCorpus.CombinedCorpus(tokenizer,
all_text='corpora/all_text.csv',
question_answering='corpora/question_answering.csv',
reasoning='corpora/reasoning_train.csv',
consistency='corpora/consistency.csv')
history = trainer.fit(training_data,
epochs=10)
However, I’m getting the following warning
WARNING:tensorflow:Gradients do not exist for variables ['tf_roberta_model/roberta/pooler/dense/kernel:0', 'tf_roberta_model/roberta/pooler/dense/bias:0', 'qarac_trainer_model/qarac_encoder_model/global_attention_pooling_head/local projection:0', 'qarac_trainer_model/qarac_encoder_model_1/global_attention_pooling_head_1/local projection:0', 'tf_roberta_model_1/roberta/pooler/dense/kernel:0', 'tf_roberta_model_1/roberta/pooler/dense/bias:0'] when minimizing the loss. If you're using `model.compile()`, did you forget to provide a `loss` argument?
WARNING:tensorflow:Gradients do not exist for variables ['tf_roberta_model/roberta/pooler/dense/kernel:0', 'tf_roberta_model/roberta/pooler/dense/bias:0', 'qarac_trainer_model/qarac_encoder_model/global_attention_pooling_head/local projection:0', 'qarac_trainer_model/qarac_encoder_model_1/global_attention_pooling_head_1/local projection:0', 'tf_roberta_model_1/roberta/pooler/dense/kernel:0', 'tf_roberta_model_1/roberta/pooler/dense/bias:0'] when minimizing the loss. If you're using `model.compile()`, did you forget to provide a `loss` argument?
which suggests to me that the model won’t be updated.
Can anyone suggest what I need to do to fix this?
For further reference, the local_projection
and global_projection
variables are weights of the following layer, which is the output stage of each of the encoder models.
@tensorflow.function
def dot_prod(vectors):
(x,y) = vectors
return tensorflow.tensordot(x,y,axes=1)
class GlobalAttentionPoolingHead(keras.layers.Layer):
def __init__(self):
"""
Creates the layer
Returns
-------
None.
"""
super(GlobalAttentionPoolingHead,self).__init__()
self.global_projection = None
self.local_projection = None
def build(self,input_shape):
"""
Initialises layer weights
Parameters
----------
input_shape : tuple
Shape of the input layer
Returns
-------
None.
"""
width = input_shape[-1]
self.global_projection = self.add_weight('global projection',
shape=(width,width),
trainable=True)
self.local_projection = self.add_weight('local projection',
shape=(width,width),
trainable=True)
self.built=True
@tensorflow.function
def project_local(self,X):
return tensorflow.tensordot(X,
self.local_projection,
axes=1)
def call(self,X,attention_mask=None,training=None):
"""
Parameters
----------
X : tensorflow.Tensor
Base model vectors to apply pooling to.
attention_mask: tensorflow.Tensor, optional
mask for pad values
training : bool, optional
Not used. The default is None.
Returns
-------
tensorflow.Tensor
The pooled value.
"""
gp = tensorflow.linalg.l2_normalize(tensorflow.tensordot(tensorflow.reduce_sum(X,
axis=1),
self.global_projection,
axes=1),
axis=1)
lp = tensorflow.linalg.l2_normalize(tensorflow.vectorized_map(self.project_local,
X),
axis=2)
attention = tensorflow.vectorized_map(dot_prod,(lp,gp))
if attention_mask is None:
attention_mask = tensorflow.ones_like(attention)
return tensorflow.vectorized_map(dot_prod,
(attention * attention_mask,X))