"Gradients do not exist" warning when using a dictionary of losses

Peter_Bleackley · October 3, 2023, 9:47am

I’m trying to train three models together against four training objectives. To do so, I define a trainer model

import keras
import qarac.models.QaracEncoderModel
import qarac.models.QaracDecoderModel

class QaracTrainerModel(keras.Model):
    
    def __init__(self,base_encoder_model,base_decoder_model,tokenizer):
        """
        Sets up the Trainer model

        Parameters
        ----------
        base_encoder_model : transformers.TFRobertaModel
            Base model for encoders.
        base_decoder_model : transformers.TFRobertaModel
            Base model for decoder
        tokenizer : transformers.RobertaTokenizer
            Tokeniaer for decoder
        Returns
        -------
        None.

        """
        super(QaracTrainerModel,self).__init__()
        self.question_encoder = qarac.models.QaracEncoderModel.QaracEncoderModel(base_encoder_model)
        self.answer_encoder = qarac.models.QaracEncoderModel.QaracEncoderModel(base_encoder_model)
        self.decoder = qarac.models.QaracDecoderModel.QaracDecoderModel(base_decoder_model,tokenizer)
        self.consistency = keras.layers.Dot(axes=1,normalize=True)
        
    def call(self,inputs,training=None):
        """
        Generates training objective outputs from training data

        Parameters
        ----------
        inputs : dict[str,tensoflow.tensor]
            Fields are
            'all_text': Tokenized text to train answer encoder to produce vectors 
                        and decoder to convert them back to text
            'offset_text': Same text as in 'all_text', but preceded by <s>
            'question': Tokenized text of questions for question answering 
                        objective
            'answer': Tokenized text of answers for question answering objective
            'proposition0': tokenized proposition for reasoning objective
            'proposition1': tokenized proposition for reasoning objective
            'conclusion_offset': tokenized text of conclusions for reasoning 
                                 objective, prefixed by '<s>'
            'statement0': tokenized statement for consistency objective
            'statement1: tokenized statement for consistency objective'
        training : Bool, optional
            Not used. The default is None.

        Returns
        -------
        results : dict[str,tensorflow.tensor]
            Fields are
            'encode_decode': tokeniaed text from decoding of vectors produced by
                             answer encoder from 'all_text'
            'question_answering': difference between vector produced by question
                                  encoder for 'question' and answer encoder for 
                                  'answer'
            'reasoning': tokenised text produced by decoder from sum of vectors 
                         produced by answwr endocer for 'proposition0' and 
                         'proposition1'
            'consistency': cosine similarity of vectors produced by answer encoder 
                           from 'statement0' and 'statement1'

        """
        results = {}
        results['encode_decode'] = self.decoder((self.answer_encoder(inputs['all_text']),
                                                inputs['offset_text']))
        results['question_answering'] = self.question_encoder(inputs['question']) - self.answer_encoder(inputs['answer'])
        results['reasoning'] = self.decoder((self.answer_encoder(inputs['proposition0'])
                                             +self.answer_encoder(inputs['proposition1']),
                                             inputs['conclusion_offset']))
        results['consistency'] = self.consistency((self.answer_encoder(inputs['statement0']),
                                                   self.answer_encoder(inputs['statement1'])))
        return results

and complie and fit it with the follwing code

def train_models(path):
    encoder_base = transformers.TFRobertaModel.from_pretrained('roberta-base')
    config = encoder_base.config
    config.is_decoder = True
    decoder_base = transformers.TFRobertaModel.from_pretrained('roberta-base',
                                                               config=config)
    tokenizer = tokenizers.Tokenizer.from_pretrained('roberta-base')
    trainer = qarac.models.QaracTrainerModel.QaracTrainerModel(encoder_base, 
                                                               decoder_base, 
                                                               tokenizer)
    losses={'encode_decode':keras.losses.SparseCategoricalCrossentropy(from_logits=True),
            'question_answering':keras.losses.mean_squared_error,
            'reasoning':keras.losses.SparseCategoricalCrossentropy(from_logits=True),
            'consistency':keras.losses.mean_squared_error}
    optimizer = keras.optimizers.Nadam(learning_rate=keras.optimizers.schedules.ExponentialDecay(1.0e-5, 100, 0.99))
    trainer.compile(optimizer=optimizer,
                    loss=losses)
    training_data = qarac.corpora.CombinedCorpus.CombinedCorpus(tokenizer,
                                                                all_text='corpora/all_text.csv',
                                                                question_answering='corpora/question_answering.csv',
                                                                reasoning='corpora/reasoning_train.csv',
                                                                consistency='corpora/consistency.csv')
    history = trainer.fit(training_data,
                          epochs=10)

However, I’m getting the following warning

WARNING:tensorflow:Gradients do not exist for variables ['tf_roberta_model/roberta/pooler/dense/kernel:0', 'tf_roberta_model/roberta/pooler/dense/bias:0', 'qarac_trainer_model/qarac_encoder_model/global_attention_pooling_head/local projection:0', 'qarac_trainer_model/qarac_encoder_model_1/global_attention_pooling_head_1/local projection:0', 'tf_roberta_model_1/roberta/pooler/dense/kernel:0', 'tf_roberta_model_1/roberta/pooler/dense/bias:0'] when minimizing the loss. If you're using `model.compile()`, did you forget to provide a `loss` argument?
WARNING:tensorflow:Gradients do not exist for variables ['tf_roberta_model/roberta/pooler/dense/kernel:0', 'tf_roberta_model/roberta/pooler/dense/bias:0', 'qarac_trainer_model/qarac_encoder_model/global_attention_pooling_head/local projection:0', 'qarac_trainer_model/qarac_encoder_model_1/global_attention_pooling_head_1/local projection:0', 'tf_roberta_model_1/roberta/pooler/dense/kernel:0', 'tf_roberta_model_1/roberta/pooler/dense/bias:0'] when minimizing the loss. If you're using `model.compile()`, did you forget to provide a `loss` argument?

which suggests to me that the model won’t be updated.

Can anyone suggest what I need to do to fix this?

For further reference, the local_projection and global_projection variables are weights of the following layer, which is the output stage of each of the encoder models.

@tensorflow.function
def dot_prod(vectors):
    (x,y) = vectors
    return tensorflow.tensordot(x,y,axes=1)
    

class GlobalAttentionPoolingHead(keras.layers.Layer):
    
    def __init__(self):
        """
        Creates the layer

        Returns
        -------
        None.

        """
        super(GlobalAttentionPoolingHead,self).__init__()
        self.global_projection = None
        self.local_projection = None
        
        
    def build(self,input_shape):
        """
        Initialises layer weights

        Parameters
        ----------
        input_shape : tuple
            Shape of the input layer

        Returns
        -------
        None.

        """
        width = input_shape[-1]
        self.global_projection = self.add_weight('global projection',
                                                 shape=(width,width),
                                                 trainable=True)
        self.local_projection = self.add_weight('local projection',
                                                shape=(width,width),
                                                trainable=True)
        self.built=True
    
    @tensorflow.function
    def project_local(self,X):
        return tensorflow.tensordot(X,
                                    self.local_projection,
                                    axes=1)
        
    def call(self,X,attention_mask=None,training=None):
        """
        

        Parameters
        ----------
        X : tensorflow.Tensor
            Base model vectors to apply pooling to.
        attention_mask: tensorflow.Tensor, optional
            mask for pad values
        training : bool, optional
            Not used. The default is None.

        Returns
        -------
        tensorflow.Tensor
            The pooled value.

        """
        gp = tensorflow.linalg.l2_normalize(tensorflow.tensordot(tensorflow.reduce_sum(X,
                                                                                       axis=1),
                                                                  self.global_projection,
                                                                 axes=1),
                                            axis=1)
        lp = tensorflow.linalg.l2_normalize(tensorflow.vectorized_map(self.project_local,
                                                                      X),
                                            axis=2)
        attention = tensorflow.vectorized_map(dot_prod,(lp,gp))
        if attention_mask is None:
            attention_mask = tensorflow.ones_like(attention)
        return tensorflow.vectorized_map(dot_prod,
                                         (attention * attention_mask,X))

Laxma_Reddy_Patlolla · October 10, 2023, 6:25pm

Hi @Peter_Bleackley ,

As per my understanding that the variables mentioned in the warning ('global_projection' and 'local_projection' ) are weights in the GlobalAttentionPoolingHead layer. Since these weights are not directly involved in the calculation of the loss, TensorFlow is warning that the gradients for these weights are not being computed.

This can happen if the variables are not connected to the forward pass in a way that affects the computation of the loss.

Please let me know if this helps you.

Thanks.

Peter_Bleackley · October 10, 2023, 9:44pm

It seems that Tensorflow/Keras isn’t really designed for this kind of composite model training, so I’ve decided to port the project to PyTorch - fortunately the HuggingFace base models I’m using are available in both Tensorflow and PyTorch versions, so it’s not a big problem to switch.