Pytorch code convertion into keras

DANIELE_ACCONCIA · February 18, 2022, 10:38am

I used the keras distillation tutorial. During the training I have the following warning. These layer are releated with the residual block that I don’t use in my output

WARNING:tensorflow:Gradients do not exist for variables [‘ResBlock/conv2d_137/kernel:0’, ‘ResBlock/conv2d_137/bias:0’, ‘ResBlock/batch_normalization_137/gamma:0’, ‘ResBlock/batch_normalization_137/beta:0’, ‘ResBlock/conv2d_138/kernel:0’, ‘ResBlock/conv2d_138/bias:0’, ‘ResBlock/batch_normalization_138/gamma:0’, ‘ResBlock/batch_normalization_138/beta:0’, ‘ResBlock/batch_normalization_139/gamma:0’, ‘ResBlock/batch_normalization_139/beta:0’, ‘ResBlock/conv2d_139/kernel:0’, ‘ResBlock/conv2d_139/bias:0’, ‘ResBlock/conv2d_140/kernel:0’, ‘ResBlock/conv2d_140/bias:0’, ‘ResBlock/batch_normalization_140/gamma:0’, ‘ResBlock/batch_normalization_140/beta:0’, ‘ResBlock/conv2d_141/kernel:0’, ‘ResBlock/conv2d_141/bias:0’, ‘ResBlock/batch_normalization_141/gamma:0’, ‘ResBlock/batch_normalization_141/beta:0’, ‘ResBlock/conv2d_142/kernel:0’, ‘ResBlock/conv2d_142/bias:0’, ‘ResBlock/batch_normalization_142/gamma:0’, ‘ResBlock/batch_normalization_142/beta:0’, ‘ResBlock/conv2d_143/kernel:0’, ‘ResBlock/conv2d_143/bias:0’, ‘ResBlock/batch_normalization_143/gamma:0’, ‘ResBlock/batch_normalization_143/beta:0’, ‘dense_9/kernel:0’, ‘dense_9/bias:0’, ‘dense_10/kernel:0’, ‘dense_10/bias:0’, ‘dense_11/kernel:0’, ‘dense_11/bias:0’] when minimizing the loss.

class ResBlock(Model):
def init(self, channels, stride = 1):

    super(ResBlock, self).__init__(name='ResBlock')
    self.flag = (stride != 1)
    self.conv1 = Conv2D(channels, 3, stride, padding='same')
    self.bn1 = BatchNormalization()
    self.conv2 = Conv2D(channels, 3, padding='same')
    self.bn2 = BatchNormalization()
    self.relu = ReLU()
    
    if self.flag:
        self.bn3 = BatchNormalization()
        self.conv3 = Conv2D(channels, 1, stride)

def call(self, x):
    
    x1 = self.conv1(x)
    x1 = self.bn1(x1)
    x1 = self.relu(x1)
    x1 = self.conv2(x1)
    x1 = self.bn2(x1)
    
    if self.flag:
        x = self.conv3(x)
        x = self.bn3(x)
        
    x1 = Layers.add([x, x1])
    x1 = self.relu(x1)
    
    return x1

class ResNet34(Model):
def init(self):

    super(ResNet34, self).__init__(name = 'ResNet34')
    self.conv1 = Conv2D(64, 7, 2, padding = 'same')
    self.bn = BatchNormalization()
    self.relu = ReLU()
    self.mp1 = MaxPooling2D(3, 2)

    self.conv2_1 = ResBlock(64)
    self.conv2_2 = ResBlock(64)
    self.conv2_3 = ResBlock(64)

    self.conv3_1 = ResBlock(128, 2)
    self.conv3_2 = ResBlock(128)
    self.conv3_3 = ResBlock(128)
    self.conv3_4 = ResBlock(128)

    self.conv4_1 = ResBlock(256, 2)
    self.conv4_2 = ResBlock(256)
    self.conv4_3 = ResBlock(256)
    self.conv4_4 = ResBlock(256)
    self.conv4_5 = ResBlock(256)
    self.conv4_6 = ResBlock(256)

    self.conv5_1 = ResBlock(512, 2)
    self.conv5_2 = ResBlock(512)
    self.conv5_3 = ResBlock(512)

    self.pool = GlobalAveragePooling2D()
    self.fc1 = Dense(512, activation = 'relu')
    self.dp1 = Dropout(0.5)
    self.fc2 = Dense(512, activation = 'relu')
    self.dp2 = Dropout(0.5)
    self.fc3 = Dense(64)


def call(self, x):
    
    x = self.conv1(x)
    x = self.bn(x)
    x = self.relu(x)
    x = self.mp1(x)

    x = self.conv2_1(x)
    x = self.conv2_2(x)
    output_1 = self.conv2_3(x)

    x = self.conv3_1(output_1)
    x = self.conv3_2(x)
    x = self.conv3_3(x)
    output_2 = self.conv3_4(x)

    x = self.conv4_1(output_2)
    x = self.conv4_2(x)
    x = self.conv4_3(x)
    x = self.conv4_4(x)
    x = self.conv4_5(x)
    output_3 = self.conv4_6(x)

    x = self.conv5_1(output_3)
    x = self.conv5_2(x)
    x = self.conv5_3(x)

    x = self.pool(x)
    x = self.fc1(x)
    x = self.dp1(x)
    x = self.fc2(x)
    x = self.dp2(x)
    x = self.fc3(x)
    
    return output_1, output_2, output_3

class Distiller(Model):
def init(self, student, teacher):
super(Distiller, self).init()
self.teacher = teacher
self.student = student

def compile(self, optimizer):
    
    super(Distiller, self).compile(optimizer = optimizer)
    
    
def Feature_Loss(self, ft_list,  fs_list):
   
    tot_loss = 0

    for i in range(len(ft_list)):
        
        fs = fs_list[i]        
        ft = ft_list[i]
        _, _, h, w = fs.shape           
        fs_norm = K.l2_normalize(fs, axis = 1) 
        ft_norm = K.l2_normalize(ft, axis = 1)     
        f_loss = (0.5/(w*h))*K.sum(K.square(fs_norm - ft_norm))      
        tot_loss += f_loss
        
    return tot_loss   
    

def train_step(self, x):

    # Forward pass of teacher
    Feature_t = self.teacher(x, training = False)

    with tf.GradientTape() as tape:
        # Forward pass of student
        Feature_s = self.student(x, training = True)

        # Compute losses         
        loss = self.Feature_Loss(Feature_t, Feature_s)

    # Compute gradients
    trainable_vars = self.student.trainable_variables
    gradients = tape.gradient(loss, trainable_vars)

    # Update weights
    self.optimizer.apply_gradients(zip(gradients, trainable_vars))

    # Return a dict of performance
    results = {m.name: m.result() for m in self.metrics}
    results.update({"student_loss": loss})
    
    return results

#--------------------------------- Dati ---------------------------------------
img_shape = 128
batch_size = 8
num_channel = 3
path = r’C:\Users\accdan\Desktop\Dataset_Wafer\10x\L1’
momentum = 0.9
l_rate = 0.4
decay = 0.0001
epochs = 1
SEED = 3222
AUTO = tf.data.experimental.AUTOTUNE
#------------------------------------------------------------------------------

#---------------------------- Dataset da Folder -------------------------------
file_list = glob.glob(path + ‘*.jpg’)
dataset = tf.data.Dataset.from_tensor_slices(file_list)
dataset = dataset.shuffle(10000, seed = SEED)
dataset = dataset.map(lambda x: parse_image(x, img_shape)).batch(batch_size).prefetch(10)
#dataset = dataset.map(lambda x : (x, x))
#------------------------------------------------------------------------------

#------------------------------ Model definition ------------------------------
model_t = ResNet34()
model_t.build(input_shape = (batch_size, img_shape, img_shape, num_channel))

model_s = ResNet34()
model_s.build(input_shape = (batch_size, img_shape, img_shape, num_channel))
#------------------------------------------------------------------------------

#------------------------------ Fit Model -------------------------------------
optimizer = SGD(lr = l_rate, momentum = momentum, decay = decay, nesterov = True)
#------------------------------------------------------------------------------

#------------------------------ Distiller -------------------------------------
distiller = Distiller(model_s, model_t)
distiller.compile(optimizer = optimizer)
#------------------------------------------------------------------------------

#------------------------------ Fit Model -------------------------------------
distiller.fit(dataset,
epochs = epochs)
#------------------------------------------------------------------------------

Bhack · February 18, 2022, 11:06am

I suppose that the warning is ok if It Is coming from the Teacher network that it Is freezed (non gradient computation there) but still involved in your loss computation.

github.com

tensorflow/tensorflow/blob/master/tensorflow/python/keras/optimizer_v2/utils.py#L78

      
        
                if grad is None:
                  vars_with_empty_grads.append(var)
                else:
                  filtered.append((grad, var))
              filtered = tuple(filtered)
            
            
  if not filtered:
                raise ValueError("No gradients provided for any variable: %s." %
                                 ([v.name for _, v in grads_and_vars],))
              if vars_with_empty_grads:
                logging.warning(
                    ("Gradients do not exist for variables %s when minimizing the loss."),
                    ([v.name for v in vars_with_empty_grads]))
              return filtered
            
            

            
def make_gradient_clipnorm_fn(clipnorm):
              """Creates a gradient transformation function for clipping by norm."""
              if clipnorm is None:
                return lambda grads_and_vars: grads_and_vars

DANIELE_ACCONCIA · February 18, 2022, 1:38pm

I don’t think that the warning is coming from teacher network, because if I make a little modification to the model adding the output of the last fc layer, modifying the loss, I don not have the warning:

    return x, [output_1, output_2, output_3]

class Distiller(Model):
def init(self, student, teacher):
super(Distiller, self).init()
self.teacher = teacher
self.student = student

def compile(self, optimizer):
    
    super(Distiller, self).compile(optimizer = optimizer)
    
    
def Feature_Loss(self, ft_list,  fs_list):
   
    tot_loss = 0

    for i in range(len(ft_list)):
        
        fs = fs_list[i]        
        ft = ft_list[i]
        _, _, h, w = fs.shape           
        fs_norm = K.l2_normalize(fs, axis = 1) 
        ft_norm = K.l2_normalize(ft, axis = 1)     
        f_loss = (0.5/(w*h))*K.sum(K.square(fs_norm - ft_norm))      
        tot_loss += f_loss
        
    return tot_loss   
    

def train_step(self, x):

    # Forward pass of teacher
    out_t, Feature_t = self.teacher(x, training = False)

    with tf.GradientTape() as tape:
        # Forward pass of student
        out_s, Feature_s = self.student(x, training = True)

        # Compute losses         
        loss = self.Feature_Loss(Feature_t, Feature_s) + (out_t - out_s)*0

Bhack · February 18, 2022, 2:06pm

Is It working with this last version?

DANIELE_ACCONCIA · February 19, 2022, 4:23pm

No relevant improvement. I followed the Keras distillation tutorial (ResNet18_trained.py - Google Drive) but the results beetwen Pytorch and Keras are still very different

Processing: ResNet18_trained.py…

Bhack · February 19, 2022, 6:54pm

Do you have a Google Colab to share?

DANIELE_ACCONCIA · February 19, 2022, 7:02pm

No, I work on a workstation. I can’t share the raw images, but I have the same problem on Mvtec dataset (a standard dataset for anomaly detection MVTec Anomaly Detection Dataset - MVTec AD: MVTec Software)

Bhack · February 19, 2022, 7:20pm

If you can share a ready to run Colab with your model and that dataset It could help.

DANIELE_ACCONCIA · February 19, 2022, 9:23pm

Colab file: Google Colab
Weight ImageNet: ResNet18_PreTrained.h5 - Google Drive

Bhack · February 20, 2022, 8:05pm

Can you simplify a little bit the code? I’ve not verified your Resnet18 definition but to minimize your code surface you could start from a pretrained nework like .e.g ResNet50V2

Than you can compose your student and teacher model with the intermediate features/outputs using:

Let me know when you have a minimized your standalone Colab.

DANIELE_ACCONCIA · February 20, 2022, 8:44pm

I’ll try, the ResNet18 should be ok, i check the weight with the Pytorch counterpart and the prediction are equal. I reproduced Pytorch resnet18 structure and copied its weight in tf net. I’d rather use ResNet18 to reproduce exactly the paper and Pytorch implementation

DANIELE_ACCONCIA · February 20, 2022, 9:05pm

I modified the colab code following your example but I used this ResNet18 implementation GitHub - qubvel/classification_models: Classification models trained on ImageNet. Keras.. Its use and syntax is the same of keras pre-trained model

Bhack · February 20, 2022, 10:19pm

If the the pretrained pytorch and TF resnet18 have exactly the same weights and output and your hyperparamters and preprocessing Is the same do you have a very similar loss progression when you train the student in pytorch and TF?

DANIELE_ACCONCIA · February 21, 2022, 7:24am

Good point, I already check this and the loss progression it isn’t the same, but the images normalization is not the same. I just tried the last code versione implemented with your suggestion, I have poor results yet.

Bhack · February 21, 2022, 8:57am

If you have verified the resnet18 between TF and pytorch, with the same hyperparamters check and on the same data you need to have a similar loss progression when you start to train the student network.

DANIELE_ACCONCIA · February 21, 2022, 10:50am

Is not loss value influenced by image normalization ? Pytorch pre-processing is the following.

mean_train = [0.485, 0.456, 0.406]
std_train = [0.229, 0.224, 0.225]
self.data_transforms = transforms.Compose([
transforms.Resize((args.load_size, args.load_size), Image.ANTIALIAS),
transforms.ToTensor(),
transforms.CenterCrop(args.input_size),
transforms.Normalize(mean=mean_train,
std=std_train)])

Bhack · February 21, 2022, 11:09am

Yes you need to reproduce the same preprocessing steps and the same preprocessing and network
related hyperparamters. Then there could be still some differences related to randomness/seeds but the loss in every train step, with exactly the same data input, need to be quite similar.

erickwilson · February 28, 2022, 6:23am

user_id = Input(shape=(1,), dtype=‘uint64’)
user_embedding_layer= Embedding(user_count, MAX_SENTS, trainable=True)
user_embedding= user_embedding_layer(user_id)

user_embedding_word= Dense(200,activation=‘relu’)(user_embedding)
user_embedding_word= Flatten()(user_embedding_word)

user_embedding_news= Dense(200,activation=‘relu’)(user_embedding)
user_embedding_news= Flatten()(user_embedding_news)