Pytorch code convertion into keras

I used the keras distillation tutorial. During the training I have the following warning. These layer are releated with the residual block that I don’t use in my output

WARNING:tensorflow:Gradients do not exist for variables [‘ResBlock/conv2d_137/kernel:0’, ‘ResBlock/conv2d_137/bias:0’, ‘ResBlock/batch_normalization_137/gamma:0’, ‘ResBlock/batch_normalization_137/beta:0’, ‘ResBlock/conv2d_138/kernel:0’, ‘ResBlock/conv2d_138/bias:0’, ‘ResBlock/batch_normalization_138/gamma:0’, ‘ResBlock/batch_normalization_138/beta:0’, ‘ResBlock/batch_normalization_139/gamma:0’, ‘ResBlock/batch_normalization_139/beta:0’, ‘ResBlock/conv2d_139/kernel:0’, ‘ResBlock/conv2d_139/bias:0’, ‘ResBlock/conv2d_140/kernel:0’, ‘ResBlock/conv2d_140/bias:0’, ‘ResBlock/batch_normalization_140/gamma:0’, ‘ResBlock/batch_normalization_140/beta:0’, ‘ResBlock/conv2d_141/kernel:0’, ‘ResBlock/conv2d_141/bias:0’, ‘ResBlock/batch_normalization_141/gamma:0’, ‘ResBlock/batch_normalization_141/beta:0’, ‘ResBlock/conv2d_142/kernel:0’, ‘ResBlock/conv2d_142/bias:0’, ‘ResBlock/batch_normalization_142/gamma:0’, ‘ResBlock/batch_normalization_142/beta:0’, ‘ResBlock/conv2d_143/kernel:0’, ‘ResBlock/conv2d_143/bias:0’, ‘ResBlock/batch_normalization_143/gamma:0’, ‘ResBlock/batch_normalization_143/beta:0’, ‘dense_9/kernel:0’, ‘dense_9/bias:0’, ‘dense_10/kernel:0’, ‘dense_10/bias:0’, ‘dense_11/kernel:0’, ‘dense_11/bias:0’] when minimizing the loss.

class ResBlock(Model):
def init(self, channels, stride = 1):

    super(ResBlock, self).__init__(name='ResBlock')
    self.flag = (stride != 1)
    self.conv1 = Conv2D(channels, 3, stride, padding='same')
    self.bn1 = BatchNormalization()
    self.conv2 = Conv2D(channels, 3, padding='same')
    self.bn2 = BatchNormalization()
    self.relu = ReLU()
    
    if self.flag:
        self.bn3 = BatchNormalization()
        self.conv3 = Conv2D(channels, 1, stride)

def call(self, x):
    
    x1 = self.conv1(x)
    x1 = self.bn1(x1)
    x1 = self.relu(x1)
    x1 = self.conv2(x1)
    x1 = self.bn2(x1)
    
    if self.flag:
        x = self.conv3(x)
        x = self.bn3(x)
        
    x1 = Layers.add([x, x1])
    x1 = self.relu(x1)
    
    return x1

class ResNet34(Model):
def init(self):

    super(ResNet34, self).__init__(name = 'ResNet34')
    self.conv1 = Conv2D(64, 7, 2, padding = 'same')
    self.bn = BatchNormalization()
    self.relu = ReLU()
    self.mp1 = MaxPooling2D(3, 2)

    self.conv2_1 = ResBlock(64)
    self.conv2_2 = ResBlock(64)
    self.conv2_3 = ResBlock(64)

    self.conv3_1 = ResBlock(128, 2)
    self.conv3_2 = ResBlock(128)
    self.conv3_3 = ResBlock(128)
    self.conv3_4 = ResBlock(128)

    self.conv4_1 = ResBlock(256, 2)
    self.conv4_2 = ResBlock(256)
    self.conv4_3 = ResBlock(256)
    self.conv4_4 = ResBlock(256)
    self.conv4_5 = ResBlock(256)
    self.conv4_6 = ResBlock(256)

    self.conv5_1 = ResBlock(512, 2)
    self.conv5_2 = ResBlock(512)
    self.conv5_3 = ResBlock(512)

    self.pool = GlobalAveragePooling2D()
    self.fc1 = Dense(512, activation = 'relu')
    self.dp1 = Dropout(0.5)
    self.fc2 = Dense(512, activation = 'relu')
    self.dp2 = Dropout(0.5)
    self.fc3 = Dense(64)


def call(self, x):
    
    x = self.conv1(x)
    x = self.bn(x)
    x = self.relu(x)
    x = self.mp1(x)

    x = self.conv2_1(x)
    x = self.conv2_2(x)
    output_1 = self.conv2_3(x)

    x = self.conv3_1(output_1)
    x = self.conv3_2(x)
    x = self.conv3_3(x)
    output_2 = self.conv3_4(x)

    x = self.conv4_1(output_2)
    x = self.conv4_2(x)
    x = self.conv4_3(x)
    x = self.conv4_4(x)
    x = self.conv4_5(x)
    output_3 = self.conv4_6(x)

    x = self.conv5_1(output_3)
    x = self.conv5_2(x)
    x = self.conv5_3(x)

    x = self.pool(x)
    x = self.fc1(x)
    x = self.dp1(x)
    x = self.fc2(x)
    x = self.dp2(x)
    x = self.fc3(x)
    
    return output_1, output_2, output_3

class Distiller(Model):
def init(self, student, teacher):
super(Distiller, self).init()
self.teacher = teacher
self.student = student

def compile(self, optimizer):
    
    super(Distiller, self).compile(optimizer = optimizer)
    
    
def Feature_Loss(self, ft_list,  fs_list):
   
    tot_loss = 0

    for i in range(len(ft_list)):
        
        fs = fs_list[i]        
        ft = ft_list[i]
        _, _, h, w = fs.shape           
        fs_norm = K.l2_normalize(fs, axis = 1) 
        ft_norm = K.l2_normalize(ft, axis = 1)     
        f_loss = (0.5/(w*h))*K.sum(K.square(fs_norm - ft_norm))      
        tot_loss += f_loss
        
    return tot_loss   
    

def train_step(self, x):

    # Forward pass of teacher
    Feature_t = self.teacher(x, training = False)

    with tf.GradientTape() as tape:
        # Forward pass of student
        Feature_s = self.student(x, training = True)

        # Compute losses         
        loss = self.Feature_Loss(Feature_t, Feature_s)

    # Compute gradients
    trainable_vars = self.student.trainable_variables
    gradients = tape.gradient(loss, trainable_vars)

    # Update weights
    self.optimizer.apply_gradients(zip(gradients, trainable_vars))

    # Return a dict of performance
    results = {m.name: m.result() for m in self.metrics}
    results.update({"student_loss": loss})
    
    return results

#--------------------------------- Dati ---------------------------------------
img_shape = 128
batch_size = 8
num_channel = 3
path = r’C:\Users\accdan\Desktop\Dataset_Wafer\10x\L1’
momentum = 0.9
l_rate = 0.4
decay = 0.0001
epochs = 1
SEED = 3222
AUTO = tf.data.experimental.AUTOTUNE
#------------------------------------------------------------------------------

#---------------------------- Dataset da Folder -------------------------------
file_list = glob.glob(path + ‘*.jpg’)
dataset = tf.data.Dataset.from_tensor_slices(file_list)
dataset = dataset.shuffle(10000, seed = SEED)
dataset = dataset.map(lambda x: parse_image(x, img_shape)).batch(batch_size).prefetch(10)
#dataset = dataset.map(lambda x : (x, x))
#------------------------------------------------------------------------------

#------------------------------ Model definition ------------------------------
model_t = ResNet34()
model_t.build(input_shape = (batch_size, img_shape, img_shape, num_channel))

model_s = ResNet34()
model_s.build(input_shape = (batch_size, img_shape, img_shape, num_channel))
#------------------------------------------------------------------------------

#------------------------------ Fit Model -------------------------------------
optimizer = SGD(lr = l_rate, momentum = momentum, decay = decay, nesterov = True)
#------------------------------------------------------------------------------

#------------------------------ Distiller -------------------------------------
distiller = Distiller(model_s, model_t)
distiller.compile(optimizer = optimizer)
#------------------------------------------------------------------------------

#------------------------------ Fit Model -------------------------------------
distiller.fit(dataset,
epochs = epochs)
#------------------------------------------------------------------------------

I suppose that the warning is ok if It Is coming from the Teacher network that it Is freezed (non gradient computation there) but still involved in your loss computation.

I don’t think that the warning is coming from teacher network, because if I make a little modification to the model adding the output of the last fc layer, modifying the loss, I don not have the warning:

    return x, [output_1, output_2, output_3]

class Distiller(Model):
def init(self, student, teacher):
super(Distiller, self).init()
self.teacher = teacher
self.student = student

def compile(self, optimizer):
    
    super(Distiller, self).compile(optimizer = optimizer)
    
    
def Feature_Loss(self, ft_list,  fs_list):
   
    tot_loss = 0

    for i in range(len(ft_list)):
        
        fs = fs_list[i]        
        ft = ft_list[i]
        _, _, h, w = fs.shape           
        fs_norm = K.l2_normalize(fs, axis = 1) 
        ft_norm = K.l2_normalize(ft, axis = 1)     
        f_loss = (0.5/(w*h))*K.sum(K.square(fs_norm - ft_norm))      
        tot_loss += f_loss
        
    return tot_loss   
    

def train_step(self, x):

    # Forward pass of teacher
    out_t, Feature_t = self.teacher(x, training = False)

    with tf.GradientTape() as tape:
        # Forward pass of student
        out_s, Feature_s = self.student(x, training = True)

        # Compute losses         
        loss = self.Feature_Loss(Feature_t, Feature_s) + (out_t - out_s)*0

Is It working with this last version?

No relevant improvement. I followed the Keras distillation tutorial (ResNet18_trained.py - Google Drive) but the results beetwen Pytorch and Keras are still very different



Processing: ResNet18_trained.py…

Do you have a Google Colab to share?

No, I work on a workstation. I can’t share the raw images, but I have the same problem on Mvtec dataset (a standard dataset for anomaly detection MVTec Anomaly Detection Dataset: MVTec Software)

If you can share a ready to run Colab with your model and that dataset It could help.

Colab file: Google Colab
Weight ImageNet: ResNet18_PreTrained.h5 - Google Drive

Can you simplify a little bit the code? I’ve not verified your Resnet18 definition but to minimize your code surface you could start from a pretrained nework like .e.g ResNet50V2

Than you can compose your student and teacher model with the intermediate features/outputs using:

Let me know when you have a minimized your standalone Colab.

I’ll try, the ResNet18 should be ok, i check the weight with the Pytorch counterpart and the prediction are equal. I reproduced Pytorch resnet18 structure and copied its weight in tf net. I’d rather use ResNet18 to reproduce exactly the paper and Pytorch implementation

I modified the colab code following your example but I used this ResNet18 implementation GitHub - qubvel/classification_models: Classification models trained on ImageNet. Keras.. Its use and syntax is the same of keras pre-trained model

If the the pretrained pytorch and TF resnet18 have exactly the same weights and output and your hyperparamters and preprocessing Is the same do you have a very similar loss progression when you train the student in pytorch and TF?

Good point, I already check this and the loss progression it isn’t the same, but the images normalization is not the same. I just tried the last code versione implemented with your suggestion, I have poor results yet.

If you have verified the resnet18 between TF and pytorch, with the same hyperparamters check and on the same data you need to have a similar loss progression when you start to train the student network.

Is not loss value influenced by image normalization ? Pytorch pre-processing is the following.

mean_train = [0.485, 0.456, 0.406]
std_train = [0.229, 0.224, 0.225]
self.data_transforms = transforms.Compose([
transforms.Resize((args.load_size, args.load_size), Image.ANTIALIAS),
transforms.ToTensor(),
transforms.CenterCrop(args.input_size),
transforms.Normalize(mean=mean_train,
std=std_train)])

Yes you need to reproduce the same preprocessing steps and the same preprocessing and network
related hyperparamters. Then there could be still some differences related to randomness/seeds but the loss in every train step, with exactly the same data input, need to be quite similar.

user_id = Input(shape=(1,), dtype=‘uint64’)
user_embedding_layer= Embedding(user_count, MAX_SENTS, trainable=True)
user_embedding= user_embedding_layer(user_id)

user_embedding_word= Dense(200,activation=‘relu’)(user_embedding)
user_embedding_word= Flatten()(user_embedding_word)

user_embedding_news= Dense(200,activation=‘relu’)(user_embedding)
user_embedding_news= Flatten()(user_embedding_news)