Hey there, I am trying to implement a WGAN model with gradient penalty based on this paper . I have managed to convert the python code to tfjs and got the training to work.
The issue I am facing is that both the discriminator and generator loss increases exponentially and are absurd.
Also I have tested the same architecture in python with every hyperparameter being the same and everything is fine.
Here is my code
Generator Model :
const generator = ({ input_dimension, output_dimension, feature_size, weight_initializers }) => {
const model = tf.sequential();
model.add(tf.layers.conv1d({
filters: 32,
kernelSize: 2,
strides: 1,
padding: 'same',
kernelInitializer: weight_initializers,
batchInputShape: [null, input_dimension, feature_size]
}))
model.add(tf.layers.leakyReLU({ alpha: 0.1 }))
model.add(tf.layers.bidirectional({
layer: tf.layers.lstm({
units: 64,
activation: 'relu',
kernelInitializer: weight_initializers,
returnSequences: false,
dropout: 0.3,
recurrentDropout: 0.0
})
}))
model.add(tf.layers.dense({ units: 64, activation: 'linear' }))
model.add(tf.layers.leakyReLU({ alpha: 0.1 }))
model.add(tf.layers.dropout({ rate: 0.2 }))
model.add(tf.layers.dense({ units: 32, activation: 'linear' }))
model.add(tf.layers.leakyReLU({ alpha: 0.1 }))
model.add(tf.layers.dropout({ rate: 0.2 }))
model.add(tf.layers.dense({ units: output_dimension }))
return model
}
Discriminator Model :
const discriminator = ({ timeStep, lookAhead, weight_initializers }) => {
const model = tf.sequential();
model.add(tf.layers.conv1d({
filters: 32,
kernelSize: 2,
strides: 1,
padding: 'same',
kernelInitializer: weight_initializers,
inputShape: [timeStep + lookAhead, 1]
}))
model.add(tf.layers.leakyReLU({ alpha: 0.1 }))
model.add(tf.layers.conv1d({
filters: 64,
kernelSize: 2,
strides: 1,
padding: 'same',
kernelInitializer: weight_initializers,
}))
model.add(tf.layers.leakyReLU({ alpha: 0.1 }))
model.add(tf.layers.flatten())
model.add(tf.layers.dense({ units: 64, activation: 'linear', useBias: true }))
model.add(tf.layers.leakyReLU({ alpha: 0.1 }))
model.add(tf.layers.dropout({ rate: 0.2 }))
model.add(tf.layers.dense({ units: 32, activation: 'linear', useBias: true }))
model.add(tf.layers.leakyReLU({ alpha: 0.1 }))
model.add(tf.layers.dropout({ rate: 0.2 }))
model.add(tf.layers.dense({ units: 1, activation: 'linear' }))
return model
}
Main train function :
// time_step = 14
// look_ahead = 5
// feature_size = 5
async function train(XTrain, yTrain, pastY, epochs, time_step, look_ahead, feature_size, batchSize) {
// Define the optimizer for both discriminator and generator
const dOptimizer = tf.train.adam(0.0004, 0.5, 0.9)
const gOptimizer = tf.train.adam(0.0001, 0.5, 0.9);
const weight_initializers = tf.initializers.randomNormal({ mean: 0.0, stddev: 0.02 });
const generator_ = generator({
input_dimension: time_step,
output_dimension: look_ahead,
feature_size,
weight_initializers
})
const discriminator_ = discriminator({
timeStep: time_step,
lookAhead: look_ahead,
weight_initializers
})
const trainHist = {
losses: [],
D_losses: [],
G_losses: [],
per_epoch_times: [],
total_ptime: []
};
let Real_price
let Generated_price
let preds = []
const data = [tf.tensor(XTrain), tf.tensor(yTrain), tf.tensor(pastY)];
for (let epoch = 0; epoch < epochs; epoch++) {
log.error(`Epoch ${epoch + 1} of ${epochs}`);
const {
yTrainTensor,
generatorData,
discriminatorLoss,
generatorLoss
} = await trainStep(data, time_step, look_ahead, dOptimizer, gOptimizer, generator_, discriminator_);
}
}
Here is my trainStep function
const generateNoise = (generator_, pastYTrainTensor, xTrain_data) => {
const generator_data = generator_.apply(xTrain_data, { training: true });
const generator_data_reshape = generator_data.reshape([generator_data.shape[0], generator_data.shape[1], 1]);
return tf.cast(pastYTrainTensor.concat(generator_data_reshape, 1), 'float32');
}
// Calculate gradient penalty
const gradientPenalty = (batchSize, time_step, look_ahead, discriminator_, fakeData, realData) => tf.tidy(() => {
const alpha = tf.randomNormal([batchSize, time_step + look_ahead, 1], 0.0, 1.0, 'float32');
const diff = fakeData.sub(realData);
const interpolated = realData.add(alpha.mul(diff));
const gradientsFn = tf.grad(x => discriminator_.apply(x, { training: true }));
const grad_fo_calc = tf.tensor(gradientsFn(interpolated).arraySync())
const gradientsNorm = grad_fo_calc.square().sum([1, 2]).sqrt();
const gp = gradientsNorm.sub(tf.scalar(1)).square().mean();
return tf.cast(gp, 'float32')
})
async function trainStep(data, time_step, look_ahead, dOptimizer, gOptimizer, generator_, discriminator_) {
const [xTrainTensor, yTrainTensor, pastYTrainTensor] = data;
// xTrainTensorshape (82, 14, 1)
// yTrainTensors shape (82, 5)
// pastYTrainTensor shape (82, 14, 1)
const batchSize = xTrainTensor.shape[0];
let dLossValue = 0;
let gLossValue = 0;
let generatorData;
const LAMBDA = tf.tensor(10.0, [1], 'float32') // Gradient penalty lambda hyperparameter
const lambda1 = 0.5; // Extra loss term for speeding up training
const lambda2 = 0.5; // Extra loss term for speeding up training
try {
// Process real data
const realYReshape = yTrainTensor.reshape([yTrainTensor.shape[0], yTrainTensor.shape[1], 1]);
const realOutput = tf.cast(pastYTrainTensor, 'float32').concat(tf.cast(realYReshape, 'float32'), 1);
// Train the discriminator
for (let i = 0; i < 5; i++) {
// Calculate discriminator loss, compute gradients of the loss with respect to discriminator's inputs
const { value: d_value, grads: d_grads } = dOptimizer.computeGradients(() => tf.tidy(() => {
// Generate fake data
const generatorData = generateNoise(generator_, pastYTrainTensor, xTrainTensor)
// Get predictions from discriminator
const DReal = discriminator_.apply(realOutput, { training: true }) // shape [batchSize, 1]
const DFake = discriminator_.apply(generatorData, { training: true }) // shape [batchSize, 1]
// console.log('D : ', DReal.arraySync()[0][0], DFake.arraySync()[0][0])
// Wasserstein Loss - If this value is 0 that means
// both the distributions are same and discriminator is
// guessing 50% of the time
const dCost = (tf.cast(DReal, 'float32').mean().sub(tf.cast(DFake, 'float32').mean())).mul(-1);
// Calculate gradient penalty
const gp = gradientPenalty(batchSize, time_step, look_ahead, discriminator_, generatorData, realOutput)
return DFake.mean().sub(DReal.mean()).add(gp.mul(LAMBDA)).asScalar();
}), discriminator_.getWeights());
dLossValue = d_value.dataSync();
dOptimizer.applyGradients(d_grads)
}
console.log('<----------------------------------------->')
// Train the generator only once
// Compute gradients of the loss with respect to generator's inputs
const { value: g_value, grads: g_grads } = gOptimizer.computeGradients(() => tf.tidy(() => {
// Generate fake output
generatorData = generateNoise(generator_, pastYTrainTensor, xTrainTensor)
tf.keep(generatorData)
const realYReshape = yTrainTensor.reshape([yTrainTensor.shape[0], yTrainTensor.shape[1], 1]);
const realOutput = tf.cast(pastYTrainTensor, 'float32').concat(tf.cast(realYReshape, 'float32'), 1);
// Get the discriminator logits for fake data
const GGenerated = discriminator_.apply(generatorData, { training: true });
// Calculate the generator loss
const g_mean = GGenerated.mean().mul(-1)
const gMse = tf.losses.meanSquaredError(realOutput, generatorData)
const gSign = tf.abs(tf.sign(realOutput).sub(tf.sign(generatorData))).mean();
const gLoss = g_mean.add(gMse.mul(lambda1)).add(gSign.mul(lambda2));
return gLoss;
}), generator_.getWeights());
gLossValue = g_value.dataSync();
gOptimizer.applyGradients(g_grads);
}
catch (e) {
tf.dispose()
console.log('Error in training discriminator')
console.log(e.stack)
}
return { yTrainTensor, generatorData, discriminatorLoss: dLossValue, generatorLoss: gLossValue };
}
Maybe something might have been lost in translation. I have been debugging this for days and I am at a full stop.
Any and all help and insight is rellay appreciated.