I try to generate independent random variable with tensorflow distributed code on different GPU.
I use the split method on the random generator to generate n subgenerators for my n GPU. Then i’d like to run some code distributed with each GPU using its own subgenerator.
It is a tack commonly achieved on software using MPI but i could not find any way to achieve with TF.
Here is an example on code not working trying to distribute the work on 2 GPU.
import numpy as np
import tensorflow as tf
import time
import sys, os
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('--n_gpus', type=int, default=1)
args = parser.parse_args()
n_gpus = args.n_gpus
device_type = "GPU"
devices = tf.config.experimental.list_physical_devices(
device_type)
devices_names = [d.name.split('e:')[1] for d in devices]
strategy = tf.distribute.MirroredStrategy( devices=devices_names[:n_gpus])
with strategy.scope():
optimizerControl= tf.keras.optimizers.Adam(learning_rate = 1e-3)
modelControl = tf.keras.Sequential([tf.keras.layers.Dense(8, activation = tf.nn.relu),
tf.keras.layers.Dense(1 )])
@tf.function
def cal( locGen, nbSimul, modelControl):
x= locGen.normal( [nbSimul])
return tf.reduce_sum(tf.square(modelControl(tf.expand_dims(x, axis=-1))[:,0]-tf.square(x)))
def train_step(newGen, nbSimul, modelControl, optimizerControl):
i = tf.distribute.get_replica_context().replica_id_in_sync_group
print("Devic run", i)
with tf.GradientTape() as tape:
loss = cal( newGen[i], nbSimul, modelControl)
gradients = tape.gradient(loss, modelControl.trainable_variables)
optimizerControl.apply_gradients(zip(gradients, modelControl.trainable_variables))
return loss
def distributed_train_step(newGen,nbSimul, modelControl, optimizerControl):
per_replica_losses = strategy.run(train_step, args=(newGen,int(nbSimul/n_gpus), modelControl, optimizerControl,))
return strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_losses,
axis=None)/nbSimul
gen = tf.random.Generator.from_seed(1)
newGen = gen.split(n_gpus)
batchSize=10
for epoch in range(10):
valTest = distributed_train_step(newGen,batchSize,modelControl,optimizerControl)
If someone knows if it is possible.
Thank you.