The version of my tensorflow is 2.6.0-gpu.
I implemented Adam
using the formula given on tf.raw_ops.ApplyAdam | TensorFlow v2.12.0, why is the result different from using tf.keras.optimizers.Adam()
? How is optimizer in Keras.optimizers.Adam
implemented?
def my_adam(params, lr, grads, globals_step, eps=1e-7):
beta1 = 0.9
beta2 = 0.999
m_w = 0
m_b = 0
v_w = 0
v_b = 0
alpha = lr * tf.sqrt(1 - tf.pow(beta2, int(globals_step))) / (1 - tf.pow(beta1, int(globals_step)))
m_w = m_w * beta1 + (1 - beta1) * grads[0]
v_w = v_w * beta2 + (1 - beta2) * tf.square(grads[0])
m_b = m_b * beta1 + (1 - beta1) * grads[1]
v_b = v_b * beta2 + (1 - beta2) * tf.square(grads[1])
params[0].assign_sub((m_w * alpha) / (tf.sqrt(v_w) + eps))
params[1].assign_sub((m_b * alpha) / (tf.sqrt(v_b) + eps))
# set seed
tf.random.set_seed(1)
np.random.seed(1)
# data
true_w = tf.Variable(tf.constant([2., 1.], shape=[2, 1], dtype=tf.float32))
true_b = tf.Variable(tf.constant([2.], dtype=tf.float32))
x = np.arange(0, 10, 0.1).reshape(50, 2)
np.random.shuffle(x)
x = tf.constant(x, dtype=tf.float32)
y = x @ true_w + true_b
# init w, b randomly
w = tf.Variable(tf.random.normal([2, 1], stddev=0.1, dtype=tf.float32))
b = tf.Variable(tf.zeros([1], dtype=tf.float32))
Call my_adam
:
global_step = 0
for i in range(3):
with tf.GradientTape() as tape:
y_hat = x @ w + b
loss = tf.reduce_mean(tf.square(y_hat - y))
grads = tape.gradient(loss, [w, b])
global_step += 1
my_adam([w, b], 0.01, grads, global_step)
print(f"w: {w.numpy()}")
print(f"b: {b.numpy()}")
print()
This is the result:
'''
w: [[-0.1001218]
[ 0.1645754]]
b: [0.01000023]
w: [[-0.09268039]
[ 0.1720168 ]]
b: [0.01744163]
w: [[-0.08629228]
[ 0.17840491]]
b: [0.02382975]
'''
Call optimizer.Adam()
:
for i in range(3):
with tf.GradientTape() as tape:
y_hat = x @ w + b
loss = tf.reduce_mean(tf.square(y_hat - y))
grads = tape.gradient(loss, [w, b])
op = keras.optimizers.Adam(learning_rate=0.01, eps=1e-7)
op.apply_gradients(zip(grads, [w, b]))
print(f"w: {w.numpy()}")
print(f"b: {b.numpy()}")
print()
This is the result:
'''
w: [[-0.10012173]
[ 0.16457547]]
b: [0.0100003]
w: [[-0.09012143]
[ 0.17457578]]
b: [0.02000059]
w: [[-0.08012114]
[ 0.18457608]]
b: [0.03000089]
'''
Why are they different? How does the optimizer tf.keras.optimizers.Adam()
work?