How does the optimizer `tf.keras.optimizers.Adam()` work?

The version of my tensorflow is 2.6.0-gpu.
I implemented Adam using the formula given on tf.raw_ops.ApplyAdam | TensorFlow v2.12.0, why is the result different from using tf.keras.optimizers.Adam()? How is optimizer in Keras.optimizers.Adam implemented?

def my_adam(params, lr, grads, globals_step, eps=1e-7):
    beta1 = 0.9
    beta2 = 0.999
    m_w = 0
    m_b = 0
    v_w = 0
    v_b = 0

    alpha = lr * tf.sqrt(1 - tf.pow(beta2, int(globals_step))) / (1 - tf.pow(beta1, int(globals_step)))

    m_w = m_w * beta1 + (1 - beta1) * grads[0]
    v_w = v_w * beta2 + (1 - beta2) * tf.square(grads[0])

    m_b = m_b * beta1 + (1 - beta1) * grads[1]
    v_b = v_b * beta2 + (1 - beta2) * tf.square(grads[1])

    params[0].assign_sub((m_w * alpha) / (tf.sqrt(v_w) + eps))
    params[1].assign_sub((m_b * alpha) / (tf.sqrt(v_b) + eps))


# set seed
tf.random.set_seed(1)
np.random.seed(1)

# data
true_w = tf.Variable(tf.constant([2., 1.], shape=[2, 1], dtype=tf.float32))
true_b = tf.Variable(tf.constant([2.], dtype=tf.float32))
x = np.arange(0, 10, 0.1).reshape(50, 2)
np.random.shuffle(x)
x = tf.constant(x, dtype=tf.float32)
y = x @ true_w + true_b

# init w, b randomly
w = tf.Variable(tf.random.normal([2, 1], stddev=0.1, dtype=tf.float32))
b = tf.Variable(tf.zeros([1], dtype=tf.float32))

Call my_adam :

global_step = 0
for i in range(3):
    with tf.GradientTape() as tape:
        y_hat = x @ w + b
        loss = tf.reduce_mean(tf.square(y_hat - y))
    grads = tape.gradient(loss, [w, b])
    global_step += 1
    my_adam([w, b], 0.01, grads, global_step)

    print(f"w: {w.numpy()}")
    print(f"b: {b.numpy()}")
    print()

This is the result:

'''
w: [[-0.1001218]
 [ 0.1645754]]
b: [0.01000023]

w: [[-0.09268039]
 [ 0.1720168 ]]
b: [0.01744163]

w: [[-0.08629228]
 [ 0.17840491]]
b: [0.02382975]
'''

Call optimizer.Adam():

for i in range(3):
    with tf.GradientTape() as tape:
        y_hat = x @ w + b
        loss = tf.reduce_mean(tf.square(y_hat - y))
    grads = tape.gradient(loss, [w, b])
    op = keras.optimizers.Adam(learning_rate=0.01, eps=1e-7)
    op.apply_gradients(zip(grads, [w, b]))
    print(f"w: {w.numpy()}")
    print(f"b: {b.numpy()}")
    print()

This is the result:

'''
w: [[-0.10012173]
 [ 0.16457547]]
b: [0.0100003]

w: [[-0.09012143]
 [ 0.17457578]]
b: [0.02000059]

w: [[-0.08012114]
 [ 0.18457608]]
b: [0.03000089]
'''

Why are they different? How does the optimizer tf.keras.optimizers.Adam() work?

Hi @ouyangfeng036 ,

I am thinking the major factor is the way you calculate the learning rate in your custom implementation and the Keras Adam optimizer learning rate.

Thanks.