I have been trying to learn how to deal with tensorflow base to build ANN as opposed to just using the keras API. Tensorflow has a nice tutorial on getting started on this with the MNIST digits dataset
Multilayer perceptrons for digit recognition with Core APIs | TensorFlow Core. I wanted to change this up a little and modify it to do regression instead of classification.
Basically all the code is as follows
import tensorflow as tf
import numpy as np
from tensorflow import keras
import pdb
def xavier_init(shape):
# Computes the xavier initialization values for a weight matrix
in_dim, out_dim = shape
xavier_lim = tf.sqrt(6.)/tf.sqrt(tf.cast(in_dim + out_dim, tf.float32))
weight_vals = tf.random.uniform(shape=(in_dim, out_dim), minval=-xavier_lim, maxval=xavier_lim, seed=22)
return weight_valsclass DenseLayer(tf.Module):
def init(self,out_dim,weight_init=xavier_init,activation=tf.identity):
super().init()
self.out_dim=out_dim
self.activation=activation
self.built=False
self.weight_init=weight_initdef __call__(self,x): if not self.built: self.in_dim=x.shape[1] self.w=tf.Variable(xavier_init(shape=(self.in_dim,self.out_dim))) self.b=tf.Variable(tf.zeros(shape=(self.out_dim,))) self.built=True z=tf.add(tf.matmul(x,self.w),self.b) return self.activation(z)
class LinearLayer(tf.Module):
def init(self,out_dim,weight_init=xavier_init):
super().init()
self.out_dim=out_dim
self.built=False
self.weight_init=weight_initdef __call__(self,x): if not self.built: self.in_dim=x.shape[1] self.w=tf.Variable(xavier_init(shape=(self.in_dim,self.out_dim))) self.b=tf.Variable(tf.zeros(shape=(self.out_dim,))) self.built=True z=tf.add(tf.matmul(x,self.w),self.b) return z
class MLP_REG(tf.Module):
def init(self,layers):
self.layers=layers@tf.function def __call__(self,x,preds=False): for layer in self.layers: x=layer(x) return x
def mse_loss(ypred,y):
return tf.reduce_mean(tf.square(ypred-y))
def accuracy(ypred,y):
loss=tf.math.reduce_sum(tf.square(ypred-y))
return loss
class Adam:
def __init__(self, learning_rate=1e-3, beta_1=0.9, beta_2=0.999, ep=1e-7):
# Initialize optimizer parameters and variable slots
super().__init__()
self.beta_1 = beta_1
self.beta_2 = beta_2
self.learning_rate = learning_rate
self.ep = ep
self.t = 1.
self.v_dvar, self.s_dvar = [], []
self.built = False
def apply_gradients(self, grads, vars):
# Initialize variables on the first call
if not self.built:
for var in vars:
v = tf.Variable(tf.zeros(shape=var.shape))
s = tf.Variable(tf.zeros(shape=var.shape))
self.v_dvar.append(v)
self.s_dvar.append(s)
self.built = True
# Update the model variables given their gradients
for i, (d_var, var) in enumerate(zip(grads, vars)):
self.v_dvar[i].assign(self.beta_1*self.v_dvar[i] + (1-self.beta_1)*d_var)
self.s_dvar[i].assign(self.beta_2*self.s_dvar[i] + (1-self.beta_2)*tf.square(d_var))
v_dvar_bc = self.v_dvar[i]/(1-(self.beta_1**self.t))
s_dvar_bc = self.s_dvar[i]/(1-(self.beta_2**self.t))
var.assign_sub(self.learning_rate*(v_dvar_bc/(tf.sqrt(s_dvar_bc) + self.ep)))
self.t += 1.
return
def train_step(x_batch,y_batch,loss,acc,model,optimizer):
with tf.GradientTape() as tape:
y_pred = model(x_batch)
batch_loss = loss(y_pred, y_batch)
batch_acc = acc(y_pred, y_batch)
grads = tape.gradient(batch_loss, model.variables)
optimizer.apply_gradients(grads, model.variables)
return batch_loss, batch_acc
def val_step(x_batch, y_batch, loss, acc, model):
# Evaluate the model on given a batch of validation data
y_pred = model(x_batch)
batch_loss = loss(y_pred, y_batch)
batch_acc = acc(y_pred, y_batch)
return batch_loss, batch_acc
def train_model(mlp, train_data, val_data, loss, acc, optimizer, epochs):
# Initialize data structures
train_losses, train_accs = ,
val_losses, val_accs = ,
for epoch in range(epochs):
batch_losses_train, batch_accs_train = ,
batch_losses_val, batch_accs_val = ,
# Iterate over training
for x_batch, y_batch in train_data:
# Compute gradients and update the model's parameters
batch_loss, batch_acc = train_step(x_batch, y_batch, loss, acc, mlp, optimizer)
# Keep track of batch-level training performance
batch_losses_train.append(batch_loss)
batch_accs_train.append(batch_acc)
# iterate of validation
for x_batch, y_batch in val_data:
batch_loss, batch_acc = val_step(x_batch, y_batch, loss, acc, mlp)
batch_losses_val.append(batch_loss)
batch_accs_val.append(batch_acc)
train_loss, train_acc = tf.reduce_mean(batch_losses_train), tf.reduce_mean(batch_accs_train)
val_loss, val_acc = tf.reduce_mean(batch_losses_val), tf.reduce_mean(batch_accs_val)
train_losses.append(train_loss)
train_accs.append(train_acc)
val_losses.append(val_loss)
val_accs.append(val_acc)
print(f"Epoch: {epoch}")
print(f"Training loss: {train_loss:.3f}, Training accuracy: {train_acc:.3f}")
print(f"Validation loss: {val_loss:.3f}, Validation accuracy: {val_acc:.3f}")
return train_losses, train_accs, val_losses, val_accs
for a dataset I used the sklearn diabetes dataset
import sklearn.datasets as ds
DS=ds.load_diabetes()
Anyways I use this to build a model with two hidden layers with 300 units and 150 units both with relu activation using and using mean squared error as loss.
I train this for 10 epochs and the loss for the training data is on the order of 7000
however, if I use Keras as follows
from tensorflow import keras
from tensflow.keras import layers
model=keras.Sequential([
keras.layers.Dense(300,activation=‘relu’),
keras.layers.Dense(150,activation=‘relu’),
keras.layers.Dense(1)])
model.compile(optimizer=‘Adam’,loss=‘mse’)
model.fit(train_data,batch_size=30,validation_split=0.25,epochs=10)
after this training loss on this model is about 5300. I feel like the two should be pretty much the same and am not sure why they are different. The parameters for the ADAM optimizer are the same. I also know that for some reason the first tensorflow base is running on the CPU which I am not sure why or if that would influence results. Is keras using different more complex optimization that would make it approach a local minimum faster?