Hi,
After doing the coursera MLOPS course I’m experimenting with TFX and I am using the MNIST dataset.
The dataset is downloaded as numpy arrays and I have encoded as TF.record. To do this the array is serialized and encoded as bytes. (see below). It all works fine and I get a lovely schema and some statistics all working great.
But then I get to the pre_processing function and cannot for the life of me figure how how to process the images. I need to use tft or tf.io functions but there is nothing that turns them from bytes back to arrays (to, for example normalise the values by dividing by 255).
I have found an example of converting a single record back in the tf.train.example section but this doesn’t seem to work on the input the pre_procesing function gets despite me trying to map it.
Any clues!?
Setting up the record files
def _bytes_feature(value):
"""Returns a bytes_list from a string / byte."""
if isinstance(value, type(tf.constant(0))): # if value ist tensor
value = value.numpy() # get value of tensor
return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
def _int64_feature(value):
"""Returns an int64_list from a bool / enum / int / uint."""
return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
def serialize_array(array):
array = tf.io.serialize_tensor(array)
return array
def image_label_to_tf_train(image, label):
image_shape = np.shape(image)
#define the dictionary -- the structure -- of our single example
data = {
'height': _int64_feature(image_shape[0]),
'width': _int64_feature(image_shape[1]),
'raw_image' : _bytes_feature(serialize_array(image)),
'label' : _int64_feature(label)
}
#create an Example, wrapping the single features
return tf.train.Example(features=tf.train.Features(feature=data))
def write_images_to_tfr_short(images, labels, filename:str="images", folder = ""):
if not os.path.isdir(folder):
!mkdir {folder}
filename= folder + "/" + filename+".tfrecords"
writer = tf.io.TFRecordWriter(filename) #create a writer that'll store our data to disk
count = 0
for index in range(len(images)):
#get the data we want to write
current_image = images[index]
current_label = labels[index]
out = image_label_to_tf_train(image=current_image, label=current_label)
writer.write(out.SerializeToString())
count += 1
writer.close()
print(f"Wrote {count} elements to TFRecord")
return count
write_images_to_tfr_short(train_x, train_y, filename= "training_image_record", folder = train_folder)
My curren pre_processing function that doesnt work (note the labels output fine!)
%%writefile {_mnist_transform_module}
import numpy as np
import tensorflow as tf
import os
from tfx import v1 as tfx
from tfx import proto
from tfx.proto import example_gen_pb2
from tfx.components import example_gen
from tfrecord_lite import decode_example
import mnist_constants
from tfrecord_lite import tf_record_iterator
_LABEL_KEY = mnist_constants.LABEL_KEY
_IMAGE_KEY = mnist_constants.IMAGE_KEY
# Define the transformations
def preprocessing_fn(inputs):
"""tf.transform's callback function for preprocessing inputs.
Args:
inputs: map from feature keys to raw not-yet-transformed features.
Returns:
Map from string feature key to transformed feature operations.
"""
image_feature_description = {
'image_raw': tf.io.FixedLenFeature([], tf.string),
}
# Initialize outputs dictionary
outputs = {}
raw_image_dataset = inputs[_IMAGE_KEY]
def _parse_image_function(example_proto):
# Parse the input tf.train.Example proto using the dictionary above.
return tf.io.parse_single_example(example_proto, image_feature_description)
parsed_image_dataset = tf.map_fn(_parse_image_function, raw_image_dataset)
outputs[_IMAGE_KEY] = parsed_image_dataset
outputs[_LABEL_KEY] = tf.cast(inputs[_LABEL_KEY], tf.int64)
return outputs