Get file name passed from tf.data.Datasets.map

My dataset consist of a group of files (1.mat, 2.mat …), and I need for each single file to concatenate the adjacent files. For example, for file 5.mat, I need to read files 0.mat to 10.mat and concatenate them in the last dimension. Moreover, each file has 5 subdatasets with dimensions 1000x120x1, 1000x200x1, 2000x200x1, 200x200x1. I need to concatenate 11 files to output 1000x120x11, , 1000x200x11, 2000x200x11, 200x200x11. As this dataset will be the input for the NN the dimension should be (None,1000,120,11).

I created a function that works well when passing a list of files, but crashes when generating a tf.dataset:

def read_inp2D(folder,datatype,batchsize,phase=None,shuffle=True,
               outlabel='vp','vs','q'],depth_train=700,ncmps=11):
    label_indx ={'vp':0,'vs':1,'q':2,'1/q':3}
    indx = [label_indx[i] for i in outlabel]
    d_indx = int(depth_train//2.5)
    
    def read_inputs(file):
        file_s = list(file.as_numpy_iterator())[0]
        print(file_s)
        file_num = int(file_s[:-4].split('_')[-1])
        filename_core = file_s[:-len(file_s.split('_')[-1])]

        h5file = h5.File(file,'r')
        labels = h5file.get('labels')[()][:d_indx, indx]
        h5file.close()

        data = {l: [] for l in datatype}

        for i,ii in enumerate(np.arange(file_num - ncmps//2, file_num + ncmps//2 + 1)):
            if ii < 0 : filei = '%s%i.mat'%(filename_core,0)
            elif ii > nshots_model: filei = '%s%i.mat' % (filename_core, nshots_model)
            else: filei = '%s%i.mat' % (filename_core, ii)
            h5filei = h5.File(filei,'r')
            for l in datatype:
                data[l].append(tf.convert_to_tensor(h5filei.get('inputs/' + l)[()],dtype=tf.float32))
            h5filei.close()
        data = {l: tf.convert_to_tensor(np.concatenate(data[l],axis=-1),dtype=tf.float32) for l in datatype}
        

        return tf.data.Dataset.from_tensor_slices(data), tf.data.Dataset.from_tensor_slices(labels)

    if phase == 'test': shuffle = False
    for i in range(len(folder)):
        # files = folder[i] + '*.mat'
        files = os.listdir(folder[i])

        'Removing files at the edges to fix cmps size (to avoid repetition)'
        nshots_model = np.max([int(file[:-4].split('_')[-1]) for file in files])
        omit_files = tuple(['_%i.mat' % i for i in np.arange(0, ncmps//2)] +
                           ['_%i.mat' % i for i in np.arange(nshots_model - ncmps//2 + 1, nshots_model + 1)])
        mask = np.array([file.endswith(omit_files) for file in files])
        temp = tf.data.Dataset.list_files(['%s/'%folder[i] + file for file in np.array(files)[~mask]],shuffle=shuffle)
        

        if i ==0 :
            dataset = copy.copy(temp)
        else:
            dataset = dataset.concatenate(temp)

    dataset = dataset.map(read_inputs, num_parallel_calls=tf.data.AUTOTUNE).batch(batch_size=batchsize)

    return dataset

The problem that I’m facing is that I cannot read the passed file name as it is a ‘Symbolic Tensor’:

AttributeError: 'SymbolicTensor' object has no attribute 'as_numpy_iterator'

I will appreciate if anyone can help me to figure out how to reach the adjacent files, and create the concatenated tensors.

Thanks

Hi @Jefferson_Bustamante, Could you please let us know what is the data type you are passing to the file argument.

And also could you please elaborate more on this. like you mention you need to concatenate the adjacent files. but you mentioned for file 5.mat, you need to read files 0.mat to 10.mat and concatenate them but the adjacent will be 4, 6.

Thank You.