My dataset consist of a group of files (1.mat, 2.mat …), and I need for each single file to concatenate the adjacent files. For example, for file 5.mat, I need to read files 0.mat to 10.mat and concatenate them in the last dimension. Moreover, each file has 5 subdatasets with dimensions 1000x120x1, 1000x200x1, 2000x200x1, 200x200x1. I need to concatenate 11 files to output 1000x120x11, , 1000x200x11, 2000x200x11, 200x200x11. As this dataset will be the input for the NN the dimension should be (None,1000,120,11).
I created a function that works well when passing a list of files, but crashes when generating a tf.dataset:
def read_inp2D(folder,datatype,batchsize,phase=None,shuffle=True,
outlabel='vp','vs','q'],depth_train=700,ncmps=11):
label_indx ={'vp':0,'vs':1,'q':2,'1/q':3}
indx = [label_indx[i] for i in outlabel]
d_indx = int(depth_train//2.5)
def read_inputs(file):
file_s = list(file.as_numpy_iterator())[0]
print(file_s)
file_num = int(file_s[:-4].split('_')[-1])
filename_core = file_s[:-len(file_s.split('_')[-1])]
h5file = h5.File(file,'r')
labels = h5file.get('labels')[()][:d_indx, indx]
h5file.close()
data = {l: [] for l in datatype}
for i,ii in enumerate(np.arange(file_num - ncmps//2, file_num + ncmps//2 + 1)):
if ii < 0 : filei = '%s%i.mat'%(filename_core,0)
elif ii > nshots_model: filei = '%s%i.mat' % (filename_core, nshots_model)
else: filei = '%s%i.mat' % (filename_core, ii)
h5filei = h5.File(filei,'r')
for l in datatype:
data[l].append(tf.convert_to_tensor(h5filei.get('inputs/' + l)[()],dtype=tf.float32))
h5filei.close()
data = {l: tf.convert_to_tensor(np.concatenate(data[l],axis=-1),dtype=tf.float32) for l in datatype}
return tf.data.Dataset.from_tensor_slices(data), tf.data.Dataset.from_tensor_slices(labels)
if phase == 'test': shuffle = False
for i in range(len(folder)):
# files = folder[i] + '*.mat'
files = os.listdir(folder[i])
'Removing files at the edges to fix cmps size (to avoid repetition)'
nshots_model = np.max([int(file[:-4].split('_')[-1]) for file in files])
omit_files = tuple(['_%i.mat' % i for i in np.arange(0, ncmps//2)] +
['_%i.mat' % i for i in np.arange(nshots_model - ncmps//2 + 1, nshots_model + 1)])
mask = np.array([file.endswith(omit_files) for file in files])
temp = tf.data.Dataset.list_files(['%s/'%folder[i] + file for file in np.array(files)[~mask]],shuffle=shuffle)
if i ==0 :
dataset = copy.copy(temp)
else:
dataset = dataset.concatenate(temp)
dataset = dataset.map(read_inputs, num_parallel_calls=tf.data.AUTOTUNE).batch(batch_size=batchsize)
return dataset
The problem that I’m facing is that I cannot read the passed file name as it is a ‘Symbolic Tensor’:
AttributeError: 'SymbolicTensor' object has no attribute 'as_numpy_iterator'
I will appreciate if anyone can help me to figure out how to reach the adjacent files, and create the concatenated tensors.
Thanks