Hello, i have a problem with dataset process and from_tensor_slices func. Here is the error:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Cell In[2], line 23
20 if len(valid_russian.shape) == 1:
21 valid_russian = np.expand_dims(valid_russian, axis=-1)
---> 23 train_dataset = tf.data.Dataset.from_tensor_slices((train_english, train_russian))
24 validation_dataset = tf.data.Dataset.from_tensor_slices((valid_english, valid_russian))
File ~/miniconda3/lib/python3.11/site-packages/tensorflow/python/data/ops/dataset_ops.py:830, in DatasetV2.from_tensor_slices(tensors, name)
826 # Loaded lazily due to a circular dependency (dataset_ops ->
827 # from_tensor_slices_op -> dataset_ops).
828 # pylint: disable=g-import-not-at-top,protected-access
829 from tensorflow.python.data.ops import from_tensor_slices_op
--> 830 return from_tensor_slices_op._from_tensor_slices(tensors, name)
File ~/miniconda3/lib/python3.11/site-packages/tensorflow/python/data/ops/from_tensor_slices_op.py:25, in _from_tensor_slices(tensors, name)
24 def _from_tensor_slices(tensors, name=None):
---> 25 return _TensorSliceDataset(tensors, name=name)
File ~/miniconda3/lib/python3.11/site-packages/tensorflow/python/data/ops/from_tensor_slices_op.py:38, in _TensorSliceDataset.__init__(self, element, is_files, name)
36 if not self._tensors:
37 raise ValueError("Invalid `element`. `element` should not be empty.")
---> 38 self._structure = nest.map_structure(
39 lambda component_spec: component_spec._unbatch(), batched_spec) # pylint: disable=protected-access
40 self._name = name
...
365 if self._shape.ndims == 0:
--> 366 raise ValueError("Unbatching a tensor is only supported for rank >= 1")
367 return TensorSpec(self._shape[1:], self._dtype)
ValueError: Unbatching a tensor is only supported for rank >= 1
here is the notebook code:
# Configure the dataset
MAIN_DATASET_DIR = 'dataset'
TRAIN_DATASET_DIR = os.path.join(MAIN_DATASET_DIR, 'train')
VALID_DATASET_DIR = os.path.join(MAIN_DATASET_DIR, 'valid')
datagen = DataGenerator(TRAIN_DATASET_DIR, VALID_DATASET_DIR)
(train_english, train_russian), (valid_english, valid_russian) = datagen.generate()
train_english = np.array(train_english)
train_russian = np.array(train_russian)
valid_english = np.array(valid_english)
valid_russian = np.array(valid_russian)
if len(train_english.shape) == 1:
train_english = np.expand_dims(train_english, axis=-1)
if len(train_russian.shape) == 1:
train_russian = np.expand_dims(train_russian, axis=-1)
if len(valid_english.shape) == 1:
valid_english = np.expand_dims(valid_english, axis=-1)
if len(valid_russian.shape) == 1:
valid_russian = np.expand_dims(valid_russian, axis=-1)
train_dataset = tf.data.Dataset.from_tensor_slices((train_english, train_russian))
validation_dataset = tf.data.Dataset.from_tensor_slices((valid_english, valid_russian))
and datagen script with class:
import os
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
"""
ΠΠΎΠ»Π½ΠΎΡΡΡΡ ΠΏΠ΅ΡΠ΅ΠΏΠΈΡΠ°ΡΡ!(ΡΠ°ΠΊΠΆΠ΅ ΠΊΠ°ΠΊ ΠΈ ΠΈΠ½ΠΈΡΠΈΠ°Π»ΠΈΠ·Π°ΡΠΎΡ Π² Π±Π»ΠΎΠΊΠ½ΠΎΡΠ΅)
"""
class DataGenerator:
def __init__(self, train_dir, valid_dir, padding_type='post', trunc_type='post'):
self.train_dir = train_dir
self.valid_dir = valid_dir
self.padding_type = padding_type
self.trunc_type = trunc_type
def load_data(self, dir_name):
data = {}
for class_name in os.listdir(dir_name):
class_dir = os.path.join(dir_name, class_name)
if not os.path.exists(class_dir):
os.makedirs(class_dir)
data[class_name] = []
for filename in os.listdir(class_dir):
if os.path.isfile(os.path.join(class_dir, filename)):
with open(os.path.join(class_dir, filename), 'r') as f:
data[class_name].append(f.read())
return data
def prepare_data(self, data):
tokenizer = Tokenizer()
for class_name in data.keys():
tokenizer.fit_on_texts(data[class_name])
sequences = tokenizer.texts_to_sequences(data[class_name])
if sequences:
padded = pad_sequences(sequences, padding=self.padding_type, truncating=self.trunc_type)
data[class_name] = [tf.expand_dims(p, -1) for p in padded] # Add an extra dimension at the end to avoid ValueError
return data
def generate(self):
train_data = self.load_data(self.train_dir)
valid_data = self.load_data(self.valid_dir)
train_data = self.prepare_data(train_data)
valid_data = self.prepare_data(valid_data)
train_data = {k: v for k, v in train_data.items() if len(v) > 0}
valid_data = {k: v for k, v in valid_data.items() if len(v) > 0}
print(f"Train data info: {len(train_data.keys())} classes, {sum([len(v) for v in train_data.values()])} samples")
print(f"Valid data info: {len(valid_data.keys())} classes, {sum([len(v) for v in valid_data.values()])} samples")
return (train_data, valid_data)
Here is my dataset path:
Trying to fix that problem already more than 3h, but, i could nothing .