Unbatching a tensor is only supported for rank >= 1

Hello, i have a problem with dataset process and from_tensor_slices func. Here is the error:

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[2], line 23
     20 if len(valid_russian.shape) == 1:
     21     valid_russian = np.expand_dims(valid_russian, axis=-1)
---> 23 train_dataset = tf.data.Dataset.from_tensor_slices((train_english, train_russian))
     24 validation_dataset = tf.data.Dataset.from_tensor_slices((valid_english, valid_russian))

File ~/miniconda3/lib/python3.11/site-packages/tensorflow/python/data/ops/dataset_ops.py:830, in DatasetV2.from_tensor_slices(tensors, name)
    826 # Loaded lazily due to a circular dependency (dataset_ops ->
    827 # from_tensor_slices_op -> dataset_ops).
    828 # pylint: disable=g-import-not-at-top,protected-access
    829 from tensorflow.python.data.ops import from_tensor_slices_op
--> 830 return from_tensor_slices_op._from_tensor_slices(tensors, name)

File ~/miniconda3/lib/python3.11/site-packages/tensorflow/python/data/ops/from_tensor_slices_op.py:25, in _from_tensor_slices(tensors, name)
     24 def _from_tensor_slices(tensors, name=None):
---> 25   return _TensorSliceDataset(tensors, name=name)

File ~/miniconda3/lib/python3.11/site-packages/tensorflow/python/data/ops/from_tensor_slices_op.py:38, in _TensorSliceDataset.__init__(self, element, is_files, name)
     36 if not self._tensors:
     37   raise ValueError("Invalid `element`. `element` should not be empty.")
---> 38 self._structure = nest.map_structure(
     39     lambda component_spec: component_spec._unbatch(), batched_spec)  # pylint: disable=protected-access
     40 self._name = name
...
    365   if self._shape.ndims == 0:
--> 366     raise ValueError("Unbatching a tensor is only supported for rank >= 1")
    367   return TensorSpec(self._shape[1:], self._dtype)

ValueError: Unbatching a tensor is only supported for rank >= 1

here is the notebook code:

# Configure the dataset
MAIN_DATASET_DIR = 'dataset'
TRAIN_DATASET_DIR = os.path.join(MAIN_DATASET_DIR, 'train')
VALID_DATASET_DIR = os.path.join(MAIN_DATASET_DIR, 'valid')

datagen = DataGenerator(TRAIN_DATASET_DIR, VALID_DATASET_DIR)
(train_english, train_russian), (valid_english, valid_russian) = datagen.generate()

train_english = np.array(train_english)
train_russian = np.array(train_russian)
valid_english = np.array(valid_english)
valid_russian = np.array(valid_russian)

if len(train_english.shape) == 1:
    train_english = np.expand_dims(train_english, axis=-1)
if len(train_russian.shape) == 1:
    train_russian = np.expand_dims(train_russian, axis=-1)
if len(valid_english.shape) == 1:
    valid_english = np.expand_dims(valid_english, axis=-1)
if len(valid_russian.shape) == 1:
    valid_russian = np.expand_dims(valid_russian, axis=-1)

train_dataset = tf.data.Dataset.from_tensor_slices((train_english, train_russian))
validation_dataset = tf.data.Dataset.from_tensor_slices((valid_english, valid_russian))

and datagen script with class:

import os
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
"""

Полностью переписать!(также как и инициализатор в блокноте)
"""
class DataGenerator:
    def __init__(self, train_dir, valid_dir, padding_type='post', trunc_type='post'):
        self.train_dir = train_dir
        self.valid_dir = valid_dir
        self.padding_type = padding_type
        self.trunc_type = trunc_type

    def load_data(self, dir_name):
        data = {}
        for class_name in os.listdir(dir_name):
            class_dir = os.path.join(dir_name, class_name)
            if not os.path.exists(class_dir):
                os.makedirs(class_dir)
            data[class_name] = []
            for filename in os.listdir(class_dir):
                if os.path.isfile(os.path.join(class_dir, filename)):
                    with open(os.path.join(class_dir, filename), 'r') as f:
                        data[class_name].append(f.read())
        return data

    def prepare_data(self, data):
        tokenizer = Tokenizer()
        for class_name in data.keys():
            tokenizer.fit_on_texts(data[class_name])
            sequences = tokenizer.texts_to_sequences(data[class_name])
            if sequences:
                padded = pad_sequences(sequences, padding=self.padding_type, truncating=self.trunc_type)
                data[class_name] = [tf.expand_dims(p, -1) for p in padded]  # Add an extra dimension at the end to avoid ValueError
        return data

    def generate(self):
        train_data = self.load_data(self.train_dir)
        valid_data = self.load_data(self.valid_dir)
        train_data = self.prepare_data(train_data)
        valid_data = self.prepare_data(valid_data)

        train_data = {k: v for k, v in train_data.items() if len(v) > 0}
        valid_data = {k: v for k, v in valid_data.items() if len(v) > 0}

        print(f"Train data info: {len(train_data.keys())} classes, {sum([len(v) for v in train_data.values()])} samples")
        print(f"Valid data info: {len(valid_data.keys())} classes, {sum([len(v) for v in valid_data.values()])} samples")

        return (train_data, valid_data)


Here is my dataset path:
Screenshot 2023-10-15 at 14.52.05

Trying to fix that problem already more than 3h, but, i could nothing .

Hi @oblivisheee, If you have run the code and see the output of the train_english it looks like train_english which is the key from the data dictionary. If you pass the output to np.array(train_english) which will be a string array with shape ( ). You create a dataset with this shape. You have to get the values for the corresponding key. To get the value form the key you have to do the following

MAIN_DATASET_DIR = '/content/dataset'
TRAIN_DATASET_DIR = os.path.join(MAIN_DATASET_DIR, 'train')
VALID_DATASET_DIR = os.path.join(MAIN_DATASET_DIR, 'valid')

datagen = DataGenerator(TRAIN_DATASET_DIR, VALID_DATASET_DIR)
train_data, valid_data = datagen.generate()

train_english= train_data['train_english']
train_russian = train_data['train_russian']
valid_english = valid_data['valid_english']
valid_russian = valid_data['valid_russian']

Now you can pass these values to create a dataset using tf.data.Dataset.from_tensor_slices.

Please refer to this gist for working code example. Thank You.

1 Like

Huge thanks! What i would do without you!