Need help with custom data loading in Keras_CV

Hello, I am new to Keras_CV and Computer vision as a whole, I am trying to load in a image dataset from Kaggle - Underwater Object Detection Dataset | Kaggle - And I am using this tutorial from the keras website for a guide - Efficient Object Detection with YOLOV8 and KerasCV - I am trying to preform object detection from the YOLOV8 model while loading in a custom dataset

Code:

import keras_cv
import keras
import tensorflow as tf
import os
import numpy as np
import tensorflow_datasets as tfds
import PIL

classes = ['fish', 'jellyfish', 'penguin', 'puffin', 'shark', 'starfish', 'stingray']
class_mapping = dict(zip(range(len(classes)), classes))

train_dir = '/home/arch_dan/Projects/Tensorflow-Keras/KerasCV_KerasNLP/CV_datasets/Fish_dataset/aquarium_pretrain/train/'

img_path = train_dir + 'images/'
label_path = train_dir + 'labels/'

def get_bboxes_classes(label_dir):
    classes_ = list()
    bboxes = list()

    for labels in os.listdir(label_dir):
        temp_c = []
        temp_b = []
        num_boxes = 0
        with open(os.path.join(label_dir, labels)) as file:
            temp = file.readlines()
            for cords in temp:
                temp_c.append(int(cords.split(' ')[0]))
                num_boxes += 1
                temp_b.append(num_boxes)
        classes_.append(temp_c)
        bboxes.append(len(temp_b))

    bbox = tf.ragged.constant(bboxes)
    classes = tf.ragged.constant(classes_)

    return bbox, classes

def create_dataset(image_paths, bboxes, classes):
    return tf.data.Dataset.from_tensor_slices((image_paths, bboxes, classes))

def load_image_paths(img_dir):
    image_paths = list()

    for images in os.listdir(img_dir):
        image_paths.append(os.path.join(img_dir, images))
    
    return tf.ragged.constant(image_paths)

image_paths = load_image_paths(img_path)
bboxes, classes = get_bboxes_classes(label_path)
loaded_dataset = create_dataset(image_paths, bboxes, classes)

train_data = loaded_dataset.take(int(len(os.listdir(label_path)) * 0.8))
val_data = loaded_dataset.skip(int(len(os.listdir(label_path)) * 0.8))

augmentations = keras.Sequential(layers=[
    keras_cv.layers.RandomFlip(mode="horizontal", bounding_box_format="xyxy"),
    keras_cv.layers.RandomShear(x_factor=0.2, y_factor=0.2, bounding_box_format="xyxy"),
    keras_cv.layers.JitteredResize(target_size=(640, 640), scale_factor=(0.75, 1.3), bounding_box_format="xyxy")])

resizing = keras_cv.layers.JitteredResize(target_size=(640, 640), scale_factor=(0.75, 1.3), bounding_box_format="xyxy")

def load_image(image_path):
    image = tf.io.read_file(image_path)
    image = tf.image.decode_jpeg(image, channels=3)
    return image

def load_dataset(image_path, classes, bbox):
    image = load_image(image_path)
    boxes = keras_cv.bounding_box.convert_format(bbox, images=image, source="xyxy", target="xywh")
    
    bounding_boxes = {
        "classes": tf.cast(classes, dtype=tf.float64),
        "boxes": boxes,
    }
    return {"images": tf.cast(image, tf.float64), "bounding_boxes": bounding_boxes}

train_set = train_data.map(load_dataset, num_parallel_calls=tf.data.AUTOTUNE)
train_set = train_set.shuffle(4 * 4)
train_set = train_set.ragged_batch(4, drop_remainder=True)

val_set = val_data.map(load_dataset, num_parallel_calls=tf.data.AUTOTUNE)
val_set = val_set.shuffle(4 * 4)
val_set = val_set.ragged_batch(4, drop_remainder=True)

val_set = val_set.map(resizing, num_parallel_calls=tf.data.AUTOTUNE)
train_set = train_set.map(augmentations, num_parallel_calls=tf.data.AUTOTUNE)

def dict_to_tuple(inputs):
    return inputs["images"], inputs["bounding_boxes"]

train_set = train_set.map(dict_to_tuple, num_parallel_calls=tf.data.AUTOTUNE)
train_set = train_set.prefetch(tf.data.AUTOTUNE)

val_set = val_set.map(dict_to_tuple, num_parallel_calls=tf.data.AUTOTUNE)
val_set = val_set.prefetch(tf.data.AUTOTUNE)

backbone = keras_cv.models.YOLOV8Backbone.from_preset("yolo_v8_s_backbone_coco")

yolo = keras_cv.models.YOLOV8Detector(
    num_classes=len(class_mapping),
    bounding_box_format="xywh",
    backbone=backbone,
    fpn_depth=1)

yolo.compile(optimizer=keras.optimizers.Adam(learning_rate=0.001, global_clipnorm=10.0),
             classification_loss="binary_crossentropy", 
             box_loss="ciou")

yolo.fit(train_set, validation_data=val_set, epochs=3)

I keep getting these errors when I run this code

  1. When I try and preform data augmentation
convert_format() expects both boxes and images to be batched, or both boxes and images to be unbatched. Received len(boxes.shape)=2, len(images.shape)=4. Expected either len(boxes.shape)=2 AND len(images.shape)=3, or len(boxes.shape)=3 AND len(images.shape)=4.

Arguments received by JitteredResize.call():
  • inputs={'images': 'tf.Tensor(shape=(4, None, None, 3), dtype=float64)', 'bounding_boxes': {'classes': 'tf.Tensor(shape=(4,), dtype=float32)', 'boxes': 'tf.Tensor(shape=(4, None), dtype=float32)'}}
  1. I ignore the data augmentation and proceed to try and train the model
Failed to convert elements of tf.RaggedTensor(values=tf.RaggedTensor(values=Tensor("yolov8_detector_1/functional_1_1/rescaling_1/Add:0", shape=(None, 3), dtype=float32), row_splits=Tensor("data_2:0", shape=(None,), dtype=int64)), row_splits=Tensor("data_1:0", shape=(5,), dtype=int64)) to Tensor. Consider casting elements to a supported type. See https://www.tensorflow.org/api_docs/python/tf/dtypes for supported TF dtypes.

Arguments received by Pad.call():
  • x=tf.Tensor(shape=(4, None, None, 3), dtype=float32)
  • constant_values=None

I do need assistance with this, I have tried numerous things to fix this but it does not help - Thank you