Loading npy files using tensorflow dataset pipeline

Mustafa_Mahmood · February 16, 2024, 4:44pm

Hi,
I’m training a CNN using tensorflow and python, my dataset is larger than the PC memory and as following:

the data set contains 754977 images cropped from 3D ct scan, the size of each image is (31,31,31) in a .npy format in a folder named cropped_nodules, this directory contains all the .npy images (1.npy , 2.npy, … etc).

And the truth table for each cropped image as a csv file named cropped_nodules.csv, contains 2 columns (SN, state) SN is a number of the image (1,2,3), while the state holds the (0,1) 0 if the image has no nodule, 1 if the image has nodule.

I’m trying to load the data, and creating x_train, y_train, x_val, y_val.
Considering the imbalanced dataset, I’m splitting the dataset based on the weight of the normal and abnormal nodules, like the x train contains 80% of the total normal nodules and 80% of the total abnormal nodules, and the x_val contains the rest of 20% of the total normal nodules, and 20% of the total abnormal nodules.

the image shapes are not consistent, some images are smaller than (31,31,31) and need to be padded.
also, the image shape must be expanded to (31,31,31,1) for grayscale dimension.

I’m facing a problem with tensors and variables, I can’t join the path for each image and I can’t load it

    file_path = tf.strings.join(data_dir, image_path_str)
TypeError: Expected list for 'inputs' argument to 'string_join' Op, not <tf.Tensor 'Const:0' shape=() dtype=string>.

import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv3D, MaxPooling3D, Flatten, Dense, Dropout
from sklearn.model_selection import train_test_split

csv_path = "/home/mustafa/project/LUNA16/cropped_nodules.csv"

# Read CSV file using pandas
df = pd.read_csv(csv_path)

image_names = df["SN"]
labels = df["state"].to_numpy()

X_train, X_val, y_train, y_val = train_test_split(
    image_names, labels, test_size=0.2, stratify=labels
)

def preprocess_image(image_path, label):
    data_dir = tf.convert_to_tensor('/home/mustafa/project/LUNA16/cropped_nodules/')
    image_path_str = tf.strings.as_string(image_path)
    file_path = tf.strings.join(data_dir, image_path_str)
    try:
        image = np.load(file_path)

    except FileNotFoundError:
        print("File not found at")
        pass
    # Add grayscale dimension
    image = np.expand_dims(image, axis=-1)
    if image.shape != (31, 31, 31, 1):
        # Padding
        pad_width = [(0, max_sz - sz) for sz, max_sz in zip(image.shape, (31,31,31,1))]
        image = np.pad(image, pad_width, mode='constant')
        # Cropping
        crop = tuple(slice(sz) for sz in (31,31,31,1))
        image = image[crop]
        pass

    return image, label
# Create TensorFlow datasets
train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
val_dataset = tf.data.Dataset.from_tensor_slices((X_val, y_val))

# Apply preprocessing as a map function
train_dataset = train_dataset.map(preprocess_image, num_parallel_calls=tf.data.AUTOTUNE)
val_dataset = val_dataset.map(preprocess_image, num_parallel_calls=tf.data.AUTOTUNE)

# Optional caching and batching
train_dataset = train_dataset.cache().shuffle(buffer_size=1024).batch(32)
val_dataset = val_dataset.batch(32)


# Define the model
model = Sequential([
    Conv3D(32, kernel_size=(3, 3, 3), activation='relu', input_shape=(31, 31, 31, 1)),
    MaxPooling3D(pool_size=(2, 2, 2)),
    Conv3D(64, kernel_size=(3, 3, 3), activation='relu'),
    MaxPooling3D(pool_size=(2, 2, 2)),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
# Model summary
model.summary()

history = model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=10,  # Adjust the number of epochs based on your requirement
    verbose = 1
)

Mustafa_Mahmood · February 16, 2024, 6:09pm

Solved with a little help from ChatGPT


def load_npy_file(file_path):
    image = np.load(file_path.numpy().decode('utf-8'))
    image = np.expand_dims(image, axis=-1)
    if image.shape != (31, 31, 31, 1):
        # Padding
        pad_width = [(0, max_sz - sz) for sz, max_sz in zip(image.shape, (31,31,31,1))]
        image = np.pad(image, pad_width, mode='constant')
        # Cropping
        crop = tuple(slice(sz) for sz in (31,31,31,1))
        image = image[crop]
        return image
    return image.astype(np.float32)  # Ensure the type is compatible with TensorFlow

def preprocess_image(image_path, label):
    image_path_str = tf.strings.as_string(image_path)
    data_dir = '/home/mustafa/project/LUNA16/cropped_nodules/'
    filename = tf.strings.join([image_path_str, '.npy'], separator="")
    file_path = tf.strings.join([data_dir, filename], separator="")
    image = tf.py_function(load_npy_file, [file_path], tf.float32)
    image.set_shape((31, 31, 31, 1))

    return image, label