Thanks for your help @Laxma_Reddy_Patlolla !
I implemented functions to do similar to your suggestion 1 while awaiting an answer. It wasn’t as bad as I thought it would be.
I’ll check into SMOTE, as it sounds like something I need. Thanks!
def shuffle_pairs(X, y):
"""
Shuffles a pair of arrays by their first dimension, maintaining correspondance between
the arrays.
PARAMETERS:
X: ndarray
A three-dimensional array of examples of the form (examples, times, channels)
y: ndarray
A one-dimensional array of labels of the form (examples,)
RETURNS:
X, y
"""
assert X.shape[0] == y.shape[0]
p = np.random.permutation(X.shape[0])
return X[p], y[p]
def train_validate_test_split(X, y, validate_size=0.1, test_size=0.1):
"""
Divide examples and labels into train, validate, and test datasets, while maintaining
the proportion of classes in labels.
PARAMETERS:
X: ndarray
A three-dimensional array of examples of the form (examples, times, channels)
y: ndarray
A one-dimensional array of labels of the form (examples,)
validate_size: float
Proportion of examples to use for validation (default 0.1)
test_size: float
Proportion of examples to use for test (default 0.1)
RETURNS:
X_train, X_validate, X_test, y_train, y_validate, y_test
"""
assert len(X) == len(y)
# np.random.seed(random_state)
train_size = 1 - validate_size - test_size
assert (train_size + validate_size + test_size) == 1
labels, counts = np.unique(y, return_counts=True)
X_train = np.empty((0, X.shape[1], X.shape[2]))
X_validate = np.empty((0, X.shape[1], X.shape[2]))
X_test = np.empty((0, X.shape[1], X.shape[2]))
y_train = np.empty((0))
y_validate = np.empty((0))
y_test = np.empty((0))
for label in labels:
num_validate = int(counts[label] * validate_size)
num_test = int(counts[label] * test_size)
num_train = counts[label] - num_validate - num_test
Xs_this_class = X[y == label]
tr, va, te = np.split(Xs_this_class, [num_train, num_train + num_validate])
X_train = np.concatenate((X_train, tr))
X_validate = np.concatenate((X_validate, va))
X_test = np.concatenate((X_test, te))
y_train = np.concatenate((y_train, np.full(num_train, label)))
y_validate = np.concatenate((y_validate, np.full(num_validate, label)))
y_test = np.concatenate((y_test, np.full(num_test, label)))
X_train, y_train = shuffle_pairs(X_train, y_train)
X_validate, y_validate = shuffle_pairs(X_validate, y_validate)
X_test, y_test = shuffle_pairs(X_test, y_test)
print(
f"X_train.shape = {X_train.shape}, X_validate.shape = {X_validate.shape}, X_test.shape = {X_test.shape}"
)
print(f"There are {len(labels)} unique classes")
print(list(zip(labels, counts)))
print(f"y_train {np.unique(y_train, return_counts=True)}")
print(f"y_validate {np.unique(y_validate, return_counts=True)}")
print(f"y_test {np.unique(y_test, return_counts=True)}")
return X_train, X_validate, X_test, y_train, y_validate, y_test