Fails when the dataframe has over 60 000 rows

I am trying to predict “site” from emails.

The dataset has 100 000 rows in total, each row is related to an incoming email.

The problem: The script fails when the dataframe has over 60 000 rows.

If i limit to 50 000 rows:
print(sys.getsizeof(df[df[‘Workflow’] == “Sales”])/1000000) # Output 36 (i believe this is in megabytes)

Thanks in advance!

Code:

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from keras.utils.np_utils import to_categorical
import datetime
import tensorflow as tf
import time

def elapsed_time():
    elapsed_time = time.time() - start_time
    hours, remainder = divmod(elapsed_time, 3600)
    minutes, seconds = divmod(remainder, 60)
    return f"{int(hours)}h {int(minutes)}m {int(seconds):02d}s elapsed - "

start_time = time.time()
print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))

print(elapsed_time(), "Starting up...")
# Select the columns you want to use as input and output
X = df[df['Workflow'] == "Sales"][['emailBody', 'emailSubject', 'emailFrom', 'emailTo', 'emailCc', 'Attachments']]
y = df[df['Workflow'] == "Sales"]['RelatedSite']

# Concatenate the input columns into a single string
X = X.astype(str).apply(lambda x: '|'.join(x), axis=1)


print(elapsed_time(), "Vectorizing...")
# Use CountVectorizer or TfidfVectorizer to vectorize the input data
vectorizer = CountVectorizer()
# vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(X)

print(elapsed_time(), "Todense...")
# Convert the sparse matrix to a dense matrix
X = X.todense()

print(elapsed_time(), "Encoder...")
# Use LabelEncoder and OneHotEncoder to one-hot encode the output data
encoder = LabelEncoder()
y = encoder.fit_transform(y)
y = to_categorical(y)

print(elapsed_time(), "Splitting dataset...")
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

print(elapsed_time(), "Defining model...")
# Define the model
model = Sequential()
model.add(Dense(64, input_dim=X.shape[1], activation='relu'))
model.add(Dense(y.shape[1], activation='softmax'))


print(elapsed_time(), "Starting to compile")
# Compile the model with a loss function and an optimizer
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

tf.compat.v1.Session()

print(elapsed_time(), "Starting to fit")
# Train the model on the training data
model.fit(X_train, y_train, epochs=10, batch_size=50)


print(elapsed_time(), "Starting to evaluate")
# Evaluate the model on the test data
scores = model.evaluate(X_test, y_test)
print("Accuracy: %.2f%%" % (scores[1]*100))
print(elapsed_time(), "Done!")

Result when the dataset is 50 000 rows:

0h 0m 00s elapsed - Starting up…
0h 0m 00s elapsed - Vectorizing…
0h 0m 00s elapsed - todense…
0h 0m 01s elapsed - encoder…
0h 0m 01s elapsed - splitting dataset…
0h 0m 08s elapsed - defining model…
0h 0m 12s elapsed - starting to compile
0h 0m 12s elapsed - starting to fit
Epoch 1/10
345/345 [==============================] - 8s 12ms/step - loss: 6.0031 - accuracy: 0.0872
Epoch 2/10
345/345 [==============================] - 4s 11ms/step - loss: 3.8441 - accuracy: 0.2882
Epoch 3/10
345/345 [==============================] - 4s 12ms/step - loss: 2.6642 - accuracy: 0.4973
Epoch 4/10
345/345 [==============================] - 5s 14ms/step - loss: 1.8110 - accuracy: 0.6610
Epoch 5/10
345/345 [==============================] - 5s 13ms/step - loss: 1.2229 - accuracy: 0.7679
Epoch 6/10
345/345 [==============================] - 5s 13ms/step - loss: 0.8320 - accuracy: 0.8392
Epoch 7/10
345/345 [==============================] - 5s 13ms/step - loss: 0.5779 - accuracy: 0.8899
Epoch 8/10
345/345 [==============================] - 5s 13ms/step - loss: 0.4216 - accuracy: 0.9254

0h 1m 37s elapsed - starting to evaluate
135/135 [==============================] - 2s 11ms/step - loss: 2.9308 - accuracy: 0.5846
Accuracy: 58.46%
0h 1m 41s elapsed - Done!

Result when the dataset is 70 000 rows:

0h 0m 00s elapsed - Starting up…
0h 0m 00s elapsed - Vectorizing…
0h 0m 01s elapsed - Todense…
0h 0m 02s elapsed - Encoder…
0h 0m 02s elapsed - Splitting dataset…
0h 0m 55s elapsed - Defining model…
0h 1m 01s elapsed - Starting to compile
0h 1m 01s elapsed - Starting to fit
InternalError Traceback (most recent call last)
in
64 print(elapsed_time(), “Starting to fit”)
65 # Train the model on the training data
—> 66 model.fit(X_train, y_train, epochs=10, batch_size=50)
67
68
c:\Python310\lib\site-packages\keras\utils\traceback_utils.py in error_handler(*args, **kwargs)
68 # To get the full stack trace, call:
69 # tf.debugging.disable_traceback_filtering()
—> 70 raise e.with_traceback(filtered_tb) from None
71 finally:
72 del filtered_tb
c:\Python310\lib\site-packages\tensorflow\python\framework\constant_op.py in convert_to_eager_tensor(value, ctx, dtype)
100 dtype = dtypes.as_dtype(dtype).as_datatype_enum
101 ctx.ensure_initialized()
→ 102 return ops.EagerTensor(value, ctx.device_name, dtype)
103
104
InternalError: Failed copying input tensor from /job:localhost/replica:0/task:0/device:CPU:0 to /job:localhost/replica:0/task:0/device:GPU:0 in order to run _EagerConst: Dst tensor is not initialized.

Some troubleshooting information:

DriverVersion Name
31.0.15.2727 NVIDIA RTX A5500 Laptop GPU
31.0.101.3512 Intel(R) UHD Graphics

platform.python_version(): 3.10.9
WARNING:tensorflow:From :25: is_gpu_available (from tensorflow.python.framework.test_util) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.config.list_physical_devices('GPU') instead.

tf.test.is_gpu_available(cuda_only=False, min_cuda_compute_capability=None): True
tf.test.is_gpu_available(): True
tf.test.gpu_device_name(): /device:GPU:0
tf.config.list_physical_devices(‘GPU’): [PhysicalDevice(name=’/physical_device:GPU:0’, device_type=‘GPU’)]
tf.test.is_built_with_gpu_support(): True
tf.test.is_built_with_cuda(): True
tf.config.experimental.list_physical_devices(): [PhysicalDevice(name=’/physical_device:CPU:0’, device_type=‘CPU’), PhysicalDevice(name=’/physical_device:GPU:0’, device_type=‘GPU’)]

tensorflow version: 2.10.1
Cuda Version: 64_112
Cudnn version: 64_8

incarnation: 14028481689049539009
physical_device_desc: “device: 0, name: NVIDIA RTX A5500 Laptop GPU, pci bus id: 0000:01:00.0, compute capability: 8.6”
xla_global_id: 416903419
]

@patchie,

Welcome to the Tensorflow Forum!

InternalError: Failed copying input tensor from /job:localhost/replica:0/task:0/device:CPU:0 to /job:localhost/replica:0/task:0/device:GPU:0 in order to run _EagerConst: Dst tensor is not initialized.

You are running out of memory on the GPU device. As a result, it’s unable to copy the input to the device.

To solve OOM error there are a couple of things you can try as shown below

i) Limit gpu memory growth - You can try limiting gpu memory by two ways :

a) Turn on memory growth by calling tf.config.experimental.set_memory_growth, it allocates more memory as the process increases and demands extra memory
b) Set a hard limit on the total memory tf.config.set_logical_device_configuration(memory_limit=1024).
ii) reduce batch size for training

Can you try and let us know if the issue continues?

Thank you!

If you are using IDE’s, restart will solve the problem. Thank you!

Hi, i now added all 3 of your suggestions, but it still fails.

Check the updated script here:

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from keras.utils.np_utils import to_categorical
import datetime
import tensorflow as tf
import time
from sklearn.feature_extraction.text import HashingVectorizer

tf.config.LogicalDeviceConfiguration(memory_limit=1024)

physical_devices = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], True)

def elapsed_time():
    elapsed_time = time.time() - start_time
    hours, remainder = divmod(elapsed_time, 3600)
    minutes, seconds = divmod(remainder, 60)
    return f"{int(hours)}h {int(minutes)}m {int(seconds):02d}s elapsed - "

start_time = time.time()
print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))

print(elapsed_time(), "Starting up...")
# Select the columns you want to use as input and output
X = df[df['Workflow'] == "Sales"][['emailBody', 'emailSubject', 'emailFrom', 'emailTo', 'emailCc', 'Attachments']]
y = df[df['Workflow'] == "Sales"]['RelatedSite']

print("len(X):", len(X))
print("len(y):", len(y))

# Concatenate the input columns into a single string
X = X.astype(str).apply(lambda x: '|'.join(x), axis=1)


print(elapsed_time(), "Vectorizing...")
# Use CountVectorizer or TfidfVectorizer to vectorize the input data
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(X)
#vectorizer = HashingVectorizer(ngram_range=(1,1))
#X = vectorizer.fit_transform(X)

print(elapsed_time(), "Todense...")
# Convert the sparse matrix to a dense matrix
X = X.todense()

print(elapsed_time(), "Encoder...")
# Use LabelEncoder and OneHotEncoder to one-hot encode the output data
encoder = LabelEncoder()
y = encoder.fit_transform(y)
y = to_categorical(y)

print(elapsed_time(), "Splitting dataset...")
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

print("X.shape[1]:", X.shape[1])
print("y.shape[1]:", y.shape[1])

print(elapsed_time(), "Defining model...")
# Define the model
model = Sequential()
model.add(Dense(500, input_dim=X.shape[1], activation='relu',))
model.add(Dense(500))
model.add(Dense(y.shape[1], activation='softmax'))


print(elapsed_time(), "Starting to compile")
# Compile the model with a loss function and an optimizer
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

tf.compat.v1.Session()

print(elapsed_time(), "Starting to fit")
# Train the model on the training data
model.fit(X_train, y_train, epochs=10, batch_size=5)


print(elapsed_time(), "Starting to evaluate")
# Evaluate the model on the test data
scores = model.evaluate(X_test, y_test)
print("Accuracy: %.2f%%" % (scores[1]*100))
print(elapsed_time(), "Done!")

And the updated output:

2023-01-10 15:54:14
0h 0m 00s elapsed -  Starting up...
len(X): 42909
len(y): 42909
0h 0m 00s elapsed -  Vectorizing...
0h 0m 01s elapsed -  Todense...
0h 0m 03s elapsed -  Encoder...
0h 0m 03s elapsed -  Splitting dataset...
X.shape[1]: 112209
y.shape[1]: 2996
0h 3m 25s elapsed -  Defining model...
0h 3m 27s elapsed -  Starting to compile
0h 3m 27s elapsed -  Starting to fit
---------------------------------------------------------------------------
InternalError                             Traceback (most recent call last)
<ipython-input-3-d55e30511bf4> in <module>
     78 print(elapsed_time(), "Starting to fit")
     79 # Train the model on the training data
---> 80 model.fit(X_train, y_train, epochs=10, batch_size=5)
     81 
     82 

c:\Python310\lib\site-packages\keras\utils\traceback_utils.py in error_handler(*args, **kwargs)
     68             # To get the full stack trace, call:
     69             # `tf.debugging.disable_traceback_filtering()`
---> 70             raise e.with_traceback(filtered_tb) from None
     71         finally:
     72             del filtered_tb

c:\Python310\lib\site-packages\tensorflow\python\framework\constant_op.py in convert_to_eager_tensor(value, ctx, dtype)
    100       dtype = dtypes.as_dtype(dtype).as_datatype_enum
    101   ctx.ensure_initialized()
--> 102   return ops.EagerTensor(value, ctx.device_name, dtype)
    103 
    104 

InternalError: Failed copying input tensor from /job:localhost/replica:0/task:0/device:CPU:0 to /job:localhost/replica:0/task:0/device:GPU:0 in order to run _EagerConst: Dst tensor is not initialized.

@patchie,

Can you try with batch size 1?

If the issue continues, try to load dataframe to the tensorflow dataset using Load a pandas DataFrame  |  TensorFlow Core.

Thank you!