I am trying to predict “site” from emails.
The dataset has 100 000 rows in total, each row is related to an incoming email.
The problem: The script fails when the dataframe has over 60 000 rows.
If i limit to 50 000 rows:
print(sys.getsizeof(df[df[‘Workflow’] == “Sales”])/1000000) # Output 36 (i believe this is in megabytes)
Thanks in advance!
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from keras.utils.np_utils import to_categorical
import datetime
import tensorflow as tf
import time
def elapsed_time():
elapsed_time = time.time() - start_time
hours, remainder = divmod(elapsed_time, 3600)
minutes, seconds = divmod(remainder, 60)
return f"{int(hours)}h {int(minutes)}m {int(seconds):02d}s elapsed - "
start_time = time.time()
print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
print(elapsed_time(), "Starting up...")
# Select the columns you want to use as input and output
X = df[df['Workflow'] == "Sales"][['emailBody', 'emailSubject', 'emailFrom', 'emailTo', 'emailCc', 'Attachments']]
y = df[df['Workflow'] == "Sales"]['RelatedSite']
# Concatenate the input columns into a single string
X = X.astype(str).apply(lambda x: '|'.join(x), axis=1)
print(elapsed_time(), "Vectorizing...")
# Use CountVectorizer or TfidfVectorizer to vectorize the input data
vectorizer = CountVectorizer()
# vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(X)
print(elapsed_time(), "Todense...")
# Convert the sparse matrix to a dense matrix
X = X.todense()
print(elapsed_time(), "Encoder...")
# Use LabelEncoder and OneHotEncoder to one-hot encode the output data
encoder = LabelEncoder()
y = encoder.fit_transform(y)
y = to_categorical(y)
print(elapsed_time(), "Splitting dataset...")
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
print(elapsed_time(), "Defining model...")
# Define the model
model = Sequential()
model.add(Dense(64, input_dim=X.shape[1], activation='relu'))
model.add(Dense(y.shape[1], activation='softmax'))
print(elapsed_time(), "Starting to compile")
# Compile the model with a loss function and an optimizer
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(elapsed_time(), "Starting to fit")
# Train the model on the training data
model.fit(X_train, y_train, epochs=10, batch_size=50)
print(elapsed_time(), "Starting to evaluate")
# Evaluate the model on the test data
scores = model.evaluate(X_test, y_test)
print("Accuracy: %.2f%%" % (scores[1]*100))
print(elapsed_time(), "Done!")
Result when the dataset is 50 000 rows:
0h 0m 00s elapsed - Starting up…
0h 0m 00s elapsed - Vectorizing…
0h 0m 00s elapsed - todense…
0h 0m 01s elapsed - encoder…
0h 0m 01s elapsed - splitting dataset…
0h 0m 08s elapsed - defining model…
0h 0m 12s elapsed - starting to compile
0h 0m 12s elapsed - starting to fit
Epoch 1/10
345/345 [==============================] - 8s 12ms/step - loss: 6.0031 - accuracy: 0.0872
Epoch 2/10
345/345 [==============================] - 4s 11ms/step - loss: 3.8441 - accuracy: 0.2882
Epoch 3/10
345/345 [==============================] - 4s 12ms/step - loss: 2.6642 - accuracy: 0.4973
Epoch 4/10
345/345 [==============================] - 5s 14ms/step - loss: 1.8110 - accuracy: 0.6610
Epoch 5/10
345/345 [==============================] - 5s 13ms/step - loss: 1.2229 - accuracy: 0.7679
Epoch 6/10
345/345 [==============================] - 5s 13ms/step - loss: 0.8320 - accuracy: 0.8392
Epoch 7/10
345/345 [==============================] - 5s 13ms/step - loss: 0.5779 - accuracy: 0.8899
Epoch 8/10
345/345 [==============================] - 5s 13ms/step - loss: 0.4216 - accuracy: 0.9254
0h 1m 37s elapsed - starting to evaluate
135/135 [==============================] - 2s 11ms/step - loss: 2.9308 - accuracy: 0.5846
Accuracy: 58.46%
0h 1m 41s elapsed - Done!
Result when the dataset is 70 000 rows:
0h 0m 00s elapsed - Starting up…
0h 0m 00s elapsed - Vectorizing…
0h 0m 01s elapsed - Todense…
0h 0m 02s elapsed - Encoder…
0h 0m 02s elapsed - Splitting dataset…
0h 0m 55s elapsed - Defining model…
0h 1m 01s elapsed - Starting to compile
0h 1m 01s elapsed - Starting to fit
InternalError Traceback (most recent call last)
64 print(elapsed_time(), “Starting to fit”)
65 # Train the model on the training data
—> 66 model.fit(X_train, y_train, epochs=10, batch_size=50)
c:\Python310\lib\site-packages\keras\utils\traceback_utils.py in error_handler(*args, **kwargs)
68 # To get the full stack trace, call:
69 #tf.debugging.disable_traceback_filtering()
—> 70 raise e.with_traceback(filtered_tb) from None
71 finally:
72 del filtered_tb
c:\Python310\lib\site-packages\tensorflow\python\framework\constant_op.py in convert_to_eager_tensor(value, ctx, dtype)
100 dtype = dtypes.as_dtype(dtype).as_datatype_enum
101 ctx.ensure_initialized()
→ 102 return ops.EagerTensor(value, ctx.device_name, dtype)
InternalError: Failed copying input tensor from /job:localhost/replica:0/task:0/device:CPU:0 to /job:localhost/replica:0/task:0/device:GPU:0 in order to run _EagerConst: Dst tensor is not initialized.
Some troubleshooting information:
DriverVersion Name NVIDIA RTX A5500 Laptop GPU Intel(R) UHD Graphicsplatform.python_version(): 3.10.9
WARNING:tensorflow:From :25: is_gpu_available (from tensorflow.python.framework.test_util) is deprecated and will be removed in a future version.
Instructions for updating:
instead.tf.test.is_gpu_available(cuda_only=False, min_cuda_compute_capability=None): True
tf.test.is_gpu_available(): True
tf.test.gpu_device_name(): /device:GPU:0
tf.config.list_physical_devices(‘GPU’): [PhysicalDevice(name=‘/physical_device:GPU:0’, device_type=‘GPU’)]
tf.test.is_built_with_gpu_support(): True
tf.test.is_built_with_cuda(): True
tf.config.experimental.list_physical_devices(): [PhysicalDevice(name=‘/physical_device:CPU:0’, device_type=‘CPU’), PhysicalDevice(name=‘/physical_device:GPU:0’, device_type=‘GPU’)]tensorflow version: 2.10.1
Cuda Version: 64_112
Cudnn version: 64_8
incarnation: 14028481689049539009
physical_device_desc: “device: 0, name: NVIDIA RTX A5500 Laptop GPU, pci bus id: 0000:01:00.0, compute capability: 8.6”
xla_global_id: 416903419