Hi,
I’ve been working on a project to use TensorFlow, and I have a lot of data. So much so that if I train it based on the full data file, it spits out this error:
numpy.core._exceptions._ArrayMemoryError: Unable to allocate 135. GiB for an array with shape (2243467, 16155) and data type float32
So I split the training data into 600 files (each is roughly 65 lines of data), and train it like this:
Model.load()
for i in range(599):
Model.train(f"training/data{i + 1}.txt")
Model.save()
I haven’t run the training task yet because I am concerned about the current implementations with saving/loading. I’m most worried it will fail to save or read the model when it exceeds the 16GB the machine is allocated (no, I can’t increase it). This is my current implementation for the model:
class TextGenerator:
def __init__(self, sequence_length=100, batch_size=128, embedding_dim=256, rnn_units=1024):
self.sequence_length = sequence_length
self.batch_size = batch_size
self.embedding_dim = embedding_dim
self.rnn_units = rnn_units
self.model = None
self.tokenizer = None
def train(self, file_path):
# Load and preprocess text data
text = open(file_path, 'rb').read().decode(encoding='utf-8', errors="ignore")
self.tokenizer = tf.keras.preprocessing.text.Tokenizer(char_level=False)
self.tokenizer.fit_on_texts([text])
total_words = len(self.tokenizer.word_index) + 1
# Create training sequences
sequences = []
for i in range(self.sequence_length, len(text)):
seq = text[i - self.sequence_length:i]
sequences.append(seq)
input_sequences = self.tokenizer.texts_to_sequences(sequences)
input_sequences = tf.keras.preprocessing.sequence.pad_sequences(input_sequences, maxlen=self.sequence_length, padding='pre')
input_sequences = np.array(input_sequences)
inputs, targets = input_sequences[:, :-1], input_sequences[:, -1]
targets = tf.keras.utils.to_categorical(targets, num_classes=total_words)
self.model = tf.keras.Sequential([
tf.keras.layers.Embedding(total_words, self.embedding_dim, input_length=self.sequence_length-1),
tf.keras.layers.LSTM(self.rnn_units),
tf.keras.layers.Dense(total_words, activation='softmax')
])
self.model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
self.model.fit(inputs, targets, epochs=10, batch_size=self.batch_size)
print(self.generate_text("hello")) # test it
def generate_text(self, seed_text, num_words=50):
for _ in range(num_words):
token_list = tf.keras.preprocessing.sequence.pad_sequences([self.tokenizer.texts_to_sequences([seed_text])[0]], maxlen=self.sequence_length-1, padding='pre')
output_word = self.tokenizer.index_word[np.random.choice(len(self.model.predict(token_list, verbose=0)[0]), p=self.model.predict(token_list, verbose=0)[0])]
seed_text += " " + output_word
return seed_text
def save(self):
self.model.save_weights("trained_text_generator_model.h5", overwrite = False)
def load(self):
self.model.load_weights("trained_text_generator_model.h5")
Please do forgive me if it’s messy or unoptimized, this is my first project to use TF.
With all that being said, what’s the best way to run it with my current situation of not having 150gb of memory to spare? I’d like to keep the full training set (and add to it later) but save_weights and load_weights seem like they may cause the error to crop up again. Thank you!