I am using librosa along with tensorflow. More specifically, I am using a generator, where I convert audio .wav files to log-mel spectrogram, delta and delta-delta.
I have a model consisting of CNN layers.
I run the script in kaggle.
However, my model does not use the GPU. It runs on CPU and is extremely slow.
Here is the code, if anyone can help.
class BerGenerator(tf.keras.utils.Sequence):
def __init__(self, X_train, y_train, batch_size = 16):
self.batch_size = batch_size
self.X_train = X_train
self.y_train = y_train
self.indexes = np.arange(len(self.X_train))
self.on_epoch_end()
def __len__(self):
return len(self.X_train) // self.batch_size
def __getitem__(self,idx):
indexes = self.indexes[idx * self.batch_size : (idx + 1) * self.batch_size]
batch_audios = self.X_train[indexes]
batch_labels = self.y_train[indexes]
X = np.empty([self.batch_size, 224, 224, 3])
for audio in batch_audios:
i = 0
#print(audio)
path = '../input/audio-files/audio_files/' + str(audio)
x, sr = librosa.load(path)
ps = librosa.feature.melspectrogram(y=x, sr=sr,n_mels=224, hop_length = 512)
ps_db= librosa.power_to_db(ps, ref=np.max) #log-mel spectrogram (2d - [n_mels,t])
delta = librosa.feature.delta(ps_db) #delta (2d)
delta2 = librosa.feature.delta(ps_db, order=2) #delta-delta (2d)
ps_db = tf.expand_dims(ps_db, axis = -1) #3d ([n_mels,t,1])
delta = tf.expand_dims(delta, axis = -1) #3d
delta2 = tf.expand_dims(delta2, axis = -1) #3d
final_map = tf.concat([ps_db, delta, delta2], axis=-1)
resized_map = tf.image.resize(final_map, [224,224]).numpy() #[224,224,3]
#print(resized_map)
X[i,:,:,:] = resized_map #[batch_size, 224, 224, 3]
i = i + 1
return np.array(X), np.array(batch_labels)
def on_epoch_end(self):
# Shuffle indexes after each epoch if shuffle is set to True.
np.random.RandomState(42).shuffle(self.indexes)
X = dataset['audio']
y = dataset['label']
X = np.array(X.tolist())
y = np.array(y.tolist())
#skf = RepeatedStratifiedKFold(n_splits = 10, n_repeats = 3)
skf = StratifiedKFold(n_splits = 10)
def create_model():
inp = Input(shape=(224, 224, 3,), dtype="float32")
img_1 = Conv2D(16, kernel_size=(3, 7), activation=activations.relu)(inp)
img_1 = Conv2D(16, kernel_size=(3, 7), activation=activations.relu)(img_1)
img_1 = MaxPool2D(pool_size=(3, 7))(img_1)
img_1 = Dropout(rate=0.1)(img_1)
img_1 = Conv2D(32, kernel_size=3, activation=activations.relu)(img_1)
img_1 = Conv2D(32, kernel_size=3, activation=activations.relu)(img_1)
img_1 = MaxPool2D(pool_size=(3, 3))(img_1)
img_1 = Dropout(rate=0.1)(img_1)
img_1 = Conv2D(128, kernel_size=3, activation=activations.relu)(img_1)
img_1 = GlobalMaxPool2D()(img_1)
img_1 = Dropout(rate=0.1)(img_1)
dense_1 = BatchNormalization()(Dense(128, activation=activations.relu)(img_1))
dense_1 = BatchNormalization()(Dense(128, activation=activations.relu)(dense_1))
dense_1 = Dense(1, activation='sigmoid')(dense_1)
model = Model(inputs=inp, outputs=dense_1)
return model
accuracy = []
precision = []
recall = []
auroc = []
specificity = []
for fold, (train_index, test_index) in enumerate(skf.split(X, y)):
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
X_train, X_eval, y_train, y_eval = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
print("Currently on fold: {}".format(fold))
model = create_model()
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate = 0.001),loss="binary_crossentropy",metrics=["acc",tf.keras.metrics.Recall(), tf.keras.metrics.Precision(),tf.keras.metrics.AUC(),
tf.keras.metrics.TrueNegatives(),tf.keras.metrics.FalsePositives()])
train_data = BerGenerator(X_train,y_train,batch_size=2)
eval_data = BerGenerator(X_eval,y_eval,batch_size = 2)
history = model.fit(train_data, epochs=1000, verbose = 1, class_weight = class_weight_function(y_train),validation_data=eval_data,callbacks = [tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10),
tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', mode = 'min',
factor=0.1, patience=3)])
test_data = BerGenerator(X_test,y_test,batch_size=4)
result = model.evaluate(test_data, verbose=0)
print(result[1], result[2],result[3], result[4], result[5]/(result[5]+result[6]))
I get the following warnings:
2021-10-04 20:32:47.111506: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:116] None of the MLIR optimization passes are enabled (registered 2)
2021-10-04 20:32:47.115770: I tensorflow/core/platform/profile_utils/cpu_utils.cc:112] CPU Frequency: 2000144999 Hz
2021-10-04 20:32:48.883555: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcublas.so.11
2021-10-04 20:32:49.779140: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcublasLt.so.11
2021-10-04 20:32:53.569330: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudnn.so.8
Thank you.