Hi i’m trying to get a custom spectrogram layer going and I can’t
class MelLayer(tf.keras.layers.Layer):
def __init__(
self,
frame_length=1024,
frame_step=256,
fft_length=None,
sampling_rate=MODEL_SR,
num_mel_channels=80,
freq_min=1,
freq_max=7600,
as_3D_tensor=True,
**kwargs,
):
super().__init__(**kwargs)
self.frame_length = frame_length
self.frame_step = frame_step
self.fft_length = fft_length
self.sampling_rate = sampling_rate
self.num_mel_channels = num_mel_channels
self.freq_min = freq_min
self.freq_max = freq_max
# Defining mel filter. This filter will be multiplied with the STFT output
self.mel_filterbank = tf.signal.linear_to_mel_weight_matrix(
num_mel_bins=self.num_mel_channels,
num_spectrogram_bins=self.frame_length // 2 + 1,
sample_rate=self.sampling_rate,
lower_edge_hertz=self.freq_min,
upper_edge_hertz=self.freq_max,
)
self.as_3D_tensor = as_3D_tensor
def call(self, audio, training=True):
stft = tf.signal.stft(
tf.squeeze(audio),
self.frame_length,
self.frame_step,
self.fft_length,
pad_end=True,
)
# Taking the magnitude of the STFT output
magnitude = tf.abs(stft)
# Multiplying the Mel-filterbank with the magnitude and scaling it using the db scale
mel = tf.matmul(tf.square(magnitude), self.mel_filterbank)
log_mel_spec = tfio.audio.dbscale(mel, top_db=80)
print(type(log_mel_spec))
return tf.expand_dims(log_mel_spec,axis=-1) if self.as_3D_tensor else tf.squeeze(log_mel_spec)
def get_config(self):
config = super(MelLayer, self).get_config()
config.update(
{
"frame_length": self.frame_length,
"frame_step": self.frame_step,
"fft_length": self.fft_length,
"sampling_rate": self.sampling_rate,
"num_mel_channels": self.num_mel_channels,
"freq_min": self.freq_min,
"freq_max": self.freq_max,
}
)
return config
class LogMelSpectrogram(tf.keras.layers.Layer):
"""Compute log-magnitude mel-scaled spectrograms."""
def __init__(self, sample_rate, fft_size, hop_size, n_mels,
f_min=0.0, f_max=None, **kwargs):
super(LogMelSpectrogram, self).__init__(**kwargs)
self.sample_rate = sample_rate
self.fft_size = fft_size
self.hop_size = hop_size
self.n_mels = n_mels
self.f_min = f_min
self.f_max = f_max if f_max else sample_rate / 2
self.mel_filterbank = tf.signal.linear_to_mel_weight_matrix(
num_mel_bins=self.n_mels,
num_spectrogram_bins=fft_size // 2 + 1,
sample_rate=self.sample_rate,
lower_edge_hertz=self.f_min,
upper_edge_hertz=self.f_max)
def build(self, input_shape):
self.non_trainable_weights.append(self.mel_filterbank)
super(LogMelSpectrogram, self).build(input_shape)
def call(self, waveforms):
"""Forward pass.
Parameters
----------
waveforms : tf.Tensor, shape = (None, n_samples)
A Batch of mono waveforms.
Returns
-------
log_mel_spectrograms : (tf.Tensor), shape = (None, time, freq, ch)
The corresponding batch of log-mel-spectrograms
"""
def _tf_log10(x):
numerator = tf.math.log(x)
denominator = tf.math.log(tf.constant(10, dtype=numerator.dtype))
return numerator / denominator
def power_to_db(magnitude, amin=1e-16, top_db=80.0):
"""
https://librosa.github.io/librosa/generated/librosa.core.power_to_db.html
"""
ref_value = tf.reduce_max(magnitude)
log_spec = 10.0 * _tf_log10(tf.maximum(amin, magnitude))
log_spec -= 10.0 * _tf_log10(tf.maximum(amin, ref_value))
log_spec = tf.maximum(log_spec, tf.reduce_max(log_spec) - top_db)
return log_spec
spectrograms = tf.signal.stft(waveforms,
frame_length=self.fft_size,
frame_step=self.hop_size,
pad_end=False)
magnitude_spectrograms = tf.abs(spectrograms)
mel_spectrograms = tf.matmul(tf.square(magnitude_spectrograms),
self.mel_filterbank)
log_mel_spectrograms = power_to_db(mel_spectrograms)
# add channel dimension
log_mel_spectrograms = tf.expand_dims(log_mel_spectrograms, 3)
return log_mel_spectrograms
def get_config(self):
config = {
'fft_size': self.fft_size,
'hop_size': self.hop_size,
'n_mels': self.n_mels,
'sample_rate': self.sample_rate,
'f_min': self.f_min,
'f_max': self.f_max,
}
config.update(super(LogMelSpectrogram, self).get_config())
return config
When I try to build the model
input = tf.keras.layers.Input(shape=(32000,1)) #audios will be 32000 samples, mono channel
mel_l = MelLayer(frame_length=512,
frame_step=512,
fft_length=None,
sampling_rate=16000,
num_mel_channels=80,
freq_min=1,
freq_max=2000,
name="spec")(input)
output = tf.keras.layers.Conv2D(16,3)(mel_l)
This throws the following error
/usr/local/lib/python3.7/dist-packages/keras/layers/convolutional.py in _get_input_channel(self, input_shape)
370 def _get_input_channel(self, input_shape):
371 channel_axis = self._get_channel_axis()
--> 372 if input_shape.dims[channel_axis].value is None:
373 raise ValueError('The channel dimension of the inputs should be defined. '
374 f'The input_shape received is {input_shape}, '
TypeError: 'NoneType' object is not subscriptable
I think it has to do with my custom class definition, as if it wasn’t detecting the input shape,
The source for the custom layer is here:
MelGAN-based spectrogram inversion using feature matching, I tried to copy the steps but it does not seem to work