Custom keras layer doesn't output shape properly

Alejandro_BG · December 1, 2021, 12:39pm

Hi i’m trying to get a custom spectrogram layer going and I can’t

class MelLayer(tf.keras.layers.Layer):
    def __init__(
        self,
        frame_length=1024,
        frame_step=256,
        fft_length=None,
        sampling_rate=MODEL_SR,
        num_mel_channels=80,
        freq_min=1,
        freq_max=7600,
        as_3D_tensor=True,
        **kwargs,
    ):
        super().__init__(**kwargs)
        self.frame_length = frame_length
        self.frame_step = frame_step
        self.fft_length = fft_length
        self.sampling_rate = sampling_rate
        self.num_mel_channels = num_mel_channels
        self.freq_min = freq_min
        self.freq_max = freq_max
        # Defining mel filter. This filter will be multiplied with the STFT output
        self.mel_filterbank = tf.signal.linear_to_mel_weight_matrix(
            num_mel_bins=self.num_mel_channels,
            num_spectrogram_bins=self.frame_length // 2 + 1,
            sample_rate=self.sampling_rate,
            lower_edge_hertz=self.freq_min,
            upper_edge_hertz=self.freq_max,
        )
        self.as_3D_tensor = as_3D_tensor

    def call(self, audio, training=True):

        stft = tf.signal.stft(
            tf.squeeze(audio),
            self.frame_length,
            self.frame_step,
            self.fft_length,
            pad_end=True,
        )

        # Taking the magnitude of the STFT output
        magnitude = tf.abs(stft)

        # Multiplying the Mel-filterbank with the magnitude and scaling it using the db scale
        mel = tf.matmul(tf.square(magnitude), self.mel_filterbank)
        log_mel_spec = tfio.audio.dbscale(mel, top_db=80)

        print(type(log_mel_spec))
 
        return tf.expand_dims(log_mel_spec,axis=-1) if self.as_3D_tensor else tf.squeeze(log_mel_spec)


    def get_config(self):
        config = super(MelLayer, self).get_config()
        config.update(
            {
                "frame_length": self.frame_length,
                "frame_step": self.frame_step,
                "fft_length": self.fft_length,
                "sampling_rate": self.sampling_rate,
                "num_mel_channels": self.num_mel_channels,
                "freq_min": self.freq_min,
                "freq_max": self.freq_max,
            }
        )
        return config

class LogMelSpectrogram(tf.keras.layers.Layer):
    """Compute log-magnitude mel-scaled spectrograms."""

    def __init__(self, sample_rate, fft_size, hop_size, n_mels,
                 f_min=0.0, f_max=None, **kwargs):
        super(LogMelSpectrogram, self).__init__(**kwargs)
        self.sample_rate = sample_rate
        self.fft_size = fft_size
        self.hop_size = hop_size
        self.n_mels = n_mels
        self.f_min = f_min
        self.f_max = f_max if f_max else sample_rate / 2
        self.mel_filterbank = tf.signal.linear_to_mel_weight_matrix(
            num_mel_bins=self.n_mels,
            num_spectrogram_bins=fft_size // 2 + 1,
            sample_rate=self.sample_rate,
            lower_edge_hertz=self.f_min,
            upper_edge_hertz=self.f_max)

    def build(self, input_shape):
        self.non_trainable_weights.append(self.mel_filterbank)
        super(LogMelSpectrogram, self).build(input_shape)

    def call(self, waveforms):
        """Forward pass.
        Parameters
        ----------
        waveforms : tf.Tensor, shape = (None, n_samples)
            A Batch of mono waveforms.
        Returns
        -------
        log_mel_spectrograms : (tf.Tensor), shape = (None, time, freq, ch)
            The corresponding batch of log-mel-spectrograms
        """
        def _tf_log10(x):
            numerator = tf.math.log(x)
            denominator = tf.math.log(tf.constant(10, dtype=numerator.dtype))
            return numerator / denominator

        def power_to_db(magnitude, amin=1e-16, top_db=80.0):
            """
            https://librosa.github.io/librosa/generated/librosa.core.power_to_db.html
            """
            ref_value = tf.reduce_max(magnitude)
            log_spec = 10.0 * _tf_log10(tf.maximum(amin, magnitude))
            log_spec -= 10.0 * _tf_log10(tf.maximum(amin, ref_value))
            log_spec = tf.maximum(log_spec, tf.reduce_max(log_spec) - top_db)

            return log_spec

        spectrograms = tf.signal.stft(waveforms,
                                      frame_length=self.fft_size,
                                      frame_step=self.hop_size,
                                      pad_end=False)

        magnitude_spectrograms = tf.abs(spectrograms)

        mel_spectrograms = tf.matmul(tf.square(magnitude_spectrograms),
                                     self.mel_filterbank)

        log_mel_spectrograms = power_to_db(mel_spectrograms)

        # add channel dimension
        log_mel_spectrograms = tf.expand_dims(log_mel_spectrograms, 3)

        return log_mel_spectrograms

    def get_config(self):
        config = {
            'fft_size': self.fft_size,
            'hop_size': self.hop_size,
            'n_mels': self.n_mels,
            'sample_rate': self.sample_rate,
            'f_min': self.f_min,
            'f_max': self.f_max,
        }
        config.update(super(LogMelSpectrogram, self).get_config())

        return config

When I try to build the model


input = tf.keras.layers.Input(shape=(32000,1)) #audios will be 32000 samples, mono channel

mel_l = MelLayer(frame_length=512,
                 frame_step=512,
                 fft_length=None,
                 sampling_rate=16000,
                 num_mel_channels=80,
                 freq_min=1,
                 freq_max=2000,
                 name="spec")(input)

output = tf.keras.layers.Conv2D(16,3)(mel_l)

This throws the following error

/usr/local/lib/python3.7/dist-packages/keras/layers/convolutional.py in _get_input_channel(self, input_shape)
    370   def _get_input_channel(self, input_shape):
    371     channel_axis = self._get_channel_axis()
--> 372     if input_shape.dims[channel_axis].value is None:
    373       raise ValueError('The channel dimension of the inputs should be defined. '
    374                        f'The input_shape received is {input_shape}, '

TypeError: 'NoneType' object is not subscriptable

I think it has to do with my custom class definition, as if it wasn’t detecting the input shape,
The source for the custom layer is here:
https://keras.io/examples/audio/melgan_spectrogram_inversion/, I tried to copy the steps but it does not seem to work

chunduriv · September 14, 2022, 10:21am

github.com/keras-team/keras

Custom keras layer doesn't output input shape correctly

opened 12:36PM - 01 Dec 21 UTC

closed 06:04PM - 23 Dec 21 UTC

AlejandroBaron

type:support stat:awaiting response from contributor stalled

Hi i'm trying to get a custom spectrogram layer going and I can't ``` class …MelLayer(tf.keras.layers.Layer): def __init__( self, frame_length=1024, frame_step=256, fft_length=None, sampling_rate=MODEL_SR, num_mel_channels=80, freq_min=1, freq_max=7600, as_3D_tensor=True, **kwargs, ): super().__init__(**kwargs) self.frame_length = frame_length self.frame_step = frame_step self.fft_length = fft_length self.sampling_rate = sampling_rate self.num_mel_channels = num_mel_channels self.freq_min = freq_min self.freq_max = freq_max # Defining mel filter. This filter will be multiplied with the STFT output self.mel_filterbank = tf.signal.linear_to_mel_weight_matrix( num_mel_bins=self.num_mel_channels, num_spectrogram_bins=self.frame_length // 2 + 1, sample_rate=self.sampling_rate, lower_edge_hertz=self.freq_min, upper_edge_hertz=self.freq_max, ) self.as_3D_tensor = as_3D_tensor def call(self, audio, training=True): stft = tf.signal.stft( tf.squeeze(audio), self.frame_length, self.frame_step, self.fft_length, pad_end=True, ) # Taking the magnitude of the STFT output magnitude = tf.abs(stft) # Multiplying the Mel-filterbank with the magnitude and scaling it using the db scale mel = tf.matmul(tf.square(magnitude), self.mel_filterbank) log_mel_spec = tfio.audio.dbscale(mel, top_db=80) print(type(log_mel_spec)) return tf.expand_dims(log_mel_spec,axis=-1) if self.as_3D_tensor else tf.squeeze(log_mel_spec) def get_config(self): config = super(MelLayer, self).get_config() config.update( { "frame_length": self.frame_length, "frame_step": self.frame_step, "fft_length": self.fft_length, "sampling_rate": self.sampling_rate, "num_mel_channels": self.num_mel_channels, "freq_min": self.freq_min, "freq_max": self.freq_max, } ) return config class LogMelSpectrogram(tf.keras.layers.Layer): """Compute log-magnitude mel-scaled spectrograms.""" def __init__(self, sample_rate, fft_size, hop_size, n_mels, f_min=0.0, f_max=None, **kwargs): super(LogMelSpectrogram, self).__init__(**kwargs) self.sample_rate = sample_rate self.fft_size = fft_size self.hop_size = hop_size self.n_mels = n_mels self.f_min = f_min self.f_max = f_max if f_max else sample_rate / 2 self.mel_filterbank = tf.signal.linear_to_mel_weight_matrix( num_mel_bins=self.n_mels, num_spectrogram_bins=fft_size // 2 + 1, sample_rate=self.sample_rate, lower_edge_hertz=self.f_min, upper_edge_hertz=self.f_max) def build(self, input_shape): self.non_trainable_weights.append(self.mel_filterbank) super(LogMelSpectrogram, self).build(input_shape) def call(self, waveforms): """Forward pass. Parameters ---------- waveforms : tf.Tensor, shape = (None, n_samples) A Batch of mono waveforms. Returns ------- log_mel_spectrograms : (tf.Tensor), shape = (None, time, freq, ch) The corresponding batch of log-mel-spectrograms """ def _tf_log10(x): numerator = tf.math.log(x) denominator = tf.math.log(tf.constant(10, dtype=numerator.dtype)) return numerator / denominator def power_to_db(magnitude, amin=1e-16, top_db=80.0): """ https://librosa.github.io/librosa/generated/librosa.core.power_to_db.html """ ref_value = tf.reduce_max(magnitude) log_spec = 10.0 * _tf_log10(tf.maximum(amin, magnitude)) log_spec -= 10.0 * _tf_log10(tf.maximum(amin, ref_value)) log_spec = tf.maximum(log_spec, tf.reduce_max(log_spec) - top_db) return log_spec spectrograms = tf.signal.stft(waveforms, frame_length=self.fft_size, frame_step=self.hop_size, pad_end=False) magnitude_spectrograms = tf.abs(spectrograms) mel_spectrograms = tf.matmul(tf.square(magnitude_spectrograms), self.mel_filterbank) log_mel_spectrograms = power_to_db(mel_spectrograms) # add channel dimension log_mel_spectrograms = tf.expand_dims(log_mel_spectrograms, 3) return log_mel_spectrograms def get_config(self): config = { 'fft_size': self.fft_size, 'hop_size': self.hop_size, 'n_mels': self.n_mels, 'sample_rate': self.sample_rate, 'f_min': self.f_min, 'f_max': self.f_max, } config.update(super(LogMelSpectrogram, self).get_config()) return config ``` When I try to build the model ``` input = tf.keras.layers.Input(shape=(32000,1)) #audios will be 32000 samples, mono channel mel_l = MelLayer(frame_length=512, frame_step=512, fft_length=None, sampling_rate=16000, num_mel_channels=80, freq_min=1, freq_max=2000, name="spec")(input) output = tf.keras.layers.Conv2D(16,3)(mel_l) ``` This throws the following error ``` /usr/local/lib/python3.7/dist-packages/keras/layers/convolutional.py in _get_input_channel(self, input_shape) 370 def _get_input_channel(self, input_shape): 371 channel_axis = self._get_channel_axis() --> 372 if input_shape.dims[channel_axis].value is None: 373 raise ValueError('The channel dimension of the inputs should be defined. ' 374 f'The input_shape received is {input_shape}, ' TypeError: 'NoneType' object is not subscriptable ``` I think it might have to do with my custom class definition, as if it wasn't detecting the input shape, The source for the custom layer is here: https://keras.io/examples/audio/melgan_spectrogram_inversion/, I tried to copy the steps but it does not seem to work