Source code for maad.sound.transform

#!/usr/bin/env python
""" 
Collection of functions to transform audio signals : Take the envelope
"""
#
# Authors:  Juan Sebastian ULLOA <lisofomia@gmail.com>
#           Sylvain HAUPERT <sylvain.haupert@mnhn.fr>
#
# License: New BSD License

# =============================================================================
# Load the modules
# =============================================================================
# Import external modules
import numpy as np
import pandas as pd
from scipy.signal import hilbert
from scipy.signal import periodogram, welch
import scipy
from warnings import warn

# import internal modules
from maad.sound import wave2frames
from maad.util import plot_spectrum

#%%
# =============================================================================
# public functions
# =============================================================================

[docs]
def envelope (s, mode='fast', Nt=32):
    """
    Calculate the envelope of a sound waveform (1d)
    
    Parameters
    ----------
    s : 1d ndarray of floats 
        Vector containing sound waveform 
    mode : str, optional, default is `fast`
        - `fast` : The sound is first divided into frames (2d) using the 
            function wave2timeframes(s), then the max of each frame gives a 
            good approximation of the envelope.
        - `Hilbert` : estimation of the envelope from the Hilbert transform. 
            The method is slow
    Nt : integer, optional, default is `32`
        Size of each frame. The largest, the highest is the approximation.
                  
    Returns
    -------    
    env : 1d ndarray of floats
        Envelope of the sound
        
    References
    ----------
    .. [1] Towsey, Michael (2013), Noise Removal from Waveforms and Spectrograms Derived from Natural Recordings of the Environment. Queensland University of Technology, Brisbane.
    .. [2] Towsey, Michael (2017),The calculation of acoustic indices derived from long-duration recordings of the natural environment. Queensland University of Technology, Brisbane.
    
    Examples
    --------
    >>> s,fs = maad.sound.load("../data/rock_savanna.wav")
    >>> env_fast = maad.sound.envelope(s, mode='fast', Nt=32)
    >>> env_fast
    array([0.2300415 , 0.28643799, 0.24285889, ..., 0.3059082 , 0.20040894,
       0.26074219])
    
    >>> env_hilbert = maad.sound.envelope(s, mode='hilbert')
    >>> env_hilbert
    array([0.06588196, 0.11301711, 0.09201435, ..., 0.18053983, 0.18351906,
       0.10258595])
    
    Compute the time vector for the vector wave.
    
    >>> import numpy as np
    >>> t = np.arange(0,len(s),1)/fs
    
    Compute the time vector for the vector env_fast.
    
    >>> t_env_fast = np.arange(0,len(env_fast),1)*len(s)/fs/len(env_fast)
    
    Plot 0.1s of the envelope and 0.1s of the abs(s).
    
    >>> import matplotlib.pyplot as plt
    >>> fig1, ax1 = plt.subplots(figsize=(10,4))
    >>> ax1.plot(t[t<0.1], abs(s[t<0.1]), label='abs(s)', lw=0.7)
    >>> ax1.plot(t[t<0.1], env_hilbert[t<0.1], label='env(s) - hilbert option', lw=0.7)
    >>> ax1.plot(t_env_fast[t_env_fast<0.1], env_fast[t_env_fast<0.1], label='env(s) - fast option', lw=0.7)
    >>> ax1.set_xlabel('Time [sec]')
    >>> ax1.set_ylabel('Amplitude')
    >>> ax1.legend()
    """
    if mode == 'fast' :
        # Envelope : take the max (see M. Towsey) of each frame
        frames = wave2frames(s, Nt)
        env = np.max(abs(frames),0) 
    elif mode =='hilbert' :
        # Compute the hilbert transform of the waveform and take the norm 
        # (magnitude) 
        env = np.abs(hilbert(s))  
    else:
        raise Exception("Invalid mode. Mode should be 'fast' or 'hilbert' ")
        
    return env


#%%

[docs]
def spectrum(s, fs, nperseg=256, noverlap=None, nfft=None, window='hann', method='welch', 
        tlims=None, flims=None, scaling='spectrum', as_pandas_series=False, display=False):
    """ 
    Estimate the power spectral density or power spectrum of 1D signal.
    
    The estimates can be computed using two methods: Welch or periodogram. Welch's method
    divides the signal into segments, computes the power spectral density for each 
    segment and then takes the average between segments. The periodogram method computes
    the power spectral density of the input signal using a defined window 
    (Hanning window by default).
    
    Parameters
    ----------
    s: 1D array 
        Input signal to process 
    
    fs: float, optional
        Sampling frequency of audio signal
    
    nperseg: int, optional
        Length of segment for 'welch' method, default is 256
    
    noverlap: int, optional
        Overlap between segments for Welch's method. If None, noverlap = nperseg/2.
    
    window : string, default is 'hann'
        Name of the window used for the short fourier transform, 'hanning' window is 
        now deprecated.
    
    nfft: int, optional
        Length of FFT for periodogram method. If None, length of signal will be used.
        Length of FFT for welch method if zero padding is desired. If None, length of nperseg will be used.
    
    method: {'welch', 'periodogram'}
        Method used to estimate the power spectral density of the signal
    
    tlims: tuple of ints or floats
        Temporal limits to compute the power spectral density in seconds (s)
        If None, estimates for the complete signal will be computed.
        Default is 'None'
    
    flims: tuple of ints or floats
        Spectral limits to compute the power spectral density in Hertz (Hz)
        If None, estimates from 0 to fs/2 will be computed.
        Default is 'None'
        
    scaling: {'spectrum', 'density'}, optional
        Choose between power spectrum (units V**2) or power spectral density (units V**2/Hz) scaling.
        Defaults to 'spectrum'.
    
    as_pandas_series: bool
        Return data as a pandas.Series. This is usefull when computing multiple features
        over a signal. Default is False.
        

    
    Returns
    -------
    pxx: pandas Series
        Power spectral density estimate.
    
    f_idx: pandas Series
        Index of sample frequencies.
    
    Notes
    -----
    This is a wrapper that uses functions from Scipy (scipy.org), in particular from the scipy.signal module.
    
    Examples
    --------
    >>> from maad import sound
    >>> s, fs = sound.load('../data/spinetail.wav')
    >>> spec, f_idx = sound.spectrum(s, fs, nperseg=512, display=True)
    
    Specify temporal and spectral limits to get spectral characteristics of the 
    spinetail's song.
    
    >>> spec, f_idx = sound.spectrum(s, fs, nperseg=2056, tlims=(5.3, 7.9), flims=(2000, 12000), display=True)
    
    """
    if tlims is not None:
    # trim audio signal
        try:
            s = s[int(tlims[0]*fs): int(tlims[1]*fs)]
        except:
            raise Exception('length of tlims tuple should be 2')
    
    # small fix to keep compatibility with old scipy version
    if window=='hanning':
        window='hann'
        warn("Window type 'hanning' will be deprecated. Changing to 'hann' window.", FutureWarning)

    if method=='welch':
        f_idx, pxx = welch(s, fs, window, nperseg, noverlap, nfft, scaling=scaling)
    
    elif method=='periodogram':
        f_idx, pxx = periodogram(s, fs, window, nfft, scaling=scaling)
        
    else:
        raise Exception("Invalid method. Method should be 'welch' or 'periodogram' ")
        
    if flims is not None:
        pxx = pxx[(f_idx>=flims[0]) & (f_idx<=flims[1])]
        f_idx = f_idx[(f_idx>=flims[0]) & (f_idx<=flims[1])]

    if as_pandas_series==True:
        index_names = ['psd_' + str(idx).zfill(3) for idx in range(1,len(pxx)+1)]
        pxx = pd.Series(pxx, index=index_names)
        f_idx = pd.Series(f_idx, index=index_names)
        
    if display:
        plot_spectrum(pxx, f_idx)

    return pxx, f_idx


#%%

[docs]
def resample(s, fs, target_fs, res_type = 'scipy_poly', **kwargs):
    """
    Changes the sample rate of an audio file or any time series.

    Parameters
    ----------
    s : np.ndarray 
        Mono or stereo signal as NumPy array.

    fs : int
        Time series sampling rate. 

    target_fs : int
        Target sampling rate.

    res_type : str, optional
        Resample method. By default 'kaiser_best'.
        'kaiser_best' method that applies a Kaiser windowed sinc interpolation.
        'kaiser_fast' similar to kaiser_best, but faster.
        'scipy' is a Fourier transform method implemented in scipy.
        'scipy_poly' Resample using polyphase filtering and an FIR filter. 

    kwargs : additional keyword arguments
        If `res_type='scipy'`, additional keyword arguments to pass to
        `scipy.signal.resample`.

    Returns
    -------
    res_data : np.ndarray 
        Resampling time series.

    See Also
    --------
    scipy.signal.resample
    
    Examples
    --------
    
    Resample an audio file from sample rate of 44100 kHz to 22050.
    
    >>> from maad import sound
    >>> s, fs = sound.load('../data/spinetail.wav')
    >>> s_resamp = sound.resample(s, fs, target_fs=fs/2)
    >>> print('Number of samples - original audio:', s.shape[0],
    ...       'Number of samples - resampled audio:', s_resamp.shape[0])
    >>> _ = sound.spectrogram(s, fs, display=True)
    >>> _ = sound.spectrogram(s_resamp, fs/2, display=True)

    """
    if fs == target_fs:
        return s

    if res_type == 'scipy':
        # Calculate the resampling ratio and get the number of output samples
        ratio = float(target_fs) / fs
        n_samples = int(np.round(s.shape[-1] * ratio))
        res_data = scipy.signal.resample(s, n_samples, axis=-1, **kwargs)
    
    elif res_type == 'scipy_poly':
        # Calculate the resampling ratio and get the common denominator
        gcd = np.gcd(int(fs), int(target_fs))
        num = target_fs // gcd
        den = fs // gcd
        # Resample the signal using SciPy's resample_poly function
        res_data = scipy.signal.resample_poly(s, num, den)
    
    elif res_type in ('kaiser_best', 'kaiser_fast'):
        raise ValueError("'kaiser_best' and 'kaiser_fast' methods are deprecated. Valid resample methods are 'scipy', 'scipy_poly', not res_type={}".format(res_type))
    
    else:
        raise ValueError("Valid resample methods are 'scipy', 'scipy_poly', not res_type={}".format(res_type))
                         
    return np.ascontiguousarray(res_data, dtype=s.dtype)


#%%

[docs]
def trim(s, fs, min_t, max_t, pad=False, pad_constant=0):
    """
    Slices a time series, from a initial time `min_t` to an ending time `max_t`.  
    If the target duration `duration =` is larger than the original duration and `pad = True`, 
    the time series is padded with a constant value `pad_constant = 0`.

    Parameters
    ----------
    s : np.ndarray 
        Mono or stereo signal as NumPy array.

    fs : int
        Time series sampling rate.

    min_t : float
        Initial time. If initial time `min_t < 0` and `pad=True`, this time is added to the 
        beginning of the audio slice.

    max_t : float
        Ending time of the audio slice.
        
    pad : bool, optional
        If true, the time series is padded with a constant value `pad_constant`. Default is False.
        
    pad_constant : 
        It is the constant with which the time series is padded. 
        
    Returns
    -------
    s_slice : np.ndarray 
        Time series with duration `duration = max_t - min_t`.

    See Also
    --------
    numpy.pad
    
    Examples
    --------
    
    Slice an audio file from 5 to 8 seconds.
    
    >>> from maad import sound
    >>> s, fs = sound.load('../data/spinetail.wav') 
    >>> s_slice = sound.trim(s, fs, min_t = 5, max_t = 8)
    >>> _ = sound.spectrogram(s_slice, fs, display=True, figsize=(4,6))
    >>> s_slice.shape[0]/fs
    3.0
    """

    min_lim = min_t * fs
    max_lim = max_t * fs
    duration = max_t - min_t
    
    if pad:
        
        if s.ndim == 1:  # mono file
            
            if (duration * fs) > s.shape[-1]:
                # length of trimmed section larger than signal duration, pad both sides
                up_lim = int(max_t * fs - s.shape[-1])
                if min_t <=0:
                    low_lim = int(abs(min_t) * fs) 
                    s_slice = np.pad(s, (low_lim, up_lim), 'constant',
                                     constant_values=(pad_constant))
                else:
                    low_lim = int(0.0) 
                    s_slice = np.pad(s, (low_lim, up_lim), 'constant',
                                     constant_values=(pad_constant))
                    s_slice = s_slice[int(min_lim):]

            elif min_t < 0:
                # min_t with negative values, pad left side
                pad_width = int(abs(min_t) * fs)
                up_lim = int(max_t * fs)
                s_slice = s[0:up_lim]
                s_slice = np.pad(s_slice, (pad_width, 0), 'constant', 
                                 constant_values=(pad_constant))

            elif max_t * fs > s.shape[-1]:
                # max_t with values outside duration of signal, pad right side
                low_lim = int(min_t * fs)
                pad_width = int((max_t * fs) - s.shape[-1])
                s_slice = s[low_lim:]
                s_slice = np.pad(s_slice, (0, pad_width), 'constant', 
                                 constant_values=(pad_constant))
                                 
            else:
                # no padding needed
                s_slice = s[int(min_lim): int(max_lim)]
        
        else:  # stereo file
            
            s_slice = np.empty((s.shape[0], int(duration * fs)))
            for i, chanel in enumerate(s):
                if (duration * fs) > chanel.shape[-1]:
                    up_lim = int(max_t * fs - chanel.shape[-1])
                    
                    if min_t <=0:
                        low_lim = int(abs(min_t) * fs) 
                        s_slice = np.pad(chanel, (low_lim, up_lim), 'constant',
                                         constant_values=(pad_constant))
                    else:
                        low_lim = int(0.0) 
                        s_slice = np.pad(chanel, (low_lim, up_lim), 'constant',
                                         constant_values=(pad_constant))
                        s_slice = s_slice[int(min_lim):]
                else:
                    s_slice[i] = s[i][int(min_lim): int(max_lim)]
                    
        return s_slice
    
    else:
        
        if min_t < 0:
            raise ValueError("min_t must be >= 0.")
        
        if s.ndim == 1:
            if (duration * fs) > s.shape[-1]:
                raise ValueError("Target duration is longer than original duration.")
            else:
                s_slice = s[int(min_lim): int(max_lim)]
        else:   
            
            s_slice = np.empty((s.shape[0], int(duration * fs)))
            for i, chanel in enumerate(s):
                if (duration * fs) > chanel.shape[-1]:
                    raise ValueError("Target duration is longer than original duration.")
                else:
                    s_slice[i] = s[i][int(min_lim): int(max_lim)]                
        
        return s_slice

    

[docs]
def normalize(s, max_amp=0.7, max_db=None):
    """
    Normalize audio signal to desired amplitude or decibell full scale value (dBFS).
    
    Parameters
    ----------
    s : np.ndarray 
        Mono audio signal as NumPy array.
    max_amp : float, optional
        Maximum amplitude. Must be between 0-1. The default is 0.7.
    max_db : float, optional
        Maximum amplitude in dBFS. Must be a negative value.

    Returns
    -------
    s_out : np.ndarray
        Normalized mono signal.
        
    See Also
    --------
    sound.trim
    
    Examples
    --------
    >>> import numpy as np
    >>> from maad import sound
    
    Use normalized amplitude value for normalization
    
    >>> s, fs = sound.load('../data/spinetail.wav')
    >>> s_out = sound.normalize(s, max_amp=0.8)
    >>> print('s_out maximum amplitude:', np.abs(s_out).max())
    s_out maximum amplitude: 0.8
    
    Use a dBFS value for normalization:
    
    >>> s, fs = sound.load('../data/spinetail.wav')
    >>> s_out = sound.normalize(s, max_db=-3)
    >>> print('s_out maximum value in dBFS:', np.max(20*np.log10(np.abs(s_out))))
    s_out maximum value in dBFS: -3.0
    
    """
    # Check values for normalization and transform if using max_dbfs
    if (max_amp < 0) | (max_amp > 1):
        raise ValueError('max_amp should be a value between 0 and 1')
        
    elif (max_db is not None):
        if max_db > 0:
            raise ValueError('max_db should be a negative value')
        else:
            # transform dbfs to normalized amplitude
            max_amp = 10 ** (max_db/20)
    else:
        pass
        
    # compute and apply gain value
    gain = max_amp/np.abs(s).max()
    s_out = s*gain
    
    return s_out



[docs]
def gain(s, gain_db=0):
    """
    Amply amplification or attenuation to the audio signal.
    
    Parameters
    ----------
    s : np.ndarray 
        Mono audio signal as NumPy array.
    gain_db : float, optional
        Gain value to amplify (positive values), or attenuate (negative values) the 
        signal in dB.

    Returns
    -------
    s_out : np.ndarray
        Amplified or attenuated audio signal.
        
    See Also
    --------
    sound.normalize, sound.trim
    
    Examples
    --------
    >>> import numpy as np
    >>> from maad import sound
    
    Amplify the signal by 3 dB
    
    >>> s, fs = sound.load('../data/spinetail.wav')
    >>> s_out = sound.gain(s, gain_db=3)
    >>> print('Signal amplified by:', 20*np.log10((s_out.max()/s.max())), 'dB')
    Signal amplified by: 3 dB
    
    Attenuate the signal by 3 dB
    
    >>> s, fs = sound.load('../data/spinetail.wav')
    >>> s_out = sound.gain(s, gain_db=-3)
    >>> print('Signal amplified by:', 20*np.log10((s_out.max()/s.max())), 'dB')
    Signal amplified by: -3 dB
    
    """        
    # Apply the gain value
    
    s_out = s * 10**(gain_db/20)
    
    return s_out