Source code for maad.sound.spectro_func

#!/usr/bin/env python
""" 
Collection of functions transform an audio signal into spectrogram and 
manipulate spectrograms.
"""
#
# Authors:  Juan Sebastian ULLOA <lisofomia@gmail.com>
#           Sylvain HAUPERT <sylvain.haupert@mnhn.fr>
#
# License: New BSD License

# =============================================================================
# Load the modules
# =============================================================================
# Import external modules
import numpy as np
import scipy as sp

# Import internal modules
from maad.util import (plot1d, plot2d, crop_image, power2dB, amplitude2dB)

# %%
# =============================================================================
# public functions
# =============================================================================


[docs] def spectrogram(x, fs, window='hann', nperseg=1024, noverlap=None, flims=None, tlims=None, mode='psd', verbose=False, display=False, savefig=None, **kwargs): """ Compute a spectrogram using the short-time Fourier transform from an audio signal. The function can compute diferent outputs according to the parameter 'mode': - power (mode='psd') - amplitude (mode = 'amplitude') => sqrt(power) - complex with real and imaginary parts (mode = 'complex') Parameters ---------- x : 1d ndarray Vector containing the sound waveform fs : int The sampling frequency in Hz window : str or tuple or array_like, optional, default to 'hann' Desired window to use. If window is a string or tuple, it is passed to get_window to generate the window values, which are DFT-even by default. See get_window for a list of windows and required parameters. If window is array_like it will be used directly as the window and its length must be nperseg. nperseg : int, optional. Defaults to 1024. Length of the segment used to compute the FFT. No zero padding. For fast calculation, it's better to use a number that is a power 2. This parameter sets the resolution in frequency as the spectrogram will contains nperseg/2 frequency bins between 0Hz-(fs/2)Hz, with a resolution df = fs/nperseg It sets also the time slot (dt) of each frequency frames : dt = nperseg/fs The higher is the number, the lower is the resolution in time (dt) but better is the resolution in frequency (df). noverlap : int, optional. Defaults to None. Number of points to overlap between segments. If None, noverlap = nperseg // 2. mode : str, optional. Default is 'psd' Choose the output between - 'psd' : Power Spectral Density - 'amplitude' : module of the stft (sqrt(psd)) - 'complex' : real and imaginary part of the stft flims, tlims : list of 2 scalars [min, max], optional, default is None flims corresponds to the min and max boundary frequency values tlims corresponds to the min and max boundary time values verbose : boolean, optional, default is False print messages into the consol or terminal if verbose is True display : boolean, optional, default is False Display the signal if True savefig : string, optional, default is None Root filename (with full path) is required to save the figures. Postfix is added to the root filename. **kwargs, optional. This parameter is used by plt.plot and savefig functions - savefilename : str, optional, default :'_filt_audiogram.png' Postfix of the figure filename - db_range : scalar, optional, default : 100 if db_range is a number, anything lower than -db_range is set to -db_range and anything larger than 0 is set to 0 - figsize : tuple of integers, optional, default: (4,10) width, height in inches. - title : string, optional, default : 'Spectrogram' title of the figure - xlabel : string, optional, default : 'Time [s]' label of the horizontal axis - ylabel : string, optional, default : 'Amplitude [AU]' - cmap : string or Colormap object, optional, default is 'gray' See https://matplotlib.org/examples/color/colormaps_reference.html in order to get all the existing colormaps examples: 'hsv', 'hot', 'bone', 'tab20c', 'jet', 'seismic', 'viridis'... - vmin, vmax : scalar, optional, default: None `vmin` and `vmax` are used in conjunction with norm to normalize luminance data. Note if you pass a `norm` instance, your settings for `vmin` and `vmax` will be ignored. - extent : list of scalars [left, right, bottom, top], optional, default: None The location, in data-coordinates, of the lower-left and upper-right corners. If `None`, the image is positioned such that the pixel centers fall on zero-based (row, column) indices. - dpi : integer, optional, default is 96 Dot per inch. For printed version, choose high dpi (i.e. dpi=300) => slow For screen version, choose low dpi (i.e. dpi=96) => fast - format : string, optional, default is 'png' Format to save the figure ... and more, see matplotlib Returns ------- Sxx : 2d ndarray of floats Spectrogram : Matrix containing K frames with N/2 frequency bins, K*N <= length (wave) Sxx unit is power => Sxx_power if mode is 'psd' Sxx unit is amplitude => Sxx_ampli if mode is 'amplitude' or 'complex' tn : 1d ndarray of floats time vector (horizontal x-axis) fn : 1d ndarray of floats Frequency vector (vertical y-axis) extent : list of scalars [left, right, bottom, top] The location, in data-coordinates, of the lower-left and upper-right corners. Notes ----- This function takes care of the energy conservation which is crucial when working with sound pressure level (dB SPL) Examples -------- >>> s,fs = maad.sound.load("../data/rock_savanna.wav") Compute energy of signal s >>> E1 = sum(s**2) >>> maad.util.power2dB(E1) 44.861029507805256 Compute the spectrogram with 'psd' output (if N<4096, the energy is lost) >>> N = 4096 >>> Sxx_power,tn,fn,ext = maad.sound.spectrogram (s, fs, nperseg=N, noverlap=N//2, mode = 'psd') Display Power Spectrogram >>> Sxx_dB = maad.util.power2dB(Sxx_power) # convert into dB >>> fig_kwargs = {'vmax': Sxx_dB.max(), 'vmin':-70, 'extent':ext, 'figsize':(4,13), 'title':'Power spectrogram density (PSD)', 'xlabel':'Time [sec]', 'ylabel':'Frequency [Hz]', } >>> fig, ax = maad.util.plot2d(Sxx_dB,**fig_kwargs) Compute mean power spectrogram >>> S_power_mean = maad.sound.avg_power_spectro(Sxx_power) energy => power x time >>> E2 = sum(S_power_mean*len(s)) >>> maad.util.power2dB(E2) 44.93083283875093 Compute the spectrogram with 'amplitude' output >>> Sxx_ampli,tn,fn,_ = maad.sound.spectrogram (s, fs, nperseg=N, noverlap=N//2, mode='amplitude') For energy conservation => convert Sxx_ampli (amplitude) into power before doing the average. >>> S_ampli_mean = maad.sound.avg_amplitude_spectro(Sxx_ampli) >>> S_power_mean = S_ampli_mean**2 energy => power x time >>> E3 = sum(S_power_mean*len(s)) >>> maad.util.power2dB(E3) 44.93083283875093 """ # Get the argument detrend. By default is "constant" but some reasons (ADI # and AEI index from soundecology), detrend should be None detrend = kwargs.pop("detrend", "constant") # Test if noverlap is None. By default, noverlap is half the length of the fft if noverlap is None: noverlap = nperseg // 2 # compute the number of frames K = len(x)//(nperseg-noverlap)-1 # compute spectrogram fn, tn, Sxx_complex = sp.signal.spectrogram(x, fs, window=window, nperseg=nperseg, noverlap=noverlap, nfft=nperseg, mode='complex', detrend=detrend, scaling='density', axis=-1) # Mutliply by the frequency resolution step (fs/nperseg) to get the power # so multiply by the sqrt((fs/nperseg)) to get the amplitude # Also multiply by sqrt(2) in order to compensate that only the postive frequencies are kept # complex Sxx_complex = Sxx_complex*np.sqrt(2*(fs/nperseg)) # magnitude Sxx = abs(Sxx_complex) # power PSDxx = Sxx**2 if mode == 'complex': Sxx_out = Sxx_complex if mode == 'amplitude': Sxx_out = Sxx if mode == 'psd': Sxx_out = PSDxx # test if the last frames are computed on a whole time frame. # if note => remove these frames if PSDxx.shape[1] > K: sup = Sxx_out.shape[1] - K Sxx_out = Sxx_out[:, :-sup] tn = tn[:-sup] # Remove the last frequency bin in order to obtain nperseg/2 frequency bins # instead of nperseg/2 + 1 Sxx_out = Sxx_out[:-1, :] fn = fn[:-1] if verbose: print('spectrogram dimension Nx=%d Ny=%d' % (Sxx_complex.shape[0], Sxx_complex.shape[1])) # Crop the image in order to analyzed only a portion of it if (flims or tlims) is not None: if verbose: print('Crop the spectrogram along time axis and frequency axis') Sxx_out, tn, fn = crop_image(Sxx_out, tn, fn, flims, tlims) if verbose: print('max value of the spectrogram %.5f' % Sxx_out.max()) # Extent extent = [tn[0], tn[-1], fn[0], fn[-1]] # dt and df resolution dt = tn[1]-tn[0] df = fn[1]-fn[0] if verbose == True: print("*************************************************************") print(" Time resolution dt=%.2fs | Frequency resolution df=%.2fHz " % (dt, df)) print("*************************************************************") # Display if display: ylabel = kwargs.pop('ylabel', 'Frequency [Hz]') xlabel = kwargs.pop('xlabel', 'Time [sec]') title = kwargs.pop('title', 'Spectrogram') cmap = kwargs.pop('cmap', 'gray') figsize = kwargs.pop('figsize', (4, 0.33*(extent[1]-extent[0]))) db_range = kwargs.pop('db_range', 96) # convert into dB if mode == 'psd': Sxx_disp = power2dB(Sxx_out, db_range=db_range) if mode == 'amplitude': Sxx_disp = amplitude2dB(Sxx_out, db_range=db_range) if mode == 'complex': Sxx_disp = amplitude2dB(Sxx_out, db_range=db_range) vmin = kwargs.pop('vmin', -db_range) vmax = kwargs.pop('vmax', Sxx_disp.max()) _, fig = plot2d(Sxx_disp, extent=extent, figsize=figsize, title=title, ylabel=ylabel, xlabel=xlabel, vmin=vmin, vmax=vmax, cmap=cmap, **kwargs) # SAVE FIGURE if savefig is not None: dpi = kwargs.pop('dpi', 96) bbox_inches = kwargs.pop('bbox_inches', 'tight') format = kwargs.pop('format', 'png') savefilename = kwargs.pop('savefilename', '_spectrogram') filename = savefig+savefilename+'.'+format print('\n''save figure : %s' % filename) fig.savefig(fname=filename, dpi=dpi, bbox_inches=bbox_inches, format=format, **kwargs) return Sxx_out, tn, fn, extent
# %%
[docs] def linear_to_octave(X, fn, thirdOctave=True, display=False, **kwargs): """ Transform a linear spectrum (1d) or Spectrogram (2d into octave or 1/3 octave spectrum (1d) or Spectrogram (2d). Our advice is to work with PSD (amplitude²) for energy conservation. Parameters ---------- X : ndarray of floats Linear spectrum (1d) or Spectrogram (2d). Work with PSD to be consistent with energy conservation fn : 1d ndarray of floats Frequency vector of the linear spectrum/spectrogram thirdOctave : Boolean, default is True choose between Octave or thirdOctave frequency resolution display : boolean, default is False Display the octave spectrum/spectrogram ** kwargs : optional. This parameter is used by plt.plot Returns ------- X_octave : ndarray of floats Octave or 1/3 octave Spectrum (1d) or Spectrogram (2d) bin_octave : vector of floats New frequency vector (octave or 1/3 octave frequency repartition) Examples -------- >>> w, fs = maad.sound.load('../data/rock_savanna.wav') >>> Sxx_power,tn,fn, ext = maad.sound.spectrogram (w, fs, nperseg=8192) >>> maad.sound.linear_to_octave(Sxx_power, fn, display=True, extent=ext, vmin=-50) """ # define the third octave or octave frequency vector in Hz. if thirdOctave: bin_octave = np.array([16, 20, 25, 31.5, 40, 50, 63, 80, 100, 125, 160, 200, 250, 315, 400, 500, 630, 800, 1000, 1250, 1600, 2000, 2500, 3150, 4000, 5000, 6300, 8000, 10000, 12500, 16000, 20000]) # third octave band. else: bin_octave = np.array( [16, 31, 63, 125, 250, 500, 1000, 2000, 4000, 8000, 16000]) # octave # get the corresponding octave from fn bin_octave = bin_octave[(bin_octave >= np.min(fn)) & (bin_octave <= np.max(fn))] # Bins limit bin_octave_low = bin_octave/(2**0.1666666) bin_octave_up = bin_octave*(2**0.1666666) # select the indices corresponding to the frequency bins range X_octave = [] for ii in np.arange(len(bin_octave)): ind = (fn >= bin_octave_low[ii]) & (fn <= bin_octave_up[ii]) X_octave.append(np.sum(X[ind, ], axis=0)) X_octave = np.asarray(X_octave) if display: X_octave_dB = power2dB(X_octave) if np.ndim(X_octave_dB) == 2: extent = kwargs.pop('extent', None) if extent is not None: xlabel = 'Time [sec]' duration = extent[1] - extent[0] deltaT = int(duration/10) xticks = (np.floor(np.arange(0, X.shape[1], X.shape[1]/10)*10)/10, np.floor(np.arange(extent[0], extent[1], deltaT)*10)/10) figsize = (4, 0.33*(extent[1]-extent[0])) else: xlabel = 'pseudoTime [points]' xticks = np.arange(0, X.shape[1], 100), figsize = (4, 13) fig_kwargs = {'vmax': kwargs.pop('vmax', np.max(X_octave_dB)), 'vmin': kwargs.pop('vmin', np.min(X_octave_dB)), 'xticks': xticks, 'figsize': kwargs.pop('figsize', figsize), 'yticks': (np.arange(len(bin_octave)), bin_octave), 'title': 'Octave Spectrogram', 'xlabel': xlabel, 'ylabel': 'Frequency [Hz]', } plot2d(X_octave_dB, **fig_kwargs) elif np.ndim(X_octave_dB) == 1: fig_kwargs = { 'title': 'Octave Spectrum', 'xlabel': kwargs.pop('xlabel', 'Frequency [Hz]'), 'ylabel': kwargs.pop('ylabel', 'Amplitude [dB]'), } plot1d(bin_octave, X_octave_dB, **fig_kwargs) return X_octave, bin_octave
# %%
[docs] def avg_power_spectro(Sxx_power): """ Computes the average of a power spectrogram along the time axis. Parameters ---------- Sxx_power : 2d ndarray of floats Power spectrogram [Matrix] Returns ------- S_power_mean : 1d ndarray of floats Power spectrum [Vector] See Also -------- avg_amplitude_spectro Examples -------- >>> w, fs = maad.sound.load('../data/rock_savanna.wav') >>> Sxx_power,tn, fn, ext = maad.sound.spectrogram (w, fs) Compute mean power spectrogram >>> S_power_mean = maad.sound.avg_power_spectro(Sxx_power) >>> S_power_mean array([2.91313297e-04, 9.16802665e-04, 2.89179556e-04, 1.04767281e-04, 6.66154492e-05, 4.59636926e-05, 3.59688131e-05, 3.30869371e-05, ... """ S_power_mean = np.mean(Sxx_power, axis=1) return S_power_mean
# %%
[docs] def avg_amplitude_spectro(Sxx_ampli): """ Computes the average of an amplitude spectrogram along the time axis. Parameters ---------- Sxx_ampli : 2d ndarray of floats Amplitude spectrogram [Matrix] Returns ------- S_ampli_mean : 1d ndarray of floats Amplitude spectrum [Vector] See Also -------- avg_power_spectro Examples -------- >>> w, fs = maad.sound.load('../data/rock_savanna.wav') >>> Sxx_amplitude,tn, fn, ext = maad.sound.spectrogram (w, fs, mode="amplitude") Compute mean amplitude spectrogram >>> S_amplitude_mean = maad.sound.avg_amplitude_spectro(Sxx_amplitude) >>> S_amplitude_mean array([0.0170679 , 0.03027875, 0.01700528, 0.01023559, 0.00816183, 0.00677965, 0.0059974 , 0.00575212, 0.00700752, 0.00926279, ... """ # average the amplitude spectrogram taking the PSD for energy conservation S_ampli_mean = np.sqrt(np.mean(Sxx_ampli**2, axis=1)) return S_ampli_mean