Source code for maad.util.parser

#!/usr/bin/env python
""" Utilitary functions to parse and read audio and text files. """
#
# Authors:  Juan Sebastian ULLOA <lisofomia@gmail.com>
#           Sylvain HAUPERT <sylvain.haupert@mnhn.fr>
#
# License: New BSD License

#%%
# =============================================================================
# Load the modules
# =============================================================================
# Import external modules
import numpy as np 
import pandas as pd
import re
import os
import glob
from datetime import datetime
from pathlib import Path # in order to be Windows/linux/MacOS compatible


#%%
# =============================================================================
# Private functions
# =============================================================================
def _date_from_filename (filename):
    """
    Extract date and time from the filename. Return a datetime object
    
    Parameters
    ----------
    filename : string
    The filename must follow this format :
    XXXX_yyyymmdd_hhmmss.wav
    with yyyy : year / mm : month / dd: day / hh : hour (24hours) /
    mm : minutes / ss : seconds
            
    Returns
    -------
    date : object datetime
        This object contains the date of creation of the file extracted from
        the filename postfix. 
    """
    # date by default
    date = datetime(1900,1,1,0,0,0,0)
    # test if it is possible to extract the recording date from the filename
    if filename[9:13].isdigit(): 
        yy=int(filename[9:13])
    else:
        return date
    if filename[13:15].isdigit(): 
        mm=int(filename[13:15])
    else:
        return date
    if filename[15:17].isdigit(): 
        dd=int(filename[15:17])
    else:
        return date
    if filename[18:20].isdigit(): 
        HH=int(filename[18:20])
    else:
        return date
    if filename[20:22].isdigit(): 
        MM=int(filename[20:22])
    else:
        return date
    if filename[22:24].isdigit(): 
        SS=int(filename[22:24])
    else:
        return date

    # extract date and time from the filename
    date = datetime(year=yy, month=mm, day=dd, hour=HH, minute=MM, second=SS, 
                    microsecond=0)
    
    return date


#%%
# =============================================================================
# Public functions
# =============================================================================

[docs]
def read_audacity_annot (audacity_filename):
    """
    Read Audacity annotations file (or labeling file) and return a Pandas Dataframe
    with the bounding box and the label of each region of interest (ROI). Allows to
    read annotations with standard Audacity style (temporal selection) and with
    spectral selection style (spectro-temporal selection). If the file exists but has no 
    annotations, the function returns and empty dataframe.
    
    Parameters
    ----------
    audacity_filename : String
        Path to the audacity file

    Returns
    -------
    tab_out : Pandas Dataframe 
        Region of interest with time-frequency limits and manual annotation label
    
    References
    ----------
    https://manual.audacityteam.org/man/label_tracks.html  
    
    Examples
    --------
    >>> from maad import sound
    >>> from maad.util import power2dB, read_audacity_annot, format_features, overlay_rois
    >>> s, fs = sound.load('../data/cold_forest_daylight.wav')
    >>> Sxx_power, tn, fn, ext = sound.spectrogram(s, fs, nperseg=1024, noverlap=1024//2)
    >>> Sxx_db = power2dB(Sxx_power) + 96
    >>> df_rois = read_audacity_annot('../data/cold_forest_daylight_label.txt') 
    >>> df_rois = format_features(df_rois, tn, fn)
    >>> overlay_rois(Sxx_db, df_rois, **{'vmin':0,'vmax':96,'extent':ext})
    
    """
    # try to read file with tab delimiter (if the file is not empty)
    try:
        tab_in = pd.read_csv(audacity_filename, delimiter='\t', header=None)

        # test if time-frequency annotation (1st column contain '/')
        # Hack to force the type of the column to be string in order to test if 
        # the column contains a character
        tab_in[0] = tab_in[0].astype('str')
        if (tab_in[0].str.contains(r"\\", na = False).sum() > 0) :

            # arrange data
            t_info = tab_in.loc[np.arange(0, len(tab_in), 2), :]
            t_info = t_info.rename(index=str, columns={
                                0: 'min_t', 1: 'max_t', 2: 'label'})
            t_info = t_info.reset_index(drop=True)
    
            f_info = tab_in.loc[np.arange(1, len(tab_in)+1, 2), :]
            f_info = f_info.rename(index=str, columns={
                                0: 'slash', 1: 'min_f', 2: 'max_f'})
            f_info = f_info.reset_index(drop=True)
    
            # return dataframe
            tab_out = pd.concat(
                                [t_info['label'].astype('str'),
                                t_info['min_t'].astype('float32'),
                                f_info['min_f'].astype('float32'),
                                t_info['max_t'].astype('float32'),
                                f_info['max_f'].astype('float32')],  axis=1)
        else :
            tab_in = tab_in.rename(index=str, columns={
                                0: 'min_t', 1: 'max_t', 2: 'label'})
            tab_in['min_f'] = np.nan
            tab_in['max_f'] = np.nan
            
            # return dataframe
            tab_out = pd.concat([
                                tab_in['label'].astype('str'),
                                tab_in['min_t'].astype('float32'),
                                tab_in['min_f'].astype('float32'),
                                tab_in['max_t'].astype('float32'),
                                tab_in['max_f'].astype('float32')],  axis=1)
    except :
        tab_out = pd.DataFrame()

    return tab_out


#%%

[docs]
def write_audacity_annot(fname, df_rois, save_file=True):
    """ 
    Write audio segmentation to text file in Audacity format, a file that can be imported
    and modified with Audacity. If the dataframe has no frequency delimiters, annotations
    are saved with standard Audacity format (temporal segmentation). If the dataframe has
    temporal and frequencial delimiters, the annotations are saved as spectral selection 
    style (spectro-temporal selection). If the dataframe is empty, the function saves an 
    empty file.
    
    Parameters
    ----------
    fname: str
        filename to save the segmentation
    df_rois: pandas dataframe
        Dataframe containing the coordinates corresponding to sound signatures
        In case of only temporal annotations : df_rois must contain at least
        the columns 'mint_t', 'max_t' 
        In case of bounding box (temporal eand frequency limits) :: df_rois 
        must contain at least the columns 'min_t', 'max_t', 'min_f', 'max_f'
    save_file: bool, optional, default=True
        If True, the file is saved. If False, the file is not saved.

    Returns
    -------
    df_to_save
        Dataframe that has been saved
    
    Examples
    --------
    >>> s, fs = maad.sound.load('../data/cold_forest_daylight.wav')
    >>> Sxx_power, tn, fn, ext = maad.sound.spectrogram(s, fs)
    >>> Sxx_db = maad.util.power2dB(Sxx_power) + 96
    >>> Sxx_power_noNoise= maad.sound.median_equalizer(Sxx_power)
    >>> Sxx_db_noNoise = maad.util.power2dB(Sxx_power_noNoise)
    >>> Sxx_db_noNoise_smooth = maad.sound.smooth(Sxx_db_noNoise, std=0.5)
    >>> im_mask = maad.rois.create_mask(im=Sxx_db_noNoise_smooth, mode_bin ='relative', 
                                        bin_std=8, bin_per=0.5)
    >>> im_rois, df_rois = maad.rois.select_rois(im_mask, min_roi=25, max_roi=None)
    >>> df_rois = maad.util.format_features(df_rois, tn, fn)
    
    Change path to save the file containing the labels position
    
    >>> df_to_save = maad.util.write_audacity_annot('save.txt', df_rois)
    
    Import the wav file then the label file in Audacity
    
    """
    if df_rois.empty:  # empty DataFrame
        print(f'{fname} > No detection found')
        df_to_save = pd.DataFrame(data=None)
        
    else:  
        # if there is no label, create a vector with incremental values
        if 'label' not in df_rois:
            df_rois['label'] = np.arange(0,len(df_rois))
        
        # if no frequency coordinates, only temporal annotations
        if ('min_f' not in df_rois) or ('max_f' not in df_rois):
            df_to_save = pd.DataFrame({'min_t':df_rois.min_t, 
                                    'max_t':df_rois.max_t, 
                                    'label':df_rois.label})
        else:
            df_to_save_odd = pd.DataFrame({'index': np.arange(0,len(df_rois)*2,2),
                                        'min_t':df_rois.min_t, 
                                        'max_t':df_rois.max_t, 
                                        'label':df_rois.label})
            df_to_save_even = pd.DataFrame({'index': np.arange(1,len(df_rois)*2,2),
                                        'min_t':'\\', 
                                        'max_t':df_rois.min_f, 
                                        'label':df_rois.max_f})
            df_to_save = pd.concat([df_to_save_odd,df_to_save_even])
            df_to_save = df_to_save.set_index('index')
            df_to_save = df_to_save.sort_index()
            
    if save_file:
        df_to_save.to_csv(fname, index=False, header=False, sep='\t') 
    else:
        pass
    
    return df_to_save


#%% 

[docs]
def read_raven_annot(raven_filename):
    """
    Read Raven annotations file (or labeling file) and return a Pandas Dataframe
    with the bounding box and the label of each region of interest (ROI). If the file 
    exists but has no annotations, the function returns and empty dataframe.
    
    Parameters
    ----------
    raven_filename : string
        Path to the annotation file

    Returns
    -------
    tab_out : Pandas Dataframe 
        Region of interest with time-frequency limits and manual annotation label
    
    References
    ----------
    http://ravensoundsoftware.com/wp-content/uploads/2017/11/Raven14UsersManual.pdf
        
    """
    df_out = pd.read_csv(raven_filename, sep='\t')
    return df_out

    
#%%

[docs]
def write_raven_annot(fname, df_rois, save_file=True):
    """ 
    Write audio segmentation to text file in Raven format, a file that can be imported
    and modified with Raven. If the dataframe has no frequency delimiters, annotations
    are saved with standard Audacity format (temporal segmentation). If the dataframe has
    temporal and frequencial delimiters, the annotations are saved as spectral selection 
    style (spectro-temporal selection). If the dataframe is empty, the function saves an 
    empty file.
    
    Parameters
    ----------
    fname: str
        filename to save the segmentation
    df_rois: pandas dataframe
        Dataframe containing the coordinates corresponding to sound signatures
        For bounding box (temporal eand frequency limits) :: df_rois 
        must contain at least the columns 'min_t', 'max_t', 'min_f', 'max_f'
            
    Returns
    -------
    df_out: pandas dataframe
        Dataframe that has been saved in Raven format
    
    Examples
    --------
    >>> from maad import sound, rois, util
    >>> s, fs = sound.load('../data/spinetail.wav')
    >>> df_rois = rois.find_rois_cwt(s, fs, flims=(3000,8000), tlen=2, th=0)
    >>> df_rois['Label'] = 'Spinetail'
    >>> df_raven = util.write_raven_annot('spinetail_annotations.txt', df_rois)
    
    """
    df_out = df_rois.copy()
    # Save empty file if dataframe is empty
    if df_out.size==0:
        print(fname, '> No detection found')
        df = pd.DataFrame(data=None)
        df.to_csv(fname, sep='\t', header=False, index=False)
        
    else:
        # Format dataframe and save
        # add basic raven columns if needed
        if not('Selection' in df_out.columns):
            df_out['Selection'] = np.arange(1, len(df_out)+1)
        
        if not('View' in df_out.columns):
            df_out['View'] = 'Spectrogram 1'
        
        if not('Channel' in df_out.columns):
            df_out['Channel'] = 1
        
        # change column names
        df_out.rename(columns={
                        'min_t': 'Begin Time (s)', 
                        'max_t': 'End Time (s)',
                        'min_f': 'Low Freq (Hz)',
                        'max_f': 'High Freq (Hz)'}, inplace=True)
        
        # reorder column names
        colname_raven = ['Selection', 'View', 'Channel', 'Begin Time (s)',
                        'End Time (s)', 'Low Freq (Hz)', 'High Freq (Hz)']
        colname_order = colname_raven + df_out.columns[~df_out.columns.isin(colname_raven)].tolist()
        df_out = df_out.reindex(columns=colname_order)
        
        if save_file:
            df_out.to_csv(fname, sep='\t', index=False)
        else:
            pass
    
    return df_out


#%%

[docs]
def date_parser(datadir, dateformat='%Y%m%d_%H%M%S', extension='.wav', prefix = '', verbose=False):
    """
    Extracts dates from filenames in a given folder and subfolders.

    Parameters
    ----------
    datadir : str
        Path to the folder to search for files.
    dateformat : str, optional
        Format string specifying the datetime pattern to extract.
        The default is'%Y%m%d_%H%M%S'
        For more information about the format codes, refer to the
        `strftime format documentation <https://strftime.org/>`_.
    extension : str, optional, 
        File extension to filter files by (e.g., '.wav', '.mp3').
        The default is '.wav'.
    prefix : str, optional, 
        Prefix of the filenames to match.
        The default is ''.
    verbose : bool, optional
        If True, print the filenames as they are processed.
        The default is False.

    Returns
    -------
    pandas.DataFrame
        DataFrame containing the extracted dates as the index 'Date',
        and the full file paths in a 'file' column.

    Raises
    ------
    ValueError
        If the datetime_format is invalid or does not match the filenames.

    Notes
    -----
    This function searches for files in the specified folder and its subfolders
    that have the given extension and match the specified prefix. It extracts
    the dates from the filenames using the provided datetime_format.

    The extracted dates are set as the index of the resulting DataFrame. The
    'file' column contains the full file paths.

    Examples
    --------
    >>> folder_path = '../../data/indices/'
    >>> ext = '.wav'
    >>> datetime_format = '%Y%m%d_%H%M%S'
    >>> df = maad.util.date_parser(datadir=folder_path, dateformat=datetime_format, extension=ext)
    >>> df
                                                                    file
    Date	
    2019-05-22 00:00:00	../../data/indices/S4A03895_20190522_000000.wav
    2019-05-22 00:15:00	../../data/indices/S4A03895_20190522_001500.wav
    2019-05-22 00:30:00	../../data/indices/S4A03895_20190522_003000.wav
    2019-05-22 00:45:00	../../data/indices/S4A03895_20190522_004500.wav
    2019-05-22 01:00:00	../../data/indices/S4A03895_20190522_010000.wav
                    ...	                                            ...
    2019-05-22 22:45:00	../../data/indices/S4A03895_20190522_224500.wav
    2019-05-22 23:00:00	../../data/indices/S4A03895_20190522_230000.wav
    2019-05-22 23:15:00	../../data/indices/S4A03895_20190522_231500.wav
    2019-05-22 23:30:00	../../data/indices/S4A03895_20190522_233000.wav
    2019-05-22 23:45:00	../../data/indices/S4A03895_20190522_234500.wav


    >>> df = maad.util.date_parser("../../data/indices/", dateformat='SM4', verbose=False)
    >>> list(df)
    >>> df
                                                                    file
    Date	
    2019-05-22 00:00:00	../../data/indices/S4A03895_20190522_000000.wav
    2019-05-22 00:15:00	../../data/indices/S4A03895_20190522_001500.wav
    2019-05-22 00:30:00	../../data/indices/S4A03895_20190522_003000.wav
    2019-05-22 00:45:00	../../data/indices/S4A03895_20190522_004500.wav
    2019-05-22 01:00:00	../../data/indices/S4A03895_20190522_010000.wav
                    ...	                                            ...
    2019-05-22 22:45:00	../../data/indices/S4A03895_20190522_224500.wav
    2019-05-22 23:00:00	../../data/indices/S4A03895_20190522_230000.wav
    2019-05-22 23:15:00	../../data/indices/S4A03895_20190522_231500.wav
    2019-05-22 23:30:00	../../data/indices/S4A03895_20190522_233000.wav
    2019-05-22 23:45:00	../../data/indices/S4A03895_20190522_234500.wav


    """    

    file_pattern = os.path.join(datadir, f'**/{prefix}*{extension}')
    file_list = glob.glob(file_pattern, recursive=True)
    data = []

    for file_path in file_list:

        # Extract the filename from the full path
        filename = os.path.basename(file_path)

        if dateformat == 'SM4':
            date =_date_from_filename(filename)
            data.append({'Date': date, 'file': file_path})
        
        elif dateformat == 'POSIX':
            posix_time = int(Path(filename).stem, 16)
            date = datetime.utcfromtimestamp(posix_time).strftime('%Y-%m-%d %H:%M:%S')
            data.append({'Date': date, 'file': file_path})

        else: 
            # Construct a regex pattern to extract the date from the filename
            pattern = _construct_pattern(dateformat)
            # Search for the date in the filename
            match = pattern.search(filename)
            # If a match is found, extract the date
            if match:
                if verbose:
                    print(f'File: {filename}')
                # Extract the date from the filename
                date_str = match.group()
                # Parse the date string
                try:
                    date = datetime.strptime(date_str, dateformat)
                    data.append({'Date': date, 'file': file_path})
                except ValueError:
                    print(f"Error parsing date: {date_str} in file: {filename}. The default date and time 1900-01-01 00:00:00 will be used.")
                    # date by default
                    data.append({'Date': "1900-01-01 00:00:01", 'file': file_path})
            else:
                print(f"No date found in file: {file_path}. The default date and time 1900-01-01 00:00:00 will be used.")
                # date by default 1900-01-01 00:00:00
                data.append({'Date': "1900-01-01 00:00:01", 'file': file_path})
    if len(data) > 0:
        df = pd.DataFrame(data)
        df.set_index('Date', inplace=True)
        # convert index to datetime 
        df.index = pd.DatetimeIndex(df.index)
        # sort dataframe by date
        df = df.sort_index(axis=0)
    else:
        df = pd.DataFrame()

    return df


def _construct_pattern(datetime_format):
    format_dict = {
        '%Y': r'(\d{4})',
        '%y': r'(\d{2})',
        '%m': r'(0[1-9]|1[0-2])',
        '%d': r'(0[1-9]|1\d|2[0-9]|3[01])',
        '%H': r'([01]\d|2[0-3])',
        '%I': r'(0[1-9]|1[0-2])',
        '%p': r'(AM|PM)',
        '%M': r'([0-5]\d)',
        '%S': r'([0-5]\d)',
        '%f': r'(\d{6})',
        '%j': r'(\d{3})',
        '%U': r'(\d{2})',
        '%W': r'(\d{2})',
        '%w': r'(\d)',
        '%A': r'(\w+)',
        '%a': r'(\w+)',
        '%B': r'(\w+)',
        '%b': r'(\w+)',
        '%c': r'(.+)',
        '%x': r'(.+)',
        '%X': r'(.+)',
        '%%': r'%',
    }
    pattern = datetime_format
    for code, regex in format_dict.items():
        pattern = pattern.replace(code, regex)
    pattern = re.compile(pattern)

    return pattern