Source code for maad.util.parser

#!/usr/bin/env python
""" Utilitary functions to parse and read audio and text files. """
#
# Authors:  Juan Sebastian ULLOA <lisofomia@gmail.com>
#           Sylvain HAUPERT <sylvain.haupert@mnhn.fr>
#
# License: New BSD License

#%%
# =============================================================================
# Load the modules
# =============================================================================
# Import external modules
import numpy as np 
import pandas as pd
import re
import os
import glob
from datetime import datetime
from pathlib import Path # in order to be Windows/linux/MacOS compatible


#%%
# =============================================================================
# Private functions
# =============================================================================
def _date_from_filename (filename):
    """
    Extract date and time from the filename. Return a datetime object
    
    Parameters
    ----------
    filename : string
    The filename must follow this format :
    XXXX_yyyymmdd_hhmmss.wav
    with yyyy : year / mm : month / dd: day / hh : hour (24hours) /
    mm : minutes / ss : seconds
            
    Returns
    -------
    date : object datetime
        This object contains the date of creation of the file extracted from
        the filename postfix. 
    """
    # date by default
    date = datetime(1900,1,1,0,0,0,0)
    # test if it is possible to extract the recording date from the filename
    if filename[9:13].isdigit(): 
        yy=int(filename[9:13])
    else:
        return date
    if filename[13:15].isdigit(): 
        mm=int(filename[13:15])
    else:
        return date
    if filename[15:17].isdigit(): 
        dd=int(filename[15:17])
    else:
        return date
    if filename[18:20].isdigit(): 
        HH=int(filename[18:20])
    else:
        return date
    if filename[20:22].isdigit(): 
        MM=int(filename[20:22])
    else:
        return date
    if filename[22:24].isdigit(): 
        SS=int(filename[22:24])
    else:
        return date

    # extract date and time from the filename
    date = datetime(year=yy, month=mm, day=dd, hour=HH, minute=MM, second=SS, 
                    microsecond=0)
    
    return date


#%%
# =============================================================================
# Public functions
# =============================================================================
[docs] def read_audacity_annot (audacity_filename): """ Read Audacity annotations file (or labeling file) and return a Pandas Dataframe with the bounding box and the label of each region of interest (ROI). Allows to read annotations with standard Audacity style (temporal selection) and with spectral selection style (spectro-temporal selection). If the file exists but has no annotations, the function returns and empty dataframe. Parameters ---------- audacity_filename : String Path to the audacity file Returns ------- tab_out : Pandas Dataframe Region of interest with time-frequency limits and manual annotation label References ---------- https://manual.audacityteam.org/man/label_tracks.html Examples -------- >>> from maad import sound >>> from maad.util import power2dB, read_audacity_annot, format_features, overlay_rois >>> s, fs = sound.load('../data/cold_forest_daylight.wav') >>> Sxx_power, tn, fn, ext = sound.spectrogram(s, fs, nperseg=1024, noverlap=1024//2) >>> Sxx_db = power2dB(Sxx_power) + 96 >>> df_rois = read_audacity_annot('../data/cold_forest_daylight_label.txt') >>> df_rois = format_features(df_rois, tn, fn) >>> overlay_rois(Sxx_db, df_rois, **{'vmin':0,'vmax':96,'extent':ext}) """ # try to read file with tab delimiter (if the file is not empty) try: tab_in = pd.read_csv(audacity_filename, delimiter='\t', header=None) # test if time-frequency annotation (1st column contain '/') # Hack to force the type of the column to be string in order to test if # the column contains a character tab_in[0] = tab_in[0].astype('str') if (tab_in[0].str.contains(r"\\", na = False).sum() > 0) : # arrange data t_info = tab_in.loc[np.arange(0, len(tab_in), 2), :] t_info = t_info.rename(index=str, columns={ 0: 'min_t', 1: 'max_t', 2: 'label'}) t_info = t_info.reset_index(drop=True) f_info = tab_in.loc[np.arange(1, len(tab_in)+1, 2), :] f_info = f_info.rename(index=str, columns={ 0: 'slash', 1: 'min_f', 2: 'max_f'}) f_info = f_info.reset_index(drop=True) # return dataframe tab_out = pd.concat( [t_info['label'].astype('str'), t_info['min_t'].astype('float32'), f_info['min_f'].astype('float32'), t_info['max_t'].astype('float32'), f_info['max_f'].astype('float32')], axis=1) else : tab_in = tab_in.rename(index=str, columns={ 0: 'min_t', 1: 'max_t', 2: 'label'}) tab_in['min_f'] = np.nan tab_in['max_f'] = np.nan # return dataframe tab_out = pd.concat([ tab_in['label'].astype('str'), tab_in['min_t'].astype('float32'), tab_in['min_f'].astype('float32'), tab_in['max_t'].astype('float32'), tab_in['max_f'].astype('float32')], axis=1) except : tab_out = pd.DataFrame() return tab_out
#%%
[docs] def write_audacity_annot(fname, df_rois, save_file=True): """ Write audio segmentation to text file in Audacity format, a file that can be imported and modified with Audacity. If the dataframe has no frequency delimiters, annotations are saved with standard Audacity format (temporal segmentation). If the dataframe has temporal and frequencial delimiters, the annotations are saved as spectral selection style (spectro-temporal selection). If the dataframe is empty, the function saves an empty file. Parameters ---------- fname: str filename to save the segmentation df_rois: pandas dataframe Dataframe containing the coordinates corresponding to sound signatures In case of only temporal annotations : df_rois must contain at least the columns 'mint_t', 'max_t' In case of bounding box (temporal eand frequency limits) :: df_rois must contain at least the columns 'min_t', 'max_t', 'min_f', 'max_f' save_file: bool, optional, default=True If True, the file is saved. If False, the file is not saved. Returns ------- df_to_save Dataframe that has been saved Examples -------- >>> s, fs = maad.sound.load('../data/cold_forest_daylight.wav') >>> Sxx_power, tn, fn, ext = maad.sound.spectrogram(s, fs) >>> Sxx_db = maad.util.power2dB(Sxx_power) + 96 >>> Sxx_power_noNoise= maad.sound.median_equalizer(Sxx_power) >>> Sxx_db_noNoise = maad.util.power2dB(Sxx_power_noNoise) >>> Sxx_db_noNoise_smooth = maad.sound.smooth(Sxx_db_noNoise, std=0.5) >>> im_mask = maad.rois.create_mask(im=Sxx_db_noNoise_smooth, mode_bin ='relative', bin_std=8, bin_per=0.5) >>> im_rois, df_rois = maad.rois.select_rois(im_mask, min_roi=25, max_roi=None) >>> df_rois = maad.util.format_features(df_rois, tn, fn) Change path to save the file containing the labels position >>> df_to_save = maad.util.write_audacity_annot('save.txt', df_rois) Import the wav file then the label file in Audacity """ if df_rois.empty: # empty DataFrame print(f'{fname} > No detection found') df_to_save = pd.DataFrame(data=None) else: # if there is no label, create a vector with incremental values if 'label' not in df_rois: df_rois['label'] = np.arange(0,len(df_rois)) # if no frequency coordinates, only temporal annotations if ('min_f' not in df_rois) or ('max_f' not in df_rois): df_to_save = pd.DataFrame({'min_t':df_rois.min_t, 'max_t':df_rois.max_t, 'label':df_rois.label}) else: df_to_save_odd = pd.DataFrame({'index': np.arange(0,len(df_rois)*2,2), 'min_t':df_rois.min_t, 'max_t':df_rois.max_t, 'label':df_rois.label}) df_to_save_even = pd.DataFrame({'index': np.arange(1,len(df_rois)*2,2), 'min_t':'\\', 'max_t':df_rois.min_f, 'label':df_rois.max_f}) df_to_save = pd.concat([df_to_save_odd,df_to_save_even]) df_to_save = df_to_save.set_index('index') df_to_save = df_to_save.sort_index() if save_file: df_to_save.to_csv(fname, index=False, header=False, sep='\t') else: pass return df_to_save
#%%
[docs] def read_raven_annot(raven_filename): """ Read Raven annotations file (or labeling file) and return a Pandas Dataframe with the bounding box and the label of each region of interest (ROI). If the file exists but has no annotations, the function returns and empty dataframe. Parameters ---------- raven_filename : string Path to the annotation file Returns ------- tab_out : Pandas Dataframe Region of interest with time-frequency limits and manual annotation label References ---------- http://ravensoundsoftware.com/wp-content/uploads/2017/11/Raven14UsersManual.pdf """ df_out = pd.read_csv(raven_filename, sep='\t') return df_out
#%%
[docs] def write_raven_annot(fname, df_rois, save_file=True): """ Write audio segmentation to text file in Raven format, a file that can be imported and modified with Raven. If the dataframe has no frequency delimiters, annotations are saved with standard Audacity format (temporal segmentation). If the dataframe has temporal and frequencial delimiters, the annotations are saved as spectral selection style (spectro-temporal selection). If the dataframe is empty, the function saves an empty file. Parameters ---------- fname: str filename to save the segmentation df_rois: pandas dataframe Dataframe containing the coordinates corresponding to sound signatures For bounding box (temporal eand frequency limits) :: df_rois must contain at least the columns 'min_t', 'max_t', 'min_f', 'max_f' Returns ------- df_out: pandas dataframe Dataframe that has been saved in Raven format Examples -------- >>> from maad import sound, rois, util >>> s, fs = sound.load('../data/spinetail.wav') >>> df_rois = rois.find_rois_cwt(s, fs, flims=(3000,8000), tlen=2, th=0) >>> df_rois['Label'] = 'Spinetail' >>> df_raven = util.write_raven_annot('spinetail_annotations.txt', df_rois) """ df_out = df_rois.copy() # Save empty file if dataframe is empty if df_out.size==0: print(fname, '> No detection found') df = pd.DataFrame(data=None) df.to_csv(fname, sep='\t', header=False, index=False) else: # Format dataframe and save # add basic raven columns if needed if not('Selection' in df_out.columns): df_out['Selection'] = np.arange(1, len(df_out)+1) if not('View' in df_out.columns): df_out['View'] = 'Spectrogram 1' if not('Channel' in df_out.columns): df_out['Channel'] = 1 # change column names df_out.rename(columns={ 'min_t': 'Begin Time (s)', 'max_t': 'End Time (s)', 'min_f': 'Low Freq (Hz)', 'max_f': 'High Freq (Hz)'}, inplace=True) # reorder column names colname_raven = ['Selection', 'View', 'Channel', 'Begin Time (s)', 'End Time (s)', 'Low Freq (Hz)', 'High Freq (Hz)'] colname_order = colname_raven + df_out.columns[~df_out.columns.isin(colname_raven)].tolist() df_out = df_out.reindex(columns=colname_order) if save_file: df_out.to_csv(fname, sep='\t', index=False) else: pass return df_out
#%%
[docs] def date_parser(datadir, dateformat='%Y%m%d_%H%M%S', extension='.wav', prefix = '', verbose=False): """ Extracts dates from filenames in a given folder and subfolders. Parameters ---------- datadir : str Path to the folder to search for files. dateformat : str, optional Format string specifying the datetime pattern to extract. The default is'%Y%m%d_%H%M%S' For more information about the format codes, refer to the `strftime format documentation <https://strftime.org/>`_. extension : str, optional, File extension to filter files by (e.g., '.wav', '.mp3'). The default is '.wav'. prefix : str, optional, Prefix of the filenames to match. The default is ''. verbose : bool, optional If True, print the filenames as they are processed. The default is False. Returns ------- pandas.DataFrame DataFrame containing the extracted dates as the index 'Date', and the full file paths in a 'file' column. Raises ------ ValueError If the datetime_format is invalid or does not match the filenames. Notes ----- This function searches for files in the specified folder and its subfolders that have the given extension and match the specified prefix. It extracts the dates from the filenames using the provided datetime_format. The extracted dates are set as the index of the resulting DataFrame. The 'file' column contains the full file paths. Examples -------- >>> folder_path = '../../data/indices/' >>> ext = '.wav' >>> datetime_format = '%Y%m%d_%H%M%S' >>> df = maad.util.date_parser(datadir=folder_path, dateformat=datetime_format, extension=ext) >>> df file Date 2019-05-22 00:00:00 ../../data/indices/S4A03895_20190522_000000.wav 2019-05-22 00:15:00 ../../data/indices/S4A03895_20190522_001500.wav 2019-05-22 00:30:00 ../../data/indices/S4A03895_20190522_003000.wav 2019-05-22 00:45:00 ../../data/indices/S4A03895_20190522_004500.wav 2019-05-22 01:00:00 ../../data/indices/S4A03895_20190522_010000.wav ... ... 2019-05-22 22:45:00 ../../data/indices/S4A03895_20190522_224500.wav 2019-05-22 23:00:00 ../../data/indices/S4A03895_20190522_230000.wav 2019-05-22 23:15:00 ../../data/indices/S4A03895_20190522_231500.wav 2019-05-22 23:30:00 ../../data/indices/S4A03895_20190522_233000.wav 2019-05-22 23:45:00 ../../data/indices/S4A03895_20190522_234500.wav >>> df = maad.util.date_parser("../../data/indices/", dateformat='SM4', verbose=False) >>> list(df) >>> df file Date 2019-05-22 00:00:00 ../../data/indices/S4A03895_20190522_000000.wav 2019-05-22 00:15:00 ../../data/indices/S4A03895_20190522_001500.wav 2019-05-22 00:30:00 ../../data/indices/S4A03895_20190522_003000.wav 2019-05-22 00:45:00 ../../data/indices/S4A03895_20190522_004500.wav 2019-05-22 01:00:00 ../../data/indices/S4A03895_20190522_010000.wav ... ... 2019-05-22 22:45:00 ../../data/indices/S4A03895_20190522_224500.wav 2019-05-22 23:00:00 ../../data/indices/S4A03895_20190522_230000.wav 2019-05-22 23:15:00 ../../data/indices/S4A03895_20190522_231500.wav 2019-05-22 23:30:00 ../../data/indices/S4A03895_20190522_233000.wav 2019-05-22 23:45:00 ../../data/indices/S4A03895_20190522_234500.wav """ file_pattern = os.path.join(datadir, f'**/{prefix}*{extension}') file_list = glob.glob(file_pattern, recursive=True) data = [] for file_path in file_list: # Extract the filename from the full path filename = os.path.basename(file_path) if dateformat == 'SM4': date =_date_from_filename(filename) data.append({'Date': date, 'file': file_path}) elif dateformat == 'POSIX': posix_time = int(Path(filename).stem, 16) date = datetime.utcfromtimestamp(posix_time).strftime('%Y-%m-%d %H:%M:%S') data.append({'Date': date, 'file': file_path}) else: # Construct a regex pattern to extract the date from the filename pattern = _construct_pattern(dateformat) # Search for the date in the filename match = pattern.search(filename) # If a match is found, extract the date if match: if verbose: print(f'File: {filename}') # Extract the date from the filename date_str = match.group() # Parse the date string try: date = datetime.strptime(date_str, dateformat) data.append({'Date': date, 'file': file_path}) except ValueError: print(f"Error parsing date: {date_str} in file: {filename}. The default date and time 1900-01-01 00:00:00 will be used.") # date by default data.append({'Date': "1900-01-01 00:00:01", 'file': file_path}) else: print(f"No date found in file: {file_path}. The default date and time 1900-01-01 00:00:00 will be used.") # date by default 1900-01-01 00:00:00 data.append({'Date': "1900-01-01 00:00:01", 'file': file_path}) if len(data) > 0: df = pd.DataFrame(data) df.set_index('Date', inplace=True) # convert index to datetime df.index = pd.DatetimeIndex(df.index) # sort dataframe by date df = df.sort_index(axis=0) else: df = pd.DataFrame() return df
def _construct_pattern(datetime_format): format_dict = { '%Y': r'(\d{4})', '%y': r'(\d{2})', '%m': r'(0[1-9]|1[0-2])', '%d': r'(0[1-9]|1\d|2[0-9]|3[01])', '%H': r'([01]\d|2[0-3])', '%I': r'(0[1-9]|1[0-2])', '%p': r'(AM|PM)', '%M': r'([0-5]\d)', '%S': r'([0-5]\d)', '%f': r'(\d{6})', '%j': r'(\d{3})', '%U': r'(\d{2})', '%W': r'(\d{2})', '%w': r'(\d)', '%A': r'(\w+)', '%a': r'(\w+)', '%B': r'(\w+)', '%b': r'(\w+)', '%c': r'(.+)', '%x': r'(.+)', '%X': r'(.+)', '%%': r'%', } pattern = datetime_format for code, regex in format_dict.items(): pattern = pattern.replace(code, regex) pattern = re.compile(pattern) return pattern