Source code for maad.util.audio_metadata_utilities

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Functions to get audio metadata from files

Warning for Windows users. Due to problems using slashes and backslashes, Paths must be raw strings
instead of regular strings, to convert a regular string into a raw string simply add an r before the string.
for example:
    raw string:        r'C:/Users/Documents/Folder/SubFolder/file.wav'
"""

import wave
import glob
import os
import pandas as pd
from pathlib import Path
import numpy as np

#%%
def _ensure_directory(path_str):
    # Create a Path object from the input string
    path = Path(path_str)
    
    # Check if the path exists and is a directory
    if path.is_dir():
        return path
    elif path.exists():
        # If the path exists but is not a directory, raise an error
        raise ValueError(f"'{path_str}' exists but is not a directory.")
    else:
        # If the path does not exist, try adding a '/' at the end and check again
        path_with_slash = path / ''
        if path_with_slash.is_dir():
            return path_with_slash
        else:
            raise ValueError(f"'{path_str}' does not exist as a directory.")

#%%

[docs]
def check_file_format(path_audio):
    """
    Check Wave file consistency. Check if WAVE format is correct and if file name 
    follows standard format. The standard format is SITENAME_DATE_TIME.WAV, with 
    DATE as YYYYMMDD and TIME as HHMMSS.

    Parameters
    ----------
    path_audio : str
        Location of audio filename.

    Raises
    ------
    File Not Found
        If file does not exist.

    Returns
    -------
    error : int
        0 if no error is found, 1 if WAVE format is incorrect and 2 if filename has no
        standard format.

    """
    basename = os.path.basename(path_audio)

    # Check Wave format:
    # try to open wav file, if error return only file name and null values on fields
    try:
        with wave.open(path_audio, 'rb') as f:
            _ = f.getparams()

    except FileNotFoundError as fnfe:
        raise fnfe

    except:
        error = 1
        return error

    # Check file name format:
    # 1. File name must have 3 fields separated by underscore '_'
    # 2. Second field (date) must have 8 characters
    # 3. Third field (time) must have 6 charcaters + 4 = 10 ('.WAV')
    if (len(basename.split('_')) != 3):
        error = 2
        return error

    else:
        date_str = basename.split('_')[1]
        time_str = basename.split('_')[2]
        if ((len(date_str) != 8) |  # date_str should have 8 characters
                (len(time_str) != 10) |  # time_str + '.wav' should have 10 characters
                (not (date_str.isnumeric())) |  # date_str should be numeric
                (not (time_str[0:-4].isnumeric()))  # time_str should be numeric
        ):
            error = 2
        else:
            error = 0

    return error


#%% 

[docs]
def audio_header(path_audio):
    """
    Get audio header information from WAVE file. 
    Header information includes, sample rate, bit depth, number of channels, 
    number of samples, file size and duration.
    
    Parameters
    ----------
    path_audio : str
        Location of audio file.

    Returns
    -------
    metadata : dictionary
        header information.
    
    Examples
    --------
    >>> from maad import util
    >>> dic_metadata = util.audio_header('../data/spinetail.wav')
    >>> print(dic_metadata)
    {'path_audio': '../data/spinetail.wav', 'fname': 'spinetail.wav', 'sample_rate': 44100, 'channels': 1, 'bits': 16, 'samples': 861799, 'fsize': 1723642, 'length': 19.541927437641725}
    """
    basename = os.path.basename(path_audio)

    with wave.open(path_audio, 'rb') as f:
        meta = f.getparams()

    metadata = {'path_audio': path_audio,
                'fname': basename,
                'sample_rate': meta.framerate,
                'channels': meta.nchannels,
                'bits': meta.sampwidth * 8,
                'samples': meta.nframes,
                'fsize': os.path.getsize(path_audio),
                'length': meta.nframes / meta.framerate}
    return metadata


#%% 

[docs]
def filename_info(path_audio, verbose =False):
    """
    Get information from filename when using standard format. The standard format is
    SITENAME_DATE_TIME.WAV, with DATE as YYYYMMDD and TIME as HHMMSS.

    Parameters
    ----------
    path_audio : str
        Location of audio file.

    Returns
    -------
    metadata : dictionary
        file name information.


    """
    if check_file_format(path_audio) == 0:
        basename = os.path.basename(path_audio)
        date = basename.split("_")[1]
        hour = basename.split("_")[2]
        date_fmt = date[0:4] + "-" + date[4:6] + "-" + date[6:8] + " " + hour[0:2] + ":" + hour[2:4] + ":" + hour[4:6]
        # structure data
        metadata = {'path_audio': path_audio,
                    'fname': basename,
                    'sensor_name': basename.split("_")[0],
                    'date': pd.to_datetime(date_fmt),
                    'time': basename.split("_")[2][0:6]}
    else:
        raise TypeError(
            'File name format not supported. The standard format must be SITENAME_DATE_TIME.WAV, with DATE as YYYYMMDD and TIME as HHMMSS.')
    return metadata


#%%

[docs]
def get_metadata_file(path_audio, verbose=False):
    """
    Get metadata asociated with audio recordings in audio file. Metadata includes basic 
    information of the audio file format (sample rate, number of channels, bit depth and 
    file size), and date information from the filename. Note however, that this function 
    is intended for use only with audio files with a self-describing header.

    Parameters
    ----------
    path_audio : str
        Path to the audio file name.
    verbose : boolean, optional
        Display error messages. The default is False.

    Returns
    -------
    metadata : dictionary
        Dictionary with metadata.

    """
    path_audio = path_audio.replace('\\', '/')  # for compatibility with Windows

    basename = os.path.basename(path_audio)
    flag = check_file_format(path_audio)

    if flag == 1:  # unreadable audio file
        metadata = {'path_audio': path_audio,
                    'fname': basename,
                    'sample_rate': np.nan,
                    'channels': np.nan,
                    'bits': np.nan,
                    'samples': np.nan,
                    'fsize': np.nan,
                    'sensor_name': np.nan,
                    'date': np.nan,
                    'time': np.nan,
                    'length': np.nan}
        if verbose:
            print('Incorrect name or wave format. Return null values for: ', path_audio)
        return metadata

    elif flag == 2:  # filename format error (for passive acoustic monitoring only)
        info_header = audio_header(path_audio)
        metadata = {'path_audio': path_audio,
                    'fname': info_header['fname'],
                    'sample_rate': info_header['sample_rate'],
                    'channels': info_header['channels'],
                    'bits': info_header['bits'],
                    'samples': info_header['samples'],
                    'length': info_header['length'],
                    'fsize': info_header['fsize'],
                    'sensor_name': np.nan,
                    'date': np.nan,
                    'time': np.nan}
        return metadata

    else:  # No error,execute function
        info_header = audio_header(path_audio)
        info_fname = filename_info(path_audio)
        metadata = {'path_audio': path_audio,
                    'fname': info_header['fname'],
                    'sample_rate': info_header['sample_rate'],
                    'channels': info_header['channels'],
                    'bits': info_header['bits'],
                    'samples': info_header['samples'],
                    'length': info_header['length'],
                    'fsize': info_header['fsize'],
                    'sensor_name': info_fname['sensor_name'],
                    'date': info_fname['date'],
                    'time': info_fname['time']}
    return metadata


# %%

[docs]
def get_metadata_dir(path_dir, verbose=False):
    """
    Get metadata asociated with audio recordings in a directory. Metadata includes basic 
    information of the audio file format (sample rate, number of channels, bit depth and 
    file size), and date information from the filename. Note however, that this function 
    is intended for use only with audio files with a self-describing header.

    Parameters
    ----------
    path_dir : str
        Path of either a directory or a file. it will select all wav files in the parent folder
        (of either the file or directory in path_dir). The search for file is performed recursively.
    verbose : boolean, optional
        Output file progress. The default is False.

    Returns
    -------
    df_metadata : pandas.DataFrame
        Dataframe with metadata, files as rows and metadata as columns.
    
    See Also
    --------
    maad.util.get_metadata_file, maad.util.audio_header, maad.util.filename_info
    
    Examples
    --------
    >>> from maad import util
    >>> df_metadata = util.get_metadata_dir('../data/indices/')
    """
    # Verify that input is a directory
    path_dir = _ensure_directory(path_dir)

    # List all files recursively and select only wav files.
    flist_wav = glob.glob(f"{path_dir}/**/*.[Ww][Aa][Vv]", recursive=True)

    # Get metadata for each file
    df_metadata = pd.DataFrame()
    for count, file in enumerate(flist_wav):
        if verbose:
            print(f'{count +1} / {len(flist_wav)} : {os.path.basename(file)}', end='\r')

        data = get_metadata_file(file, verbose)
        df_metadata = pd.concat([df_metadata, pd.DataFrame.from_records([data])])

    df_metadata.reset_index(drop=True, inplace=True)
    
    return df_metadata




#%% Examples of use
"""
Test functions 


# 1. Read metadata from a valid file with correct name format -> read ok
path_audio = './test_data/S4A03895_20190522_000000.wav'
get_metadata_file(path_audio)
audio_header(path_audio)
filename_info(path_audio)
check_file_format(path_audio)  # should be 0

# 2. Read metadata from a valid file with incorrect name format -> null output
path_audio = './test_data/spinetail_20220219_30222L.wav'
get_metadata_file(path_audio)
audio_header(path_audio)
filename_info(path_audio)
check_file_format(path_audio) # should be error=2

# 3. Read metadata from a wav file with incorrect format and correct name format -> null output
path_audio = './test_data/NOHEADER_20190522_000000.wav'
get_metadata_file(path_audio)
audio_header(path_audio)
filename_info(path_audio)
check_file_format(path_audio)  # should be error=1

# 4. Read metadata from a file with no header and incorrect name format -> null output
path_audio = './test_data/NOHEADER_BAD_FNAME_FORMAT.wav'
get_metadata_file(path_audio)
audio_header(path_audio)
filename_info(path_audio)
check_file_format(path_audio)  # should be error=1

"""