Source code for maad.util.audio_metadata_utilities
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Functions to get audio metadata from files
Warning for Windows users. Due to problems using slashes and backslashes, Paths must be raw strings
instead of regular strings, to convert a regular string into a raw string simply add an r before the string.
for example:
raw string: r'C:/Users/Documents/Folder/SubFolder/file.wav'
"""
import wave
import glob
import os
import pandas as pd
from pathlib import Path
import numpy as np
#%%
def _ensure_directory(path_str):
# Create a Path object from the input string
path = Path(path_str)
# Check if the path exists and is a directory
if path.is_dir():
return path
elif path.exists():
# If the path exists but is not a directory, raise an error
raise ValueError(f"'{path_str}' exists but is not a directory.")
else:
# If the path does not exist, try adding a '/' at the end and check again
path_with_slash = path / ''
if path_with_slash.is_dir():
return path_with_slash
else:
raise ValueError(f"'{path_str}' does not exist as a directory.")
#%%
[docs]
def check_file_format(path_audio):
"""
Check Wave file consistency. Check if WAVE format is correct and if file name
follows standard format. The standard format is SITENAME_DATE_TIME.WAV, with
DATE as YYYYMMDD and TIME as HHMMSS.
Parameters
----------
path_audio : str
Location of audio filename.
Raises
------
File Not Found
If file does not exist.
Returns
-------
error : int
0 if no error is found, 1 if WAVE format is incorrect and 2 if filename has no
standard format.
"""
basename = os.path.basename(path_audio)
# Check Wave format:
# try to open wav file, if error return only file name and null values on fields
try:
with wave.open(path_audio, 'rb') as f:
_ = f.getparams()
except FileNotFoundError as fnfe:
raise fnfe
except:
error = 1
return error
# Check file name format:
# 1. File name must have 3 fields separated by underscore '_'
# 2. Second field (date) must have 8 characters
# 3. Third field (time) must have 6 charcaters + 4 = 10 ('.WAV')
if (len(basename.split('_')) != 3):
error = 2
return error
else:
date_str = basename.split('_')[1]
time_str = basename.split('_')[2]
if ((len(date_str) != 8) | # date_str should have 8 characters
(len(time_str) != 10) | # time_str + '.wav' should have 10 characters
(not (date_str.isnumeric())) | # date_str should be numeric
(not (time_str[0:-4].isnumeric())) # time_str should be numeric
):
error = 2
else:
error = 0
return error
#%%
[docs]
def audio_header(path_audio):
"""
Get audio header information from WAVE file.
Header information includes, sample rate, bit depth, number of channels,
number of samples, file size and duration.
Parameters
----------
path_audio : str
Location of audio file.
Returns
-------
metadata : dictionary
header information.
Examples
--------
>>> from maad import util
>>> dic_metadata = util.audio_header('../data/spinetail.wav')
>>> print(dic_metadata)
{'path_audio': '../data/spinetail.wav', 'fname': 'spinetail.wav', 'sample_rate': 44100, 'channels': 1, 'bits': 16, 'samples': 861799, 'fsize': 1723642, 'length': 19.541927437641725}
"""
basename = os.path.basename(path_audio)
with wave.open(path_audio, 'rb') as f:
meta = f.getparams()
metadata = {'path_audio': path_audio,
'fname': basename,
'sample_rate': meta.framerate,
'channels': meta.nchannels,
'bits': meta.sampwidth * 8,
'samples': meta.nframes,
'fsize': os.path.getsize(path_audio),
'length': meta.nframes / meta.framerate}
return metadata
#%%
[docs]
def filename_info(path_audio, verbose =False):
"""
Get information from filename when using standard format. The standard format is
SITENAME_DATE_TIME.WAV, with DATE as YYYYMMDD and TIME as HHMMSS.
Parameters
----------
path_audio : str
Location of audio file.
Returns
-------
metadata : dictionary
file name information.
"""
if check_file_format(path_audio) == 0:
basename = os.path.basename(path_audio)
date = basename.split("_")[1]
hour = basename.split("_")[2]
date_fmt = date[0:4] + "-" + date[4:6] + "-" + date[6:8] + " " + hour[0:2] + ":" + hour[2:4] + ":" + hour[4:6]
# structure data
metadata = {'path_audio': path_audio,
'fname': basename,
'sensor_name': basename.split("_")[0],
'date': pd.to_datetime(date_fmt),
'time': basename.split("_")[2][0:6]}
else:
raise TypeError(
'File name format not supported. The standard format must be SITENAME_DATE_TIME.WAV, with DATE as YYYYMMDD and TIME as HHMMSS.')
return metadata
#%%
[docs]
def get_metadata_file(path_audio, verbose=False):
"""
Get metadata asociated with audio recordings in audio file. Metadata includes basic
information of the audio file format (sample rate, number of channels, bit depth and
file size), and date information from the filename. Note however, that this function
is intended for use only with audio files with a self-describing header.
Parameters
----------
path_audio : str
Path to the audio file name.
verbose : boolean, optional
Display error messages. The default is False.
Returns
-------
metadata : dictionary
Dictionary with metadata.
"""
path_audio = path_audio.replace('\\', '/') # for compatibility with Windows
basename = os.path.basename(path_audio)
flag = check_file_format(path_audio)
if flag == 1: # unreadable audio file
metadata = {'path_audio': path_audio,
'fname': basename,
'sample_rate': np.nan,
'channels': np.nan,
'bits': np.nan,
'samples': np.nan,
'fsize': np.nan,
'sensor_name': np.nan,
'date': np.nan,
'time': np.nan,
'length': np.nan}
if verbose:
print('Incorrect name or wave format. Return null values for: ', path_audio)
return metadata
elif flag == 2: # filename format error (for passive acoustic monitoring only)
info_header = audio_header(path_audio)
metadata = {'path_audio': path_audio,
'fname': info_header['fname'],
'sample_rate': info_header['sample_rate'],
'channels': info_header['channels'],
'bits': info_header['bits'],
'samples': info_header['samples'],
'length': info_header['length'],
'fsize': info_header['fsize'],
'sensor_name': np.nan,
'date': np.nan,
'time': np.nan}
return metadata
else: # No error,execute function
info_header = audio_header(path_audio)
info_fname = filename_info(path_audio)
metadata = {'path_audio': path_audio,
'fname': info_header['fname'],
'sample_rate': info_header['sample_rate'],
'channels': info_header['channels'],
'bits': info_header['bits'],
'samples': info_header['samples'],
'length': info_header['length'],
'fsize': info_header['fsize'],
'sensor_name': info_fname['sensor_name'],
'date': info_fname['date'],
'time': info_fname['time']}
return metadata
# %%
[docs]
def get_metadata_dir(path_dir, verbose=False):
"""
Get metadata asociated with audio recordings in a directory. Metadata includes basic
information of the audio file format (sample rate, number of channels, bit depth and
file size), and date information from the filename. Note however, that this function
is intended for use only with audio files with a self-describing header.
Parameters
----------
path_dir : str
Path of either a directory or a file. it will select all wav files in the parent folder
(of either the file or directory in path_dir). The search for file is performed recursively.
verbose : boolean, optional
Output file progress. The default is False.
Returns
-------
df_metadata : pandas.DataFrame
Dataframe with metadata, files as rows and metadata as columns.
See Also
--------
maad.util.get_metadata_file, maad.util.audio_header, maad.util.filename_info
Examples
--------
>>> from maad import util
>>> df_metadata = util.get_metadata_dir('../data/indices/')
"""
# Verify that input is a directory
path_dir = _ensure_directory(path_dir)
# List all files recursively and select only wav files.
flist_wav = glob.glob(f"{path_dir}/**/*.[Ww][Aa][Vv]", recursive=True)
# Get metadata for each file
df_metadata = pd.DataFrame()
for count, file in enumerate(flist_wav):
if verbose:
print(f'{count +1} / {len(flist_wav)} : {os.path.basename(file)}', end='\r')
data = get_metadata_file(file, verbose)
df_metadata = pd.concat([df_metadata, pd.DataFrame.from_records([data])])
df_metadata.reset_index(drop=True, inplace=True)
return df_metadata
#%% Examples of use
"""
Test functions
# 1. Read metadata from a valid file with correct name format -> read ok
path_audio = './test_data/S4A03895_20190522_000000.wav'
get_metadata_file(path_audio)
audio_header(path_audio)
filename_info(path_audio)
check_file_format(path_audio) # should be 0
# 2. Read metadata from a valid file with incorrect name format -> null output
path_audio = './test_data/spinetail_20220219_30222L.wav'
get_metadata_file(path_audio)
audio_header(path_audio)
filename_info(path_audio)
check_file_format(path_audio) # should be error=2
# 3. Read metadata from a wav file with incorrect format and correct name format -> null output
path_audio = './test_data/NOHEADER_20190522_000000.wav'
get_metadata_file(path_audio)
audio_header(path_audio)
filename_info(path_audio)
check_file_format(path_audio) # should be error=1
# 4. Read metadata from a file with no header and incorrect name format -> null output
path_audio = './test_data/NOHEADER_BAD_FNAME_FORMAT.wav'
get_metadata_file(path_audio)
audio_header(path_audio)
filename_info(path_audio)
check_file_format(path_audio) # should be error=1
"""