Source code for maad.util.xeno_canto

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" 
Collection of functions to send queries to www.xeno-canto.org, get dataframe
with all xeno-canto fields and eventually download bird sound files with
JSON metadata.

"""   
#
# Authors:  original author Karoliina Oksanen, 2014
#           Updated to python 3.7.4, Agnieszka Mikolajczyk, 2019
#           Modified for scikit-maad by Sylvain HAUPERT, 2021        
#
# License: New BSD License


import urllib.request
import json
import pandas as pd
from pathlib import Path
import numpy as np
import os

# %%

[docs]
def xc_query(searchTerms,
             max_nb_files = None,
             format_time = False,
             format_date = False,
             random_seed = 1979,
             verbose=False):
    """
    Query metadata from Xeno-Canto website depending on the search terms. The
    audio recordings metadata are grouped and stored in a dataframe.

    Parameters
    ----------
    searchTerms : list
        list of search terms to perform the query
        The main seach terms are :
        - grp : birds
        - gen : genus
        - ssp : subspecies
        - en  : english name
        - q   : quality
        - cnt : country
        - len : length
        - area : continent (europe, africa, america, asia)
        see more here : https://www.xeno-canto.org/help/search
    max_nb_files: integer, optional
        Maximum number of audio files requested. The default is None
    format_time : boolean, optional
        Time in Xeno-Canto is not always present neither correctly formated. 
        If true, time will be correctly formated to be processed as DateTime 
        format. When formating is not possible, the row is dropped. 
        The default is False
    format_date : boolean, optional
        Date in Xeno-Canto is not always present neither correctly formated. 
        If true, rows with uncorrect format of date are dropped.
    random_seed : integer, optional
        Fix the random seed in order to get the same result every time the 
        function is called
    verbose : boolean, optional
        Print messages during the execution of the function. The default is False.

    Returns
    -------
    df_dataset : pandas DataFrame
        Dataframe containing all the recordings metadata matching search terms
    """
    
    # #*** HACK *** to remove the parameter 'type' from query as it does
    # # not work at the time 10 Nov 2022
    # params = searchTerms
    # searchTerms = []
    # if params is not None :
    #     for param in params:
    #         if 'type' not in param :  
    #             searchTerms.append(param)    
    # #*** END HACK *** 
    
    # initialization of 
    numPages = 1
    page = 1
    df_dataset = pd.DataFrame()
    while page < numPages+1:
        if verbose:
            print("Loading page "+str(page)+"...")
        url = 'https://www.xeno-canto.org/api/2/recordings?query={0}&page={1}'.format(
            '%20'.join(searchTerms), page)
        if verbose:
            print(url)
        jsonPage = urllib.request.urlopen(url)
        jsondata = json.loads(jsonPage.read().decode('utf-8'))
        # check number of pages
        numPages = jsondata['numPages']
        # Append pandas dataframe of records & convert to .csv file
        df_dataset = pd.concat([df_dataset, pd.DataFrame(jsondata['recordings'])]) #df_dataset.append(pd.DataFrame(jsondata['recordings']))
        # increment the current page
        page = page+1
                
    # test if the dataset is not empty
    if len(df_dataset)>0:
        
        # #*** HACK *** to filter the dataset with the parameter 'type' as it does
        # # not work for regular query at the time 10 Nov 2022
        # if verbose :
        #     print("searchTerms {}".format(searchTerms))
        # if params is not None :
        #     for param in params  :
        #         if 'type' in param :
        #             value =  param.split(':')[1] 
        #             df_dataset = df_dataset[df_dataset.type.apply(lambda type: value in type)]
        # #*** END HACK *** 

        # convert latitude and longitude coordinates into float
        df_dataset['lat'] = df_dataset['lat'].astype(float)
        df_dataset['lng'] = df_dataset['lng'].astype(float)
        
        # rearrange index to be sure to have unique and increasing index
        df_dataset.reset_index(drop=True, inplace=True)
        
        # the format of length is not correct (missing 0 before 0:45 => 00:45)
        # Correct the format of length for length shorten than 9:59 (4 characters)
        # by adding a 0
        df_dataset['length'].where(~(df_dataset.length.str.len()==4), 
                                    other='0'+ df_dataset[df_dataset.length.str.len()==4].length, 
                                    inplace=True) 
        
        if format_time == True :
            # rearrange index to be sure to have unique and increasing index
            df_dataset.reset_index(drop=True, inplace=True)
            
            # the format of time is not always correct
            # replace . by :
            df_dataset['time'].replace(to_replace = '[.]', value=':', regex= True)
            df_dataset['time'].replace(to_replace = '[ ] ', value='', regex= True)
            
            # drop rows where there is no valid time information that can be corrected
            df_dataset = df_dataset[(df_dataset.time.str.match('^(0[0-9]|1[0-9]|2[0-3])[:]([0-5][0-9])$')) | 
                                    (df_dataset.time.str.match('^([0-9])[:]([0-5][0-9])$'))]
            
            # Correct the format of time when 0 is missing (missing 0 before 0:45 => 00:45)
            # by adding a 0
            df_dataset['time'][df_dataset.time.str.match('^([0-9])[:]([0-5][0-9])$')] = '0' + df_dataset[df_dataset.time.str.match('^([0-9])[:]([0-5][0-9])$')].time

            if verbose:
                print("Keeped metadata for", len(df_dataset), "files after formating time")
            
        if format_date == True :
            # rearrange index to be sure to have unique and increasing index
            df_dataset.reset_index(drop=True, inplace=True)
            # drop rows where there is no valid date information
            df_dataset = df_dataset[df_dataset.date.str.match(r'^(20[0-9][0-9]|19[0-9][0-9])-(0[1-9]|1[0-2])-([1-9]|1[0-9]|2[0-9]|3[0-1])$')]
            
            if verbose:
                print("Keeped metadata for", len(df_dataset), "files after formating date")
        
        if (format_time == True) and (format_date == True) :         
            # add a column with the week number
            df_dataset['week'] = pd.to_datetime(df_dataset['date']).dt.isocalendar()['week'] # type: ignore
            # add a column with datetime in DateTime format
            df_dataset['datetime'] =  pd.to_datetime(df_dataset['time']+' '+ df_dataset['date'], format="%H:%M %Y-%m-%d")

    # if no limit in the number of files
    if max_nb_files is not None :
        # test if the number of files is greater than the maximum number of
        # resquested files
        if len(df_dataset) > max_nb_files : 
            df_dataset = df_dataset.sample(n = max_nb_files,
                                           random_state = random_seed)
       
    if verbose:
        print("Found", numPages, "pages in total.")
        print("Saved metadata for", len(df_dataset), "files")
            
    # rearrange index to be sure to have unique and increasing index
    df_dataset.reset_index(drop=True, inplace=True)

    return df_dataset


# %%

[docs]
def xc_multi_query(df_query,
                   max_nb_files = None,
                   format_time  = False,
                   format_date  = False,
                   random_seed  = 1979,
                   verbose      = False):
    """
    Multi_query performs multiple queries following the search terms defined
    in the input dataframe

    Parameters
    ----------
    df_query : pandas DataFrame
        Dataframe with search terms. Each row corresponds to a new query. 
        Columns corresponds to the search terms allowed by Xeno-Canto
    max_nb_files: integer, optional
        Maximum number of audio files requested. The default is None
    format_time : boolean, optional
        Time in Xeno-Canto is not always present neither correctly formated. 
        If true, time will be correctly formated to be processed as DateTime 
        format. When formating is not possible, the row is dropped. 
        The default is False
    format_date : boolean, optional
        Date in Xeno-Canto is not always present neither correctly formated. 
        If true, rows with uncorrect format of date are dropped.
    random_seed : integer, optional
        Fix the random seed in order to get the same result every time the 
        function is called
    verbose : boolean, optional
        Print messages during the execution of the function. The default is False.

    Returns
    -------
    df_dataset : pandas DataFrame
        Dataframe containing all the recordings metadata matching 
        the search terms.

    """

    df_dataset = pd.DataFrame()
    for index, row in df_query.iterrows():
        searchTerms = row.tolist()
        df_dataset = pd.concat([df_dataset, xc_query(searchTerms, 
                                                max_nb_files,
                                                format_time,
                                                format_date,
                                                random_seed,
                                                verbose)]) # df_dataset.append

    # rearrange index to be sure to have unique and increasing index
    df_dataset.reset_index(drop=True, inplace=True)
    
    return df_dataset


#%%

[docs]
def xc_selection(df_dataset,
                 max_nb_files=100,
                 max_length='01:00',
                 min_length='00:10',
                 min_quality='B',
                 verbose = False):
    """
    Select a maximum number of recordings depending on their quality and 
    duration in order to create an homogeneous dataset.

    Parameters
    ----------
    df_dataset : pandas DataFrame
        Dataframe containing all the recordings metadata 
    max_nb_files : int, optional
        Max number of audio files per species. The default is 100.
    max_length : string, optional
        Max duration of the audio files. The default is '01:00'.
    min_length : string, optional
        Min duration of the audio files. The default is '00:10'.
    min_quality : string, optional
        Min quality of the audio files. The default is 'B'.
    verbose : boolean, optional
        Print messages during the execution of the function. The default is False.

    Returns
    -------
    df_dataset_out : pandas DataFrame
        Dataframe containing the selected recordings metadata 

    """

    df_dataset_out = pd.DataFrame()
    quality = ['A', 'B', 'C', 'D', 'E']
    if min_quality == 'A' :
        quality =['A']
    elif min_quality == 'B' :
        quality = ['A', 'B']
    elif min_quality == 'C':
        quality = ['A', 'B', 'C']
    elif min_quality == 'D':
        quality = ['A', 'B', 'C', 'D']
    unique_species = pd.unique(df_dataset.gen + ' ' + df_dataset.sp)
    for name in unique_species:
        if verbose : print(name)
        # extract the genus and species from the scientific name
        gen = name.rpartition(' ')[0]
        sp = name.rpartition(' ')[2]
        # select the corresponding to the species
        # !! the string test is case sensitive (the genus start with a upper case)
        subdf_dataset = df_dataset[((df_dataset.gen == gen) &
                                    (df_dataset.sp == sp))]
        # sort the dataframe corresponding to the species by audio quality
        subdf_dataset = subdf_dataset.sort_values(by='q')

        # Counter initialization
        current_nb_files = 0
        requested_nb_files = 0
        current_quality = 0
        while (current_nb_files < max_nb_files) & (current_quality < len(quality)):
            requested_nb_files = max_nb_files - current_nb_files
            q = quality[current_quality]
            if verbose : print('    ... request %2.0f files of quality %s' %
                  (requested_nb_files, q))
            
            mask1 = ((subdf_dataset.q == q) &
                    (subdf_dataset.length <= max_length) &
                    (subdf_dataset.length >= min_length))

            if len(subdf_dataset[mask1]) >= requested_nb_files:
                # create a temp dataframe with the selected rows
                df_temp = subdf_dataset[mask1].sort_values(
                    by='length', ascending=False).iloc[0:requested_nb_files]
                # add the rows to the output dataframe
                df_dataset_out = pd.concat([df_dataset_out, df_temp], axis=0)
                # drop the selected rows to avoid future selection
                subdf_dataset.drop(df_temp.index, axis=0, inplace=True)
            else :  
                # create a temp dataframe with the selected rows
                df_temp = subdf_dataset[mask1].sort_values(
                    by='length', ascending=False)
                # add the rows to the output dataframe
                df_dataset_out = pd.concat([df_dataset_out, df_temp], axis=0)
                # drop the selected rows to avoid future selection
                subdf_dataset.drop(df_temp.index, axis=0, inplace=True)
            if verbose : 
                print('    --> found %2.0f files of quality %s and %s<length<%s'%
                     (len(df_temp), q, min_length, max_length ))
            current_nb_files += len(df_temp)
            requested_nb_files = max_nb_files - current_nb_files
            current_quality += 1

        if verbose : print('    total files : %2.f' %current_nb_files)
        if verbose : print("-----------------------------------------")
        
    return df_dataset_out


#%%

[docs]
def xc_download(df,
                rootdir,
                dataset_name='dataset',
                overwrite   = False,
                save_csv    = False,
                verbose     = False):
    """
    Download the audio files from Xeno-Canto based on the input dataframe
    It will create directories for each species if needed

    Parameters
    ----------
    df : pandas DataFrame
        Dataframe containing the selected recordings metadata 
    rootdir : string
        Path to the directory where the whole dataset will be saved
    dataset_name : string, optional
        Name of the dataset that will be created as a parent directory . 
        The default is 'dataset'.
    overwrite : boolean, optional
        Test if the directory where the audio files will be downloaded already
        exists. if True, it will download the data in the directory anyway.
        Otherwise, if False, it will not download audio files.
    save_csv : boolean, optional
        if True, the csv corresponding to the species will be saved in the
        directory of the species. The default is False.
    verbose : boolean, optional
        Print messages during the execution of the function. The default is False.

    Returns
    -------
    df : pandas DataFrame
        Dataframe similar to df but without the rows of the audio recordings
        that were not downloaded.
        Add a new column "fullfilename" with the paths to the newly downloaded 
        audio files
    """
    # format rootdir as path
    rootdir = Path(rootdir)

    # list of the full paths to the audios
    fullpath_list = []
    
    # Try to set 'id' as index
    try :
        df.set_index('id', inplace = True)
    except :
        pass
   
    #--------------------------------------------------------------------------
    # Check whether the specified path is an existing directory or not 
    isdir = os.path.exists(rootdir / dataset_name)    
    if (isdir == False) or ((isdir == True) and (overwrite == True)) : 
        if (overwrite == True):
            if verbose:
                print(
                    "The directory "
                    + str(rootdir / dataset_name)
                    + " already exists and will be overwritten" )        
        if verbose :
            numfiles = len(df)
            print("A total of", numfiles, "files will be downloaded")
            
        # change type of rootdir into Path
        count = 1
        for index, row in df.iterrows():

            #------------------------------------------------------------------
            # create a name for the directory
            name_dir = row.gen + ' ' + row.sp + '_' + row.en
            # create a directory for the species
            path = rootdir / dataset_name / name_dir
            if not os.path.exists(path):
                if verbose :
                    print("Creating subdirectory " + 
                          str(path) + 
                          " for downloaded files...")
                os.makedirs(path)
            #------------------------------------------------------------------
            # get filenames
            audio_format = '.' + row['file-name'].split('.')[-1]
            filename = 'XC' + str(index) + audio_format
            # test if the file already exists
            if os.path.exists(rootdir / dataset_name / name_dir / filename) == True:
                fullpath_list += [path / filename]
                if verbose :
                    print( filename + " already exists")
                # drop the row of this recordings
                #df.drop(index, inplace = True)
            else:
                #--------------------------------------------------------------
                # get website recording http download address 
                fileaddress = row.file
                # try to download the audio recording
                try :
                    fullpath, _ = urllib.request.urlretrieve(fileaddress, path / filename)
                    fullpath_list += [str(fullpath)]
                    if verbose : 
                        numfiles = len(df)
                        print("Saving file ", count, "/", numfiles, ": " + fileaddress)
                except:
                    # can't download the audio file (it does not exist (anymore) in
                    # xeno-canto)
                    if verbose :
                        numfiles = len(df)
                        print("***WARNING*** Can't save the file ", 
                              count, "/", numfiles, ": " + fileaddress)
                    # drop the row of this recordings
                    df.drop(index, inplace = True)
                    
            #------------------------------------------------------------------        
            # save csv
            if save_csv:
                filename_csv = str(path/'metadata.csv')
                # test if the csv file doesn't exit
                if os.path.exists(filename_csv) == False:
                    # try to create a file and add a row corresponding to the index
                    try :
                        df.loc[index].to_frame().T.to_csv(filename_csv, 
                                                          sep=";", 
                                                          index=True, 
                                                          index_label = 'id') 
                    except :
                        pass
                # if the csv file exists, concat both dataframes
                else :
                    # try to read the file and add a row corresponding to the index
                    try :
                        pd.concat([pd.read_csv(filename_csv,sep=';',index_col='id'), 
                                   df.loc[index].to_frame().T], 
                                  ignore_index=False).drop_duplicates().to_csv(filename_csv, 
                                                                               sep=";", 
                                                                               index=True,
                                                                               index_label='id')       
                    except :
                        pass
            
            # increment the counter
            count += 1
                        
        # add a new column
        df['fullfilename'] = fullpath_list
        
    else :
        if verbose:
            print(
                "***WARNING*** : The directory "
                + str(rootdir)
                + " already exists"
            )
            
    return df


#%%

# def xc_save_csv(
#         df,
#         rootdir   = os.getcwd(),
#         filename  = "xc_metadata.csv",
#         overwrite = False,
#         verbose   = False): 
#     """
#     Save audio recordings metadata collected from xeno-canto into a csv file

#     Parameters
#     ----------
#     df : pandas DataFrame
#         Dataframe containing the selected recordings metadata 
#     rootdir : string, optional
#         Path to the directory. The default is the current directory
#     filename : string, optional
#         Name of the csv file. The default is "xc_metadata.csv"
#     overwrite : boolean, optional
#         Overwirte the csv file if it already exists
#     verbose : boolean, optional
#         Print messages during the execution of the function. The default is False.

#     Returns
#     -------
#     fullpath : string
#         Returns the full path of the csv file
#     """    
#     # format rootdir to Path
#     rootdir = Path(rootdir)
    
#     # Check whether the specified path is an existing file or not 
#     isfile = os.path.isfile(rootdir / filename)
    
#     if (isfile == False) or ((isfile == True) and (overwrite == True)) : 
#         if (overwrite == True):
#             if verbose:
#                 print(
#                     "The file "
#                     + filename
#                     + " already exists in " 
#                     + str(rootdir)
#                     + " and will be overwritten" )      
#         df.to_csv(rootdir / filename, 
#                   sep=';', 
#                   index=True, 
#                   header=True)
#         fullpath = rootdir / filename
#     else:
#         if verbose:
#             print(
#                 "***WARNING*** : The file "
#                 + filename
#                 + " already exists in "
#                 + str(rootdir)
#             )
#         fullpath = []
        
#     return fullpath
    
# #%%
    
# def xc_read_csv(        
#         filename,
#         rootdir   = os.getcwd(),
#         verbose   = False): 
#     """
#     Read audio recordings metadata collected from xeno-canto and saved into
#     a csv file

#     Parameters
#     ----------
#     filename : string
#         Name of the csv file.
#     rootdir : string, optional
#         Path to the directory. The default is the current directory
#     verbose : boolean, optional
#         Print messages during the execution of the function. The default is False.

#     Returns
#     -------
#     df_dataset : pandas DataFrame
#         Dataframe containing the audio recordings metadata 
#     """    
#     # format rootdir to Path
#     rootdir = Path(rootdir)
    
#     try :
#         df_dataset = pd.read_csv(rootdir / filename, sep=';', index='id')
#     except:
#         df_dataset = pd.DataFrame()
#         if verbose : 
#             print(
#                 "***WARNING : The file "
#                 + filename
#                 + " does not exist in " 
#                 + str(rootdir))
    
#     return df_dataset
    
# %%
if __name__ == '__main__':
    
    df_query = pd.DataFrame()
    df_species = pd.DataFrame()
    
    # species
    df_species['scientific name'] = ['Agelaius phoeniceus',
                                     'psittacula krameri',
                                     'Ardea herodias']
    # query
    gen = []
    sp = []
    for name in df_species['scientific name']:
        gen.append(name.rpartition(' ')[0])
        sp.append(name.rpartition(' ')[2])

    df_query['gen'] = gen
    df_query['sp'] = sp
    df_query['q'] = 'q:A'
    df_query['len'] = 'len:"10-60"'

    df_dataset = xc_multi_query(df_query, 
                                max_nb_files = 5,
                                verbose=True)
    
    print('number of files in the dataset is %d'%(len(df_dataset)))
    print(df_dataset.head(15))
    
    
    xc_download(df_dataset, 
                rootdir=os.getcwd(), 
                dataset_name='my_dataset', 
                save_csv=True,
                overwrite=False,
                verbose = True)