Source code for maad.util.xeno_canto

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" 
Collection of functions to send queries to www.xeno-canto.org, get dataframe
with all xeno-canto fields and eventually download bird sound files with
JSON metadata.

"""   
#
# Authors:  original author Karoliina Oksanen, 2014
#           Updated to python 3.7.4, Agnieszka Mikolajczyk, 2019
#           Modified for scikit-maad by Sylvain HAUPERT, 2021        
#
# License: New BSD License


import urllib.request
import json
import pandas as pd
from pathlib import Path
import numpy as np
import os

# %%
[docs] def xc_query(searchTerms, max_nb_files = None, format_time = False, format_date = False, random_seed = 1979, verbose=False): """ Query metadata from Xeno-Canto website depending on the search terms. The audio recordings metadata are grouped and stored in a dataframe. Parameters ---------- searchTerms : list list of search terms to perform the query The main seach terms are : - grp : birds - gen : genus - ssp : subspecies - en : english name - q : quality - cnt : country - len : length - area : continent (europe, africa, america, asia) see more here : https://www.xeno-canto.org/help/search max_nb_files: integer, optional Maximum number of audio files requested. The default is None format_time : boolean, optional Time in Xeno-Canto is not always present neither correctly formated. If true, time will be correctly formated to be processed as DateTime format. When formating is not possible, the row is dropped. The default is False format_date : boolean, optional Date in Xeno-Canto is not always present neither correctly formated. If true, rows with uncorrect format of date are dropped. random_seed : integer, optional Fix the random seed in order to get the same result every time the function is called verbose : boolean, optional Print messages during the execution of the function. The default is False. Returns ------- df_dataset : pandas DataFrame Dataframe containing all the recordings metadata matching search terms """ # #*** HACK *** to remove the parameter 'type' from query as it does # # not work at the time 10 Nov 2022 # params = searchTerms # searchTerms = [] # if params is not None : # for param in params: # if 'type' not in param : # searchTerms.append(param) # #*** END HACK *** # initialization of numPages = 1 page = 1 df_dataset = pd.DataFrame() while page < numPages+1: if verbose: print("Loading page "+str(page)+"...") url = 'https://www.xeno-canto.org/api/2/recordings?query={0}&page={1}'.format( '%20'.join(searchTerms), page) if verbose: print(url) jsonPage = urllib.request.urlopen(url) jsondata = json.loads(jsonPage.read().decode('utf-8')) # check number of pages numPages = jsondata['numPages'] # Append pandas dataframe of records & convert to .csv file df_dataset = pd.concat([df_dataset, pd.DataFrame(jsondata['recordings'])]) #df_dataset.append(pd.DataFrame(jsondata['recordings'])) # increment the current page page = page+1 # test if the dataset is not empty if len(df_dataset)>0: # #*** HACK *** to filter the dataset with the parameter 'type' as it does # # not work for regular query at the time 10 Nov 2022 # if verbose : # print("searchTerms {}".format(searchTerms)) # if params is not None : # for param in params : # if 'type' in param : # value = param.split(':')[1] # df_dataset = df_dataset[df_dataset.type.apply(lambda type: value in type)] # #*** END HACK *** # convert latitude and longitude coordinates into float df_dataset['lat'] = df_dataset['lat'].astype(float) df_dataset['lng'] = df_dataset['lng'].astype(float) # rearrange index to be sure to have unique and increasing index df_dataset.reset_index(drop=True, inplace=True) # the format of length is not correct (missing 0 before 0:45 => 00:45) # Correct the format of length for length shorten than 9:59 (4 characters) # by adding a 0 df_dataset['length'].where(~(df_dataset.length.str.len()==4), other='0'+ df_dataset[df_dataset.length.str.len()==4].length, inplace=True) if format_time == True : # rearrange index to be sure to have unique and increasing index df_dataset.reset_index(drop=True, inplace=True) # the format of time is not always correct # replace . by : df_dataset['time'].replace(to_replace = '[.]', value=':', regex= True) df_dataset['time'].replace(to_replace = '[ ] ', value='', regex= True) # drop rows where there is no valid time information that can be corrected df_dataset = df_dataset[(df_dataset.time.str.match('^(0[0-9]|1[0-9]|2[0-3])[:]([0-5][0-9])$')) | (df_dataset.time.str.match('^([0-9])[:]([0-5][0-9])$'))] # Correct the format of time when 0 is missing (missing 0 before 0:45 => 00:45) # by adding a 0 df_dataset['time'][df_dataset.time.str.match('^([0-9])[:]([0-5][0-9])$')] = '0' + df_dataset[df_dataset.time.str.match('^([0-9])[:]([0-5][0-9])$')].time if verbose: print("Keeped metadata for", len(df_dataset), "files after formating time") if format_date == True : # rearrange index to be sure to have unique and increasing index df_dataset.reset_index(drop=True, inplace=True) # drop rows where there is no valid date information df_dataset = df_dataset[df_dataset.date.str.match(r'^(20[0-9][0-9]|19[0-9][0-9])-(0[1-9]|1[0-2])-([1-9]|1[0-9]|2[0-9]|3[0-1])$')] if verbose: print("Keeped metadata for", len(df_dataset), "files after formating date") if (format_time == True) and (format_date == True) : # add a column with the week number df_dataset['week'] = pd.to_datetime(df_dataset['date']).dt.isocalendar()['week'] # type: ignore # add a column with datetime in DateTime format df_dataset['datetime'] = pd.to_datetime(df_dataset['time']+' '+ df_dataset['date'], format="%H:%M %Y-%m-%d") # if no limit in the number of files if max_nb_files is not None : # test if the number of files is greater than the maximum number of # resquested files if len(df_dataset) > max_nb_files : df_dataset = df_dataset.sample(n = max_nb_files, random_state = random_seed) if verbose: print("Found", numPages, "pages in total.") print("Saved metadata for", len(df_dataset), "files") # rearrange index to be sure to have unique and increasing index df_dataset.reset_index(drop=True, inplace=True) return df_dataset
# %%
[docs] def xc_multi_query(df_query, max_nb_files = None, format_time = False, format_date = False, random_seed = 1979, verbose = False): """ Multi_query performs multiple queries following the search terms defined in the input dataframe Parameters ---------- df_query : pandas DataFrame Dataframe with search terms. Each row corresponds to a new query. Columns corresponds to the search terms allowed by Xeno-Canto max_nb_files: integer, optional Maximum number of audio files requested. The default is None format_time : boolean, optional Time in Xeno-Canto is not always present neither correctly formated. If true, time will be correctly formated to be processed as DateTime format. When formating is not possible, the row is dropped. The default is False format_date : boolean, optional Date in Xeno-Canto is not always present neither correctly formated. If true, rows with uncorrect format of date are dropped. random_seed : integer, optional Fix the random seed in order to get the same result every time the function is called verbose : boolean, optional Print messages during the execution of the function. The default is False. Returns ------- df_dataset : pandas DataFrame Dataframe containing all the recordings metadata matching the search terms. """ df_dataset = pd.DataFrame() for index, row in df_query.iterrows(): searchTerms = row.tolist() df_dataset = pd.concat([df_dataset, xc_query(searchTerms, max_nb_files, format_time, format_date, random_seed, verbose)]) # df_dataset.append # rearrange index to be sure to have unique and increasing index df_dataset.reset_index(drop=True, inplace=True) return df_dataset
#%%
[docs] def xc_selection(df_dataset, max_nb_files=100, max_length='01:00', min_length='00:10', min_quality='B', verbose = False): """ Select a maximum number of recordings depending on their quality and duration in order to create an homogeneous dataset. Parameters ---------- df_dataset : pandas DataFrame Dataframe containing all the recordings metadata max_nb_files : int, optional Max number of audio files per species. The default is 100. max_length : string, optional Max duration of the audio files. The default is '01:00'. min_length : string, optional Min duration of the audio files. The default is '00:10'. min_quality : string, optional Min quality of the audio files. The default is 'B'. verbose : boolean, optional Print messages during the execution of the function. The default is False. Returns ------- df_dataset_out : pandas DataFrame Dataframe containing the selected recordings metadata """ df_dataset_out = pd.DataFrame() quality = ['A', 'B', 'C', 'D', 'E'] if min_quality == 'A' : quality =['A'] elif min_quality == 'B' : quality = ['A', 'B'] elif min_quality == 'C': quality = ['A', 'B', 'C'] elif min_quality == 'D': quality = ['A', 'B', 'C', 'D'] unique_species = pd.unique(df_dataset.gen + ' ' + df_dataset.sp) for name in unique_species: if verbose : print(name) # extract the genus and species from the scientific name gen = name.rpartition(' ')[0] sp = name.rpartition(' ')[2] # select the corresponding to the species # !! the string test is case sensitive (the genus start with a upper case) subdf_dataset = df_dataset[((df_dataset.gen == gen) & (df_dataset.sp == sp))] # sort the dataframe corresponding to the species by audio quality subdf_dataset = subdf_dataset.sort_values(by='q') # Counter initialization current_nb_files = 0 requested_nb_files = 0 current_quality = 0 while (current_nb_files < max_nb_files) & (current_quality < len(quality)): requested_nb_files = max_nb_files - current_nb_files q = quality[current_quality] if verbose : print(' ... request %2.0f files of quality %s' % (requested_nb_files, q)) mask1 = ((subdf_dataset.q == q) & (subdf_dataset.length <= max_length) & (subdf_dataset.length >= min_length)) if len(subdf_dataset[mask1]) >= requested_nb_files: # create a temp dataframe with the selected rows df_temp = subdf_dataset[mask1].sort_values( by='length', ascending=False).iloc[0:requested_nb_files] # add the rows to the output dataframe df_dataset_out = pd.concat([df_dataset_out, df_temp], axis=0) # drop the selected rows to avoid future selection subdf_dataset.drop(df_temp.index, axis=0, inplace=True) else : # create a temp dataframe with the selected rows df_temp = subdf_dataset[mask1].sort_values( by='length', ascending=False) # add the rows to the output dataframe df_dataset_out = pd.concat([df_dataset_out, df_temp], axis=0) # drop the selected rows to avoid future selection subdf_dataset.drop(df_temp.index, axis=0, inplace=True) if verbose : print(' --> found %2.0f files of quality %s and %s<length<%s'% (len(df_temp), q, min_length, max_length )) current_nb_files += len(df_temp) requested_nb_files = max_nb_files - current_nb_files current_quality += 1 if verbose : print(' total files : %2.f' %current_nb_files) if verbose : print("-----------------------------------------") return df_dataset_out
#%%
[docs] def xc_download(df, rootdir, dataset_name='dataset', overwrite = False, save_csv = False, verbose = False): """ Download the audio files from Xeno-Canto based on the input dataframe It will create directories for each species if needed Parameters ---------- df : pandas DataFrame Dataframe containing the selected recordings metadata rootdir : string Path to the directory where the whole dataset will be saved dataset_name : string, optional Name of the dataset that will be created as a parent directory . The default is 'dataset'. overwrite : boolean, optional Test if the directory where the audio files will be downloaded already exists. if True, it will download the data in the directory anyway. Otherwise, if False, it will not download audio files. save_csv : boolean, optional if True, the csv corresponding to the species will be saved in the directory of the species. The default is False. verbose : boolean, optional Print messages during the execution of the function. The default is False. Returns ------- df : pandas DataFrame Dataframe similar to df but without the rows of the audio recordings that were not downloaded. Add a new column "fullfilename" with the paths to the newly downloaded audio files """ # format rootdir as path rootdir = Path(rootdir) # list of the full paths to the audios fullpath_list = [] # Try to set 'id' as index try : df.set_index('id', inplace = True) except : pass #-------------------------------------------------------------------------- # Check whether the specified path is an existing directory or not isdir = os.path.exists(rootdir / dataset_name) if (isdir == False) or ((isdir == True) and (overwrite == True)) : if (overwrite == True): if verbose: print( "The directory " + str(rootdir / dataset_name) + " already exists and will be overwritten" ) if verbose : numfiles = len(df) print("A total of", numfiles, "files will be downloaded") # change type of rootdir into Path count = 1 for index, row in df.iterrows(): #------------------------------------------------------------------ # create a name for the directory name_dir = row.gen + ' ' + row.sp + '_' + row.en # create a directory for the species path = rootdir / dataset_name / name_dir if not os.path.exists(path): if verbose : print("Creating subdirectory " + str(path) + " for downloaded files...") os.makedirs(path) #------------------------------------------------------------------ # get filenames filename = 'XC' + str(index) + '.mp3' # test if the mp3 file already exists if os.path.exists(rootdir / dataset_name / name_dir / filename) == True: fullpath_list += [path / filename] if verbose : print( filename + " already exists") # drop the row of this recordings #df.drop(index, inplace = True) else: #-------------------------------------------------------------- # get website recording http download address fileaddress = row.file # try to download the audio recording try : fullpath, _ = urllib.request.urlretrieve(fileaddress, path / filename) fullpath_list += [str(fullpath)] if verbose : numfiles = len(df) print("Saving file ", count, "/", numfiles, ": " + fileaddress) except: # can't download the audio file (it does not exist (anymore) in # xeno-canto) if verbose : numfiles = len(df) print("***WARNING*** Can't save the file ", count, "/", numfiles, ": " + fileaddress) # drop the row of this recordings df.drop(index, inplace = True) #------------------------------------------------------------------ # save csv if save_csv: filename_csv = str(path/'metadata.csv') # test if the csv file doesn't exit if os.path.exists(filename_csv) == False: # try to create a file and add a row corresponding to the index try : df.loc[index].to_frame().T.to_csv(filename_csv, sep=";", index=True, index_label = 'id') except : pass # if the csv file exists, concat both dataframes else : # try to read the file and add a row corresponding to the index try : pd.concat([pd.read_csv(filename_csv,sep=';',index_col='id'), df.loc[index].to_frame().T], ignore_index=False).drop_duplicates().to_csv(filename_csv, sep=";", index=True, index_label='id') except : pass # increment the counter count += 1 # add a new column df['fullfilename'] = fullpath_list else : if verbose: print( "***WARNING*** : The directory " + str(rootdir) + " already exists" ) return df
#%% # def xc_save_csv( # df, # rootdir = os.getcwd(), # filename = "xc_metadata.csv", # overwrite = False, # verbose = False): # """ # Save audio recordings metadata collected from xeno-canto into a csv file # Parameters # ---------- # df : pandas DataFrame # Dataframe containing the selected recordings metadata # rootdir : string, optional # Path to the directory. The default is the current directory # filename : string, optional # Name of the csv file. The default is "xc_metadata.csv" # overwrite : boolean, optional # Overwirte the csv file if it already exists # verbose : boolean, optional # Print messages during the execution of the function. The default is False. # Returns # ------- # fullpath : string # Returns the full path of the csv file # """ # # format rootdir to Path # rootdir = Path(rootdir) # # Check whether the specified path is an existing file or not # isfile = os.path.isfile(rootdir / filename) # if (isfile == False) or ((isfile == True) and (overwrite == True)) : # if (overwrite == True): # if verbose: # print( # "The file " # + filename # + " already exists in " # + str(rootdir) # + " and will be overwritten" ) # df.to_csv(rootdir / filename, # sep=';', # index=True, # header=True) # fullpath = rootdir / filename # else: # if verbose: # print( # "***WARNING*** : The file " # + filename # + " already exists in " # + str(rootdir) # ) # fullpath = [] # return fullpath # #%% # def xc_read_csv( # filename, # rootdir = os.getcwd(), # verbose = False): # """ # Read audio recordings metadata collected from xeno-canto and saved into # a csv file # Parameters # ---------- # filename : string # Name of the csv file. # rootdir : string, optional # Path to the directory. The default is the current directory # verbose : boolean, optional # Print messages during the execution of the function. The default is False. # Returns # ------- # df_dataset : pandas DataFrame # Dataframe containing the audio recordings metadata # """ # # format rootdir to Path # rootdir = Path(rootdir) # try : # df_dataset = pd.read_csv(rootdir / filename, sep=';', index='id') # except: # df_dataset = pd.DataFrame() # if verbose : # print( # "***WARNING : The file " # + filename # + " does not exist in " # + str(rootdir)) # return df_dataset # %% if __name__ == '__main__': df_query = pd.DataFrame() df_species = pd.DataFrame() # species df_species['scientific name'] = ['Agelaius phoeniceus', 'psittacula krameri', 'Ardea herodias'] # query gen = [] sp = [] for name in df_species['scientific name']: gen.append(name.rpartition(' ')[0]) sp.append(name.rpartition(' ')[2]) df_query['gen'] = gen df_query['sp'] = sp df_query['q'] = 'q:A' df_query['len'] = 'len:"10-60"' df_dataset = xc_multi_query(df_query, max_nb_files = 5, verbose=True) print('number of files in the dataset is %d'%(len(df_dataset))) print(df_dataset.head(15)) xc_download(df_dataset, rootdir=os.getcwd(), dataset_name='my_dataset', save_csv=True, overwrite=False, verbose = True)