Extract only birds sound from audio

This paper introduced a method to extract only segments with bird sound from an audio file. Since the paper didn’t give any code, I started to write it by myself.

Here is the Python implementation:

import cv2
import time
import torch
import librosa
import soundfile as sf
import numpy as np

from torchlibrosa.stft import LogmelFilterBank, Spectrogram

class CFG:
    n_fft = 2048
    hop_length = 512
    sample_rate = 32000
    n_mels = 64
    fmin = 150
    fmax = 150000

class SignalExtractor:
    def __init__(self):
        self.spectrogram_extractor = Spectrogram(
            n_fft=CFG.n_fft, hop_length=CFG.hop_length, win_length=CFG.n_fft, window="hann",
            center=True, pad_mode="reflect", freeze_parameters=True)
        # Logmel feature extractor
        self.logmel_extractor = LogmelFilterBank(sr=CFG.sample_rate, n_fft=CFG.n_fft,
            n_mels=CFG.n_mels, fmin=CFG.fmin, fmax=CFG.fmax, ref=1.0, amin=1e-10, top_db=None, freeze_parameters=True)
        self.factors = [2.0, 1.8, 1.6, 1.4, 1.2, 1.1]
        self.kernel_size = 15
        self.sn_threshold = 0.2

    def extract(self, input):
        x = torch.from_numpy(input)
        x = x[None, :].float()

        x = self.spectrogram_extractor(x)
        x = self.logmel_extractor(x)

        x = x.squeeze(0).squeeze(0)
        x = x.permute(1, 0).numpy()
        x = x - np.amin(x)

        for factor in self.factors:
            sound, sn_ratio = self._factor_extract(input, x, factor)
            if sn_ratio >= self.sn_threshold:
                break

        return sound, sn_ratio

    def _factor_extract(self, input, x, factor: float):
        rows, cols = x.shape
        row_median = np.median(x, axis=1)
        row_median_matrix = np.tile(row_median, (cols, 1)).T * factor
        col_median = np.median(x, axis=0)
        col_median_matrix = np.tile(col_median, (rows, 1)) * factor

        y = x > row_median_matrix
        z = x > col_median_matrix
        res = np.logical_and(y, z) + np.zeros(x.shape)

        kernel = np.ones((self.kernel_size, self.kernel_size), np.uint8)
        img = cv2.dilate(res, kernel, iterations=1)

        indicator = np.sum(img, axis=0)
        chunk_size = input.shape[0] // indicator.shape[0]
        sounds = []
        for index, chunk in enumerate(indicator):
            if chunk > 0:
                sounds.append(input[index*chunk_size:(index+1)*chunk_size])
        if len(sounds) <= 0:
            return None, 0.0
        sound = np.concatenate(sounds)
        return sound, sound.shape[0]/input.shape[0]

Python

import cv2

import time

import torch

import librosa

import soundfile as sf

import numpy as np

from torchlibrosa.stft import LogmelFilterBank, Spectrogram

class CFG:

    n_fft = 2048

    hop_length = 512

    sample_rate = 32000

    n_mels = 64

    fmin = 150

    fmax = 150000

class SignalExtractor:

    def __init__(self):

        self.spectrogram_extractor = Spectrogram(

            n_fft=CFG.n_fft, hop_length=CFG.hop_length, win_length=CFG.n_fft, window="hann",

            center=True, pad_mode="reflect", freeze_parameters=True)

        # Logmel feature extractor

        self.logmel_extractor = LogmelFilterBank(sr=CFG.sample_rate, n_fft=CFG.n_fft,

            n_mels=CFG.n_mels, fmin=CFG.fmin, fmax=CFG.fmax, ref=1.0, amin=1e-10, top_db=None, freeze_parameters=True)

        self.factors = [2.0, 1.8, 1.6, 1.4, 1.2, 1.1]

        self.kernel_size = 15

        self.sn_threshold = 0.2

    def extract(self, input):

        x = torch.from_numpy(input)

        x = x[None, :].float()

        x = self.spectrogram_extractor(x)

        x = self.logmel_extractor(x)

        x = x.squeeze(0).squeeze(0)

        x = x.permute(1, 0).numpy()

        x = x - np.amin(x)

        for factor in self.factors:

            sound, sn_ratio = self._factor_extract(input, x, factor)

            if sn_ratio >= self.sn_threshold:

                break

        return sound, sn_ratio

    def _factor_extract(self, input, x, factor: float):

        rows, cols = x.shape

        row_median = np.median(x, axis=1)

        row_median_matrix = np.tile(row_median, (cols, 1)).T * factor

        col_median = np.median(x, axis=0)

        col_median_matrix = np.tile(col_median, (rows, 1)) * factor

        y = x > row_median_matrix

        z = x > col_median_matrix

        res = np.logical_and(y, z) + np.zeros(x.shape)

        kernel = np.ones((self.kernel_size, self.kernel_size), np.uint8)

        img = cv2.dilate(res, kernel, iterations=1)

        indicator = np.sum(img, axis=0)

        chunk_size = input.shape[0] // indicator.shape[0]

        sounds = []

        for index, chunk in enumerate(indicator):

            if chunk > 0:

                sounds.append(input[index*chunk_size:(index+1)*chunk_size])

        if len(sounds) <= 0:

            return None, 0.0

        sound = np.concatenate(sounds)

        return sound, sound.shape[0]/input.shape[0]

Extract only birds sound from audio

Extract only birds sound from audio

Recommend

EU to use blockchain for educational and professional credential verification

Google launches News Showcase in US this summer

Runtime error !! need help pls

Preparing the stage for 6G: A fast and compact transceiver for sub-THz frequenci...

Top 3 Ontario Sportsbooks 2023: Best Legit Sports Betting Sites in ON

YACRB – Yet Another CoRoutines Blog

Studying art history to understand AI evolution

Wear OS 4 may finally allow you to switch phones without factory resetting your...

预见2023：《2023年中国全钒液流电池行业全景图谱》(附市场现状、竞争格局和发展趋势...

US Debt Likely To Skyrocket to $50,000,000,000,000, Says deVere Group CEO Nigel...

About Joyk