193 lines
9.2 KiB
Python
193 lines
9.2 KiB
Python
from dataset_files import MultiThreadedAudioFeatureExtractor
|
|
from pathlib import Path
|
|
from panns_inference import AudioTagging
|
|
import logging
|
|
import numpy as np
|
|
import queue
|
|
import concurrent.futures
|
|
import threading
|
|
import time
|
|
import audiopreprocessing
|
|
#import torch
|
|
#import gc
|
|
|
|
class mtafe_panns():
|
|
__audio_queue: queue.Queue[ # List of ...
|
|
tuple[ # Pair of chunked audio and its path
|
|
list[tuple[np.ndarray, float, int]], # Chunked audio
|
|
Path # Path to original audio
|
|
]
|
|
] # Listed of Chunked/Resampled audio
|
|
__audio_loader_threads: int # Amount of audio feeder threads
|
|
__feature_extractor_threads: int # Amount of feature extractor threads (if the method allows)
|
|
__audio_paths_list: queue.Queue[Path] # Path list to audio
|
|
__max_audio_in_queue: int # Maximum audio in queue
|
|
__desired_sr: int
|
|
__mono: bool
|
|
__chunk_length: float
|
|
__overlap: float
|
|
__features: dict[Path, list[tuple[np.ndarray, float, int]]] # This is a crime, I know
|
|
__features_lock: threading.Lock
|
|
__audio_loader_threadpool: list[concurrent.futures.Future]
|
|
__feature_extractor_threadpool: list[concurrent.futures.Future]
|
|
__at: AudioTagging
|
|
__batch_size: int
|
|
|
|
def __init__(self,
|
|
audio_paths: list[Path],
|
|
max_audio_in_queue: int = 16,
|
|
audio_feeder_threads: int = 8,
|
|
feature_extractor_threads: int = 8,
|
|
desired_sr: int = 32000,
|
|
force_mono: bool = False,
|
|
chunk_length: float = 15.0,
|
|
chunk_overlap: float = 2.0,
|
|
batch_size: int = 20
|
|
):
|
|
# Check if the paths passed in are all valid and add them to queue
|
|
self.__audio_paths_list = queue.Queue()
|
|
for p in audio_paths:
|
|
if not p.is_file():
|
|
raise Exception(f"Path '{p.absolute()}' is NOT a valid file!")
|
|
else:
|
|
self.__audio_paths_list.put(p)
|
|
#self.__audio_paths_list.task_done()
|
|
|
|
logging.info(f"[MTAFE] [Constructor] Queued {self.__audio_paths_list.qsize()} files")
|
|
|
|
# Set up private attributes
|
|
## Audio preprocessing parameters
|
|
self.__desired_sr = desired_sr
|
|
self.__mono = force_mono
|
|
self.__chunk_length = chunk_length
|
|
self.__overlap = chunk_overlap
|
|
|
|
## Extractor/Feeder settings
|
|
self.__max_audio_in_queue = max_audio_in_queue
|
|
self.__audio_loader_threads = audio_feeder_threads
|
|
self.__feature_extractor_threads = feature_extractor_threads
|
|
|
|
## Set up runtime conditions
|
|
self.__audio_queue = queue.Queue(maxsize=max_audio_in_queue)
|
|
self.__features = {}
|
|
self.__features_lock = threading.Lock()
|
|
self.__audio_loader_threadpool = []
|
|
self.__feature_extractor_threadpool = []
|
|
|
|
logging.info(f"[MTAFE] [Constructor] Extraction parameters: {desired_sr}Hz, Mono: {force_mono}, Divide into {chunk_length}s chunks with {chunk_overlap}s of overlap")
|
|
logging.info(f"[MTAFE] [Constructor] Using {audio_feeder_threads} threads for preprocessing audio and {feature_extractor_threads} threads for feature extraction. Max queue size of {max_audio_in_queue} files")
|
|
|
|
logging.info(f"[MTAFE] [Constructor] Initializing PANNs")
|
|
logging.info(f"[MTAFE] [Constructor] Inferencing with batch size {batch_size}")
|
|
self.__at = AudioTagging(checkpoint_path=None, device='cuda')
|
|
self.__batch_size = batch_size
|
|
|
|
def __chunks(self, lst, n):
|
|
# Stolen straight from Stackoverflow
|
|
"""Yield successive n-sized chunks from lst."""
|
|
for i in range(0, len(lst), n):
|
|
yield lst[i:i + n]
|
|
|
|
def __audio_inference_embedding(self, audio: list[tuple[np.ndarray, float, int]]) -> list[tuple[np.ndarray, float, int]]:
|
|
audio_chunk_list = []
|
|
timepos_list = []
|
|
channel_id_list = []
|
|
embedding_list = []
|
|
|
|
# Split into equal sized list
|
|
for audio_chunk, timepos, channel in audio:
|
|
audio_chunk_list.append(audio_chunk)
|
|
timepos_list.append(timepos)
|
|
channel_id_list.append(channel)
|
|
|
|
# Convert audio_chunk_list into numpy array
|
|
audio_chunk_list = np.array(audio_chunk_list)
|
|
|
|
#logging.info("[MTAFE] [PANNs] Inferencing...")
|
|
try:
|
|
for i, batch in enumerate(self.__chunks(audio_chunk_list, self.__batch_size)):
|
|
(clipwise_output, embedding) = self.__at.inference(batch)
|
|
for vect in embedding: # vect: np.ndarray
|
|
embedding_list.append(vect)
|
|
logging.info(f"[MTAFE] [PANNs] Inferenced batch {i}")
|
|
|
|
assert len(audio_chunk_list) == len(timepos_list) == len(channel_id_list) == len(embedding_list)
|
|
except Exception as e:
|
|
logging.critical("[MTAFE] [PANNs] ERROR! INFERENCE FAILED!!! OR LIST SIZE MISMATCH")
|
|
logging.critical(e)
|
|
embedding_list = [None for _ in audio_chunk_list] # Clearing embedding_list and filling it with None
|
|
return list(zip(embedding_list, channel_id_list, embedding_list))
|
|
|
|
def __audio_feeder_thread(self, thread_id):
|
|
while (not self.__audio_paths_list.empty()):
|
|
new_audio_path = self.__audio_paths_list.get()
|
|
self.__audio_paths_list.task_done()
|
|
logging.info(f"[MTAFE] [Audio Feeder {thread_id}] Preprocess: {new_audio_path.absolute()}")
|
|
new_audio = audiopreprocessing.load_preprocessed_audio(
|
|
new_audio_path,
|
|
self.__desired_sr,
|
|
self.__mono,
|
|
self.__chunk_length,
|
|
self.__overlap
|
|
)
|
|
self.__audio_queue.put((new_audio, new_audio_path))
|
|
logging.info(f"[MTAFE] [Audio Feeder {thread_id}] Feed: {new_audio_path.absolute()}")
|
|
logging.info(f"[MTAFE] [Audio Feeder {thread_id}] Thread finished!")
|
|
|
|
def __check_all_audiofeed_thread_finished(self) -> bool:
|
|
for ft in self.__audio_loader_threadpool:
|
|
if ft.running():
|
|
return False
|
|
return True
|
|
|
|
def __check_all_featureextractor_thread_finished(self) -> bool:
|
|
for ft in self.__feature_extractor_threadpool:
|
|
if ft.running():
|
|
return False
|
|
return True
|
|
|
|
def __feature_extractor_thread(self, thread_id):
|
|
while (not self.__check_all_audiofeed_thread_finished() or not self.__audio_queue.empty()):
|
|
if (not self.__audio_queue.empty()):
|
|
audio_to_process, audio_path = self.__audio_queue.get()
|
|
self.__audio_queue.task_done()
|
|
logging.info(f"[MTAFE] [Feature Extractor {thread_id}] Extracting: {audio_path}")
|
|
features_to_add = self.__audio_inference_embedding(audio_to_process)
|
|
logging.info(f"[MTAFE] [Feature Extractor {thread_id}] Extracted: {len(features_to_add)} features")
|
|
with self.__features_lock:
|
|
self.__features[audio_path] = features_to_add
|
|
logging.info(f"[MTAFE] [Feature Extractor {thread_id}] Feature Extraction complete for {audio_path} w/ {len(features_to_add)} features")
|
|
logging.info(f"[MTAFE] [Feature Extractor {thread_id}] Thread finished!")
|
|
|
|
def __count_running_threads(self) -> tuple[int, int]:
|
|
running_extractors = 0
|
|
running_feeders = 0
|
|
for ft in self.__feature_extractor_threadpool:
|
|
if ft.running(): running_extractors += 1
|
|
for ft in self.__audio_loader_threadpool:
|
|
if ft.running(): running_feeders += 1
|
|
return (running_feeders, running_extractors)
|
|
|
|
@property
|
|
def features(self) -> dict[Path, list[tuple[np.ndarray, float, int]]]:
|
|
return self.__features
|
|
|
|
def extract(self):
|
|
total_amount = self.__audio_paths_list.qsize()
|
|
logging.info(f"[MTAFE] [Main] Starting feature extraction for {total_amount} file(s)")
|
|
t_start = time.perf_counter()
|
|
with concurrent.futures.ThreadPoolExecutor(max_workers=(self.__audio_loader_threads + self.__feature_extractor_threads)) as executor:
|
|
for i in range(self.__audio_loader_threads):
|
|
ld_ft = executor.submit(self.__audio_feeder_thread, i)
|
|
self.__audio_loader_threadpool.append(ld_ft)
|
|
for i in range(self.__feature_extractor_threads):
|
|
ld_ft = executor.submit(self.__feature_extractor_thread, i)
|
|
self.__feature_extractor_threadpool.append(ld_ft)
|
|
while ( (not self.__check_all_audiofeed_thread_finished()) and (not self.__check_all_featureextractor_thread_finished()) ):
|
|
nfeeder, nextract = self.__count_running_threads()
|
|
print(f"[MTAFE Progress] Processed {len(self.__features)}/{total_amount} (L:{self.__audio_queue.qsize()}/W:{self.__audio_paths_list.qsize()}, LD:{nfeeder}/EXT:{nextract})", end="\r")
|
|
t_stop = time.perf_counter()
|
|
logging.info(f"[MTAFE] Processed {len(self.__features)}/{total_amount} (L:{self.__audio_queue.qsize()}/W:{self.__audio_paths_list.qsize()} COMPLETE)")
|
|
delta_t = t_stop - t_start
|
|
total_features = sum( [len(self.__features[path]) for path in self.__features] )
|
|
logging.info(f"[MTAFE] Extraction complete. Took {delta_t} seconds. Added {total_features} vectors/embeddings") |