import platform import os import pickle import random import threading import time import concurrent.futures import numpy as np from pathlib import Path import audiopreprocessing import logging import queue def serialize_dict_obj(path : Path, object : dict) -> int: """Serializes Python Dictionary object to a file via Pickle. Args: path (Path): Path to store the file object (dict): Dictionary object to serialize Returns: int: size in bytes written """ # Horrible practice, horrible security, but it will work for now with path.open("wb") as fp: pickle.dump(object, fp) fp.seek(0, os.SEEK_END) size = fp.tell() return size logging.info("Reading local dataset directory structure...") ASMRThreePath = Path("C:\\ASMRThree") ASMRTwoPath = Path("D:\\ASMRTwo") ASMROnePath = Path("E:\\ASMROne") if (platform.system() == 'Linux'): ASMROnePath = Path('/mnt/Scratchpad/ASMROne') ASMRTwoPath = Path('/mnt/MyStuffz/ASMRTwo') ASMRThreePath = Path('/mnt/Windows11/ASMRThree') size_one, size_two, size_three = 0, 0, 0 files_one, files_two, files_three = [], [], [] folders_one, folders_two, folders_three = [], [], [] # Statistic calculation for ASMROne for root, dirs, files in ASMROnePath.walk(): # Root will iterate through all folders if root.absolute() != ASMROnePath.absolute(): # Skip root of ASMROnePath folders_one.append(root) # Add folder to list for fname in files: # Iterate through all files in current root file = root/fname # Get file path assert file.is_file() files_one.append(file) size_one += file.stat().st_size # Get file size # Statistic calculation for ASMRTwo for root, dirs, files in ASMRTwoPath.walk(): # Root will iterate through all folders if root.absolute() != ASMRTwoPath.absolute(): # Skip root of ASMRTwoPath folders_two.append(root) # Add folder to list for fname in files: # Iterate through all files in current root file = root/fname # Get file path assert file.is_file() files_two.append(file) size_two += file.stat().st_size # Get file size # Statistic calculation for ASMRThree for root, dirs, files in ASMRThreePath.walk(): # Root will iterate through all folders if root.absolute() != ASMRThreePath.absolute(): # Skip root of ASMRThreePath folders_three.append(root) # Add folder to list for fname in files: # Iterate through all files in current root file = root/fname # Get file path assert file.is_file() files_three.append(file) size_three += file.stat().st_size # Get file size DataSubsetPaths = [ASMROnePath, ASMRTwoPath, ASMRThreePath] DLSiteWorksPaths = [] # Collect ASMR Works (RJ ID, Paths) for ASMRSubsetPath in DataSubsetPaths: for WorkPaths in ASMRSubsetPath.iterdir(): DLSiteWorksPaths.append(WorkPaths) fileExt2fileType = { ".TXT": "Document", ".WAV": "Audio", ".MP3": "Audio", ".PNG": "Image", ".JPG": "Image", ".VTT": "Subtitle", ".PDF": "Document", ".FLAC": "Audio", ".MP4": "Video", ".LRC": "Subtitle", ".SRT": "Subtitle", ".JPEG": "Image", ".ASS": "Subtitle", "": "NO EXTENSION", ".M4A": "Audio", ".MKV": "Video" } fileext_stat = {} file_list = files_one + files_two + files_three file_list_count = len(file_list) for file in file_list: f_ext = file.suffix.upper() if (f_ext in fileext_stat.keys()): fileext_stat[f_ext]['Count'] += 1 fileext_stat[f_ext]['List'].append(file) fileext_stat[f_ext]['ExtensionMass'] += file.stat().st_size else: fileext_stat[f_ext] = {} fileext_stat[f_ext]['Count'] = 1 fileext_stat[f_ext]['List'] = [file] fileext_stat[f_ext]['ExtensionMass'] = file.stat().st_size # The total sum of sizes of the same file extension fileext_stat[f_ext]['MediaType'] = fileExt2fileType[f_ext] audio_paths = [] for extension in fileext_stat: # I can't be bothered to convert this into a list compresion if fileext_stat[extension]['MediaType'] == "Audio": audio_paths += fileext_stat[extension]['List'] def random_audio_chunk(n : int, seed : int = 177013) -> list[Path]: """Returns a random selection of audio files Args: n (int): Amount of files to return seed (int, optional): Seed for RNG. Defaults to 177013. Returns: list[Path]: List of randomly selected audio paths (using Path object) """ random.seed(seed) #return random.choices(audio_paths, k=n) # Contains repeated elements return random.sample(audio_paths, k=n) class AudioFeatureExtractor(): __audio_queue: list[ # List of ... tuple[ # Pair of chunked audio and its path list[tuple[np.ndarray, float, int]], # Chunked audio Path # Path to original audio ] ] # Listed of Chunked/Resampled audio __feeder_future: concurrent.futures.Future __extractor_future: concurrent.futures.Future __audio_paths_list: list[Path] __max_audio_in_queue: int __queue_lock: threading.Lock __desired_sr: int __mono: bool __chunk_length: float __overlap: float __features: dict[Path, list[tuple[np.ndarray, float, int]]] # This is a crime, I know # { audioPath: # [(embedding, pos, channel)...] # } def __embedding_inference(self, audio_ndarray: np.ndarray) -> np.ndarray: """Uses embedding model to inference an audio. Returns embedding vectors. Function to be overrided. Returns np.zeros(32). Args: audio_ndarray (np.ndarray): Returns: np.ndarray: _description_ """ return np.zeros(32) def __embedding_extract(self, audio: tuple[np.ndarray, float, int]) -> tuple[np.ndarray, float, int, np.ndarray]: """Receives a tuple of audio, position, and channel ID, then adding the embedding to the tuple Args: audio (tuple[np.ndarray, float, int]): tuple of audio, position, channel id Returns: tuple[np.ndarray, float, int, np.ndarray]: audio, position, channel id, embedding vector """ audio_chunk, pos, channel_id = audio return (audio_chunk, pos, channel_id, self.__embedding_inference(audio_chunk)) def __audio_queue_feeder(self): # TODO: Upgrade to multithreaded loader? """Internal thread function. Preprocess and load the audio continuously to audio_queue until the end of the audio_paths_list """ while (self.__audio_paths_list): # While there are still Path elements in path list if (not (len(self.__audio_queue) < self.__max_audio_in_queue)): logging.info("[AFE] [Audio Queue Thread]: Queue Full, feeder thread sleeping for 5 seconds") time.sleep(5) while(len(self.__audio_queue) < self.__max_audio_in_queue): # While the audio queue is not full new_audio_path = self.__audio_paths_list[0] new_audio = audiopreprocessing.load_preprocessed_audio( new_audio_path, self.__desired_sr, self.__mono, self.__chunk_length, self.__overlap ) with self.__queue_lock: self.__audio_queue.append( (new_audio, new_audio_path) ) pop_path = self.__audio_paths_list.pop(0) logging.info(f"[AFE] [Audio Queue Thread]: Added new audio to queue {pop_path}") logging.info("[AFE] [Audio Queue Thread]: DONE. All audio files fed") def __audio_queue_feature_extractor(self): """Internal thread function. Get audio from audio queue. And extract embedding vector for all audio chunks. Stores the resulting embedding into self.__features. With Original Audio's Path as key, and list[tuple[np.ndarray, float, int]] (list of tuple of embedding vector, position, channel id) """ while (self.__audio_paths_list or self.__audio_queue): # While there are still audio to be processed if (self.__audio_queue): # If audio queue is not empty with self.__queue_lock: audio_to_process, audio_path = self.__audio_queue.pop(0) # Get audio from queue logging.info(f"[AFE] [Feature Extractor Thread]: Extracting {len(audio_to_process)} features from audio {audio_path}") for audio_chunk in audio_to_process: same_audio_chunk, timepos, channel_id, embedd_vect = self.__embedding_extract(audio_chunk) if (audio_path not in self.__features.keys()): #if DEBUG: print("Adding new vector to", audio_path.name) self.__features[audio_path] = [(embedd_vect, timepos, channel_id)] else: #if DEBUG: print("Adding vector to", audio_path.name) self.__features[audio_path].append( (embedd_vect, timepos, channel_id) ) else: logging.info("[AFE] [Feature Extractor Thread]: Queue Empty, extractor thread sleeping for 5 seconds") # If audio queue is empty, wait time.sleep(5) logging.info("[AFE] [Feature Extractor Thread]: DONE. Extracted all features from all audio files") def __init__( self, audio_paths_list: list[Path], max_audio_in_queue: int, desired_sr: int, mono: bool, chunk_length: float = 15.0, overlap: float = 2.0 ): self.__audio_queue = [] self.__audio_paths_list = audio_paths_list self.__max_audio_in_queue = max_audio_in_queue self.__queue_lock = threading.Lock() self.__desired_sr = desired_sr self.__mono = mono self.__chunk_length = chunk_length self.__overlap = overlap self.__features = {} @property def features(self) -> dict[Path, list[tuple[np.ndarray, float, int]]]: return self.__features def extract(self): print("Starting feature extraction for", len(self.__audio_paths_list), "file(s)") total_amount = len(self.__audio_paths_list) t_start = time.perf_counter() with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor: self.__feeder_future = executor.submit(self.__audio_queue_feeder) self.__extractor_future = executor.submit(self.__audio_queue_feature_extractor) while (self.__feeder_future.running() or self.__extractor_future.running()): print(f"Processed {len(self.__features)}/{total_amount} (L:{len(self.__audio_queue)}/W{len(self.__audio_paths_list)})", end="\r") time.sleep(1) t_stop = time.perf_counter() print(f"Processed {len(self.__features)}/{total_amount} (L:{len(self.__audio_queue)}/W:{len(self.__audio_paths_list)} COMPLETE)") delta_t = t_stop - t_start total_features = sum( [len(self.__features[path]) for path in self.__features] ) print() print("Extraction completed") print(f"Took {delta_t} seconds. Added {total_features} vectors/embeddings") class MultiThreadedAudioFeatureExtractor(): __audio_queue: queue.Queue[ # List of ... tuple[ # Pair of chunked audio and its path list[tuple[np.ndarray, float, int]], # Chunked audio Path # Path to original audio ] ] # Listed of Chunked/Resampled audio __audio_loader_threads: int # Amount of audio feeder threads __feature_extractor_threads: int # Amount of feature extractor threads (if the method allows) __audio_paths_list: queue.Queue[Path] # Path list to audio __max_audio_in_queue: int # Maximum audio in queue # Audio Feeeder parameter __desired_sr: int # Desired Sample Rate (Resampling) __mono: bool # Force load audio in mono mode __chunk_length: float # Audio chunk length __overlap: float # Result __features: dict[Path, list[tuple[np.ndarray, float, int]]] # This is a crime, I know __features_lock: threading.Lock # __features: { audioPath: # [(embedding1, pos1, channel1), # (embedding2, pos2, channel1)] # ... # } # Runtime __audio_loader_threadpool: list[concurrent.futures.Future] __feature_extractor_threadpool: list[concurrent.futures.Future] __audio_feed_condition: threading.Condition def __audio_inference_embedding(self, audio: list[tuple[np.ndarray, float, int]]) -> list[tuple[np.ndarray, float, int]]: """Receives a list of audio chunks, and then extracts embeddings for all audio chunks, returns the resulting embedding as a list of tuples(embedding, time, channel_id) Args: audio (list[tuple[np.ndarray, float, int]]): list of audio chunks Returns: list[tuple[np.ndarray, float, int]]: List of (embedding vector, timepos, channel id) """ features = [] for audio_chunk in audio: audio, timepos, channel_id = audio_chunk zero = np.zeros(32) features.append( (zero, timepos, channel_id) ) time.sleep(0.01) return features # To be overridden def __audio_feeder_thread(self, thread_id): # If there is still audio in paths list # Is the audio queue not full? while (not self.__audio_paths_list.empty()): if (not self.__audio_queue.full()): # Feed audio new_audio_path = self.__audio_paths_list.get() self.__audio_paths_list.task_done() logging.info(f"[MTAFE] [Audio Feeder {thread_id}] Preprocess: {new_audio_path.absolute()}") new_audio = audiopreprocessing.load_preprocessed_audio( new_audio_path, self.__desired_sr, self.__mono, self.__chunk_length, self.__overlap ) self.__audio_queue.put((new_audio, new_audio_path)) #self.__audio_queue.task_done() #with self.__audio_feed_condition: self.__audio_feed_condition.notify_all() logging.info(f"[MTAFE] [Audio Feeder {thread_id}] Feed: {new_audio_path.absolute()}") #else: # logging.info(f"[MTAFE] [Audio Feeder {thread_id}] Audio queue full ({self.__audio_queue.qsize()} <= {self.__max_audio_in_queue} FALSE): waiting") # with self.__audio_feed_condition: # logging.info(f"[MTAFE] [Audio Feeder {thread_id}] Audio queue full: waiting") # self.__audio_feed_condition.wait_for(lambda: not self.__audio_queue.qsize() <= self.__max_audio_in_queue) # This consumes way too much CPU power # self.__audio_feed_condition.wait(10) logging.info(f"[MTAFE] [Audio Feeder {thread_id}] Thread finished!") #def testfeedthread(self, nthreads): # t1 = threading.Thread(target=self.__audio_feeder_thread, args=(1,)) # t2 = threading.Thread(target=self.__audio_feeder_thread, args=(2,)) # t1.start(); t2.start() # #with self.__audio_feed_condition: # # self.__audio_feed_condition.notify_all() # t1.join(); t2.join() # with concurrent.futures.ThreadPoolExecutor(max_workers=nthreads) as executor: # for i in range(nthreads): # ft = executor.submit(self.__audio_feeder_thread, i) # self.__audio_loader_threadpool.append(ft) def __check_all_audiofeed_thread_finished(self) -> bool: for ft in self.__audio_loader_threadpool: if ft.running(): return False return True def __check_all_featureextractor_thread_finished(self) -> bool: for ft in self.__feature_extractor_threadpool: if ft.running(): return False return True def __feature_extractor_thread(self, thread_id): while (not self.__check_all_audiofeed_thread_finished() or not self.__audio_queue.empty()): if (not self.__audio_queue.empty()): audio_to_process, audio_path = self.__audio_queue.get() self.__audio_queue.task_done() logging.info(f"[MTAFE] [Feature Extractor {thread_id}] Extracting: {audio_path}") features_to_add = self.__audio_inference_embedding(audio_to_process) logging.info(f"[MTAFE] [Feature Extractor {thread_id}] Extracted: {len(features_to_add)} features") with self.__features_lock: self.__features[audio_path] = features_to_add #with self.__audio_feed_condition: self.__audio_feed_condition.notify_all() logging.info(f"[MTAFE] [Feature Extractor {thread_id}] Feature Extraction complete for {audio_path} w/ {len(features_to_add)} features") #else: # if (not self.__check_all_audiofeed_thread_finished()): # with self.__audio_feed_condition: # logging.info(f"[MTAFE] [Feature Extractor {thread_id}] Audio queue empty: waiting") # self.__audio_feed_condition.wait(10) # self.__audio_feed_condition.wait_for(lambda: not self.__audio_queue.empty()) logging.info(f"[MTAFE] [Feature Extractor {thread_id}] Thread finished!") def __count_running_threads(self) -> tuple[int, int]: running_extractors = 0 running_feeders = 0 for ft in self.__feature_extractor_threadpool: if ft.running(): running_extractors += 1 for ft in self.__audio_loader_threadpool: if ft.running(): running_feeders += 1 return (running_feeders, running_extractors) @property def features(self) -> dict[Path, list[tuple[np.ndarray, float, int]]]: return self.__features def extract(self): total_amount = self.__audio_paths_list.qsize() logging.info(f"[MTAFE] [Main] Starting feature extraction for {total_amount} file(s)") t_start = time.perf_counter() with concurrent.futures.ThreadPoolExecutor(max_workers=(self.__audio_loader_threads + self.__feature_extractor_threads)) as executor: for i in range(self.__audio_loader_threads): ld_ft = executor.submit(self.__audio_feeder_thread, i) self.__audio_loader_threadpool.append(ld_ft) for i in range(self.__feature_extractor_threads): ld_ft = executor.submit(self.__feature_extractor_thread, i) self.__feature_extractor_threadpool.append(ld_ft) while ( (not self.__check_all_audiofeed_thread_finished()) and (not self.__check_all_featureextractor_thread_finished()) ): nfeeder, nextract = self.__count_running_threads() print(f"[MTAFE Progress] Processed {len(self.__features)}/{total_amount} (L:{self.__audio_queue.qsize()}/W:{self.__audio_paths_list.qsize()}, LD:{nfeeder}/EXT:{nextract})", end="\r") t_stop = time.perf_counter() logging.info(f"[MTAFE] Processed {len(self.__features)}/{total_amount} (L:{self.__audio_queue.qsize()}/W:{self.__audio_paths_list.qsize()} COMPLETE)") delta_t = t_stop - t_start total_features = sum( [len(self.__features[path]) for path in self.__features] ) logging.info(f"[MTAFE] Extraction complete. Took {delta_t} seconds. Added {total_features} vectors/embeddings") def __init__( self, audio_paths: list[Path], max_audio_in_queue: int = 16, audio_feeder_threads: int = 8, feature_extractor_threads: int = 8, desired_sr: int = 32000, force_mono: bool = False, chunk_length: float = 15.0, chunk_overlap: float = 2.0, ): # Check if the paths passed in are all valid and add them to queue self.__audio_paths_list = queue.Queue() for p in audio_paths: if not p.is_file(): raise Exception(f"Path '{p.absolute()}' is NOT a valid file!") else: self.__audio_paths_list.put(p) #self.__audio_paths_list.task_done() logging.info(f"[MTAFE] [Constructor] Queued {self.__audio_paths_list.qsize()} files") # Set up private attributes ## Audio preprocessing parameters self.__desired_sr = desired_sr self.__mono = force_mono self.__chunk_length = chunk_length self.__overlap = chunk_overlap ## Extractor/Feeder settings self.__max_audio_in_queue = max_audio_in_queue self.__audio_loader_threads = audio_feeder_threads self.__feature_extractor_threads = feature_extractor_threads ## Set up runtime conditions self.__audio_queue = queue.Queue() self.__features = {} self.__features_lock = threading.Lock() self.__audio_loader_threadpool = [] self.__feature_extractor_threadpool = [] self.__audio_feed_condition = threading.Condition() logging.info(f"[MTAFE] [Constructor] Extraction parameters: {desired_sr}Hz, Mono: {force_mono}, Divide into {chunk_length}s chunks with {chunk_overlap}s of overlap") logging.info(f"[MTAFE] [Constructor] Using {audio_feeder_threads} threads for preprocessing audio and {feature_extractor_threads} threads for feature extraction. Max queue size of {max_audio_in_queue} files") # More audio embeddings specific code below (To be overridden)