some test data

2025-04-10 09:01:24 +02:00
parent a9d3d10da9
commit 6fc6df87b2
10 changed files with 25401 additions and 22798 deletions
--- a/FeatureExtraction/dataset_files.py
+++ b/FeatureExtraction/dataset_files.py
@@ -1,9 +1,43 @@
+import platform
+import os
+import pickle
+import random
+import threading
+import time
+import concurrent.futures
+import numpy as np
 from pathlib import Path
+import audiopreprocessing
+
+DEBUG=True
+
+def serialize_dict_obj(path : Path, object : dict) -> int:
+    """Serializes Python Dictionary object to a file via Pickle.
+
+    Args:
+        path (Path): Path to store the file
+        object (dict): Dictionary object to serialize
+    Returns:
+        int: size in bytes written
+    """
+    # Horrible practice, horrible security, but it will work for now
+    with path.open("wb") as fp:
+        pickle.dump(object, fp)
+        fp.seek(0, os.SEEK_END)
+        size = fp.tell()
+    return size
+
+print("Reading local dataset directory structure...")

 ASMRThreePath = Path("C:\\ASMRThree")
 ASMRTwoPath = Path("D:\\ASMRTwo")
 ASMROnePath = Path("E:\\ASMROne")

+if (platform.system() == 'Linux'):
+    ASMROnePath = Path('/mnt/Scratchpad/ASMROne')
+    ASMRTwoPath = Path('/mnt/MyStuffz/ASMRTwo')
+    ASMRThreePath = Path('/mnt/Windows11/ASMRThree')
+
 size_one, size_two, size_three = 0, 0, 0
 files_one, files_two, files_three = [], [], []
 folders_one, folders_two, folders_three = [], [], []
@@ -78,4 +112,161 @@ for file in file_list:
        fileext_stat[f_ext]['Count'] = 1
        fileext_stat[f_ext]['List'] = [file]
        fileext_stat[f_ext]['ExtensionMass'] = file.stat().st_size # The total sum of  sizes of the same file extension
-        fileext_stat[f_ext]['MediaType'] = fileExt2fileType[f_ext]
+        fileext_stat[f_ext]['MediaType'] = fileExt2fileType[f_ext]
+
+audio_paths = []
+for extension in fileext_stat: # I can't be bothered to convert this into a list compresion
+    if fileext_stat[extension]['MediaType'] == "Audio":
+        audio_paths += fileext_stat[extension]['List']
+        
+def random_audio_chunk(n : int, seed : int = 177013) -> list[Path]:
+    """Returns a random selection of audio files
+
+    Args:
+        n (int): Amount of files to return
+        seed (int, optional): Seed for RNG. Defaults to 177013.
+
+    Returns:
+        list[Path]: List of randomly selected audio paths (using Path object)
+    """
+    random.seed(seed)
+    #return random.choices(audio_paths, k=n) # Contains repeated elements
+    return random.sample(audio_paths, k=n)
+
+class AudioFeatureExtractor():
+    __audio_queue: list[ # List of ...
+        tuple[ # Pair of chunked audio and its path
+            list[tuple[np.ndarray, float, int]], # Chunked audio
+            Path # Path to original audio
+        ]
+    ] # Listed of Chunked/Resampled audio
+    __feeder_future: concurrent.futures.Future
+    __extractor_future: concurrent.futures.Future
+    __audio_paths_list: list[Path]
+    __max_audio_in_queue: int
+    __queue_lock: threading.Lock
+    __desired_sr: int
+    __mono: bool
+    __chunk_length: float
+    __overlap: float
+    __features: dict[Path, list[tuple[np.ndarray, float, int]]]
+    # { audioPath:
+    #   [(embedding, pos, channel)...]
+    # }
+    
+    def __embedding_inference(self, audio_ndarray: np.ndarray) -> np.ndarray:
+        """Uses embedding model to inference an audio. Returns embedding vectors.
+        Function to be overrided. Returns np.zeros(32).
+
+        Args:
+            audio_ndarray (np.ndarray): 
+
+        Returns:
+            np.ndarray: _description_
+        """
+        return np.zeros(32)
+    
+    def __embedding_extract(self, audio: tuple[np.ndarray, float, int]) -> tuple[np.ndarray, float, int, np.ndarray]:
+        """Receives a tuple of audio, position, and channel ID, then adding the embedding to the tuple
+
+        Args:
+            audio (tuple[np.ndarray, float, int]): tuple of audio, position, channel id
+
+        Returns:
+            tuple[np.ndarray, float, int, np.ndarray]: audio, position, channel id, embedding vector
+        """
+        audio_chunk, pos, channel_id = audio
+        return (audio_chunk, pos, channel_id, self.__embedding_inference(audio_chunk))
+    
+    def __audio_queue_feeder(self): # TODO: Upgrade to multithreaded loader?
+        """Internal thread function. Preprocess and load the audio continuously to
+        audio_queue until the end of the audio_paths_list
+        """
+        while (self.__audio_paths_list): # While there are still Path elements in path list
+            if (not (len(self.__audio_queue) < self.__max_audio_in_queue)):
+                if DEBUG: print("Audio Queue Thread: Queue Full, feeder thread sleeping for 5 seconds")
+                time.sleep(5)
+            while(len(self.__audio_queue) < self.__max_audio_in_queue): # While the audio queue is not full
+                new_audio_path = self.__audio_paths_list[0]
+                new_audio = audiopreprocessing.load_preprocessed_audio(
+                        new_audio_path,
+                        self.__desired_sr,
+                        self.__mono,
+                        self.__chunk_length,
+                        self.__overlap
+                    )
+                with self.__queue_lock:
+                    self.__audio_queue.append(
+                        (new_audio, new_audio_path)
+                    )
+                    pop_path = self.__audio_paths_list.pop(0)
+                    if DEBUG: print("Audio Queue Thread: Added new audio to queue", pop_path)
+        if DEBUG: print("Audio Queue Thread: DONE. All audio files fed")
+                    
+    def __audio_queue_feature_extractor(self):
+        """Internal thread function. Get audio from audio queue. And extract embedding vector
+        for all audio chunks. Stores the resulting embedding into self.__features.
+        With Original Audio's Path as key, and list[tuple[np.ndarray, float, int]] (list of tuple of embedding vector, position, channel id)
+        """
+        while (self.__audio_paths_list or self.__audio_queue): # While there are still audio to be processed
+            if (self.__audio_queue): # If audio queue is not empty
+                with self.__queue_lock: 
+                    audio_to_process, audio_path = self.__audio_queue.pop(0) # Get audio from queue
+                    if DEBUG: print(f"Feature Extractor Thread: Extracting {len(audio_to_process)} features from audio", audio_path)
+                for audio_chunk in audio_to_process:
+                    same_audio_chunk, timepos, channel_id, embedd_vect = self.__embedding_extract(audio_chunk)
+                    if (audio_path not in self.__features.keys()):
+                        #if DEBUG: print("Adding new vector to", audio_path.name)
+                        self.__features[audio_path] = [(embedd_vect, timepos, channel_id)]
+                    else:
+                        #if DEBUG: print("Adding vector to", audio_path.name)
+                        self.__features[audio_path].append(
+                            (embedd_vect, timepos, channel_id)
+                        )
+            else:
+                if DEBUG: print("Feature Extractor Thread: Queue Empty, extractor thread sleeping for 5 seconds") # If audio queue is empty, wait
+                time.sleep(5)
+        if DEBUG: print("Feature Extractor Thread: DONE. Extracted all features from all audio files")
+            
+    def __init__(
+            self, 
+            audio_paths_list: list[Path],
+            max_audio_in_queue: int,
+            desired_sr: int,
+            mono: bool,
+            chunk_length: float = 15.0,
+            overlap: float = 2.0
+        ):
+        self.__audio_queue = []
+        self.__audio_paths_list = audio_paths_list
+        self.__max_audio_in_queue = max_audio_in_queue
+        self.__queue_lock = threading.Lock()
+        self.__desired_sr = desired_sr
+        self.__mono = mono
+        self.__chunk_length = chunk_length
+        self.__overlap = overlap
+        self.__features = {}
+    
+    @property
+    def features(self) -> dict[Path, list[tuple[np.ndarray, float, int]]]:
+        return self.__features
+    
+    def extract(self):
+        print("Starting feature extraction for", len(self.__audio_paths_list), "file(s)")
+        total_amount = len(self.__audio_paths_list)
+        t_start = time.perf_counter()
+        with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
+            self.__feeder_future = executor.submit(self.__audio_queue_feeder)
+            self.__extractor_future = executor.submit(self.__audio_queue_feature_extractor)
+            while (self.__feeder_future.running() or self.__extractor_future.running()):
+                print(f"Processed {len(self.__features)}/{total_amount} (L:{len(self.__audio_queue)}/W{len(self.__audio_paths_list)})", end="\r")
+                time.sleep(1)
+               
+        t_stop = time.perf_counter()
+        print(f"Processed {len(self.__features)}/{total_amount} (L:{len(self.__audio_queue)}/W:{len(self.__audio_paths_list)} COMPLETE)")
+        delta_t = t_stop - t_start
+        total_features = sum( [len(self.__features[path]) for path in self.__features] ) 
+        print()
+        print("Extraction completed")
+        print(f"Took {delta_t} seconds. Added {total_features} vectors/embeddings")
+