I'm burnt out, I can't get multithreaded audio feature extractor to work :(

2025-04-19 17:47:09 +02:00
parent b855b7e255
commit 37b6a3c5e7
9 changed files with 563 additions and 180 deletions
--- a/FeatureExtraction/dataset_files.py
+++ b/FeatureExtraction/dataset_files.py
@@ -2,6 +2,7 @@ import platform
 import os
 import pickle
 import random
+import multiprocessing
 import threading
 import time
 import concurrent.futures
@@ -133,155 +134,159 @@ def random_audio_chunk(n : int, seed : int = 177013) -> list[Path]:
    #return random.choices(audio_paths, k=n) # Contains repeated elements
    return random.sample(audio_paths, k=n)

-class AudioFeatureExtractor():
-    __audio_queue: list[ # List of ...
-        tuple[ # Pair of chunked audio and its path
-            list[tuple[np.ndarray, float, int]], # Chunked audio
-            Path # Path to original audio
-        ]
-    ] # Listed of Chunked/Resampled audio
-    __feeder_future: concurrent.futures.Future
-    __extractor_future: concurrent.futures.Future
-    __audio_paths_list: list[Path]
-    __max_audio_in_queue: int
-    __queue_lock: threading.Lock
-    __desired_sr: int
-    __mono: bool
-    __chunk_length: float
-    __overlap: float
-    __features: dict[Path, list[tuple[np.ndarray, float, int]]] # This is a crime, I know
-    # { audioPath:
-    #   [(embedding, pos, channel)...]
-    # }
+# class AudioFeatureExtractor():
+#     __audio_queue: list[ # List of ...
+#         tuple[ # Pair of chunked audio and its path
+#             list[tuple[np.ndarray, float, int]], # Chunked audio
+#             Path # Path to original audio
+#         ]
+#     ] # Listed of Chunked/Resampled audio
+#     __feeder_future: concurrent.futures.Future
+#     __extractor_future: concurrent.futures.Future
+#     __audio_paths_list: list[Path]
+#     __max_audio_in_queue: int
+#     __queue_lock: threading.Lock
+#     __desired_sr: int
+#     __mono: bool
+#     __chunk_length: float
+#     __overlap: float
+#     __features: dict[Path, list[tuple[np.ndarray, float, int]]] # This is a crime, I know
+#     # { audioPath:
+#     #   [(embedding, pos, channel)...]
+#     # }
    
-    def __embedding_inference(self, audio_ndarray: np.ndarray) -> np.ndarray:
-        """Uses embedding model to inference an audio. Returns embedding vectors.
-        Function to be overrided. Returns np.zeros(32).
+#     def __embedding_inference(self, audio_ndarray: np.ndarray) -> np.ndarray:
+#         """Uses embedding model to inference an audio. Returns embedding vectors.
+#         Function to be overrided. Returns np.zeros(32).

-        Args:
-            audio_ndarray (np.ndarray): 
+#         Args:
+#             audio_ndarray (np.ndarray): 

-        Returns:
-            np.ndarray: _description_
-        """
-        return np.zeros(32)
+#         Returns:
+#             np.ndarray: _description_
+#         """
+#         return np.zeros(32)
    
-    def __embedding_extract(self, audio: tuple[np.ndarray, float, int]) -> tuple[np.ndarray, float, int, np.ndarray]:
-        """Receives a tuple of audio, position, and channel ID, then adding the embedding to the tuple
+#     def __embedding_extract(self, audio: tuple[np.ndarray, float, int]) -> tuple[np.ndarray, float, int, np.ndarray]:
+#         """Receives a tuple of audio, position, and channel ID, then adding the embedding to the tuple

-        Args:
-            audio (tuple[np.ndarray, float, int]): tuple of audio, position, channel id
+#         Args:
+#             audio (tuple[np.ndarray, float, int]): tuple of audio, position, channel id

-        Returns:
-            tuple[np.ndarray, float, int, np.ndarray]: audio, position, channel id, embedding vector
-        """
-        audio_chunk, pos, channel_id = audio
-        return (audio_chunk, pos, channel_id, self.__embedding_inference(audio_chunk))
+#         Returns:
+#             tuple[np.ndarray, float, int, np.ndarray]: audio, position, channel id, embedding vector
+#         """
+#         audio_chunk, pos, channel_id = audio
+#         return (audio_chunk, pos, channel_id, self.__embedding_inference(audio_chunk))
    
-    def __audio_queue_feeder(self): # TODO: Upgrade to multithreaded loader?
-        """Internal thread function. Preprocess and load the audio continuously to
-        audio_queue until the end of the audio_paths_list
-        """
-        while (self.__audio_paths_list): # While there are still Path elements in path list
-            if (not (len(self.__audio_queue) < self.__max_audio_in_queue)):
-                logging.info("[AFE] [Audio Queue Thread]: Queue Full, feeder thread sleeping for 5 seconds")
-                time.sleep(5)
-            while(len(self.__audio_queue) < self.__max_audio_in_queue): # While the audio queue is not full
-                new_audio_path = self.__audio_paths_list[0]
-                new_audio = audiopreprocessing.load_preprocessed_audio(
-                        new_audio_path,
-                        self.__desired_sr,
-                        self.__mono,
-                        self.__chunk_length,
-                        self.__overlap
-                    )
-                with self.__queue_lock:
-                    self.__audio_queue.append(
-                        (new_audio, new_audio_path)
-                    )
-                    pop_path = self.__audio_paths_list.pop(0)
-                    logging.info(f"[AFE] [Audio Queue Thread]: Added new audio to queue {pop_path}")
-        logging.info("[AFE] [Audio Queue Thread]: DONE. All audio files fed")
+#     def __audio_queue_feeder(self): # TODO: Upgrade to multithreaded loader?
+#         """Internal thread function. Preprocess and load the audio continuously to
+#         audio_queue until the end of the audio_paths_list
+#         """
+#         while (self.__audio_paths_list): # While there are still Path elements in path list
+#             if (not (len(self.__audio_queue) < self.__max_audio_in_queue)):
+#                 logging.info("[AFE] [Audio Queue Thread]: Queue Full, feeder thread sleeping for 5 seconds")
+#                 time.sleep(5)
+#             while(len(self.__audio_queue) < self.__max_audio_in_queue): # While the audio queue is not full
+#                 new_audio_path = self.__audio_paths_list[0]
+#                 new_audio = audiopreprocessing.load_preprocessed_audio(
+#                         new_audio_path,
+#                         self.__desired_sr,
+#                         self.__mono,
+#                         self.__chunk_length,
+#                         self.__overlap
+#                     )
+#                 with self.__queue_lock:
+#                     self.__audio_queue.append(
+#                         (new_audio, new_audio_path)
+#                     )
+#                     pop_path = self.__audio_paths_list.pop(0)
+#                     logging.info(f"[AFE] [Audio Queue Thread]: Added new audio to queue {pop_path}")
+#         logging.info("[AFE] [Audio Queue Thread]: DONE. All audio files fed")
                    
-    def __audio_queue_feature_extractor(self):
-        """Internal thread function. Get audio from audio queue. And extract embedding vector
-        for all audio chunks. Stores the resulting embedding into self.__features.
-        With Original Audio's Path as key, and list[tuple[np.ndarray, float, int]] (list of tuple of embedding vector, position, channel id)
-        """
-        while (self.__audio_paths_list or self.__audio_queue): # While there are still audio to be processed
-            if (self.__audio_queue): # If audio queue is not empty
-                with self.__queue_lock: 
-                    audio_to_process, audio_path = self.__audio_queue.pop(0) # Get audio from queue
-                    logging.info(f"[AFE] [Feature Extractor Thread]: Extracting {len(audio_to_process)} features from audio {audio_path}")
-                for audio_chunk in audio_to_process:
-                    same_audio_chunk, timepos, channel_id, embedd_vect = self.__embedding_extract(audio_chunk)
-                    if (audio_path not in self.__features.keys()):
-                        #if DEBUG: print("Adding new vector to", audio_path.name)
-                        self.__features[audio_path] = [(embedd_vect, timepos, channel_id)]
-                    else:
-                        #if DEBUG: print("Adding vector to", audio_path.name)
-                        self.__features[audio_path].append(
-                            (embedd_vect, timepos, channel_id)
-                        )
-            else:
-                logging.info("[AFE] [Feature Extractor Thread]: Queue Empty, extractor thread sleeping for 5 seconds") # If audio queue is empty, wait
-                time.sleep(5)
-        logging.info("[AFE] [Feature Extractor Thread]: DONE. Extracted all features from all audio files")
+#     def __audio_queue_feature_extractor(self):
+#         """Internal thread function. Get audio from audio queue. And extract embedding vector
+#         for all audio chunks. Stores the resulting embedding into self.__features.
+#         With Original Audio's Path as key, and list[tuple[np.ndarray, float, int]] (list of tuple of embedding vector, position, channel id)
+#         """
+#         while (self.__audio_paths_list or self.__audio_queue): # While there are still audio to be processed
+#             if (self.__audio_queue): # If audio queue is not empty
+#                 with self.__queue_lock: 
+#                     audio_to_process, audio_path = self.__audio_queue.pop(0) # Get audio from queue
+#                     logging.info(f"[AFE] [Feature Extractor Thread]: Extracting {len(audio_to_process)} features from audio {audio_path}")
+#                 for audio_chunk in audio_to_process:
+#                     same_audio_chunk, timepos, channel_id, embedd_vect = self.__embedding_extract(audio_chunk)
+#                     if (audio_path not in self.__features.keys()):
+#                         #if DEBUG: print("Adding new vector to", audio_path.name)
+#                         self.__features[audio_path] = [(embedd_vect, timepos, channel_id)]
+#                     else:
+#                         #if DEBUG: print("Adding vector to", audio_path.name)
+#                         self.__features[audio_path].append(
+#                             (embedd_vect, timepos, channel_id)
+#                         )
+#             else:
+#                 logging.info("[AFE] [Feature Extractor Thread]: Queue Empty, extractor thread sleeping for 5 seconds") # If audio queue is empty, wait
+#                 time.sleep(5)
+#         logging.info("[AFE] [Feature Extractor Thread]: DONE. Extracted all features from all audio files")
            
-    def __init__(
-            self, 
-            audio_paths_list: list[Path],
-            max_audio_in_queue: int,
-            desired_sr: int,
-            mono: bool,
-            chunk_length: float = 15.0,
-            overlap: float = 2.0
-        ):
-        self.__audio_queue = []
-        self.__audio_paths_list = audio_paths_list
-        self.__max_audio_in_queue = max_audio_in_queue
-        self.__queue_lock = threading.Lock()
-        self.__desired_sr = desired_sr
-        self.__mono = mono
-        self.__chunk_length = chunk_length
-        self.__overlap = overlap
-        self.__features = {}
+#     def __init__(
+#             self, 
+#             audio_paths_list: list[Path],
+#             max_audio_in_queue: int,
+#             desired_sr: int,
+#             mono: bool,
+#             chunk_length: float = 15.0,
+#             overlap: float = 2.0
+#         ):
+#         self.__audio_queue = []
+#         self.__audio_paths_list = audio_paths_list
+#         self.__max_audio_in_queue = max_audio_in_queue
+#         self.__queue_lock = threading.Lock()
+#         self.__desired_sr = desired_sr
+#         self.__mono = mono
+#         self.__chunk_length = chunk_length
+#         self.__overlap = overlap
+#         self.__features = {}
    
-    @property
-    def features(self) -> dict[Path, list[tuple[np.ndarray, float, int]]]:
-        return self.__features
+#     @property
+#     def features(self) -> dict[Path, list[tuple[np.ndarray, float, int]]]:
+#         return self.__features
    
-    def extract(self):
-        print("Starting feature extraction for", len(self.__audio_paths_list), "file(s)")
-        total_amount = len(self.__audio_paths_list)
-        t_start = time.perf_counter()
-        with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
-            self.__feeder_future = executor.submit(self.__audio_queue_feeder)
-            self.__extractor_future = executor.submit(self.__audio_queue_feature_extractor)
-            while (self.__feeder_future.running() or self.__extractor_future.running()):
-                print(f"Processed {len(self.__features)}/{total_amount} (L:{len(self.__audio_queue)}/W{len(self.__audio_paths_list)})", end="\r")
-                time.sleep(1)
+#     def extract(self):
+#         print("Starting feature extraction for", len(self.__audio_paths_list), "file(s)")
+#         total_amount = len(self.__audio_paths_list)
+#         t_start = time.perf_counter()
+#         with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
+#             self.__feeder_future = executor.submit(self.__audio_queue_feeder)
+#             self.__extractor_future = executor.submit(self.__audio_queue_feature_extractor)
+#             while (self.__feeder_future.running() or self.__extractor_future.running()):
+#                 print(f"Processed {len(self.__features)}/{total_amount} (L:{len(self.__audio_queue)}/W{len(self.__audio_paths_list)})", end="\r")
+#                 time.sleep(1)
               
-        t_stop = time.perf_counter()
-        print(f"Processed {len(self.__features)}/{total_amount} (L:{len(self.__audio_queue)}/W:{len(self.__audio_paths_list)} COMPLETE)")
-        delta_t = t_stop - t_start
-        total_features = sum( [len(self.__features[path]) for path in self.__features] ) 
-        print()
-        print("Extraction completed")
-        print(f"Took {delta_t} seconds. Added {total_features} vectors/embeddings")
+#         t_stop = time.perf_counter()
+#         print(f"Processed {len(self.__features)}/{total_amount} (L:{len(self.__audio_queue)}/W:{len(self.__audio_paths_list)} COMPLETE)")
+#         delta_t = t_stop - t_start
+#         total_features = sum( [len(self.__features[path]) for path in self.__features] ) 
+#         print()
+#         print("Extraction completed")
+#         print(f"Took {delta_t} seconds. Added {total_features} vectors/embeddings")

 class MultiThreadedAudioFeatureExtractor():
+    # This is the third time I am rewriting this, please send help. Multithreaded apps is pure hell to develop and debug
+    # After testing: this will hang at the last audio, precisely at preprocessing audio. I suspect that GIL hit the performance
+    # so much to the point that the preprocessing routine cannot get any share of the CPU execution cycle
    __audio_queue: queue.Queue[ # List of ...
        tuple[ # Pair of chunked audio and its path
-            list[tuple[np.ndarray, float, int]], # Chunked audio
+            list[tuple[np.ndarray, float, int]], # Chunked audio list of (ndarray, time position of chunk relative to original audio, channel_id)
            Path # Path to original audio
        ]
    ] # Listed of Chunked/Resampled audio
-    __audio_loader_threads: int # Amount of audio feeder threads
+    __audio_feeder_threads: int # Amount of audio feeder threads
    __feature_extractor_threads: int # Amount of feature extractor threads (if the method allows)
    __audio_paths_list: queue.Queue[Path] # Path list to audio
    __max_audio_in_queue: int # Maximum audio in queue
-    # Audio Feeeder parameter
+    __audio_feeder_barrier: threading.Barrier # Synchronization barrier for all audio feeder threads
+    # Audio Feeder parameter
    __desired_sr: int # Desired Sample Rate (Resampling)
    __mono: bool # Force load audio in mono mode
    __chunk_length: float # Audio chunk length
@@ -295,9 +300,8 @@ class MultiThreadedAudioFeatureExtractor():
    # ...
    # }
    # Runtime
-    __audio_loader_threadpool: list[concurrent.futures.Future]
+    __audio_feeder_threadpool: list[concurrent.futures.Future]
    __feature_extractor_threadpool: list[concurrent.futures.Future]
-    __audio_feed_condition: threading.Condition
    
    def __audio_inference_embedding(self, audio: list[tuple[np.ndarray, float, int]]) -> list[tuple[np.ndarray, float, int]]:
        """Receives a list of audio chunks, and then extracts embeddings for all audio chunks, returns the resulting embedding as a list of tuples(embedding, time, channel_id)
@@ -313,18 +317,21 @@ class MultiThreadedAudioFeatureExtractor():
            audio, timepos, channel_id = audio_chunk
            zero = np.zeros(32)
            features.append( (zero, timepos, channel_id) )
-        time.sleep(0.01)
+        time.sleep(0.01) # Simulate effort, change to simulate spent seconds in each audio file
        return features
        # To be overridden
    
-    def __audio_feeder_thread(self, thread_id):
-        # If there is still audio in paths list
-        # Is the audio queue not full?
-        while (not self.__audio_paths_list.empty()):
-            if (not self.__audio_queue.full()):
-                # Feed audio
+    def __audio_feeder_thread(self, thread_id: int, barrier: threading.Barrier):
+        try:
+            while True:
+                # Attempt to get audio path from audio path queue 
                new_audio_path = self.__audio_paths_list.get()
-                self.__audio_paths_list.task_done()
+                # Check thread exit condition (If the queue returns None, that means the audio path queue is now empty and the thread should end itself)
+                if (new_audio_path is None):
+                    self.__audio_paths_list.put(new_audio_path) # Put None back to notify other audio feeder threads
+                    # Omae wa mou shindeiru
+                    break # Si la ETSISI ve esto seguramente me echarán de la escuela
+                # Now that the audio path queue is not empty, try preprocessing an audio
                logging.info(f"[MTAFE] [Audio Feeder {thread_id}] Preprocess: {new_audio_path.absolute()}")
                new_audio = audiopreprocessing.load_preprocessed_audio(
                    new_audio_path,
@@ -333,17 +340,34 @@ class MultiThreadedAudioFeatureExtractor():
                    self.__chunk_length,
                    self.__overlap
                )
-                self.__audio_queue.put((new_audio, new_audio_path))
-                #self.__audio_queue.task_done()
-                #with self.__audio_feed_condition: self.__audio_feed_condition.notify_all()
+                self.__audio_queue.put((new_audio, new_audio_path)) # In theory, this should block this audio feeder thread when the audio queue is full
                logging.info(f"[MTAFE] [Audio Feeder {thread_id}] Feed: {new_audio_path.absolute()}")
-            #else:
-            #    logging.info(f"[MTAFE] [Audio Feeder {thread_id}] Audio queue full ({self.__audio_queue.qsize()} <= {self.__max_audio_in_queue} FALSE): waiting")
-            #    with self.__audio_feed_condition:
-            #        logging.info(f"[MTAFE] [Audio Feeder {thread_id}] Audio queue full: waiting")
-            #        self.__audio_feed_condition.wait_for(lambda: not self.__audio_queue.qsize() <= self.__max_audio_in_queue) # This consumes way too much CPU power
-            #        self.__audio_feed_condition.wait(10)
-        logging.info(f"[MTAFE] [Audio Feeder {thread_id}] Thread finished!")
+            logging.info("[MTAFE] [Audio Feeder {thread_id}] Waiting for other threads to finish")
+            barrier.wait()
+            if (thread_id == 0):
+                self.__audio_queue.put(None) # None to signal audio_queue has no more elements to process
+            logging.info(f"[MTAFE] [Audio Feeder {thread_id}] Thread finished!")
+        except Exception as e:
+            logging.error(f"[MTAFE] [Audio Feeder {thread_id}] An exception occurred! Committing seppuku!")
+            logging.exception(e)
+            return
+
+        # while (not self.__audio_paths_list.empty()):
+        #     if (not self.__audio_queue.full()):
+        #         # Feed audio
+        #         new_audio_path = self.__audio_paths_list.get()
+        #         self.__audio_paths_list.task_done()
+        #         logging.info(f"[MTAFE] [Audio Feeder {thread_id}] Preprocess: {new_audio_path.absolute()}")
+        #         new_audio = audiopreprocessing.load_preprocessed_audio(
+        #             new_audio_path,
+        #             self.__desired_sr,
+        #             self.__mono,
+        #             self.__chunk_length,
+        #             self.__overlap
+        #         )
+        #         self.__audio_queue.put((new_audio, new_audio_path))
+        #         logging.info(f"[MTAFE] [Audio Feeder {thread_id}] Feed: {new_audio_path.absolute()}")
+        # logging.info(f"[MTAFE] [Audio Feeder {thread_id}] Thread finished!")

    #def testfeedthread(self, nthreads):
        # t1 = threading.Thread(target=self.__audio_feeder_thread, args=(1,))
@@ -358,7 +382,7 @@ class MultiThreadedAudioFeatureExtractor():
        #         self.__audio_loader_threadpool.append(ft)

    def __check_all_audiofeed_thread_finished(self) -> bool:
-        for ft in self.__audio_loader_threadpool:
+        for ft in self.__audio_feeder_threadpool:
            if ft.running():
                return False
        return True
@@ -370,17 +394,33 @@ class MultiThreadedAudioFeatureExtractor():
        return True
    
    def __feature_extractor_thread(self, thread_id):
-        while (not self.__check_all_audiofeed_thread_finished() or not self.__audio_queue.empty()):
-            if (not self.__audio_queue.empty()):
-                audio_to_process, audio_path = self.__audio_queue.get()
-                self.__audio_queue.task_done()
-                logging.info(f"[MTAFE] [Feature Extractor {thread_id}] Extracting: {audio_path}")
-                features_to_add = self.__audio_inference_embedding(audio_to_process)
-                logging.info(f"[MTAFE] [Feature Extractor {thread_id}] Extracted: {len(features_to_add)} features")
-                with self.__features_lock:
-                    self.__features[audio_path] = features_to_add
-                #with self.__audio_feed_condition: self.__audio_feed_condition.notify_all()
-                logging.info(f"[MTAFE] [Feature Extractor {thread_id}] Feature Extraction complete for {audio_path} w/ {len(features_to_add)} features")
+        while True:
+            # Attempt to get next audio chunks to process
+            next_audio_tuple = self.__audio_queue.get()
+            # Check thread exit condition
+            if (next_audio_tuple is None):
+                self.__audio_queue.put(next_audio_tuple) # Put the None back to notify other threads
+                break # unalive urself
+            else: # Assuming we got more tuples
+                current_audio_to_process, current_audio_path = next_audio_tuple # Deconstruct tuple 
+            logging.info(f"[MTAFE] [Feature Extractor {thread_id}] Extracting: {current_audio_path}")
+            features_to_add = self.__audio_inference_embedding(current_audio_to_process)
+            with self.__features_lock:
+                self.__features[current_audio_path] = features_to_add
+            logging.info(f"[MTAFE] [Feature Extractor {thread_id}] Feature Extraction complete for {current_audio_path} w/ {len(features_to_add)} features")
+        logging.info(f"[MTAFE] [Feature Extractor {thread_id}] Thread finished!")
+            
+        # while (not self.__check_all_audiofeed_thread_finished() or not self.__audio_queue.empty()):
+        #     if (not self.__audio_queue.empty()):
+        #         audio_to_process, audio_path = self.__audio_queue.get()
+        #         self.__audio_queue.task_done()
+        #         logging.info(f"[MTAFE] [Feature Extractor {thread_id}] Extracting: {audio_path}")
+        #         features_to_add = self.__audio_inference_embedding(audio_to_process)
+        #         logging.info(f"[MTAFE] [Feature Extractor {thread_id}] Extracted: {len(features_to_add)} features")
+        #         with self.__features_lock:
+        #             self.__features[audio_path] = features_to_add
+        #         #with self.__audio_feed_condition: self.__audio_feed_condition.notify_all()
+        #         logging.info(f"[MTAFE] [Feature Extractor {thread_id}] Feature Extraction complete for {audio_path} w/ {len(features_to_add)} features")
            #else:
            #    if (not self.__check_all_audiofeed_thread_finished()):
            #        with self.__audio_feed_condition:
@@ -388,14 +428,13 @@ class MultiThreadedAudioFeatureExtractor():
            #            self.__audio_feed_condition.wait(10)
            #            self.__audio_feed_condition.wait_for(lambda: not self.__audio_queue.empty())
        
-        logging.info(f"[MTAFE] [Feature Extractor {thread_id}] Thread finished!")
        
    def __count_running_threads(self) -> tuple[int, int]:
        running_extractors = 0
        running_feeders = 0
        for ft in self.__feature_extractor_threadpool:
            if ft.running(): running_extractors += 1
-        for ft in self.__audio_loader_threadpool:
+        for ft in self.__audio_feeder_threadpool:
            if ft.running(): running_feeders += 1
        return (running_feeders, running_extractors)
    
@@ -404,21 +443,26 @@ class MultiThreadedAudioFeatureExtractor():
        return self.__features
    
    def extract(self):
-        total_amount = self.__audio_paths_list.qsize()
+        total_amount = self.__audio_paths_list.qsize() - 1 # Account for None to indicate queue end
        logging.info(f"[MTAFE] [Main] Starting feature extraction for {total_amount} file(s)")
-        t_start = time.perf_counter()
-        with concurrent.futures.ThreadPoolExecutor(max_workers=(self.__audio_loader_threads + self.__feature_extractor_threads)) as executor:
-            for i in range(self.__audio_loader_threads):
-                ld_ft = executor.submit(self.__audio_feeder_thread, i)
-                self.__audio_loader_threadpool.append(ld_ft)
+        t_start = time.perf_counter() # Timer
+        with concurrent.futures.ProcessPoolExecutor(max_workers=(self.__audio_feeder_threads + self.__feature_extractor_threads)) as executor:
+            # Audio feeder threads
+            for i in range(self.__audio_feeder_threads):
+                logging.info(f"[MTAFE] Started audio feeder thread {i}")
+                ld_ft = executor.submit(self.__audio_feeder_thread, i, self.__audio_feeder_barrier)
+                self.__audio_feeder_threadpool.append(ld_ft)
+            # Feature extractor threads
            for i in range(self.__feature_extractor_threads):
-                ld_ft = executor.submit(self.__feature_extractor_thread, i)
-                self.__feature_extractor_threadpool.append(ld_ft)
+                logging.info(f"[MTAFE] Started feature extractor thread {i}")
+                ex_ft = executor.submit(self.__feature_extractor_thread, i)
+                self.__feature_extractor_threadpool.append(ex_ft)
+            # Progress checking
            while ( (not self.__check_all_audiofeed_thread_finished()) and (not self.__check_all_featureextractor_thread_finished()) ):
                nfeeder, nextract = self.__count_running_threads()
                print(f"[MTAFE Progress] Processed {len(self.__features)}/{total_amount} (L:{self.__audio_queue.qsize()}/W:{self.__audio_paths_list.qsize()}, LD:{nfeeder}/EXT:{nextract})", end="\r")
        t_stop = time.perf_counter()
-        logging.info(f"[MTAFE] Processed {len(self.__features)}/{total_amount} (L:{self.__audio_queue.qsize()}/W:{self.__audio_paths_list.qsize()} COMPLETE)")
+        logging.info(f"[MTAFE] Processed {len(self.__features)}/{total_amount} (L:{self.__audio_queue.qsize() - 1}/W:{self.__audio_paths_list.qsize() - 1} COMPLETE)")
        delta_t = t_stop - t_start
        total_features = sum( [len(self.__features[path]) for path in self.__features] )
        logging.info(f"[MTAFE] Extraction complete. Took {delta_t} seconds. Added {total_features} vectors/embeddings")
@@ -435,15 +479,15 @@ class MultiThreadedAudioFeatureExtractor():
        chunk_overlap: float = 2.0,
    ):
        # Check if the paths passed in are all valid and add them to queue
-        self.__audio_paths_list = queue.Queue()
+        self.__audio_paths_list = multiprocessing.Queue()
        for p in audio_paths:
            if not p.is_file():
                raise Exception(f"Path '{p.absolute()}' is NOT a valid file!")
            else:
                self.__audio_paths_list.put(p)
-        #self.__audio_paths_list.task_done()
+        self.__audio_paths_list.put(None) # To signal to the producer that the audio path list is empty, since Queue.empty() is unreliable
        
-        logging.info(f"[MTAFE] [Constructor] Queued {self.__audio_paths_list.qsize()} files")
+        logging.info(f"[MTAFE] [Constructor] Queued {self.__audio_paths_list.qsize() - 1} files")
        
        # Set up private attributes
        ## Audio preprocessing parameters
@@ -454,16 +498,16 @@ class MultiThreadedAudioFeatureExtractor():
        
        ## Extractor/Feeder settings
        self.__max_audio_in_queue = max_audio_in_queue
-        self.__audio_loader_threads = audio_feeder_threads
+        self.__audio_feeder_threads = audio_feeder_threads
        self.__feature_extractor_threads = feature_extractor_threads
        
        ## Set up runtime conditions
-        self.__audio_queue = queue.Queue()
+        self.__audio_queue = multiprocessing.Queue(maxsize=self.__max_audio_in_queue)
        self.__features = {}
-        self.__features_lock = threading.Lock()
-        self.__audio_loader_threadpool = []
+        self.__features_lock = multiprocessing.Lock()
+        self.__audio_feeder_barrier = multiprocessing.Barrier(self.__audio_feeder_threads)
+        self.__audio_feeder_threadpool = []
        self.__feature_extractor_threadpool = []
-        self.__audio_feed_condition = threading.Condition()
        
        logging.info(f"[MTAFE] [Constructor] Extraction parameters: {desired_sr}Hz, Mono: {force_mono}, Divide into {chunk_length}s chunks with {chunk_overlap}s of overlap")
        logging.info(f"[MTAFE] [Constructor] Using {audio_feeder_threads} threads for preprocessing audio and {feature_extractor_threads} threads for feature extraction. Max queue size of {max_audio_in_queue} files")
--- a/FeatureExtraction/mtafe.py
+++ b/FeatureExtraction/mtafe.py
@@ -0,0 +1,8 @@
+import dataset_files
+import multiprocessing
+import logging
+import numpy as np
+import threading
+import queue
+from pathlib import Path
+
--- a/FeatureExtraction/test.py
+++ b/FeatureExtraction/test.py
@@ -5,12 +5,13 @@ logging.basicConfig(format="%(asctime)s/%(levelname)s: [%(module)s] %(message)s"

 from dataset_files import MultiThreadedAudioFeatureExtractor, random_audio_chunk
 mtafe = MultiThreadedAudioFeatureExtractor(
-    audio_paths=random_audio_chunk(200),
+    audio_paths=random_audio_chunk(8),
    max_audio_in_queue=8,
    audio_feeder_threads=8,
    feature_extractor_threads=1,
    desired_sr=32000,
    force_mono=False,
    chunk_length=15,
-    chunk_overlap=2)
+    chunk_overlap=2
+)
 mtafe.extract()
--- a/FeatureExtraction/test_mtafe.py
+++ b/FeatureExtraction/test_mtafe.py
@@ -0,0 +1,17 @@
+#import mtafe
+import logging
+#import dataset_files
+logging.basicConfig(format="%(asctime)s/%(levelname)s: [%(module)s] %(message)s", level=logging.DEBUG)
+
+logging.info("Running tests")
+    # m = mtafe.mtafe(
+    #     audio_paths=dataset_files.random_audio_chunk(2),
+    #     max_audio_in_queue=8,
+    #     audio_feeder_threads=8,
+    #     feature_extractor_threads=1,
+    #     desired_sr=32000,
+    #     force_mono=False,
+    #     chunk_length=15,
+    #     chunk_overlap=2
+    # )
+    # m.run()