I'm burnt out, I can't get multithreaded audio feature extractor to work :(

2025-04-19 17:47:09 +02:00
parent b855b7e255
commit 37b6a3c5e7
9 changed files with 563 additions and 180 deletions
--- a/mtafe_lab/audiopreprocessing.py
+++ b/mtafe_lab/audiopreprocessing.py
@@ -0,0 +1,95 @@
+import librosa
+import pickle
+import os
+import numpy as np
+from pathlib import Path
+import logging
+
+logger = logging.getLogger(__name__)
+
+def triggerlog():
+    logger.critical("Testing: info")
+
+def resample_load(input_path : Path, target_sr : int = 16000, mono_audio : bool = False) -> np.ndarray: # AI
+    """Load and resamples the audio into `target_sr`.
+
+    Args:
+        input_path (Path): pathlib.Path object to audio file
+        target_sr (int, optional): Target Sample Rate to resample. Defaults to 16000.
+        mono_audio (bool, optional): Load the audio in mono mode. Defaults to False.
+
+    Returns:
+        np.ndarray: _description_
+    """
+    # Load audio file with original sample rate
+    logger.info(f"[resample_load] Loading audio {input_path}")
+    audio, orig_sr = librosa.load(input_path, sr=None, mono=mono_audio)
+    
+    # Resample if necessary
+    if orig_sr != target_sr:
+        logger.info(f"[resample_load] Resampling to {target_sr}")
+        audio = librosa.resample(audio, orig_sr=orig_sr, target_sr=target_sr)
+        
+    return audio
+
+def chunk_audio(audio : np.ndarray, sr: int, chunk_length: float = 10.0, overlap: float = 2.0) -> tuple[list[np.ndarray], list[float], int]: # AI
+    """
+    Chunks audio file into overlapping segments. Only pass in mono audio here.
+    
+    Args:
+        audio_file: Loaded audio ndarray (one channel only)
+        sr: Sample rate for the given audio file
+        chunk_length: Length of each chunk in seconds
+        overlap: Overlap between chunks in seconds
+    
+    Returns:
+        List of audio chunks, list of chunk positions, and given sample rate
+    """
+    logger.info(f"[chunk_audio] Chunking audio ({len(audio) / sr}s)")
+    # Calculate chunk size and hop length in samples
+    chunk_size = int(chunk_length * sr)
+    hop_length = int((chunk_length - overlap) * sr)
+    
+    # Generate chunks
+    chunks = []
+    positions = []
+    k = 0
+    for i in range(0, len(audio) - chunk_size + 1, hop_length):
+        chunk = audio[i:i + chunk_size]
+        chunks.append(chunk)
+        positions.append(i / sr)
+        k += 1
+    if k == 0: # The full audio length is less than chunk_length
+        chunks = [audio]
+        positions = [0.0]
+        logger.info(f"[chunk_audio] Audio less than chunk_length. Returning original audio as chunk\r")
+    else:
+        logger.info(f"[chunk_audio] Audio is split into {k} chunks")
+    
+    return chunks, positions, sr
+
+def load_preprocessed_audio(
+    path: Path,
+    desired_sr: int,
+    mono: bool = False,
+    chunk_length: float = 15.0,
+    overlap: float = 2.0) -> list[tuple[np.ndarray, float, int]]:
+    
+    result = []
+    # Load and resample audio
+    audio = resample_load(path, desired_sr, mono) # Stereo 2D matrix, Mono 1D array
+    if mono or (audio.ndim == 1):
+        # Chunk audio: mono (or the audio file loaded in itself is mono)
+        chunks, positions, _ = chunk_audio(audio, desired_sr, chunk_length, overlap)
+        assert len(chunks) == len(positions)
+        result.extend(zip(chunks, positions, [-1 for _ in range(len(chunks))]))
+        # (ndarray_chunk1, pos1, -1): first audio chunk, position1, -1 (Mono channel indicator)
+    else:
+        # Chunk audio: stereo/multichannel
+        for channel_id, channel_audio in enumerate(audio):
+            chunks, positions, _ = chunk_audio(channel_audio, desired_sr, chunk_length, overlap)
+            assert len(chunks) == len(positions)
+            result.extend(zip(chunks, positions, [channel_id for _ in range(len(chunks))]))
+            # (ndarray_chunk1, pos1, 0): first audio chunk, position1, 0 (channel 0)
+    logging.info(f"[load_preprocessed_audio] Loaded audio {path} ({desired_sr}Hz, Chunk {chunk_length}s with overlap {overlap}s) MONO:{mono}")
+    return result
--- a/mtafe_lab/dataset.py
+++ b/mtafe_lab/dataset.py
@@ -0,0 +1,135 @@
+import platform
+import os
+import pickle
+import random
+import multiprocessing
+import threading
+import time
+import concurrent.futures
+import numpy as np
+from pathlib import Path
+import audiopreprocessing
+import logging
+import queue
+
+def serialize_dict_obj(path : Path, object : dict) -> int:
+    """Serializes Python Dictionary object to a file via Pickle.
+
+    Args:
+        path (Path): Path to store the file
+        object (dict): Dictionary object to serialize
+    Returns:
+        int: size in bytes written
+    """
+    # Horrible practice, horrible security, but it will work for now
+    with path.open("wb") as fp:
+        pickle.dump(object, fp)
+        fp.seek(0, os.SEEK_END)
+        size = fp.tell()
+    return size
+
+logging.info("Reading local dataset directory structure...")
+
+ASMRThreePath = Path("C:\\ASMRThree")
+ASMRTwoPath = Path("D:\\ASMRTwo")
+ASMROnePath = Path("E:\\ASMROne")
+
+if (platform.system() == 'Linux'):
+    ASMROnePath = Path('/mnt/Scratchpad/ASMROne')
+    ASMRTwoPath = Path('/mnt/MyStuffz/ASMRTwo')
+    ASMRThreePath = Path('/mnt/Windows11/ASMRThree')
+
+size_one, size_two, size_three = 0, 0, 0
+files_one, files_two, files_three = [], [], []
+folders_one, folders_two, folders_three = [], [], []
+
+# Statistic calculation for ASMROne
+for root, dirs, files in ASMROnePath.walk(): # Root will iterate through all folders
+    if root.absolute() != ASMROnePath.absolute(): # Skip root of ASMROnePath
+        folders_one.append(root) # Add folder to list
+    for fname in files: # Iterate through all files in current root
+        file = root/fname # Get file path
+        assert file.is_file()
+        files_one.append(file)
+        size_one += file.stat().st_size # Get file size
+        
+# Statistic calculation for ASMRTwo
+for root, dirs, files in ASMRTwoPath.walk(): # Root will iterate through all folders
+    if root.absolute() != ASMRTwoPath.absolute(): # Skip root of ASMRTwoPath
+        folders_two.append(root) # Add folder to list
+    for fname in files: # Iterate through all files in current root
+        file = root/fname # Get file path
+        assert file.is_file()
+        files_two.append(file)
+        size_two += file.stat().st_size # Get file size
+        
+# Statistic calculation for ASMRThree
+for root, dirs, files in ASMRThreePath.walk(): # Root will iterate through all folders
+    if root.absolute() != ASMRThreePath.absolute(): # Skip root of ASMRThreePath
+        folders_three.append(root) # Add folder to list
+    for fname in files: # Iterate through all files in current root
+        file = root/fname # Get file path
+        assert file.is_file()
+        files_three.append(file)
+        size_three += file.stat().st_size # Get file size
+        
+DataSubsetPaths = [ASMROnePath, ASMRTwoPath, ASMRThreePath]
+DLSiteWorksPaths = []
+# Collect ASMR Works (RJ ID, Paths)
+for ASMRSubsetPath in DataSubsetPaths:
+    for WorkPaths in ASMRSubsetPath.iterdir():
+        DLSiteWorksPaths.append(WorkPaths)
+        
+fileExt2fileType = {
+    ".TXT": "Document",
+    ".WAV": "Audio",
+    ".MP3": "Audio",
+    ".PNG": "Image",
+    ".JPG": "Image",
+    ".VTT": "Subtitle",
+    ".PDF": "Document",
+    ".FLAC": "Audio",
+    ".MP4": "Video",
+    ".LRC": "Subtitle",
+    ".SRT": "Subtitle",
+    ".JPEG": "Image",
+    ".ASS": "Subtitle",
+    "": "NO EXTENSION",
+    ".M4A": "Audio",
+    ".MKV": "Video"
+}
+fileext_stat = {}
+file_list = files_one + files_two + files_three
+file_list_count = len(file_list)
+
+for file in file_list:
+    f_ext = file.suffix.upper()
+    if (f_ext in fileext_stat.keys()):
+        fileext_stat[f_ext]['Count'] += 1
+        fileext_stat[f_ext]['List'].append(file)
+        fileext_stat[f_ext]['ExtensionMass'] += file.stat().st_size
+    else:
+        fileext_stat[f_ext] = {}
+        fileext_stat[f_ext]['Count'] = 1
+        fileext_stat[f_ext]['List'] = [file]
+        fileext_stat[f_ext]['ExtensionMass'] = file.stat().st_size # The total sum of  sizes of the same file extension
+        fileext_stat[f_ext]['MediaType'] = fileExt2fileType[f_ext]
+
+audio_paths = []
+for extension in fileext_stat: # I can't be bothered to convert this into a list compresion
+    if fileext_stat[extension]['MediaType'] == "Audio":
+        audio_paths += fileext_stat[extension]['List']
+        
+def random_audio_chunk(n : int, seed : int = 177013) -> list[Path]:
+    """Returns a random selection of audio files
+
+    Args:
+        n (int): Amount of files to return
+        seed (int, optional): Seed for RNG. Defaults to 177013.
+
+    Returns:
+        list[Path]: List of randomly selected audio paths (using Path object)
+    """
+    random.seed(seed)
+    #return random.choices(audio_paths, k=n) # Contains repeated elements
+    return random.sample(audio_paths, k=n)
--- a/mtafe_lab/mtafe.py
+++ b/mtafe_lab/mtafe.py
@@ -0,0 +1,32 @@
+import logging
+logging.basicConfig(format="%(asctime)s/%(levelname)s: [%(module)s] %(message)s", level=logging.INFO)
+
+import multiprocessing
+import multiprocessing.process
+import dataset
+import audiopreprocessing
+from pathlib import Path
+
+def copy_worker(origin_queue, target_queue):
+    p = origin_queue.get()
+    logging.info(f"Processing: {p}")
+    l = audiopreprocessing.load_preprocessed_audio(p, 32000, True)
+    print("Preprocess complete, putting it into queue")
+    target_queue.put(l) # Even on a small scale test, the process will always hang here
+
+if __name__ == "__main__":
+    audio_path_queue = multiprocessing.Queue()
+    audio_queue = multiprocessing.Queue()
+    
+    rand_paths = dataset.random_audio_chunk(1)
+    for p in rand_paths:
+        audio_path_queue.put(p)
+        
+    print("Files queued")
+    
+    processes = [multiprocessing.Process(target=copy_worker, args=(audio_path_queue, audio_queue)) for _ in range(1)]
+    for p in processes: p.start()
+    for p in processes: p.join()
+    
+    print("Joined")
+    #for _ in range(1): print(audio_queue.get())
--- a/mtafe_lab/test_mp.py
+++ b/mtafe_lab/test_mp.py
@@ -0,0 +1,30 @@
+import logging
+logging.basicConfig(format="%(asctime)s/%(levelname)s: [%(module)s] %(message)s", level=logging.INFO)
+
+import multiprocessing
+from dataset import random_audio_chunk
+import audiopreprocessing
+from time import sleep
+
+origin_queue = multiprocessing.Queue()
+target_queue = multiprocessing.Queue()
+
+def worker(orig, targ):
+    p = orig.get()
+    #out = "PROCESSED" + str(p.absolute())
+    out = audiopreprocessing.load_preprocessed_audio(p, 16000, True) # This will cause put to hang
+    targ.put(out) # This will hang the process
+
+if __name__ == "__main__":
+    K = 2
+    
+    for p in random_audio_chunk(K):
+        origin_queue.put(p)
+    
+    processes = [multiprocessing.Process(target=worker, args=(origin_queue, target_queue)) for _ in range(K)]
+    for p in processes: p.start()
+    for p in processes: p.join()
+    
+    logging.critical("Successfully terminated all threads")
+    
+    for _ in range(K): print(target_queue.get())
--- a/mtafe_lab/test_mtafe.py
+++ b/mtafe_lab/test_mtafe.py
@@ -0,0 +1,21 @@
+import logging
+logging.basicConfig(format="%(asctime)s/%(levelname)s: [%(module)s] %(message)s", level=logging.INFO)
+
+import mtafe
+from dataset import random_audio_chunk
+
+logging.info("Generating random audio path list")
+rdpl = random_audio_chunk(2)
+
+logging.info("Initializing MTAFE")
+mtafe.initialize_parameters(
+        paudio_paths=rdpl,
+        pmax_audio_in_queue=4,
+        paudio_feeder_threads=2,
+        pfeature_extractor_threads=1,
+        pdesired_sr=32000,
+        pforce_mono=False,
+        pchunk_length=15,
+        pchunk_overlap=2
+)
+mtafe.test_feeder()