DLSiteFSearch/FeatureExtraction/dataset_files.py

import platform
import os
import pickle
import random
import threading
import time
import concurrent.futures
import numpy as np
from pathlib import Path
import audiopreprocessing
import logging
import queue

def serialize_dict_obj(path : Path, object : dict) -> int:
    """Serializes Python Dictionary object to a file via Pickle.

    Args:
        path (Path): Path to store the file
        object (dict): Dictionary object to serialize
    Returns:
        int: size in bytes written
    """
    # Horrible practice, horrible security, but it will work for now
    with path.open("wb") as fp:
        pickle.dump(object, fp)
        fp.seek(0, os.SEEK_END)
        size = fp.tell()
    return size

logging.info("Reading local dataset directory structure...")

ASMRThreePath = Path("C:\\ASMRThree")
ASMRTwoPath = Path("D:\\ASMRTwo")
ASMROnePath = Path("E:\\ASMROne")

if (platform.system() == 'Linux'):
    ASMROnePath = Path('/mnt/Scratchpad/ASMROne')
    ASMRTwoPath = Path('/mnt/MyStuffz/ASMRTwo')
    ASMRThreePath = Path('/mnt/Windows11/ASMRThree')

size_one, size_two, size_three = 0, 0, 0
files_one, files_two, files_three = [], [], []
folders_one, folders_two, folders_three = [], [], []

# Statistic calculation for ASMROne
for root, dirs, files in ASMROnePath.walk(): # Root will iterate through all folders
    if root.absolute() != ASMROnePath.absolute(): # Skip root of ASMROnePath
        folders_one.append(root) # Add folder to list
    for fname in files: # Iterate through all files in current root
        file = root/fname # Get file path
        assert file.is_file()
        files_one.append(file)
        size_one += file.stat().st_size # Get file size

# Statistic calculation for ASMRTwo
for root, dirs, files in ASMRTwoPath.walk(): # Root will iterate through all folders
    if root.absolute() != ASMRTwoPath.absolute(): # Skip root of ASMRTwoPath
        folders_two.append(root) # Add folder to list
    for fname in files: # Iterate through all files in current root
        file = root/fname # Get file path
        assert file.is_file()
        files_two.append(file)
        size_two += file.stat().st_size # Get file size

# Statistic calculation for ASMRThree
for root, dirs, files in ASMRThreePath.walk(): # Root will iterate through all folders
    if root.absolute() != ASMRThreePath.absolute(): # Skip root of ASMRThreePath
        folders_three.append(root) # Add folder to list
    for fname in files: # Iterate through all files in current root
        file = root/fname # Get file path
        assert file.is_file()
        files_three.append(file)
        size_three += file.stat().st_size # Get file size

DataSubsetPaths = [ASMROnePath, ASMRTwoPath, ASMRThreePath]
DLSiteWorksPaths = []
# Collect ASMR Works (RJ ID, Paths)
for ASMRSubsetPath in DataSubsetPaths:
    for WorkPaths in ASMRSubsetPath.iterdir():
        DLSiteWorksPaths.append(WorkPaths)

fileExt2fileType = {
    ".TXT": "Document",
    ".WAV": "Audio",
    ".MP3": "Audio",
    ".PNG": "Image",
    ".JPG": "Image",
    ".VTT": "Subtitle",
    ".PDF": "Document",
    ".FLAC": "Audio",
    ".MP4": "Video",
    ".LRC": "Subtitle",
    ".SRT": "Subtitle",
    ".JPEG": "Image",
    ".ASS": "Subtitle",
    "": "NO EXTENSION",
    ".M4A": "Audio",
    ".MKV": "Video"
}
fileext_stat = {}
file_list = files_one + files_two + files_three
file_list_count = len(file_list)

for file in file_list:
    f_ext = file.suffix.upper()
    if (f_ext in fileext_stat.keys()):
        fileext_stat[f_ext]['Count'] += 1
        fileext_stat[f_ext]['List'].append(file)
        fileext_stat[f_ext]['ExtensionMass'] += file.stat().st_size
    else:
        fileext_stat[f_ext] = {}
        fileext_stat[f_ext]['Count'] = 1
        fileext_stat[f_ext]['List'] = [file]
        fileext_stat[f_ext]['ExtensionMass'] = file.stat().st_size # The total sum of  sizes of the same file extension
        fileext_stat[f_ext]['MediaType'] = fileExt2fileType[f_ext]

audio_paths = []
for extension in fileext_stat: # I can't be bothered to convert this into a list compresion
    if fileext_stat[extension]['MediaType'] == "Audio":
        audio_paths += fileext_stat[extension]['List']

def random_audio_chunk(n : int, seed : int = 177013) -> list[Path]:
    """Returns a random selection of audio files

    Args:
        n (int): Amount of files to return
        seed (int, optional): Seed for RNG. Defaults to 177013.

    Returns:
        list[Path]: List of randomly selected audio paths (using Path object)
    """
    random.seed(seed)
    #return random.choices(audio_paths, k=n) # Contains repeated elements
    return random.sample(audio_paths, k=n)

class AudioFeatureExtractor():
    __audio_queue: list[ # List of ...
        tuple[ # Pair of chunked audio and its path
            list[tuple[np.ndarray, float, int]], # Chunked audio
            Path # Path to original audio
        ]
    ] # Listed of Chunked/Resampled audio
    __feeder_future: concurrent.futures.Future
    __extractor_future: concurrent.futures.Future
    __audio_paths_list: list[Path]
    __max_audio_in_queue: int
    __queue_lock: threading.Lock
    __desired_sr: int
    __mono: bool
    __chunk_length: float
    __overlap: float
    __features: dict[Path, list[tuple[np.ndarray, float, int]]] # This is a crime, I know
    # { audioPath:
    #   [(embedding, pos, channel)...]
    # }

    def __embedding_inference(self, audio_ndarray: np.ndarray) -> np.ndarray:
        """Uses embedding model to inference an audio. Returns embedding vectors.
        Function to be overrided. Returns np.zeros(32).

        Args:
            audio_ndarray (np.ndarray):

        Returns:
            np.ndarray: _description_
        """
        return np.zeros(32)

    def __embedding_extract(self, audio: tuple[np.ndarray, float, int]) -> tuple[np.ndarray, float, int, np.ndarray]:
        """Receives a tuple of audio, position, and channel ID, then adding the embedding to the tuple

        Args:
            audio (tuple[np.ndarray, float, int]): tuple of audio, position, channel id

        Returns:
            tuple[np.ndarray, float, int, np.ndarray]: audio, position, channel id, embedding vector
        """
        audio_chunk, pos, channel_id = audio
        return (audio_chunk, pos, channel_id, self.__embedding_inference(audio_chunk))

    def __audio_queue_feeder(self): # TODO: Upgrade to multithreaded loader?
        """Internal thread function. Preprocess and load the audio continuously to
        audio_queue until the end of the audio_paths_list
        """
        while (self.__audio_paths_list): # While there are still Path elements in path list
            if (not (len(self.__audio_queue) < self.__max_audio_in_queue)):
                logging.info("[AFE] [Audio Queue Thread]: Queue Full, feeder thread sleeping for 5 seconds")
                time.sleep(5)
            while(len(self.__audio_queue) < self.__max_audio_in_queue): # While the audio queue is not full
                new_audio_path = self.__audio_paths_list[0]
                new_audio = audiopreprocessing.load_preprocessed_audio(
                        new_audio_path,
                        self.__desired_sr,
                        self.__mono,
                        self.__chunk_length,
                        self.__overlap
                    )
                with self.__queue_lock:
                    self.__audio_queue.append(
                        (new_audio, new_audio_path)
                    )
                    pop_path = self.__audio_paths_list.pop(0)
                    logging.info(f"[AFE] [Audio Queue Thread]: Added new audio to queue {pop_path}")
        logging.info("[AFE] [Audio Queue Thread]: DONE. All audio files fed")

    def __audio_queue_feature_extractor(self):
        """Internal thread function. Get audio from audio queue. And extract embedding vector
        for all audio chunks. Stores the resulting embedding into self.__features.
        With Original Audio's Path as key, and list[tuple[np.ndarray, float, int]] (list of tuple of embedding vector, position, channel id)
        """
        while (self.__audio_paths_list or self.__audio_queue): # While there are still audio to be processed
            if (self.__audio_queue): # If audio queue is not empty
                with self.__queue_lock:
                    audio_to_process, audio_path = self.__audio_queue.pop(0) # Get audio from queue
                    logging.info(f"[AFE] [Feature Extractor Thread]: Extracting {len(audio_to_process)} features from audio {audio_path}")
                for audio_chunk in audio_to_process:
                    same_audio_chunk, timepos, channel_id, embedd_vect = self.__embedding_extract(audio_chunk)
                    if (audio_path not in self.__features.keys()):
                        #if DEBUG: print("Adding new vector to", audio_path.name)
                        self.__features[audio_path] = [(embedd_vect, timepos, channel_id)]
                    else:
                        #if DEBUG: print("Adding vector to", audio_path.name)
                        self.__features[audio_path].append(
                            (embedd_vect, timepos, channel_id)
                        )
            else:
                logging.info("[AFE] [Feature Extractor Thread]: Queue Empty, extractor thread sleeping for 5 seconds") # If audio queue is empty, wait
                time.sleep(5)
        logging.info("[AFE] [Feature Extractor Thread]: DONE. Extracted all features from all audio files")

    def __init__(
            self,
            audio_paths_list: list[Path],
            max_audio_in_queue: int,
            desired_sr: int,
            mono: bool,
            chunk_length: float = 15.0,
            overlap: float = 2.0
        ):
        self.__audio_queue = []
        self.__audio_paths_list = audio_paths_list
        self.__max_audio_in_queue = max_audio_in_queue
        self.__queue_lock = threading.Lock()
        self.__desired_sr = desired_sr
        self.__mono = mono
        self.__chunk_length = chunk_length
        self.__overlap = overlap
        self.__features = {}

    @property
    def features(self) -> dict[Path, list[tuple[np.ndarray, float, int]]]:
        return self.__features

    def extract(self):
        print("Starting feature extraction for", len(self.__audio_paths_list), "file(s)")
        total_amount = len(self.__audio_paths_list)
        t_start = time.perf_counter()
        with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
            self.__feeder_future = executor.submit(self.__audio_queue_feeder)
            self.__extractor_future = executor.submit(self.__audio_queue_feature_extractor)
            while (self.__feeder_future.running() or self.__extractor_future.running()):
                print(f"Processed {len(self.__features)}/{total_amount} (L:{len(self.__audio_queue)}/W{len(self.__audio_paths_list)})", end="\r")
                time.sleep(1)

        t_stop = time.perf_counter()
        print(f"Processed {len(self.__features)}/{total_amount} (L:{len(self.__audio_queue)}/W:{len(self.__audio_paths_list)} COMPLETE)")
        delta_t = t_stop - t_start
        total_features = sum( [len(self.__features[path]) for path in self.__features] )
        print()
        print("Extraction completed")
        print(f"Took {delta_t} seconds. Added {total_features} vectors/embeddings")

class MultiThreadedAudioFeatureExtractor():
    __audio_queue: queue.Queue[ # List of ...
        tuple[ # Pair of chunked audio and its path
            list[tuple[np.ndarray, float, int]], # Chunked audio
            Path # Path to original audio
        ]
    ] # Listed of Chunked/Resampled audio
    __audio_loader_threads: int # Amount of audio feeder threads
    __feature_extractor_threads: int # Amount of feature extractor threads (if the method allows)
    __audio_paths_list: queue.Queue[Path] # Path list to audio
    __max_audio_in_queue: int # Maximum audio in queue
    # Audio Feeeder parameter
    __desired_sr: int # Desired Sample Rate (Resampling)
    __mono: bool # Force load audio in mono mode
    __chunk_length: float # Audio chunk length
    __overlap: float
    # Result
    __features: dict[Path, list[tuple[np.ndarray, float, int]]] # This is a crime, I know
    __features_lock: threading.Lock
    # __features: { audioPath:
    #   [(embedding1, pos1, channel1),
    # (embedding2, pos2, channel1)]
    # ...
    # }
    # Runtime
    __audio_loader_threadpool: list[concurrent.futures.Future]
    __feature_extractor_threadpool: list[concurrent.futures.Future]
    __audio_feed_condition: threading.Condition

    def __audio_inference_embedding(self, audio: list[tuple[np.ndarray, float, int]]) -> list[tuple[np.ndarray, float, int]]:
        """Receives a list of audio chunks, and then extracts embeddings for all audio chunks, returns the resulting embedding as a list of tuples(embedding, time, channel_id)

        Args:
            audio (list[tuple[np.ndarray, float, int]]): list of audio chunks

        Returns:
            list[tuple[np.ndarray, float, int]]: List of (embedding vector, timepos, channel id)
        """
        features = []
        for audio_chunk in audio:
            audio, timepos, channel_id = audio_chunk
            zero = np.zeros(32)
            features.append( (zero, timepos, channel_id) )
        time.sleep(0.01)
        return features
        # To be overridden

    def __audio_feeder_thread(self, thread_id):
        # If there is still audio in paths list
        # Is the audio queue not full?
        while (not self.__audio_paths_list.empty()):
            if (not self.__audio_queue.full()):
                # Feed audio
                new_audio_path = self.__audio_paths_list.get()
                self.__audio_paths_list.task_done()
                logging.info(f"[MTAFE] [Audio Feeder {thread_id}] Preprocess: {new_audio_path.absolute()}")
                new_audio = audiopreprocessing.load_preprocessed_audio(
                    new_audio_path,
                    self.__desired_sr,
                    self.__mono,
                    self.__chunk_length,
                    self.__overlap
                )
                self.__audio_queue.put((new_audio, new_audio_path))
                #self.__audio_queue.task_done()
                #with self.__audio_feed_condition: self.__audio_feed_condition.notify_all()
                logging.info(f"[MTAFE] [Audio Feeder {thread_id}] Feed: {new_audio_path.absolute()}")
            #else:
            #    logging.info(f"[MTAFE] [Audio Feeder {thread_id}] Audio queue full ({self.__audio_queue.qsize()} <= {self.__max_audio_in_queue} FALSE): waiting")
            #    with self.__audio_feed_condition:
            #        logging.info(f"[MTAFE] [Audio Feeder {thread_id}] Audio queue full: waiting")
            #        self.__audio_feed_condition.wait_for(lambda: not self.__audio_queue.qsize() <= self.__max_audio_in_queue) # This consumes way too much CPU power
            #        self.__audio_feed_condition.wait(10)
        logging.info(f"[MTAFE] [Audio Feeder {thread_id}] Thread finished!")

    #def testfeedthread(self, nthreads):
        # t1 = threading.Thread(target=self.__audio_feeder_thread, args=(1,))
        # t2 = threading.Thread(target=self.__audio_feeder_thread, args=(2,))
        # t1.start(); t2.start()
        # #with self.__audio_feed_condition:
        # #    self.__audio_feed_condition.notify_all()
        # t1.join(); t2.join()
        # with concurrent.futures.ThreadPoolExecutor(max_workers=nthreads) as executor:
        #     for i in range(nthreads):
        #         ft = executor.submit(self.__audio_feeder_thread, i)
        #         self.__audio_loader_threadpool.append(ft)

    def __check_all_audiofeed_thread_finished(self) -> bool:
        for ft in self.__audio_loader_threadpool:
            if ft.running():
                return False
        return True

    def __check_all_featureextractor_thread_finished(self) -> bool:
        for ft in self.__feature_extractor_threadpool:
            if ft.running():
                return False
        return True

    def __feature_extractor_thread(self, thread_id):
        while (not self.__check_all_audiofeed_thread_finished() or not self.__audio_queue.empty()):
            if (not self.__audio_queue.empty()):
                audio_to_process, audio_path = self.__audio_queue.get()
                self.__audio_queue.task_done()
                logging.info(f"[MTAFE] [Feature Extractor {thread_id}] Extracting: {audio_path}")
                features_to_add = self.__audio_inference_embedding(audio_to_process)
                logging.info(f"[MTAFE] [Feature Extractor {thread_id}] Extracted: {len(features_to_add)} features")
                with self.__features_lock:
                    self.__features[audio_path] = features_to_add
                #with self.__audio_feed_condition: self.__audio_feed_condition.notify_all()
                logging.info(f"[MTAFE] [Feature Extractor {thread_id}] Feature Extraction complete for {audio_path} w/ {len(features_to_add)} features")
            #else:
            #    if (not self.__check_all_audiofeed_thread_finished()):
            #        with self.__audio_feed_condition:
            #            logging.info(f"[MTAFE] [Feature Extractor {thread_id}] Audio queue empty: waiting")
            #            self.__audio_feed_condition.wait(10)
            #            self.__audio_feed_condition.wait_for(lambda: not self.__audio_queue.empty())

        logging.info(f"[MTAFE] [Feature Extractor {thread_id}] Thread finished!")

    def __count_running_threads(self) -> tuple[int, int]:
        running_extractors = 0
        running_feeders = 0
        for ft in self.__feature_extractor_threadpool:
            if ft.running(): running_extractors += 1
        for ft in self.__audio_loader_threadpool:
            if ft.running(): running_feeders += 1
        return (running_feeders, running_extractors)

    @property
    def features(self) -> dict[Path, list[tuple[np.ndarray, float, int]]]:
        return self.__features

    def extract(self):
        total_amount = self.__audio_paths_list.qsize()
        logging.info(f"[MTAFE] [Main] Starting feature extraction for {total_amount} file(s)")
        t_start = time.perf_counter()
        with concurrent.futures.ThreadPoolExecutor(max_workers=(self.__audio_loader_threads + self.__feature_extractor_threads)) as executor:
            for i in range(self.__audio_loader_threads):
                ld_ft = executor.submit(self.__audio_feeder_thread, i)
                self.__audio_loader_threadpool.append(ld_ft)
            for i in range(self.__feature_extractor_threads):
                ld_ft = executor.submit(self.__feature_extractor_thread, i)
                self.__feature_extractor_threadpool.append(ld_ft)
            while ( (not self.__check_all_audiofeed_thread_finished()) and (not self.__check_all_featureextractor_thread_finished()) ):
                nfeeder, nextract = self.__count_running_threads()
                print(f"[MTAFE Progress] Processed {len(self.__features)}/{total_amount} (L:{self.__audio_queue.qsize()}/W:{self.__audio_paths_list.qsize()}, LD:{nfeeder}/EXT:{nextract})", end="\r")
        t_stop = time.perf_counter()
        logging.info(f"[MTAFE] Processed {len(self.__features)}/{total_amount} (L:{self.__audio_queue.qsize()}/W:{self.__audio_paths_list.qsize()} COMPLETE)")
        delta_t = t_stop - t_start
        total_features = sum( [len(self.__features[path]) for path in self.__features] )
        logging.info(f"[MTAFE] Extraction complete. Took {delta_t} seconds. Added {total_features} vectors/embeddings")

    def __init__(
        self,
        audio_paths: list[Path],
        max_audio_in_queue: int = 16,
        audio_feeder_threads: int = 8,
        feature_extractor_threads: int = 8,
        desired_sr: int = 32000,
        force_mono: bool = False,
        chunk_length: float = 15.0,
        chunk_overlap: float = 2.0,
    ):
        # Check if the paths passed in are all valid and add them to queue
        self.__audio_paths_list = queue.Queue()
        for p in audio_paths:
            if not p.is_file():
                raise Exception(f"Path '{p.absolute()}' is NOT a valid file!")
            else:
                self.__audio_paths_list.put(p)
        #self.__audio_paths_list.task_done()

        logging.info(f"[MTAFE] [Constructor] Queued {self.__audio_paths_list.qsize()} files")

        # Set up private attributes
        ## Audio preprocessing parameters
        self.__desired_sr = desired_sr
        self.__mono = force_mono
        self.__chunk_length = chunk_length
        self.__overlap = chunk_overlap

        ## Extractor/Feeder settings
        self.__max_audio_in_queue = max_audio_in_queue
        self.__audio_loader_threads = audio_feeder_threads
        self.__feature_extractor_threads = feature_extractor_threads

        ## Set up runtime conditions
        self.__audio_queue = queue.Queue()
        self.__features = {}
        self.__features_lock = threading.Lock()
        self.__audio_loader_threadpool = []
        self.__feature_extractor_threadpool = []
        self.__audio_feed_condition = threading.Condition()

        logging.info(f"[MTAFE] [Constructor] Extraction parameters: {desired_sr}Hz, Mono: {force_mono}, Divide into {chunk_length}s chunks with {chunk_overlap}s of overlap")
        logging.info(f"[MTAFE] [Constructor] Using {audio_feeder_threads} threads for preprocessing audio and {feature_extractor_threads} threads for feature extraction. Max queue size of {max_audio_in_queue} files")

        # More audio embeddings specific code below (To be overridden)