some test data

2025-04-10 09:01:24 +02:00
parent a9d3d10da9
commit 6fc6df87b2
10 changed files with 25401 additions and 22798 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,180 @@
 .obsidian
 DLSiteFSearchPython_venv
-pypy_venv
+pypy_venv
+vggish
+*.pkl
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# UV
+#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#uv.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+
+# Ruff stuff:
+.ruff_cache/
+
+# PyPI configuration file
+.pypirc
--- a/DLSiteFSearchObsidian/Audio
+++ b/DLSiteFSearchObsidian/Audio
@@ -0,0 +1,27 @@
+For the current approach, I need to have a method, that is fast, accurate, and low-resource if possible, to convert all approximately 9000 audio files into feature vectors.
+
+I was originally going to use `PANNs` or `VGGIsh` for audio embedding generation. But `PANNs` has crashed on me with `CUDA out of memory` errors. `VGGIsh` looks kind of complicated.
+
+Anyway, I have asked Claude Sonnet for directions. It did gave me some more results than searching on Google `Audio Embedding Generation`. It recommended the following embedding models:
+1. CLAP
+2. BYOL-A
+3. PANNs
+4. wav2vec 2.0
+5. MERT
+6. VGGish
+I have never heard of any of these options. I have discovered `PANNs` through an Elastic Search article. Also `Ziliz` or `Milvus` has published an article ranking the embedding models. Which is why I wanted to try out `PANNs, wav2red, VGGish` these three models.
+
+Each model has its own quirk to run. Although `Towhee` has an uniform way to use all of these embedding models, I have my doubts on this project, which seem to be inactive, and also has allegations of using inadequate ways to gain more Stars on GitHub.
+
+I will have to set up a comparison between searching with all of these embedding models.
+
+Also Claude Sonnet has recommended to chop up the audio into smaller 10 seconds chunks. I was wondering why I was getting `CUDA Out of memory` errors. It's because I haven't chunked my audio into smaller pieces. Which explains the error. Since most of the audios are usually 30 minutes long. It also recommended overlapping the chunks. Please see the exported `JSON` chat for details.
+
+The audio must be pre-processed:
+1. Load all channels of the audio into memory
+2. Resample audio according to the model's instruction or training parameter
+3. Split the audio into chunks of 10-15 seconds
+Each chunk may have its metadata associated with the position (time in full track audio) and channel information (L, R)
+
+# Benchmark
+With 200 audio clips, randomly selected. all audio embedding models mentioned above must have its time for processing 200 audio clips recorded, and its vector results stored on disk.
--- a/FeatureExtraction/AudioFeatureExtraction.ipynb
+++ b/FeatureExtraction/AudioFeatureExtraction.ipynb
--- a/FeatureExtraction/ExtractionFrameworkThroughputTest.ipynb
+++ b/FeatureExtraction/ExtractionFrameworkThroughputTest.ipynb
--- a/FeatureExtraction/ImageFeatureExtraction.ipynb
+++ b/FeatureExtraction/ImageFeatureExtraction.ipynb
--- a/FeatureExtraction/TestAudioFeatureExtractionPANNS.ipynb
+++ b/FeatureExtraction/TestAudioFeatureExtractionPANNS.ipynb
--- a/FeatureExtraction/audiopreprocessing.py
+++ b/FeatureExtraction/audiopreprocessing.py
@@ -0,0 +1,80 @@
+import librosa
+import pickle
+import os
+import numpy as np
+from pathlib import Path
+
+DEBUG=True
+
+def resample_load(input_path : Path, target_sr : int = 16000, mono_audio : bool = False) -> np.ndarray: # AI
+    """Resample audio to target sample rate and save to output directory"""
+    # Load audio file with original sample rate
+    if DEBUG: print("[resample_load] Loading audio", input_path)
+    audio, orig_sr = librosa.load(input_path, sr=None, mono=mono_audio)
+    
+    # Resample if necessary
+    if orig_sr != target_sr:
+        if DEBUG: print("[resample_load] Resampling to", target_sr)
+        audio = librosa.resample(audio, orig_sr=orig_sr, target_sr=target_sr)
+        
+    return audio
+
+def chunk_audio(audio : np.ndarray, sr: int, chunk_length: float = 10.0, overlap: float = 2.0) -> tuple[list[np.ndarray], list[float], int]: # AI
+    """
+    Chunks audio file into overlapping segments. Only pass in mono audio here.
+    
+    Args:
+        audio_file: Loaded audio ndarray
+        sr: Sample rate for the given audio file
+        chunk_length: Length of each chunk in seconds
+        overlap: Overlap between chunks in seconds
+    
+    Returns:
+        List of audio chunks, list of chunk positions, and given sample rate
+    """
+    if DEBUG: print("[chunk_audio] Chunking audio")
+    # Calculate chunk size and hop length in samples
+    chunk_size = int(chunk_length * sr)
+    hop_length = int((chunk_length - overlap) * sr)
+    
+    # Generate chunks
+    chunks = []
+    positions = []
+    k = 0
+    for i in range(0, len(audio) - chunk_size + 1, hop_length):
+        chunk = audio[i:i + chunk_size]
+        chunks.append(chunk)
+        positions.append(i / sr)
+        k += 1
+        if DEBUG: print("[chunk_audio] Chunked", k, end="\r")
+    if k == 0: # The full audio length is less than chunk_length
+        chunks = [audio]
+        positions = [0.0]
+    
+    return chunks, positions, sr
+
+def load_preprocessed_audio(
+    path: Path,
+    desired_sr: int,
+    mono: bool = False,
+    chunk_length: float = 15.0,
+    overlap: float = 2.0) -> list[tuple[np.ndarray, float, int]]:
+    
+    result = []
+    # Load and resample audio
+    audio = resample_load(path, desired_sr, mono) # Stereo 2D matrix, Mono 1D array
+    if mono or (audio.ndim == 1):
+        # Chunk audio: mono (or the audio file loaded in itself is mono)
+        chunks, positions, _ = chunk_audio(audio, desired_sr, chunk_length, overlap)
+        assert len(chunks) == len(positions)
+        result.extend(zip(chunks, positions, [-1 for _ in range(len(chunks))]))
+        # (ndarray_chunk1, pos1, -1): first audio chunk, position1, -1 (Mono channel indicator)
+    else:
+        # Chunk audio: stereo/multichannel
+        for channel_id, channel_audio in enumerate(audio):
+            chunks, positions, _ = chunk_audio(channel_audio, desired_sr, chunk_length, overlap)
+            assert len(chunks) == len(positions)
+            result.extend(zip(chunks, positions, [channel_id for _ in range(len(chunks))]))
+            # (ndarray_chunk1, pos1, 0): first audio chunk, position1, 0 (channel 0)
+    
+    return result
--- a/FeatureExtraction/dataset_files.py
+++ b/FeatureExtraction/dataset_files.py
@@ -1,9 +1,43 @@
+import platform
+import os
+import pickle
+import random
+import threading
+import time
+import concurrent.futures
+import numpy as np
 from pathlib import Path
+import audiopreprocessing
+
+DEBUG=True
+
+def serialize_dict_obj(path : Path, object : dict) -> int:
+    """Serializes Python Dictionary object to a file via Pickle.
+
+    Args:
+        path (Path): Path to store the file
+        object (dict): Dictionary object to serialize
+    Returns:
+        int: size in bytes written
+    """
+    # Horrible practice, horrible security, but it will work for now
+    with path.open("wb") as fp:
+        pickle.dump(object, fp)
+        fp.seek(0, os.SEEK_END)
+        size = fp.tell()
+    return size
+
+print("Reading local dataset directory structure...")

 ASMRThreePath = Path("C:\\ASMRThree")
 ASMRTwoPath = Path("D:\\ASMRTwo")
 ASMROnePath = Path("E:\\ASMROne")

+if (platform.system() == 'Linux'):
+    ASMROnePath = Path('/mnt/Scratchpad/ASMROne')
+    ASMRTwoPath = Path('/mnt/MyStuffz/ASMRTwo')
+    ASMRThreePath = Path('/mnt/Windows11/ASMRThree')
+
 size_one, size_two, size_three = 0, 0, 0
 files_one, files_two, files_three = [], [], []
 folders_one, folders_two, folders_three = [], [], []
@@ -78,4 +112,161 @@ for file in file_list:
        fileext_stat[f_ext]['Count'] = 1
        fileext_stat[f_ext]['List'] = [file]
        fileext_stat[f_ext]['ExtensionMass'] = file.stat().st_size # The total sum of  sizes of the same file extension
-        fileext_stat[f_ext]['MediaType'] = fileExt2fileType[f_ext]
+        fileext_stat[f_ext]['MediaType'] = fileExt2fileType[f_ext]
+
+audio_paths = []
+for extension in fileext_stat: # I can't be bothered to convert this into a list compresion
+    if fileext_stat[extension]['MediaType'] == "Audio":
+        audio_paths += fileext_stat[extension]['List']
+        
+def random_audio_chunk(n : int, seed : int = 177013) -> list[Path]:
+    """Returns a random selection of audio files
+
+    Args:
+        n (int): Amount of files to return
+        seed (int, optional): Seed for RNG. Defaults to 177013.
+
+    Returns:
+        list[Path]: List of randomly selected audio paths (using Path object)
+    """
+    random.seed(seed)
+    #return random.choices(audio_paths, k=n) # Contains repeated elements
+    return random.sample(audio_paths, k=n)
+
+class AudioFeatureExtractor():
+    __audio_queue: list[ # List of ...
+        tuple[ # Pair of chunked audio and its path
+            list[tuple[np.ndarray, float, int]], # Chunked audio
+            Path # Path to original audio
+        ]
+    ] # Listed of Chunked/Resampled audio
+    __feeder_future: concurrent.futures.Future
+    __extractor_future: concurrent.futures.Future
+    __audio_paths_list: list[Path]
+    __max_audio_in_queue: int
+    __queue_lock: threading.Lock
+    __desired_sr: int
+    __mono: bool
+    __chunk_length: float
+    __overlap: float
+    __features: dict[Path, list[tuple[np.ndarray, float, int]]]
+    # { audioPath:
+    #   [(embedding, pos, channel)...]
+    # }
+    
+    def __embedding_inference(self, audio_ndarray: np.ndarray) -> np.ndarray:
+        """Uses embedding model to inference an audio. Returns embedding vectors.
+        Function to be overrided. Returns np.zeros(32).
+
+        Args:
+            audio_ndarray (np.ndarray): 
+
+        Returns:
+            np.ndarray: _description_
+        """
+        return np.zeros(32)
+    
+    def __embedding_extract(self, audio: tuple[np.ndarray, float, int]) -> tuple[np.ndarray, float, int, np.ndarray]:
+        """Receives a tuple of audio, position, and channel ID, then adding the embedding to the tuple
+
+        Args:
+            audio (tuple[np.ndarray, float, int]): tuple of audio, position, channel id
+
+        Returns:
+            tuple[np.ndarray, float, int, np.ndarray]: audio, position, channel id, embedding vector
+        """
+        audio_chunk, pos, channel_id = audio
+        return (audio_chunk, pos, channel_id, self.__embedding_inference(audio_chunk))
+    
+    def __audio_queue_feeder(self): # TODO: Upgrade to multithreaded loader?
+        """Internal thread function. Preprocess and load the audio continuously to
+        audio_queue until the end of the audio_paths_list
+        """
+        while (self.__audio_paths_list): # While there are still Path elements in path list
+            if (not (len(self.__audio_queue) < self.__max_audio_in_queue)):
+                if DEBUG: print("Audio Queue Thread: Queue Full, feeder thread sleeping for 5 seconds")
+                time.sleep(5)
+            while(len(self.__audio_queue) < self.__max_audio_in_queue): # While the audio queue is not full
+                new_audio_path = self.__audio_paths_list[0]
+                new_audio = audiopreprocessing.load_preprocessed_audio(
+                        new_audio_path,
+                        self.__desired_sr,
+                        self.__mono,
+                        self.__chunk_length,
+                        self.__overlap
+                    )
+                with self.__queue_lock:
+                    self.__audio_queue.append(
+                        (new_audio, new_audio_path)
+                    )
+                    pop_path = self.__audio_paths_list.pop(0)
+                    if DEBUG: print("Audio Queue Thread: Added new audio to queue", pop_path)
+        if DEBUG: print("Audio Queue Thread: DONE. All audio files fed")
+                    
+    def __audio_queue_feature_extractor(self):
+        """Internal thread function. Get audio from audio queue. And extract embedding vector
+        for all audio chunks. Stores the resulting embedding into self.__features.
+        With Original Audio's Path as key, and list[tuple[np.ndarray, float, int]] (list of tuple of embedding vector, position, channel id)
+        """
+        while (self.__audio_paths_list or self.__audio_queue): # While there are still audio to be processed
+            if (self.__audio_queue): # If audio queue is not empty
+                with self.__queue_lock: 
+                    audio_to_process, audio_path = self.__audio_queue.pop(0) # Get audio from queue
+                    if DEBUG: print(f"Feature Extractor Thread: Extracting {len(audio_to_process)} features from audio", audio_path)
+                for audio_chunk in audio_to_process:
+                    same_audio_chunk, timepos, channel_id, embedd_vect = self.__embedding_extract(audio_chunk)
+                    if (audio_path not in self.__features.keys()):
+                        #if DEBUG: print("Adding new vector to", audio_path.name)
+                        self.__features[audio_path] = [(embedd_vect, timepos, channel_id)]
+                    else:
+                        #if DEBUG: print("Adding vector to", audio_path.name)
+                        self.__features[audio_path].append(
+                            (embedd_vect, timepos, channel_id)
+                        )
+            else:
+                if DEBUG: print("Feature Extractor Thread: Queue Empty, extractor thread sleeping for 5 seconds") # If audio queue is empty, wait
+                time.sleep(5)
+        if DEBUG: print("Feature Extractor Thread: DONE. Extracted all features from all audio files")
+            
+    def __init__(
+            self, 
+            audio_paths_list: list[Path],
+            max_audio_in_queue: int,
+            desired_sr: int,
+            mono: bool,
+            chunk_length: float = 15.0,
+            overlap: float = 2.0
+        ):
+        self.__audio_queue = []
+        self.__audio_paths_list = audio_paths_list
+        self.__max_audio_in_queue = max_audio_in_queue
+        self.__queue_lock = threading.Lock()
+        self.__desired_sr = desired_sr
+        self.__mono = mono
+        self.__chunk_length = chunk_length
+        self.__overlap = overlap
+        self.__features = {}
+    
+    @property
+    def features(self) -> dict[Path, list[tuple[np.ndarray, float, int]]]:
+        return self.__features
+    
+    def extract(self):
+        print("Starting feature extraction for", len(self.__audio_paths_list), "file(s)")
+        total_amount = len(self.__audio_paths_list)
+        t_start = time.perf_counter()
+        with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
+            self.__feeder_future = executor.submit(self.__audio_queue_feeder)
+            self.__extractor_future = executor.submit(self.__audio_queue_feature_extractor)
+            while (self.__feeder_future.running() or self.__extractor_future.running()):
+                print(f"Processed {len(self.__features)}/{total_amount} (L:{len(self.__audio_queue)}/W{len(self.__audio_paths_list)})", end="\r")
+                time.sleep(1)
+               
+        t_stop = time.perf_counter()
+        print(f"Processed {len(self.__features)}/{total_amount} (L:{len(self.__audio_queue)}/W:{len(self.__audio_paths_list)} COMPLETE)")
+        delta_t = t_stop - t_start
+        total_features = sum( [len(self.__features[path]) for path in self.__features] ) 
+        print()
+        print("Extraction completed")
+        print(f"Took {delta_t} seconds. Added {total_features} vectors/embeddings")
+            
--- a/FeatureExtraction/test.py
+++ b/FeatureExtraction/test.py
@@ -0,0 +1,3 @@
+from dataset_files import AudioFeatureExtractor, random_audio_chunk
+afe = AudioFeatureExtractor(random_audio_chunk(32), 16, 32000, False)
+afe.extract()
--- a/LocalDatasetAnalysis.ipynb
+++ b/LocalDatasetAnalysis.ipynb