Compare commits
5 Commits
af81c82d18
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
|
b14a0a2a17
|
|||
|
37b6a3c5e7
|
|||
|
b855b7e255
|
|||
|
6fc6df87b2
|
|||
|
a9d3d10da9
|
179
.gitignore
vendored
179
.gitignore
vendored
@@ -1 +1,180 @@
|
|||||||
.obsidian
|
.obsidian
|
||||||
|
DLSiteFSearchPython_venv
|
||||||
|
pypy_venv
|
||||||
|
vggish
|
||||||
|
*.pkl
|
||||||
|
|
||||||
|
# Byte-compiled / optimized / DLL files
|
||||||
|
__pycache__/
|
||||||
|
*.py[cod]
|
||||||
|
*$py.class
|
||||||
|
|
||||||
|
# C extensions
|
||||||
|
*.so
|
||||||
|
|
||||||
|
# Distribution / packaging
|
||||||
|
.Python
|
||||||
|
build/
|
||||||
|
develop-eggs/
|
||||||
|
dist/
|
||||||
|
downloads/
|
||||||
|
eggs/
|
||||||
|
.eggs/
|
||||||
|
lib/
|
||||||
|
lib64/
|
||||||
|
parts/
|
||||||
|
sdist/
|
||||||
|
var/
|
||||||
|
wheels/
|
||||||
|
share/python-wheels/
|
||||||
|
*.egg-info/
|
||||||
|
.installed.cfg
|
||||||
|
*.egg
|
||||||
|
MANIFEST
|
||||||
|
|
||||||
|
# PyInstaller
|
||||||
|
# Usually these files are written by a python script from a template
|
||||||
|
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||||
|
*.manifest
|
||||||
|
*.spec
|
||||||
|
|
||||||
|
# Installer logs
|
||||||
|
pip-log.txt
|
||||||
|
pip-delete-this-directory.txt
|
||||||
|
|
||||||
|
# Unit test / coverage reports
|
||||||
|
htmlcov/
|
||||||
|
.tox/
|
||||||
|
.nox/
|
||||||
|
.coverage
|
||||||
|
.coverage.*
|
||||||
|
.cache
|
||||||
|
nosetests.xml
|
||||||
|
coverage.xml
|
||||||
|
*.cover
|
||||||
|
*.py,cover
|
||||||
|
.hypothesis/
|
||||||
|
.pytest_cache/
|
||||||
|
cover/
|
||||||
|
|
||||||
|
# Translations
|
||||||
|
*.mo
|
||||||
|
*.pot
|
||||||
|
|
||||||
|
# Django stuff:
|
||||||
|
*.log
|
||||||
|
local_settings.py
|
||||||
|
db.sqlite3
|
||||||
|
db.sqlite3-journal
|
||||||
|
|
||||||
|
# Flask stuff:
|
||||||
|
instance/
|
||||||
|
.webassets-cache
|
||||||
|
|
||||||
|
# Scrapy stuff:
|
||||||
|
.scrapy
|
||||||
|
|
||||||
|
# Sphinx documentation
|
||||||
|
docs/_build/
|
||||||
|
|
||||||
|
# PyBuilder
|
||||||
|
.pybuilder/
|
||||||
|
target/
|
||||||
|
|
||||||
|
# Jupyter Notebook
|
||||||
|
.ipynb_checkpoints
|
||||||
|
|
||||||
|
# IPython
|
||||||
|
profile_default/
|
||||||
|
ipython_config.py
|
||||||
|
|
||||||
|
# pyenv
|
||||||
|
# For a library or package, you might want to ignore these files since the code is
|
||||||
|
# intended to run in multiple environments; otherwise, check them in:
|
||||||
|
# .python-version
|
||||||
|
|
||||||
|
# pipenv
|
||||||
|
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
||||||
|
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
||||||
|
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
||||||
|
# install all needed dependencies.
|
||||||
|
#Pipfile.lock
|
||||||
|
|
||||||
|
# UV
|
||||||
|
# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
|
||||||
|
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
||||||
|
# commonly ignored for libraries.
|
||||||
|
#uv.lock
|
||||||
|
|
||||||
|
# poetry
|
||||||
|
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
||||||
|
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
||||||
|
# commonly ignored for libraries.
|
||||||
|
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
||||||
|
#poetry.lock
|
||||||
|
|
||||||
|
# pdm
|
||||||
|
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
||||||
|
#pdm.lock
|
||||||
|
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
||||||
|
# in version control.
|
||||||
|
# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
|
||||||
|
.pdm.toml
|
||||||
|
.pdm-python
|
||||||
|
.pdm-build/
|
||||||
|
|
||||||
|
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
||||||
|
__pypackages__/
|
||||||
|
|
||||||
|
# Celery stuff
|
||||||
|
celerybeat-schedule
|
||||||
|
celerybeat.pid
|
||||||
|
|
||||||
|
# SageMath parsed files
|
||||||
|
*.sage.py
|
||||||
|
|
||||||
|
# Environments
|
||||||
|
.env
|
||||||
|
.venv
|
||||||
|
env/
|
||||||
|
venv/
|
||||||
|
ENV/
|
||||||
|
env.bak/
|
||||||
|
venv.bak/
|
||||||
|
|
||||||
|
# Spyder project settings
|
||||||
|
.spyderproject
|
||||||
|
.spyproject
|
||||||
|
|
||||||
|
# Rope project settings
|
||||||
|
.ropeproject
|
||||||
|
|
||||||
|
# mkdocs documentation
|
||||||
|
/site
|
||||||
|
|
||||||
|
# mypy
|
||||||
|
.mypy_cache/
|
||||||
|
.dmypy.json
|
||||||
|
dmypy.json
|
||||||
|
|
||||||
|
# Pyre type checker
|
||||||
|
.pyre/
|
||||||
|
|
||||||
|
# pytype static type analyzer
|
||||||
|
.pytype/
|
||||||
|
|
||||||
|
# Cython debug symbols
|
||||||
|
cython_debug/
|
||||||
|
|
||||||
|
# PyCharm
|
||||||
|
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
||||||
|
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
||||||
|
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
||||||
|
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
||||||
|
#.idea/
|
||||||
|
|
||||||
|
# Ruff stuff:
|
||||||
|
.ruff_cache/
|
||||||
|
|
||||||
|
# PyPI configuration file
|
||||||
|
.pypirc
|
||||||
@@ -31,6 +31,7 @@ Another path is this paper I found while searching:
|
|||||||
|
|
||||||
This paper employed ORB algorithm on the spectrogram image, which is interesting. But the paper specifically says that it is tested for music identification. Not ASMR audio. Although I am sure that a spectrogram is just another image for the ORB algorithm. But usual length for ASMR audio ranges from short minutes to hours long audio. And I am not sure if ORB is able to handle such extreme image proportions (extremely large images, with the audio length proportional to the X dimension of the image).
|
This paper employed ORB algorithm on the spectrogram image, which is interesting. But the paper specifically says that it is tested for music identification. Not ASMR audio. Although I am sure that a spectrogram is just another image for the ORB algorithm. But usual length for ASMR audio ranges from short minutes to hours long audio. And I am not sure if ORB is able to handle such extreme image proportions (extremely large images, with the audio length proportional to the X dimension of the image).
|
||||||
One of the ways I came up is to probably chop the audio into pieces, and then running the ORB algorithm to extract the features, that way we don't end up with extraordinary image sizes for the spectrogram, but I am not sure of its effectiveness. So I will also have to experiment with that.
|
One of the ways I came up is to probably chop the audio into pieces, and then running the ORB algorithm to extract the features, that way we don't end up with extraordinary image sizes for the spectrogram, but I am not sure of its effectiveness. So I will also have to experiment with that.
|
||||||
|
We can always just use existing Music fingerprinting programs like Shazam or Chromaprint? But I highly doubt its effectiveness.
|
||||||
|
|
||||||
So my current approach will be experimenting these two ways using the local DLSite audio that I have. And compare the results between each other.
|
So my current approach will be experimenting these two ways using the local DLSite audio that I have. And compare the results between each other.
|
||||||
|
|
||||||
|
|||||||
27
DLSiteFSearchObsidian/Audio Embedding generation.md
Normal file
27
DLSiteFSearchObsidian/Audio Embedding generation.md
Normal file
@@ -0,0 +1,27 @@
|
|||||||
|
For the current approach, I need to have a method, that is fast, accurate, and low-resource if possible, to convert all approximately 9000 audio files into feature vectors.
|
||||||
|
|
||||||
|
I was originally going to use `PANNs` or `VGGIsh` for audio embedding generation. But `PANNs` has crashed on me with `CUDA out of memory` errors. `VGGIsh` looks kind of complicated.
|
||||||
|
|
||||||
|
Anyway, I have asked Claude Sonnet for directions. It did gave me some more results than searching on Google `Audio Embedding Generation`. It recommended the following embedding models:
|
||||||
|
1. CLAP
|
||||||
|
2. BYOL-A
|
||||||
|
3. PANNs
|
||||||
|
4. wav2vec 2.0
|
||||||
|
5. MERT
|
||||||
|
6. VGGish
|
||||||
|
I have never heard of any of these options. I have discovered `PANNs` through an Elastic Search article. Also `Ziliz` or `Milvus` has published an article ranking the embedding models. Which is why I wanted to try out `PANNs, wav2red, VGGish` these three models.
|
||||||
|
|
||||||
|
Each model has its own quirk to run. Although `Towhee` has an uniform way to use all of these embedding models, I have my doubts on this project, which seem to be inactive, and also has allegations of using inadequate ways to gain more Stars on GitHub.
|
||||||
|
|
||||||
|
I will have to set up a comparison between searching with all of these embedding models.
|
||||||
|
|
||||||
|
Also Claude Sonnet has recommended to chop up the audio into smaller 10 seconds chunks. I was wondering why I was getting `CUDA Out of memory` errors. It's because I haven't chunked my audio into smaller pieces. Which explains the error. Since most of the audios are usually 30 minutes long. It also recommended overlapping the chunks. Please see the exported `JSON` chat for details.
|
||||||
|
|
||||||
|
The audio must be pre-processed:
|
||||||
|
1. Load all channels of the audio into memory
|
||||||
|
2. Resample audio according to the model's instruction or training parameter
|
||||||
|
3. Split the audio into chunks of 10-15 seconds
|
||||||
|
Each chunk may have its metadata associated with the position (time in full track audio) and channel information (L, R)
|
||||||
|
|
||||||
|
# Benchmark
|
||||||
|
With 200 audio clips, randomly selected. all audio embedding models mentioned above must have its time for processing 200 audio clips recorded, and its vector results stored on disk.
|
||||||
15
DLSiteFSearchObsidian/Implementation attempt for MTAFE.md
Normal file
15
DLSiteFSearchObsidian/Implementation attempt for MTAFE.md
Normal file
@@ -0,0 +1,15 @@
|
|||||||
|
My implementation attempt for a Multi-Threaded Audio Feature Extractor... my attempt ended in misery.
|
||||||
|
|
||||||
|
My vision is a program that is multi-threaded, that will do audio pre-processing and feature extraction in different threads. There should be `i` threads that will do pre-processing on all given audio file paths, and there should be `j` threads that will do feature extraction. If the audio pre-processing pipeline is single-threaded, it will pose a bottleneck to the entire program. But the feature extractor itself is also a bottleneck, since all audio embedding extractor rely on GPU inference, the feature extraction process must be single-threaded on my computer.
|
||||||
|
|
||||||
|
I was trying to adapt the program for multiple threads for audio pre-processing AND multi-threaded for feature extraction (for beefier GPU that can handle more inference threads)
|
||||||
|
|
||||||
|
Unfortunately... All my attempts has ended in misery, my multi-threaded code is littered with performance issues and deadlocks. Python isn't exactly the best language for multi-threaded code due to the existence of GIL. I am trying to implement a multi-producer, multi-consumer model here. The best attempt I was able to do will hang for a long time waiting for the producer (audio feeder) to pre-process the audio, and put it on the shared queue. It will lock up for a really long time, but after that, it will process everything in light speed. But when it's nearing the end, there is a great chance that the program will deadlock itself. I wasn't able to debug, and the profile didn't really yield any result that are useful to me.
|
||||||
|
|
||||||
|
At one point I even relied on AI, and I still wasn't getting a consistent result, the AI generated a code that was significantly faster, with less deadlock, but has the issue of skipping audio files due to them not being pre-processed in time. I could implement additional logic to catch processing errors, and retry if possible. But I am really burnt out, and I would look for better alternatives.
|
||||||
|
|
||||||
|
The next thing I am going to try is to separate this program into two, this program attempts to do pre-processing AND feature extraction in the same time. I would split the process into two. One program (preferably multi-threaded) that will do all the audio pre-processing (resampling, chunking, etc.), and it will output the pre-processed audio into a serialized pickle file, or any other serialization formats.
|
||||||
|
I can see various issues with this approach, the most important of which is space, I am basically taking all of those audio files (which is NOT a small amount), and I am re-encoding it, without any compression. Even though I have decided to lower the audio's bit-rate (fro, the typical 48000 Hz or 192000 Hz to just 32000 Hz, or in specific embedding extraction models: 8000 Hz or 16000 Hz), this will still take up a lot of space.
|
||||||
|
Also the pickle won't be the best format for storing all of those audio, safety issue is one of them, but the alternative of encoding each chunk into FLAC/MP3 compressed format, will be very heavy on the file system. Even though I do have a SSD. I am uncertain if the filesystem, handling hundred of thousands of audio chunk files will have a hit on the performance and the life of the SSD.
|
||||||
|
|
||||||
|
But at least this will be a lot easier to implement.
|
||||||
35
DLSiteFSearchObsidian/Local dataset analysis.md
Normal file
35
DLSiteFSearchObsidian/Local dataset analysis.md
Normal file
@@ -0,0 +1,35 @@
|
|||||||
|
The local dataset due to space (Disk partition) constraints, is split into three subsets:
|
||||||
|
|
||||||
|
- ASMROne
|
||||||
|
- ASMRTwo
|
||||||
|
- ASMRThree
|
||||||
|
|
||||||
|
There are no substantial differences between each subset.
|
||||||
|
Subset sizes and audio work count:
|
||||||
|
|
||||||
|
- ASMR One --> 119 Audio works, 470GB/504 791 391 855 Bytes
|
||||||
|
- ASMR Two --> 90 Audio works, 439GB/471 683 782 635 Bytes
|
||||||
|
- ASMR Three --> 121 Audio works, 499GB/536 552 753 022 Bytes
|
||||||
|
|
||||||
|
Total: 330 Audio works, 1409GB/1 513 027 927 512 Bytes
|
||||||
|
|
||||||
|
There are works from different languages (audio language, or including translation subtitle file), different sizes, different audio encoding formats, etc.
|
||||||
|
|
||||||
|
Basic statistical data on filesystem level:
|
||||||
|
|
||||||
|
| Subset | File count | Folder count |
|
||||||
|
| ---------- | ---------- | ------------ |
|
||||||
|
| ASMR One | 6317 | 1017 |
|
||||||
|
| ASMR Two | 7435 | 760 |
|
||||||
|
| ASMR Three | 6694 | 1066 |
|
||||||
|
|
||||||
|
Average Audio Work size:
|
||||||
|
$1409 \, \text{GigaBytes} \div 330 \, \text{Works} = 4.2\overline{69} \, \text{GigaBytes/Work}$
|
||||||
|
Avg.: approximately 4.27 GB per work
|
||||||
|
|
||||||
|
In this project we will be indexing only the following type of files:
|
||||||
|
- Audio
|
||||||
|
- Image
|
||||||
|
- Document
|
||||||
|
|
||||||
|
In depth analysis of the contents in the dataset is located in `LocalDatasetAnalysis.ipynb`
|
||||||
@@ -18,3 +18,98 @@ Also it kind of interests me how the C++ <--> Rust integration works in this pro
|
|||||||
What puzzles me is that ORB_SLAM3 itself, also depends on OpenCV heavily, so if Rust is using OpenCV wrapper, what the ORB_SLAM3 is supposed to use? Specially that ORB_SLAM3 will return vectors that are OpenCV types. And Rust may not understand C++ OpenCV type.
|
What puzzles me is that ORB_SLAM3 itself, also depends on OpenCV heavily, so if Rust is using OpenCV wrapper, what the ORB_SLAM3 is supposed to use? Specially that ORB_SLAM3 will return vectors that are OpenCV types. And Rust may not understand C++ OpenCV type.
|
||||||
|
|
||||||
After a bit of digging, I found that `imsearch` uses a premade wrapper for OpenCV, which is fine. During the compilation of `imsearch`, linking OpenCV is the step that will often fail (because OpenCV-rust binding requires you to bring your own OpenCV or your system package's OpenCV). My hypothesis is that ORB_SLAM3 in the `imsearch` code is probably linking against the same OpenCV library that is being used in the Rust calls. They can pass raw pointers to each other which is allowed by the Rust OpenCV binding. And the fact that `imsearch/src/ORB_SLAM3/ocvrs_common.hpp` indicates that `ORB_SLAM3` and `imsearch` are passing pointers around, the custom wrapper is `imsearch/src/ORB_SLAM3/ORBwrapper.cc`.
|
After a bit of digging, I found that `imsearch` uses a premade wrapper for OpenCV, which is fine. During the compilation of `imsearch`, linking OpenCV is the step that will often fail (because OpenCV-rust binding requires you to bring your own OpenCV or your system package's OpenCV). My hypothesis is that ORB_SLAM3 in the `imsearch` code is probably linking against the same OpenCV library that is being used in the Rust calls. They can pass raw pointers to each other which is allowed by the Rust OpenCV binding. And the fact that `imsearch/src/ORB_SLAM3/ocvrs_common.hpp` indicates that `ORB_SLAM3` and `imsearch` are passing pointers around, the custom wrapper is `imsearch/src/ORB_SLAM3/ORBwrapper.cc`.
|
||||||
|
|
||||||
|
# Search method
|
||||||
|
First, `imsearch` needs a source dataset, which is a image database where future queries will be compared against.
|
||||||
|
|
||||||
|
During `imsearch add-image (directory)`, the program will loop through all image files in the directory, and extracts all feature vectors using ORB_SLAM3.
|
||||||
|
|
||||||
|
The "feature vector", which is actually the source descriptor (OpenCV Descriptor) is stored in RockDB. A source descriptor is a matrix of size $\text{Number of Features} \times 32$. Each containing a `uint8` value. (See test_slamorb.ipynb)
|
||||||
|
|
||||||
|
All the features will be stored in a internal Key-Value database using RockDB. There are various tables that are used in `imsearch`.
|
||||||
|
|
||||||
|
The first table is the Features table. Each feature will have its own corresponding ID in incremental order. Note that it does not store the original image path, which is the only important metadata.
|
||||||
|
|
||||||
|
The second table is the Image Path table, which stores all image paths (the only metadata about images) added to `imsearch`. Each image path will have its own corresponding ID.
|
||||||
|
|
||||||
|
The third table is a Relation Table. For all stored features in RockDB, it establishes a relation between Feature ID and Image path ID.
|
||||||
|
|
||||||
|
For example, running ORB_SLAM3 on an image may return an 480x32 matrix, this means that the image has 480 feature vectors. Adding a single image to the database, `imsearch` will extract all the feature vectors, and store each of the feature vectors (all 480 of them) into the Features Table, then the image path will be inserted in the Image Path Table. Finally, it will insert into the Relation Table, all the Feature ID of the image, and its corresponding Image Path ID.
|
||||||
|
|
||||||
|
```mermaid
|
||||||
|
erDiagram
|
||||||
|
fid[ImageFeatureColumn] {
|
||||||
|
uint64 FeatureID
|
||||||
|
vector feature
|
||||||
|
}
|
||||||
|
ipid[ImagePathColumn] {
|
||||||
|
uint64 ImagePathID
|
||||||
|
string path
|
||||||
|
}
|
||||||
|
fid2ipid[FeatureID2ImagePathID] {
|
||||||
|
uint64 FeatureID
|
||||||
|
uint64 ImagePathID
|
||||||
|
}
|
||||||
|
fid |{ -- }| fid2ipid : Relation
|
||||||
|
ipid || -- || fid2ipid : Relation
|
||||||
|
```
|
||||||
|
This establishes a many to one relationship between features and image paths.
|
||||||
|
|
||||||
|
After adding all the features into RockDB, `imsearch export-data` is called, and all the feature vectors are exported into a NumPy Serialized Array.
|
||||||
|
After exporting, using the python script provided in `utils/train.py`, a new FAISS index using `IndexBinaryIVF` is created with dimension 256 (uint8 is 8 bit, there are 32 uint8 in one feature vector, a single feature vectors uses up 256 bit, thus the dimension for the binary vector index is 256), `k` or `nlist` is at discretion of the user, depending on the feature amount in the database. The `nlist` parameter divides [all feature vectors into clusters](https://github.com/facebookresearch/faiss/wiki/Faster-search), `nlist` indicates the amount of cluster to form. These kind of indexes requires training, the script will attempt to train the index using all the feature vectors contained in the NumPy Serialized Array exported by `imsearch`. After the index training is complete. The newly created FAISS index will be serialized and saved into `~/.config/imsearch/`.
|
||||||
|
|
||||||
|
Afterwards, running `imsearch build-index` will actually add all the vectors into the index. During the training process, the actual index stays empty, the training process is to better cluster new feature vectors that will be added afterwards, and also make the KNN search much more performant.
|
||||||
|
|
||||||
|
After index building is complete, the `imsearch` can finally be used for reverse-image search. Either via CLI or using the Web API.
|
||||||
|
|
||||||
|
During search, a query image is passed in. ORB_SLAM3 will extract all the feature vectors present in the query image. Obtaining the feature of the image, which is a 2D Matrix. If an image has 480 feature vectors, then the Matrix will be 480x32. One row of the Matrix corresponds to a single feature vector of the image.
|
||||||
|
|
||||||
|
`imsearch` will perform a KNN search for all features present in the image, all 480 feature vectors will be searched. returning its neighbor vectors (their index, and their distance).
|
||||||
|
|
||||||
|
After getting all neighbors vectors (id) and their distance, we look up neighbor's vector id with its corresponding image file path in the RockDB FeatureID2ImagePathID. And then we assign a score on the similarity of each feature based on its distance with the neighbor vector. We basically obtain a statistical chart: a HashMap with image-path as Key, and a list of scores on the similarity of each feature vector between the query image vector and neighbor vectors (and its image). Please see [`lolishinshi/imsearch/src/imdb.rs`](https://github.com/lolishinshi/imsearch/blob/master/src/imdb.rs#L185)
|
||||||
|
|
||||||
|
Usually if an image has a match, it will find various image (paths) with various feature vector similarity scores attached under that image path. If all scores are high and there are plenty of scores attached under the same image path? Then it's probably the original image that we are trying to find. If not, then either the scores will be low, or the image-path will be completely different, and with low similarity scores on each feature-vector neighbor-vector comparison.
|
||||||
|
|
||||||
|
Finally, all the scores are weighted using Wilson's Score. Giving each image-path a uniform similarity score. Then the result will be passed back to the end user.
|
||||||
|
|
||||||
|
Still, it's not a trivial process, whoever came up with the idea, I must give you my praise. But holy shit the source code for `lolishinshi/imsearch` is hard to read. It came with basically no documentation (other than how to use it). Reading Rust code is extremely hard for me, specially when there is some chaining action going on, like this:
|
||||||
|
|
||||||
|
```rust
|
||||||
|
// Fragment of lolishinshi/imsearch/src/index.rs @ L185
|
||||||
|
pub fn search<M>(&self, points: &M, knn: usize) -> Vec<Vec<Neighbor>>
|
||||||
|
where
|
||||||
|
M: Matrix,
|
||||||
|
{
|
||||||
|
assert_eq!(points.width() * 8, self.d as usize);
|
||||||
|
let mut dists = vec![0i32; points.height() * knn];
|
||||||
|
let mut indices = vec![0i64; points.height() * knn];
|
||||||
|
let start = Instant::now();
|
||||||
|
unsafe {
|
||||||
|
faiss_IndexBinary_search(
|
||||||
|
self.index,
|
||||||
|
points.height() as i64,
|
||||||
|
points.as_ptr(),
|
||||||
|
knn as i64,
|
||||||
|
dists.as_mut_ptr(),
|
||||||
|
indices.as_mut_ptr(),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
debug!("knn search time: {:.2}s", start.elapsed().as_secs_f32());
|
||||||
|
indices
|
||||||
|
.into_iter()
|
||||||
|
.zip(dists.into_iter())
|
||||||
|
.map(|(index, distance)| Neighbor {
|
||||||
|
index: index as usize,
|
||||||
|
distance: distance as u32,
|
||||||
|
})
|
||||||
|
.chunks(knn)
|
||||||
|
.into_iter()
|
||||||
|
.map(|chunk| chunk.collect())
|
||||||
|
.collect()
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
I had to whip out GitHub Copilot for this hieroglyphic, because between non-existence of code documentation, the ludicrous amount of `into_iter()` and chaining, and `unwraps` and `results` and unfamiliar macros. It's definitely a frustrating experience reading the code if you are not a Rust developer.
|
||||||
|
|
||||||
|
I will be adapting this image search method into Python and Milvus. Thank you `lolishinshi`.
|
||||||
541
FeatureExtraction/ExtractionFrameworkThroughputTest.ipynb
Normal file
541
FeatureExtraction/ExtractionFrameworkThroughputTest.ipynb
Normal file
@@ -0,0 +1,541 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "df1636c8",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Extraction Framework Throughput Test\n",
|
||||||
|
"\n",
|
||||||
|
"This notepad will test the rate of audio load, preprocessing (resampling, chunking) throughput.\n",
|
||||||
|
"\n",
|
||||||
|
"All chunked audio will be assigned of embedding `np.zeros(32)` for demonstration purposes."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"id": "9924265b",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stderr",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/id3.c:process_comment():587] error: No comment text / valid description?\n",
|
||||||
|
"[src/libmpg123/id3.c:process_comment():587] error: No comment text / valid description?\n",
|
||||||
|
"/home/qt/Repository/DLSiteFSearch/FeatureExtraction/audiopreprocessing.py:26: UserWarning: PySoundFile failed. Trying audioread instead.\n",
|
||||||
|
" audio, orig_sr = librosa.load(input_path, sr=None, mono=mono_audio)\n",
|
||||||
|
"/home/qt/Repository/DLSiteFSearch/.venv/lib/python3.12/site-packages/librosa/core/audio.py:184: FutureWarning: librosa.core.audio.__audioread_load\n",
|
||||||
|
"\tDeprecated as of librosa version 0.10.0.\n",
|
||||||
|
"\tIt will be removed in librosa version 1.0.\n",
|
||||||
|
" y, sr_native = __audioread_load(path, offset, duration, dtype)\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"/home/qt/Repository/DLSiteFSearch/FeatureExtraction/audiopreprocessing.py:26: UserWarning: PySoundFile failed. Trying audioread instead.\n",
|
||||||
|
" audio, orig_sr = librosa.load(input_path, sr=None, mono=mono_audio)\n",
|
||||||
|
"/home/qt/Repository/DLSiteFSearch/.venv/lib/python3.12/site-packages/librosa/core/audio.py:184: FutureWarning: librosa.core.audio.__audioread_load\n",
|
||||||
|
"\tDeprecated as of librosa version 0.10.0.\n",
|
||||||
|
"\tIt will be removed in librosa version 1.0.\n",
|
||||||
|
" y, sr_native = __audioread_load(path, offset, duration, dtype)\n",
|
||||||
|
"/home/qt/Repository/DLSiteFSearch/FeatureExtraction/audiopreprocessing.py:26: UserWarning: PySoundFile failed. Trying audioread instead.\n",
|
||||||
|
" audio, orig_sr = librosa.load(input_path, sr=None, mono=mono_audio)\n",
|
||||||
|
"/home/qt/Repository/DLSiteFSearch/.venv/lib/python3.12/site-packages/librosa/core/audio.py:184: FutureWarning: librosa.core.audio.__audioread_load\n",
|
||||||
|
"\tDeprecated as of librosa version 0.10.0.\n",
|
||||||
|
"\tIt will be removed in librosa version 1.0.\n",
|
||||||
|
" y, sr_native = __audioread_load(path, offset, duration, dtype)\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!\n",
|
||||||
|
"[src/libmpg123/id3.c:process_comment():587] error: No comment text / valid description?\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"from dataset_files import MultiThreadedAudioFeatureExtractor, random_audio_chunk\n",
|
||||||
|
"import audiopreprocessing\n",
|
||||||
|
"import logging\n",
|
||||||
|
"logging.basicConfig(format=\"%(asctime)s/%(levelname)s: [%(module)s] %(message)s\", level=logging.INFO)\n",
|
||||||
|
"\n",
|
||||||
|
"mtafe = MultiThreadedAudioFeatureExtractor(\n",
|
||||||
|
" audio_paths=random_audio_chunk(128),\n",
|
||||||
|
" max_audio_in_queue=8,\n",
|
||||||
|
" audio_feeder_threads=8,\n",
|
||||||
|
" feature_extractor_threads=1,\n",
|
||||||
|
" desired_sr=32000,\n",
|
||||||
|
" force_mono=False,\n",
|
||||||
|
" chunk_length=15,\n",
|
||||||
|
" chunk_overlap=2\n",
|
||||||
|
")\n",
|
||||||
|
"audio_map = [audiopreprocessing.load_preprocessed_audio(p, 8000) for p in random_audio_chunk(200)]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"id": "edfa3680",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"[156,\n",
|
||||||
|
" 134,\n",
|
||||||
|
" 150,\n",
|
||||||
|
" 134,\n",
|
||||||
|
" 258,\n",
|
||||||
|
" 228,\n",
|
||||||
|
" 226,\n",
|
||||||
|
" 58,\n",
|
||||||
|
" 238,\n",
|
||||||
|
" 2,\n",
|
||||||
|
" 188,\n",
|
||||||
|
" 12,\n",
|
||||||
|
" 70,\n",
|
||||||
|
" 228,\n",
|
||||||
|
" 10,\n",
|
||||||
|
" 330,\n",
|
||||||
|
" 80,\n",
|
||||||
|
" 98,\n",
|
||||||
|
" 112,\n",
|
||||||
|
" 114,\n",
|
||||||
|
" 8,\n",
|
||||||
|
" 172,\n",
|
||||||
|
" 98,\n",
|
||||||
|
" 124,\n",
|
||||||
|
" 206,\n",
|
||||||
|
" 40,\n",
|
||||||
|
" 114,\n",
|
||||||
|
" 130,\n",
|
||||||
|
" 10,\n",
|
||||||
|
" 17,\n",
|
||||||
|
" 92,\n",
|
||||||
|
" 70,\n",
|
||||||
|
" 8,\n",
|
||||||
|
" 24,\n",
|
||||||
|
" 336,\n",
|
||||||
|
" 46,\n",
|
||||||
|
" 8,\n",
|
||||||
|
" 186,\n",
|
||||||
|
" 232,\n",
|
||||||
|
" 282,\n",
|
||||||
|
" 136,\n",
|
||||||
|
" 68,\n",
|
||||||
|
" 66,\n",
|
||||||
|
" 134,\n",
|
||||||
|
" 58,\n",
|
||||||
|
" 48,\n",
|
||||||
|
" 36,\n",
|
||||||
|
" 78,\n",
|
||||||
|
" 72,\n",
|
||||||
|
" 20,\n",
|
||||||
|
" 172,\n",
|
||||||
|
" 142,\n",
|
||||||
|
" 170,\n",
|
||||||
|
" 114,\n",
|
||||||
|
" 12,\n",
|
||||||
|
" 644,\n",
|
||||||
|
" 180,\n",
|
||||||
|
" 94,\n",
|
||||||
|
" 78,\n",
|
||||||
|
" 188,\n",
|
||||||
|
" 56,\n",
|
||||||
|
" 204,\n",
|
||||||
|
" 34,\n",
|
||||||
|
" 232,\n",
|
||||||
|
" 124,\n",
|
||||||
|
" 162,\n",
|
||||||
|
" 262,\n",
|
||||||
|
" 198,\n",
|
||||||
|
" 258,\n",
|
||||||
|
" 26,\n",
|
||||||
|
" 170,\n",
|
||||||
|
" 120,\n",
|
||||||
|
" 10,\n",
|
||||||
|
" 84,\n",
|
||||||
|
" 4,\n",
|
||||||
|
" 152,\n",
|
||||||
|
" 10,\n",
|
||||||
|
" 214,\n",
|
||||||
|
" 222,\n",
|
||||||
|
" 58,\n",
|
||||||
|
" 86,\n",
|
||||||
|
" 104,\n",
|
||||||
|
" 256,\n",
|
||||||
|
" 192,\n",
|
||||||
|
" 30,\n",
|
||||||
|
" 230,\n",
|
||||||
|
" 150,\n",
|
||||||
|
" 172,\n",
|
||||||
|
" 194,\n",
|
||||||
|
" 38,\n",
|
||||||
|
" 10,\n",
|
||||||
|
" 168,\n",
|
||||||
|
" 154,\n",
|
||||||
|
" 4,\n",
|
||||||
|
" 168,\n",
|
||||||
|
" 234,\n",
|
||||||
|
" 86,\n",
|
||||||
|
" 230,\n",
|
||||||
|
" 158,\n",
|
||||||
|
" 300,\n",
|
||||||
|
" 160,\n",
|
||||||
|
" 6,\n",
|
||||||
|
" 12,\n",
|
||||||
|
" 22,\n",
|
||||||
|
" 98,\n",
|
||||||
|
" 126,\n",
|
||||||
|
" 106,\n",
|
||||||
|
" 8,\n",
|
||||||
|
" 180,\n",
|
||||||
|
" 34,\n",
|
||||||
|
" 152,\n",
|
||||||
|
" 118,\n",
|
||||||
|
" 10,\n",
|
||||||
|
" 10,\n",
|
||||||
|
" 128,\n",
|
||||||
|
" 210,\n",
|
||||||
|
" 124,\n",
|
||||||
|
" 158,\n",
|
||||||
|
" 174,\n",
|
||||||
|
" 204,\n",
|
||||||
|
" 166,\n",
|
||||||
|
" 186,\n",
|
||||||
|
" 180,\n",
|
||||||
|
" 110,\n",
|
||||||
|
" 44,\n",
|
||||||
|
" 126,\n",
|
||||||
|
" 172,\n",
|
||||||
|
" 220,\n",
|
||||||
|
" 236,\n",
|
||||||
|
" 122,\n",
|
||||||
|
" 184,\n",
|
||||||
|
" 54,\n",
|
||||||
|
" 182,\n",
|
||||||
|
" 234,\n",
|
||||||
|
" 192,\n",
|
||||||
|
" 262,\n",
|
||||||
|
" 138,\n",
|
||||||
|
" 124,\n",
|
||||||
|
" 126,\n",
|
||||||
|
" 230,\n",
|
||||||
|
" 194,\n",
|
||||||
|
" 48,\n",
|
||||||
|
" 658,\n",
|
||||||
|
" 118,\n",
|
||||||
|
" 88,\n",
|
||||||
|
" 242,\n",
|
||||||
|
" 266,\n",
|
||||||
|
" 220,\n",
|
||||||
|
" 190,\n",
|
||||||
|
" 138,\n",
|
||||||
|
" 192,\n",
|
||||||
|
" 56,\n",
|
||||||
|
" 180,\n",
|
||||||
|
" 192,\n",
|
||||||
|
" 150,\n",
|
||||||
|
" 196,\n",
|
||||||
|
" 160,\n",
|
||||||
|
" 16,\n",
|
||||||
|
" 62,\n",
|
||||||
|
" 110,\n",
|
||||||
|
" 16,\n",
|
||||||
|
" 38,\n",
|
||||||
|
" 408,\n",
|
||||||
|
" 192,\n",
|
||||||
|
" 400,\n",
|
||||||
|
" 208,\n",
|
||||||
|
" 1,\n",
|
||||||
|
" 136,\n",
|
||||||
|
" 212,\n",
|
||||||
|
" 164,\n",
|
||||||
|
" 84,\n",
|
||||||
|
" 262,\n",
|
||||||
|
" 28,\n",
|
||||||
|
" 270,\n",
|
||||||
|
" 86,\n",
|
||||||
|
" 184,\n",
|
||||||
|
" 122,\n",
|
||||||
|
" 206,\n",
|
||||||
|
" 256,\n",
|
||||||
|
" 4,\n",
|
||||||
|
" 48,\n",
|
||||||
|
" 168,\n",
|
||||||
|
" 194,\n",
|
||||||
|
" 210,\n",
|
||||||
|
" 122,\n",
|
||||||
|
" 144,\n",
|
||||||
|
" 8,\n",
|
||||||
|
" 32,\n",
|
||||||
|
" 232,\n",
|
||||||
|
" 1,\n",
|
||||||
|
" 8,\n",
|
||||||
|
" 272,\n",
|
||||||
|
" 302,\n",
|
||||||
|
" 44,\n",
|
||||||
|
" 200,\n",
|
||||||
|
" 48,\n",
|
||||||
|
" 176,\n",
|
||||||
|
" 180,\n",
|
||||||
|
" 258,\n",
|
||||||
|
" 4]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 5,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"audio_map_len = [len(m) for m in audio_map]\n",
|
||||||
|
"audio_map_len"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "7b2ee365",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"[PosixPath('/mnt/Scratchpad/ASMROne/RJ01312393/03mp3_効果音あり/○TR2:一回ヤったからって気まずくなるとかw.mp3'),\n",
|
||||||
|
" PosixPath('/mnt/Scratchpad/ASMROne/RJ01068516/本篇/#7.在漫咖侍寝.wav'),\n",
|
||||||
|
" PosixPath('/mnt/MyStuffz/ASMRTwo/RJ01192303/01_舔舐活同伴課程篇/mp3版/06_萌音×梨亞的延長課程.mp3'),\n",
|
||||||
|
" PosixPath('/mnt/MyStuffz/ASMRTwo/RJ291279/3_FLAC/アレンジ/04射精音なし.flac'),\n",
|
||||||
|
" PosixPath('/mnt/Windows11/ASMRThree/RJ01068246/02:wav/track04_想让你为了姐姐大量射精.wav'),\n",
|
||||||
|
" PosixPath('/mnt/Scratchpad/ASMROne/RJ01109943/★萝莉义妹J◯♪豪华特典★/03.兎月りりむ。Freetalk/2-.wav版【最高音质・推荐下载鉴赏】/EX.萝莉义妹@兎月りりむ。Freetalk.wav'),\n",
|
||||||
|
" PosixPath('/mnt/Scratchpad/ASMROne/RJ01263522/「游戏迷女友」系列总集篇/边玩游戏边让你自由地使用小穴的游戏迷女友【双声道立体声】/1.音声文件/WAV/2.『顺便帮你舔耳』.wav'),\n",
|
||||||
|
" PosixPath('/mnt/Scratchpad/ASMROne/RJ01037597/mp3/07_特典トラック・公募音声.mp3'),\n",
|
||||||
|
" PosixPath('/mnt/Scratchpad/ASMROne/RJ271498/mp3/10_【朝フェラ】リラックス朝フェラ.mp3'),\n",
|
||||||
|
" PosixPath('/mnt/Scratchpad/ASMROne/RJ01269607/完成音声/mp3-无SE/tr00_报幕.mp3'),\n",
|
||||||
|
" PosixPath('/mnt/Scratchpad/ASMROne/RJ304908/mp3/07 酔っ払ったお姉ちゃんとあまあま対面座位セックス.mp3'),\n",
|
||||||
|
" PosixPath('/mnt/Windows11/ASMRThree/RJ01153369/02_個別データ/SEなし/08.芦屋もこ/7 挿入⇒喘ぎ.mp3'),\n",
|
||||||
|
" PosixPath('/mnt/Scratchpad/ASMROne/RJ01282581/2-.mp3版【圧縮形式・軽量バージョン】/1.SEあり(おすすめ)/6.キスされまくり甘々ピロートーク_SE.mp3'),\n",
|
||||||
|
" PosixPath('/mnt/Scratchpad/ASMROne/RJ01070779/wav/06_绀的陪睡(哄睡).wav'),\n",
|
||||||
|
" PosixPath('/mnt/Windows11/ASMRThree/RJ01153369/02_個別データ/SEなし/19.星羅あかね/10 寝息.mp3'),\n",
|
||||||
|
" PosixPath('/mnt/Scratchpad/ASMROne/RJ01194525/mp3/TR03_先輩、襲ってもいいんですよ?-.mp3'),\n",
|
||||||
|
" PosixPath('/mnt/Windows11/ASMRThree/RJ01167343/1.音声ファイル/環境音無し/7.おまけ『オカズ淫語をたくさん申し上げます♪』.mp3'),\n",
|
||||||
|
" PosixPath('/mnt/MyStuffz/ASMRTwo/RJ01205182/wav_no SE/02. 按摩一下耳朵,讓你睡得更舒服~(耳朵按摩)no SE.wav'),\n",
|
||||||
|
" PosixPath('/mnt/MyStuffz/ASMRTwo/RJ387999/2.背徳(通常)ルート/2.wav/3.強制愛撫.wav'),\n",
|
||||||
|
" PosixPath('/mnt/MyStuffz/ASMRTwo/RJ325846/3_FLAC/ex13【極】脳髄舐め into the abyss.flac'),\n",
|
||||||
|
" PosixPath('/mnt/Windows11/ASMRThree/RJ400619/wav/04_PVボイス/フェアリーのキャシー編PV.wav'),\n",
|
||||||
|
" PosixPath('/mnt/Windows11/ASMRThree/RJ406548/3-.wav版【ハイレゾ品質・ダウンロード視聴にオススメ】/2.SEなし/04 嘘オホ喘ぎww&天使特製オモチャで亀頭いじめ_NoSE.wav'),\n",
|
||||||
|
" PosixPath('/mnt/MyStuffz/ASMRTwo/RJ01205182/mp3_no SE/02. 按摩一下耳朵,讓你睡得更舒服~(耳朵按摩)no SE.mp3'),\n",
|
||||||
|
" PosixPath('/mnt/Scratchpad/ASMROne/RJ01050049/MP3/03 変態搾精フェラと発情汁まみれオナニー.mp3'),\n",
|
||||||
|
" PosixPath('/mnt/Scratchpad/ASMROne/RJ01058640/2-.FLAC版【低損耗形式・推薦線上收聽】/1.SE(推薦)/02.使用性騷擾按摩&戴套性愛來抑制治療小穴性慾♪_SE.flac'),\n",
|
||||||
|
" PosixPath('/mnt/MyStuffz/ASMRTwo/RJ01252490/「你要是敢内射我就杀了你!」态度嚣张但尤其敏感的超可爱JK喵子/mp3/Tr.7 附赠音轨.mp3'),\n",
|
||||||
|
" PosixPath('/mnt/Windows11/ASMRThree/RJ437868/本編(これがメイン)/限界お漏らしルート/SE一部あり\\u3000お漏らしルート.wav'),\n",
|
||||||
|
" PosixPath('/mnt/Windows11/ASMRThree/RJ01123987/1-.mp3版【压缩・适合在线视听】/1.含SE(推荐)/02.去个不停♪连续高潮与肉感紧致小穴性爱♪_SE.mp3'),\n",
|
||||||
|
" PosixPath('/mnt/Windows11/ASMRThree/RJ01153369/02_個別データ/SEなし/18.竹早芽衣/2 耳舐め.mp3'),\n",
|
||||||
|
" PosixPath('/mnt/Windows11/ASMRThree/RJ437868/ルート別切り抜き/排泄シーンのみ/SEなし\\u3000限界排泄のみ.wav'),\n",
|
||||||
|
" PosixPath('/mnt/MyStuffz/ASMRTwo/RJ347971/SE有り/MP3/【8】おまけ:両耳舐めループ.mp3'),\n",
|
||||||
|
" PosixPath('/mnt/Scratchpad/ASMROne/RJ01282581/1-.wav版【最高品質・リリムワークス謹製】/1.SEあり(おすすめ)/6.キスされまくり甘々ピロートーク_SE.wav')]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"random_audio_chunk(32)\n",
|
||||||
|
"audio_map[0]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "f9f16e98",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"After small scale testing, Three threads: main, `audio_feed` thread and `extractor` thread. Processing 32 files took 60 seconds. After interpolating, processing 9000 files will take approximately 16875 seconds, which is 4,6 hours. It is possible that we need to upgrade both the `audio_feed` and `extractor` into multithreading.\n",
|
||||||
|
"\n",
|
||||||
|
"After a second round of test.\n",
|
||||||
|
"\n",
|
||||||
|
"```Processed 200/200 (L:0/W:0 COMPLETE)\n",
|
||||||
|
"Extraction completed\n",
|
||||||
|
"Took 338.271537993 seconds. Added 27835 vectors/embeddings```\n",
|
||||||
|
"\n",
|
||||||
|
"The throughput is 200 files in 338 seconds, which is 0.59 files per second. Or 1.69 second per file"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"id": "4d453b31",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"8430449"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"from dataset_files import serialize_dict_obj\n",
|
||||||
|
"from pathlib import Path\n",
|
||||||
|
"\n",
|
||||||
|
"serialize_dict_obj(Path(\"./testfeature.pkl\").resolve(), afe.features)"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": ".venv",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.12.3"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
||||||
5664
FeatureExtraction/ImageFeatureExtraction.ipynb
Normal file
5664
FeatureExtraction/ImageFeatureExtraction.ipynb
Normal file
File diff suppressed because one or more lines are too long
7958
FeatureExtraction/TestAudioFeatureExtractionPANNS.ipynb
Normal file
7958
FeatureExtraction/TestAudioFeatureExtractionPANNS.ipynb
Normal file
File diff suppressed because it is too large
Load Diff
95
FeatureExtraction/audiopreprocessing.py
Normal file
95
FeatureExtraction/audiopreprocessing.py
Normal file
@@ -0,0 +1,95 @@
|
|||||||
|
import librosa
|
||||||
|
import pickle
|
||||||
|
import os
|
||||||
|
import numpy as np
|
||||||
|
from pathlib import Path
|
||||||
|
import logging
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
def triggerlog():
|
||||||
|
logger.critical("Testing: info")
|
||||||
|
|
||||||
|
def resample_load(input_path : Path, target_sr : int = 16000, mono_audio : bool = False) -> np.ndarray: # AI
|
||||||
|
"""Load and resamples the audio into `target_sr`.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
input_path (Path): pathlib.Path object to audio file
|
||||||
|
target_sr (int, optional): Target Sample Rate to resample. Defaults to 16000.
|
||||||
|
mono_audio (bool, optional): Load the audio in mono mode. Defaults to False.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
np.ndarray: _description_
|
||||||
|
"""
|
||||||
|
# Load audio file with original sample rate
|
||||||
|
logger.info(f"[resample_load] Loading audio {input_path}")
|
||||||
|
audio, orig_sr = librosa.load(input_path, sr=None, mono=mono_audio)
|
||||||
|
|
||||||
|
# Resample if necessary
|
||||||
|
if orig_sr != target_sr:
|
||||||
|
logger.info(f"[resample_load] Resampling to {target_sr}")
|
||||||
|
audio = librosa.resample(audio, orig_sr=orig_sr, target_sr=target_sr)
|
||||||
|
|
||||||
|
return audio
|
||||||
|
|
||||||
|
def chunk_audio(audio : np.ndarray, sr: int, chunk_length: float = 10.0, overlap: float = 2.0) -> tuple[list[np.ndarray], list[float], int]: # AI
|
||||||
|
"""
|
||||||
|
Chunks audio file into overlapping segments. Only pass in mono audio here.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
audio_file: Loaded audio ndarray (one channel only)
|
||||||
|
sr: Sample rate for the given audio file
|
||||||
|
chunk_length: Length of each chunk in seconds
|
||||||
|
overlap: Overlap between chunks in seconds
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of audio chunks, list of chunk positions, and given sample rate
|
||||||
|
"""
|
||||||
|
logger.info(f"[chunk_audio] Chunking audio ({len(audio) / sr}s)")
|
||||||
|
# Calculate chunk size and hop length in samples
|
||||||
|
chunk_size = int(chunk_length * sr)
|
||||||
|
hop_length = int((chunk_length - overlap) * sr)
|
||||||
|
|
||||||
|
# Generate chunks
|
||||||
|
chunks = []
|
||||||
|
positions = []
|
||||||
|
k = 0
|
||||||
|
for i in range(0, len(audio) - chunk_size + 1, hop_length):
|
||||||
|
chunk = audio[i:i + chunk_size]
|
||||||
|
chunks.append(chunk)
|
||||||
|
positions.append(i / sr)
|
||||||
|
k += 1
|
||||||
|
if k == 0: # The full audio length is less than chunk_length
|
||||||
|
chunks = [audio]
|
||||||
|
positions = [0.0]
|
||||||
|
logger.info(f"[chunk_audio] Audio less than chunk_length. Returning original audio as chunk\r")
|
||||||
|
else:
|
||||||
|
logger.info(f"[chunk_audio] Audio is split into {k} chunks")
|
||||||
|
|
||||||
|
return chunks, positions, sr
|
||||||
|
|
||||||
|
def load_preprocessed_audio(
|
||||||
|
path: Path,
|
||||||
|
desired_sr: int,
|
||||||
|
mono: bool = False,
|
||||||
|
chunk_length: float = 15.0,
|
||||||
|
overlap: float = 2.0) -> list[tuple[np.ndarray, float, int]]:
|
||||||
|
|
||||||
|
result = []
|
||||||
|
# Load and resample audio
|
||||||
|
audio = resample_load(path, desired_sr, mono) # Stereo 2D matrix, Mono 1D array
|
||||||
|
if mono or (audio.ndim == 1):
|
||||||
|
# Chunk audio: mono (or the audio file loaded in itself is mono)
|
||||||
|
chunks, positions, _ = chunk_audio(audio, desired_sr, chunk_length, overlap)
|
||||||
|
assert len(chunks) == len(positions)
|
||||||
|
result.extend(zip(chunks, positions, [-1 for _ in range(len(chunks))]))
|
||||||
|
# (ndarray_chunk1, pos1, -1): first audio chunk, position1, -1 (Mono channel indicator)
|
||||||
|
else:
|
||||||
|
# Chunk audio: stereo/multichannel
|
||||||
|
for channel_id, channel_audio in enumerate(audio):
|
||||||
|
chunks, positions, _ = chunk_audio(channel_audio, desired_sr, chunk_length, overlap)
|
||||||
|
assert len(chunks) == len(positions)
|
||||||
|
result.extend(zip(chunks, positions, [channel_id for _ in range(len(chunks))]))
|
||||||
|
# (ndarray_chunk1, pos1, 0): first audio chunk, position1, 0 (channel 0)
|
||||||
|
|
||||||
|
return result
|
||||||
515
FeatureExtraction/dataset_files.py
Normal file
515
FeatureExtraction/dataset_files.py
Normal file
@@ -0,0 +1,515 @@
|
|||||||
|
import platform
|
||||||
|
import os
|
||||||
|
import pickle
|
||||||
|
import random
|
||||||
|
import multiprocessing
|
||||||
|
import threading
|
||||||
|
import time
|
||||||
|
import concurrent.futures
|
||||||
|
import numpy as np
|
||||||
|
from pathlib import Path
|
||||||
|
import audiopreprocessing
|
||||||
|
import logging
|
||||||
|
import queue
|
||||||
|
|
||||||
|
def serialize_dict_obj(path : Path, object : dict) -> int:
|
||||||
|
"""Serializes Python Dictionary object to a file via Pickle.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
path (Path): Path to store the file
|
||||||
|
object (dict): Dictionary object to serialize
|
||||||
|
Returns:
|
||||||
|
int: size in bytes written
|
||||||
|
"""
|
||||||
|
# Horrible practice, horrible security, but it will work for now
|
||||||
|
with path.open("wb") as fp:
|
||||||
|
pickle.dump(object, fp)
|
||||||
|
fp.seek(0, os.SEEK_END)
|
||||||
|
size = fp.tell()
|
||||||
|
return size
|
||||||
|
|
||||||
|
logging.info("Reading local dataset directory structure...")
|
||||||
|
|
||||||
|
ASMRThreePath = Path("C:\\ASMRThree")
|
||||||
|
ASMRTwoPath = Path("D:\\ASMRTwo")
|
||||||
|
ASMROnePath = Path("E:\\ASMROne")
|
||||||
|
|
||||||
|
if (platform.system() == 'Linux'):
|
||||||
|
ASMROnePath = Path('/mnt/Scratchpad/ASMROne')
|
||||||
|
ASMRTwoPath = Path('/mnt/MyStuffz/ASMRTwo')
|
||||||
|
ASMRThreePath = Path('/mnt/Windows11/ASMRThree')
|
||||||
|
|
||||||
|
size_one, size_two, size_three = 0, 0, 0
|
||||||
|
files_one, files_two, files_three = [], [], []
|
||||||
|
folders_one, folders_two, folders_three = [], [], []
|
||||||
|
|
||||||
|
# Statistic calculation for ASMROne
|
||||||
|
for root, dirs, files in ASMROnePath.walk(): # Root will iterate through all folders
|
||||||
|
if root.absolute() != ASMROnePath.absolute(): # Skip root of ASMROnePath
|
||||||
|
folders_one.append(root) # Add folder to list
|
||||||
|
for fname in files: # Iterate through all files in current root
|
||||||
|
file = root/fname # Get file path
|
||||||
|
assert file.is_file()
|
||||||
|
files_one.append(file)
|
||||||
|
size_one += file.stat().st_size # Get file size
|
||||||
|
|
||||||
|
# Statistic calculation for ASMRTwo
|
||||||
|
for root, dirs, files in ASMRTwoPath.walk(): # Root will iterate through all folders
|
||||||
|
if root.absolute() != ASMRTwoPath.absolute(): # Skip root of ASMRTwoPath
|
||||||
|
folders_two.append(root) # Add folder to list
|
||||||
|
for fname in files: # Iterate through all files in current root
|
||||||
|
file = root/fname # Get file path
|
||||||
|
assert file.is_file()
|
||||||
|
files_two.append(file)
|
||||||
|
size_two += file.stat().st_size # Get file size
|
||||||
|
|
||||||
|
# Statistic calculation for ASMRThree
|
||||||
|
for root, dirs, files in ASMRThreePath.walk(): # Root will iterate through all folders
|
||||||
|
if root.absolute() != ASMRThreePath.absolute(): # Skip root of ASMRThreePath
|
||||||
|
folders_three.append(root) # Add folder to list
|
||||||
|
for fname in files: # Iterate through all files in current root
|
||||||
|
file = root/fname # Get file path
|
||||||
|
assert file.is_file()
|
||||||
|
files_three.append(file)
|
||||||
|
size_three += file.stat().st_size # Get file size
|
||||||
|
|
||||||
|
DataSubsetPaths = [ASMROnePath, ASMRTwoPath, ASMRThreePath]
|
||||||
|
DLSiteWorksPaths = []
|
||||||
|
# Collect ASMR Works (RJ ID, Paths)
|
||||||
|
for ASMRSubsetPath in DataSubsetPaths:
|
||||||
|
for WorkPaths in ASMRSubsetPath.iterdir():
|
||||||
|
DLSiteWorksPaths.append(WorkPaths)
|
||||||
|
|
||||||
|
fileExt2fileType = {
|
||||||
|
".TXT": "Document",
|
||||||
|
".WAV": "Audio",
|
||||||
|
".MP3": "Audio",
|
||||||
|
".PNG": "Image",
|
||||||
|
".JPG": "Image",
|
||||||
|
".VTT": "Subtitle",
|
||||||
|
".PDF": "Document",
|
||||||
|
".FLAC": "Audio",
|
||||||
|
".MP4": "Video",
|
||||||
|
".LRC": "Subtitle",
|
||||||
|
".SRT": "Subtitle",
|
||||||
|
".JPEG": "Image",
|
||||||
|
".ASS": "Subtitle",
|
||||||
|
"": "NO EXTENSION",
|
||||||
|
".M4A": "Audio",
|
||||||
|
".MKV": "Video"
|
||||||
|
}
|
||||||
|
fileext_stat = {}
|
||||||
|
file_list = files_one + files_two + files_three
|
||||||
|
file_list_count = len(file_list)
|
||||||
|
|
||||||
|
for file in file_list:
|
||||||
|
f_ext = file.suffix.upper()
|
||||||
|
if (f_ext in fileext_stat.keys()):
|
||||||
|
fileext_stat[f_ext]['Count'] += 1
|
||||||
|
fileext_stat[f_ext]['List'].append(file)
|
||||||
|
fileext_stat[f_ext]['ExtensionMass'] += file.stat().st_size
|
||||||
|
else:
|
||||||
|
fileext_stat[f_ext] = {}
|
||||||
|
fileext_stat[f_ext]['Count'] = 1
|
||||||
|
fileext_stat[f_ext]['List'] = [file]
|
||||||
|
fileext_stat[f_ext]['ExtensionMass'] = file.stat().st_size # The total sum of sizes of the same file extension
|
||||||
|
fileext_stat[f_ext]['MediaType'] = fileExt2fileType[f_ext]
|
||||||
|
|
||||||
|
audio_paths = []
|
||||||
|
for extension in fileext_stat: # I can't be bothered to convert this into a list compresion
|
||||||
|
if fileext_stat[extension]['MediaType'] == "Audio":
|
||||||
|
audio_paths += fileext_stat[extension]['List']
|
||||||
|
|
||||||
|
def random_audio_chunk(n : int, seed : int = 177013) -> list[Path]:
|
||||||
|
"""Returns a random selection of audio files
|
||||||
|
|
||||||
|
Args:
|
||||||
|
n (int): Amount of files to return
|
||||||
|
seed (int, optional): Seed for RNG. Defaults to 177013.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
list[Path]: List of randomly selected audio paths (using Path object)
|
||||||
|
"""
|
||||||
|
random.seed(seed)
|
||||||
|
#return random.choices(audio_paths, k=n) # Contains repeated elements
|
||||||
|
return random.sample(audio_paths, k=n)
|
||||||
|
|
||||||
|
# class AudioFeatureExtractor():
|
||||||
|
# __audio_queue: list[ # List of ...
|
||||||
|
# tuple[ # Pair of chunked audio and its path
|
||||||
|
# list[tuple[np.ndarray, float, int]], # Chunked audio
|
||||||
|
# Path # Path to original audio
|
||||||
|
# ]
|
||||||
|
# ] # Listed of Chunked/Resampled audio
|
||||||
|
# __feeder_future: concurrent.futures.Future
|
||||||
|
# __extractor_future: concurrent.futures.Future
|
||||||
|
# __audio_paths_list: list[Path]
|
||||||
|
# __max_audio_in_queue: int
|
||||||
|
# __queue_lock: threading.Lock
|
||||||
|
# __desired_sr: int
|
||||||
|
# __mono: bool
|
||||||
|
# __chunk_length: float
|
||||||
|
# __overlap: float
|
||||||
|
# __features: dict[Path, list[tuple[np.ndarray, float, int]]] # This is a crime, I know
|
||||||
|
# # { audioPath:
|
||||||
|
# # [(embedding, pos, channel)...]
|
||||||
|
# # }
|
||||||
|
|
||||||
|
# def __embedding_inference(self, audio_ndarray: np.ndarray) -> np.ndarray:
|
||||||
|
# """Uses embedding model to inference an audio. Returns embedding vectors.
|
||||||
|
# Function to be overrided. Returns np.zeros(32).
|
||||||
|
|
||||||
|
# Args:
|
||||||
|
# audio_ndarray (np.ndarray):
|
||||||
|
|
||||||
|
# Returns:
|
||||||
|
# np.ndarray: _description_
|
||||||
|
# """
|
||||||
|
# return np.zeros(32)
|
||||||
|
|
||||||
|
# def __embedding_extract(self, audio: tuple[np.ndarray, float, int]) -> tuple[np.ndarray, float, int, np.ndarray]:
|
||||||
|
# """Receives a tuple of audio, position, and channel ID, then adding the embedding to the tuple
|
||||||
|
|
||||||
|
# Args:
|
||||||
|
# audio (tuple[np.ndarray, float, int]): tuple of audio, position, channel id
|
||||||
|
|
||||||
|
# Returns:
|
||||||
|
# tuple[np.ndarray, float, int, np.ndarray]: audio, position, channel id, embedding vector
|
||||||
|
# """
|
||||||
|
# audio_chunk, pos, channel_id = audio
|
||||||
|
# return (audio_chunk, pos, channel_id, self.__embedding_inference(audio_chunk))
|
||||||
|
|
||||||
|
# def __audio_queue_feeder(self): # TODO: Upgrade to multithreaded loader?
|
||||||
|
# """Internal thread function. Preprocess and load the audio continuously to
|
||||||
|
# audio_queue until the end of the audio_paths_list
|
||||||
|
# """
|
||||||
|
# while (self.__audio_paths_list): # While there are still Path elements in path list
|
||||||
|
# if (not (len(self.__audio_queue) < self.__max_audio_in_queue)):
|
||||||
|
# logging.info("[AFE] [Audio Queue Thread]: Queue Full, feeder thread sleeping for 5 seconds")
|
||||||
|
# time.sleep(5)
|
||||||
|
# while(len(self.__audio_queue) < self.__max_audio_in_queue): # While the audio queue is not full
|
||||||
|
# new_audio_path = self.__audio_paths_list[0]
|
||||||
|
# new_audio = audiopreprocessing.load_preprocessed_audio(
|
||||||
|
# new_audio_path,
|
||||||
|
# self.__desired_sr,
|
||||||
|
# self.__mono,
|
||||||
|
# self.__chunk_length,
|
||||||
|
# self.__overlap
|
||||||
|
# )
|
||||||
|
# with self.__queue_lock:
|
||||||
|
# self.__audio_queue.append(
|
||||||
|
# (new_audio, new_audio_path)
|
||||||
|
# )
|
||||||
|
# pop_path = self.__audio_paths_list.pop(0)
|
||||||
|
# logging.info(f"[AFE] [Audio Queue Thread]: Added new audio to queue {pop_path}")
|
||||||
|
# logging.info("[AFE] [Audio Queue Thread]: DONE. All audio files fed")
|
||||||
|
|
||||||
|
# def __audio_queue_feature_extractor(self):
|
||||||
|
# """Internal thread function. Get audio from audio queue. And extract embedding vector
|
||||||
|
# for all audio chunks. Stores the resulting embedding into self.__features.
|
||||||
|
# With Original Audio's Path as key, and list[tuple[np.ndarray, float, int]] (list of tuple of embedding vector, position, channel id)
|
||||||
|
# """
|
||||||
|
# while (self.__audio_paths_list or self.__audio_queue): # While there are still audio to be processed
|
||||||
|
# if (self.__audio_queue): # If audio queue is not empty
|
||||||
|
# with self.__queue_lock:
|
||||||
|
# audio_to_process, audio_path = self.__audio_queue.pop(0) # Get audio from queue
|
||||||
|
# logging.info(f"[AFE] [Feature Extractor Thread]: Extracting {len(audio_to_process)} features from audio {audio_path}")
|
||||||
|
# for audio_chunk in audio_to_process:
|
||||||
|
# same_audio_chunk, timepos, channel_id, embedd_vect = self.__embedding_extract(audio_chunk)
|
||||||
|
# if (audio_path not in self.__features.keys()):
|
||||||
|
# #if DEBUG: print("Adding new vector to", audio_path.name)
|
||||||
|
# self.__features[audio_path] = [(embedd_vect, timepos, channel_id)]
|
||||||
|
# else:
|
||||||
|
# #if DEBUG: print("Adding vector to", audio_path.name)
|
||||||
|
# self.__features[audio_path].append(
|
||||||
|
# (embedd_vect, timepos, channel_id)
|
||||||
|
# )
|
||||||
|
# else:
|
||||||
|
# logging.info("[AFE] [Feature Extractor Thread]: Queue Empty, extractor thread sleeping for 5 seconds") # If audio queue is empty, wait
|
||||||
|
# time.sleep(5)
|
||||||
|
# logging.info("[AFE] [Feature Extractor Thread]: DONE. Extracted all features from all audio files")
|
||||||
|
|
||||||
|
# def __init__(
|
||||||
|
# self,
|
||||||
|
# audio_paths_list: list[Path],
|
||||||
|
# max_audio_in_queue: int,
|
||||||
|
# desired_sr: int,
|
||||||
|
# mono: bool,
|
||||||
|
# chunk_length: float = 15.0,
|
||||||
|
# overlap: float = 2.0
|
||||||
|
# ):
|
||||||
|
# self.__audio_queue = []
|
||||||
|
# self.__audio_paths_list = audio_paths_list
|
||||||
|
# self.__max_audio_in_queue = max_audio_in_queue
|
||||||
|
# self.__queue_lock = threading.Lock()
|
||||||
|
# self.__desired_sr = desired_sr
|
||||||
|
# self.__mono = mono
|
||||||
|
# self.__chunk_length = chunk_length
|
||||||
|
# self.__overlap = overlap
|
||||||
|
# self.__features = {}
|
||||||
|
|
||||||
|
# @property
|
||||||
|
# def features(self) -> dict[Path, list[tuple[np.ndarray, float, int]]]:
|
||||||
|
# return self.__features
|
||||||
|
|
||||||
|
# def extract(self):
|
||||||
|
# print("Starting feature extraction for", len(self.__audio_paths_list), "file(s)")
|
||||||
|
# total_amount = len(self.__audio_paths_list)
|
||||||
|
# t_start = time.perf_counter()
|
||||||
|
# with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
|
||||||
|
# self.__feeder_future = executor.submit(self.__audio_queue_feeder)
|
||||||
|
# self.__extractor_future = executor.submit(self.__audio_queue_feature_extractor)
|
||||||
|
# while (self.__feeder_future.running() or self.__extractor_future.running()):
|
||||||
|
# print(f"Processed {len(self.__features)}/{total_amount} (L:{len(self.__audio_queue)}/W{len(self.__audio_paths_list)})", end="\r")
|
||||||
|
# time.sleep(1)
|
||||||
|
|
||||||
|
# t_stop = time.perf_counter()
|
||||||
|
# print(f"Processed {len(self.__features)}/{total_amount} (L:{len(self.__audio_queue)}/W:{len(self.__audio_paths_list)} COMPLETE)")
|
||||||
|
# delta_t = t_stop - t_start
|
||||||
|
# total_features = sum( [len(self.__features[path]) for path in self.__features] )
|
||||||
|
# print()
|
||||||
|
# print("Extraction completed")
|
||||||
|
# print(f"Took {delta_t} seconds. Added {total_features} vectors/embeddings")
|
||||||
|
|
||||||
|
class MultiThreadedAudioFeatureExtractor():
|
||||||
|
# This is the third time I am rewriting this, please send help. Multithreaded apps is pure hell to develop and debug
|
||||||
|
# After testing: this will hang at the last audio, precisely at preprocessing audio. I suspect that GIL hit the performance
|
||||||
|
# so much to the point that the preprocessing routine cannot get any share of the CPU execution cycle
|
||||||
|
__audio_queue: queue.Queue[ # List of ...
|
||||||
|
tuple[ # Pair of chunked audio and its path
|
||||||
|
list[tuple[np.ndarray, float, int]], # Chunked audio list of (ndarray, time position of chunk relative to original audio, channel_id)
|
||||||
|
Path # Path to original audio
|
||||||
|
]
|
||||||
|
] # Listed of Chunked/Resampled audio
|
||||||
|
__audio_feeder_threads: int # Amount of audio feeder threads
|
||||||
|
__feature_extractor_threads: int # Amount of feature extractor threads (if the method allows)
|
||||||
|
__audio_paths_list: queue.Queue[Path] # Path list to audio
|
||||||
|
__max_audio_in_queue: int # Maximum audio in queue
|
||||||
|
__audio_feeder_barrier: threading.Barrier # Synchronization barrier for all audio feeder threads
|
||||||
|
# Audio Feeder parameter
|
||||||
|
__desired_sr: int # Desired Sample Rate (Resampling)
|
||||||
|
__mono: bool # Force load audio in mono mode
|
||||||
|
__chunk_length: float # Audio chunk length
|
||||||
|
__overlap: float
|
||||||
|
# Result
|
||||||
|
__features: dict[Path, list[tuple[np.ndarray, float, int]]] # This is a crime, I know
|
||||||
|
__features_lock: threading.Lock
|
||||||
|
# __features: { audioPath:
|
||||||
|
# [(embedding1, pos1, channel1),
|
||||||
|
# (embedding2, pos2, channel1)]
|
||||||
|
# ...
|
||||||
|
# }
|
||||||
|
# Runtime
|
||||||
|
__audio_feeder_threadpool: list[concurrent.futures.Future]
|
||||||
|
__feature_extractor_threadpool: list[concurrent.futures.Future]
|
||||||
|
|
||||||
|
def __audio_inference_embedding(self, audio: list[tuple[np.ndarray, float, int]]) -> list[tuple[np.ndarray, float, int]]:
|
||||||
|
"""Receives a list of audio chunks, and then extracts embeddings for all audio chunks, returns the resulting embedding as a list of tuples(embedding, time, channel_id)
|
||||||
|
|
||||||
|
Args:
|
||||||
|
audio (list[tuple[np.ndarray, float, int]]): list of audio chunks
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
list[tuple[np.ndarray, float, int]]: List of (embedding vector, timepos, channel id)
|
||||||
|
"""
|
||||||
|
features = []
|
||||||
|
for audio_chunk in audio:
|
||||||
|
audio, timepos, channel_id = audio_chunk
|
||||||
|
zero = np.zeros(32)
|
||||||
|
features.append( (zero, timepos, channel_id) )
|
||||||
|
time.sleep(0.01) # Simulate effort, change to simulate spent seconds in each audio file
|
||||||
|
return features
|
||||||
|
# To be overridden
|
||||||
|
|
||||||
|
def __audio_feeder_thread(self, thread_id: int, barrier: threading.Barrier):
|
||||||
|
try:
|
||||||
|
while True:
|
||||||
|
# Attempt to get audio path from audio path queue
|
||||||
|
new_audio_path = self.__audio_paths_list.get()
|
||||||
|
# Check thread exit condition (If the queue returns None, that means the audio path queue is now empty and the thread should end itself)
|
||||||
|
if (new_audio_path is None):
|
||||||
|
self.__audio_paths_list.put(new_audio_path) # Put None back to notify other audio feeder threads
|
||||||
|
# Omae wa mou shindeiru
|
||||||
|
break # Si la ETSISI ve esto seguramente me echarán de la escuela
|
||||||
|
# Now that the audio path queue is not empty, try preprocessing an audio
|
||||||
|
logging.info(f"[MTAFE] [Audio Feeder {thread_id}] Preprocess: {new_audio_path.absolute()}")
|
||||||
|
new_audio = audiopreprocessing.load_preprocessed_audio(
|
||||||
|
new_audio_path,
|
||||||
|
self.__desired_sr,
|
||||||
|
self.__mono,
|
||||||
|
self.__chunk_length,
|
||||||
|
self.__overlap
|
||||||
|
)
|
||||||
|
self.__audio_queue.put((new_audio, new_audio_path)) # In theory, this should block this audio feeder thread when the audio queue is full
|
||||||
|
logging.info(f"[MTAFE] [Audio Feeder {thread_id}] Feed: {new_audio_path.absolute()}")
|
||||||
|
logging.info("[MTAFE] [Audio Feeder {thread_id}] Waiting for other threads to finish")
|
||||||
|
barrier.wait()
|
||||||
|
if (thread_id == 0):
|
||||||
|
self.__audio_queue.put(None) # None to signal audio_queue has no more elements to process
|
||||||
|
logging.info(f"[MTAFE] [Audio Feeder {thread_id}] Thread finished!")
|
||||||
|
except Exception as e:
|
||||||
|
logging.error(f"[MTAFE] [Audio Feeder {thread_id}] An exception occurred! Committing seppuku!")
|
||||||
|
logging.exception(e)
|
||||||
|
return
|
||||||
|
|
||||||
|
# while (not self.__audio_paths_list.empty()):
|
||||||
|
# if (not self.__audio_queue.full()):
|
||||||
|
# # Feed audio
|
||||||
|
# new_audio_path = self.__audio_paths_list.get()
|
||||||
|
# self.__audio_paths_list.task_done()
|
||||||
|
# logging.info(f"[MTAFE] [Audio Feeder {thread_id}] Preprocess: {new_audio_path.absolute()}")
|
||||||
|
# new_audio = audiopreprocessing.load_preprocessed_audio(
|
||||||
|
# new_audio_path,
|
||||||
|
# self.__desired_sr,
|
||||||
|
# self.__mono,
|
||||||
|
# self.__chunk_length,
|
||||||
|
# self.__overlap
|
||||||
|
# )
|
||||||
|
# self.__audio_queue.put((new_audio, new_audio_path))
|
||||||
|
# logging.info(f"[MTAFE] [Audio Feeder {thread_id}] Feed: {new_audio_path.absolute()}")
|
||||||
|
# logging.info(f"[MTAFE] [Audio Feeder {thread_id}] Thread finished!")
|
||||||
|
|
||||||
|
#def testfeedthread(self, nthreads):
|
||||||
|
# t1 = threading.Thread(target=self.__audio_feeder_thread, args=(1,))
|
||||||
|
# t2 = threading.Thread(target=self.__audio_feeder_thread, args=(2,))
|
||||||
|
# t1.start(); t2.start()
|
||||||
|
# #with self.__audio_feed_condition:
|
||||||
|
# # self.__audio_feed_condition.notify_all()
|
||||||
|
# t1.join(); t2.join()
|
||||||
|
# with concurrent.futures.ThreadPoolExecutor(max_workers=nthreads) as executor:
|
||||||
|
# for i in range(nthreads):
|
||||||
|
# ft = executor.submit(self.__audio_feeder_thread, i)
|
||||||
|
# self.__audio_loader_threadpool.append(ft)
|
||||||
|
|
||||||
|
def __check_all_audiofeed_thread_finished(self) -> bool:
|
||||||
|
for ft in self.__audio_feeder_threadpool:
|
||||||
|
if ft.running():
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
|
def __check_all_featureextractor_thread_finished(self) -> bool:
|
||||||
|
for ft in self.__feature_extractor_threadpool:
|
||||||
|
if ft.running():
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
|
def __feature_extractor_thread(self, thread_id):
|
||||||
|
while True:
|
||||||
|
# Attempt to get next audio chunks to process
|
||||||
|
next_audio_tuple = self.__audio_queue.get()
|
||||||
|
# Check thread exit condition
|
||||||
|
if (next_audio_tuple is None):
|
||||||
|
self.__audio_queue.put(next_audio_tuple) # Put the None back to notify other threads
|
||||||
|
break # unalive urself
|
||||||
|
else: # Assuming we got more tuples
|
||||||
|
current_audio_to_process, current_audio_path = next_audio_tuple # Deconstruct tuple
|
||||||
|
logging.info(f"[MTAFE] [Feature Extractor {thread_id}] Extracting: {current_audio_path}")
|
||||||
|
features_to_add = self.__audio_inference_embedding(current_audio_to_process)
|
||||||
|
with self.__features_lock:
|
||||||
|
self.__features[current_audio_path] = features_to_add
|
||||||
|
logging.info(f"[MTAFE] [Feature Extractor {thread_id}] Feature Extraction complete for {current_audio_path} w/ {len(features_to_add)} features")
|
||||||
|
logging.info(f"[MTAFE] [Feature Extractor {thread_id}] Thread finished!")
|
||||||
|
|
||||||
|
# while (not self.__check_all_audiofeed_thread_finished() or not self.__audio_queue.empty()):
|
||||||
|
# if (not self.__audio_queue.empty()):
|
||||||
|
# audio_to_process, audio_path = self.__audio_queue.get()
|
||||||
|
# self.__audio_queue.task_done()
|
||||||
|
# logging.info(f"[MTAFE] [Feature Extractor {thread_id}] Extracting: {audio_path}")
|
||||||
|
# features_to_add = self.__audio_inference_embedding(audio_to_process)
|
||||||
|
# logging.info(f"[MTAFE] [Feature Extractor {thread_id}] Extracted: {len(features_to_add)} features")
|
||||||
|
# with self.__features_lock:
|
||||||
|
# self.__features[audio_path] = features_to_add
|
||||||
|
# #with self.__audio_feed_condition: self.__audio_feed_condition.notify_all()
|
||||||
|
# logging.info(f"[MTAFE] [Feature Extractor {thread_id}] Feature Extraction complete for {audio_path} w/ {len(features_to_add)} features")
|
||||||
|
#else:
|
||||||
|
# if (not self.__check_all_audiofeed_thread_finished()):
|
||||||
|
# with self.__audio_feed_condition:
|
||||||
|
# logging.info(f"[MTAFE] [Feature Extractor {thread_id}] Audio queue empty: waiting")
|
||||||
|
# self.__audio_feed_condition.wait(10)
|
||||||
|
# self.__audio_feed_condition.wait_for(lambda: not self.__audio_queue.empty())
|
||||||
|
|
||||||
|
|
||||||
|
def __count_running_threads(self) -> tuple[int, int]:
|
||||||
|
running_extractors = 0
|
||||||
|
running_feeders = 0
|
||||||
|
for ft in self.__feature_extractor_threadpool:
|
||||||
|
if ft.running(): running_extractors += 1
|
||||||
|
for ft in self.__audio_feeder_threadpool:
|
||||||
|
if ft.running(): running_feeders += 1
|
||||||
|
return (running_feeders, running_extractors)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def features(self) -> dict[Path, list[tuple[np.ndarray, float, int]]]:
|
||||||
|
return self.__features
|
||||||
|
|
||||||
|
def extract(self):
|
||||||
|
total_amount = self.__audio_paths_list.qsize() - 1 # Account for None to indicate queue end
|
||||||
|
logging.info(f"[MTAFE] [Main] Starting feature extraction for {total_amount} file(s)")
|
||||||
|
t_start = time.perf_counter() # Timer
|
||||||
|
with concurrent.futures.ProcessPoolExecutor(max_workers=(self.__audio_feeder_threads + self.__feature_extractor_threads)) as executor:
|
||||||
|
# Audio feeder threads
|
||||||
|
for i in range(self.__audio_feeder_threads):
|
||||||
|
logging.info(f"[MTAFE] Started audio feeder thread {i}")
|
||||||
|
ld_ft = executor.submit(self.__audio_feeder_thread, i, self.__audio_feeder_barrier)
|
||||||
|
self.__audio_feeder_threadpool.append(ld_ft)
|
||||||
|
# Feature extractor threads
|
||||||
|
for i in range(self.__feature_extractor_threads):
|
||||||
|
logging.info(f"[MTAFE] Started feature extractor thread {i}")
|
||||||
|
ex_ft = executor.submit(self.__feature_extractor_thread, i)
|
||||||
|
self.__feature_extractor_threadpool.append(ex_ft)
|
||||||
|
# Progress checking
|
||||||
|
while ( (not self.__check_all_audiofeed_thread_finished()) and (not self.__check_all_featureextractor_thread_finished()) ):
|
||||||
|
nfeeder, nextract = self.__count_running_threads()
|
||||||
|
print(f"[MTAFE Progress] Processed {len(self.__features)}/{total_amount} (L:{self.__audio_queue.qsize()}/W:{self.__audio_paths_list.qsize()}, LD:{nfeeder}/EXT:{nextract})", end="\r")
|
||||||
|
t_stop = time.perf_counter()
|
||||||
|
logging.info(f"[MTAFE] Processed {len(self.__features)}/{total_amount} (L:{self.__audio_queue.qsize() - 1}/W:{self.__audio_paths_list.qsize() - 1} COMPLETE)")
|
||||||
|
delta_t = t_stop - t_start
|
||||||
|
total_features = sum( [len(self.__features[path]) for path in self.__features] )
|
||||||
|
logging.info(f"[MTAFE] Extraction complete. Took {delta_t} seconds. Added {total_features} vectors/embeddings")
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
audio_paths: list[Path],
|
||||||
|
max_audio_in_queue: int = 16,
|
||||||
|
audio_feeder_threads: int = 8,
|
||||||
|
feature_extractor_threads: int = 8,
|
||||||
|
desired_sr: int = 32000,
|
||||||
|
force_mono: bool = False,
|
||||||
|
chunk_length: float = 15.0,
|
||||||
|
chunk_overlap: float = 2.0,
|
||||||
|
):
|
||||||
|
# Check if the paths passed in are all valid and add them to queue
|
||||||
|
self.__audio_paths_list = multiprocessing.Queue()
|
||||||
|
for p in audio_paths:
|
||||||
|
if not p.is_file():
|
||||||
|
raise Exception(f"Path '{p.absolute()}' is NOT a valid file!")
|
||||||
|
else:
|
||||||
|
self.__audio_paths_list.put(p)
|
||||||
|
self.__audio_paths_list.put(None) # To signal to the producer that the audio path list is empty, since Queue.empty() is unreliable
|
||||||
|
|
||||||
|
logging.info(f"[MTAFE] [Constructor] Queued {self.__audio_paths_list.qsize() - 1} files")
|
||||||
|
|
||||||
|
# Set up private attributes
|
||||||
|
## Audio preprocessing parameters
|
||||||
|
self.__desired_sr = desired_sr
|
||||||
|
self.__mono = force_mono
|
||||||
|
self.__chunk_length = chunk_length
|
||||||
|
self.__overlap = chunk_overlap
|
||||||
|
|
||||||
|
## Extractor/Feeder settings
|
||||||
|
self.__max_audio_in_queue = max_audio_in_queue
|
||||||
|
self.__audio_feeder_threads = audio_feeder_threads
|
||||||
|
self.__feature_extractor_threads = feature_extractor_threads
|
||||||
|
|
||||||
|
## Set up runtime conditions
|
||||||
|
self.__audio_queue = multiprocessing.Queue(maxsize=self.__max_audio_in_queue)
|
||||||
|
self.__features = {}
|
||||||
|
self.__features_lock = multiprocessing.Lock()
|
||||||
|
self.__audio_feeder_barrier = multiprocessing.Barrier(self.__audio_feeder_threads)
|
||||||
|
self.__audio_feeder_threadpool = []
|
||||||
|
self.__feature_extractor_threadpool = []
|
||||||
|
|
||||||
|
logging.info(f"[MTAFE] [Constructor] Extraction parameters: {desired_sr}Hz, Mono: {force_mono}, Divide into {chunk_length}s chunks with {chunk_overlap}s of overlap")
|
||||||
|
logging.info(f"[MTAFE] [Constructor] Using {audio_feeder_threads} threads for preprocessing audio and {feature_extractor_threads} threads for feature extraction. Max queue size of {max_audio_in_queue} files")
|
||||||
|
|
||||||
|
# More audio embeddings specific code below (To be overridden)
|
||||||
8
FeatureExtraction/mtafe.py
Normal file
8
FeatureExtraction/mtafe.py
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
import dataset_files
|
||||||
|
import multiprocessing
|
||||||
|
import logging
|
||||||
|
import numpy as np
|
||||||
|
import threading
|
||||||
|
import queue
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
193
FeatureExtraction/mtafe_panns.py
Normal file
193
FeatureExtraction/mtafe_panns.py
Normal file
@@ -0,0 +1,193 @@
|
|||||||
|
from dataset_files import MultiThreadedAudioFeatureExtractor
|
||||||
|
from pathlib import Path
|
||||||
|
from panns_inference import AudioTagging
|
||||||
|
import logging
|
||||||
|
import numpy as np
|
||||||
|
import queue
|
||||||
|
import concurrent.futures
|
||||||
|
import threading
|
||||||
|
import time
|
||||||
|
import audiopreprocessing
|
||||||
|
#import torch
|
||||||
|
#import gc
|
||||||
|
|
||||||
|
class mtafe_panns():
|
||||||
|
__audio_queue: queue.Queue[ # List of ...
|
||||||
|
tuple[ # Pair of chunked audio and its path
|
||||||
|
list[tuple[np.ndarray, float, int]], # Chunked audio
|
||||||
|
Path # Path to original audio
|
||||||
|
]
|
||||||
|
] # Listed of Chunked/Resampled audio
|
||||||
|
__audio_loader_threads: int # Amount of audio feeder threads
|
||||||
|
__feature_extractor_threads: int # Amount of feature extractor threads (if the method allows)
|
||||||
|
__audio_paths_list: queue.Queue[Path] # Path list to audio
|
||||||
|
__max_audio_in_queue: int # Maximum audio in queue
|
||||||
|
__desired_sr: int
|
||||||
|
__mono: bool
|
||||||
|
__chunk_length: float
|
||||||
|
__overlap: float
|
||||||
|
__features: dict[Path, list[tuple[np.ndarray, float, int]]] # This is a crime, I know
|
||||||
|
__features_lock: threading.Lock
|
||||||
|
__audio_loader_threadpool: list[concurrent.futures.Future]
|
||||||
|
__feature_extractor_threadpool: list[concurrent.futures.Future]
|
||||||
|
__at: AudioTagging
|
||||||
|
__batch_size: int
|
||||||
|
|
||||||
|
def __init__(self,
|
||||||
|
audio_paths: list[Path],
|
||||||
|
max_audio_in_queue: int = 16,
|
||||||
|
audio_feeder_threads: int = 8,
|
||||||
|
feature_extractor_threads: int = 8,
|
||||||
|
desired_sr: int = 32000,
|
||||||
|
force_mono: bool = False,
|
||||||
|
chunk_length: float = 15.0,
|
||||||
|
chunk_overlap: float = 2.0,
|
||||||
|
batch_size: int = 20
|
||||||
|
):
|
||||||
|
# Check if the paths passed in are all valid and add them to queue
|
||||||
|
self.__audio_paths_list = queue.Queue()
|
||||||
|
for p in audio_paths:
|
||||||
|
if not p.is_file():
|
||||||
|
raise Exception(f"Path '{p.absolute()}' is NOT a valid file!")
|
||||||
|
else:
|
||||||
|
self.__audio_paths_list.put(p)
|
||||||
|
#self.__audio_paths_list.task_done()
|
||||||
|
|
||||||
|
logging.info(f"[MTAFE] [Constructor] Queued {self.__audio_paths_list.qsize()} files")
|
||||||
|
|
||||||
|
# Set up private attributes
|
||||||
|
## Audio preprocessing parameters
|
||||||
|
self.__desired_sr = desired_sr
|
||||||
|
self.__mono = force_mono
|
||||||
|
self.__chunk_length = chunk_length
|
||||||
|
self.__overlap = chunk_overlap
|
||||||
|
|
||||||
|
## Extractor/Feeder settings
|
||||||
|
self.__max_audio_in_queue = max_audio_in_queue
|
||||||
|
self.__audio_loader_threads = audio_feeder_threads
|
||||||
|
self.__feature_extractor_threads = feature_extractor_threads
|
||||||
|
|
||||||
|
## Set up runtime conditions
|
||||||
|
self.__audio_queue = queue.Queue(maxsize=max_audio_in_queue)
|
||||||
|
self.__features = {}
|
||||||
|
self.__features_lock = threading.Lock()
|
||||||
|
self.__audio_loader_threadpool = []
|
||||||
|
self.__feature_extractor_threadpool = []
|
||||||
|
|
||||||
|
logging.info(f"[MTAFE] [Constructor] Extraction parameters: {desired_sr}Hz, Mono: {force_mono}, Divide into {chunk_length}s chunks with {chunk_overlap}s of overlap")
|
||||||
|
logging.info(f"[MTAFE] [Constructor] Using {audio_feeder_threads} threads for preprocessing audio and {feature_extractor_threads} threads for feature extraction. Max queue size of {max_audio_in_queue} files")
|
||||||
|
|
||||||
|
logging.info(f"[MTAFE] [Constructor] Initializing PANNs")
|
||||||
|
logging.info(f"[MTAFE] [Constructor] Inferencing with batch size {batch_size}")
|
||||||
|
self.__at = AudioTagging(checkpoint_path=None, device='cuda')
|
||||||
|
self.__batch_size = batch_size
|
||||||
|
|
||||||
|
def __chunks(self, lst, n):
|
||||||
|
# Stolen straight from Stackoverflow
|
||||||
|
"""Yield successive n-sized chunks from lst."""
|
||||||
|
for i in range(0, len(lst), n):
|
||||||
|
yield lst[i:i + n]
|
||||||
|
|
||||||
|
def __audio_inference_embedding(self, audio: list[tuple[np.ndarray, float, int]]) -> list[tuple[np.ndarray, float, int]]:
|
||||||
|
audio_chunk_list = []
|
||||||
|
timepos_list = []
|
||||||
|
channel_id_list = []
|
||||||
|
embedding_list = []
|
||||||
|
|
||||||
|
# Split into equal sized list
|
||||||
|
for audio_chunk, timepos, channel in audio:
|
||||||
|
audio_chunk_list.append(audio_chunk)
|
||||||
|
timepos_list.append(timepos)
|
||||||
|
channel_id_list.append(channel)
|
||||||
|
|
||||||
|
# Convert audio_chunk_list into numpy array
|
||||||
|
audio_chunk_list = np.array(audio_chunk_list)
|
||||||
|
|
||||||
|
#logging.info("[MTAFE] [PANNs] Inferencing...")
|
||||||
|
try:
|
||||||
|
for i, batch in enumerate(self.__chunks(audio_chunk_list, self.__batch_size)):
|
||||||
|
(clipwise_output, embedding) = self.__at.inference(batch)
|
||||||
|
for vect in embedding: # vect: np.ndarray
|
||||||
|
embedding_list.append(vect)
|
||||||
|
logging.info(f"[MTAFE] [PANNs] Inferenced batch {i}")
|
||||||
|
|
||||||
|
assert len(audio_chunk_list) == len(timepos_list) == len(channel_id_list) == len(embedding_list)
|
||||||
|
except Exception as e:
|
||||||
|
logging.critical("[MTAFE] [PANNs] ERROR! INFERENCE FAILED!!! OR LIST SIZE MISMATCH")
|
||||||
|
logging.critical(e)
|
||||||
|
embedding_list = [None for _ in audio_chunk_list] # Clearing embedding_list and filling it with None
|
||||||
|
return list(zip(embedding_list, channel_id_list, embedding_list))
|
||||||
|
|
||||||
|
def __audio_feeder_thread(self, thread_id):
|
||||||
|
while (not self.__audio_paths_list.empty()):
|
||||||
|
new_audio_path = self.__audio_paths_list.get()
|
||||||
|
self.__audio_paths_list.task_done()
|
||||||
|
logging.info(f"[MTAFE] [Audio Feeder {thread_id}] Preprocess: {new_audio_path.absolute()}")
|
||||||
|
new_audio = audiopreprocessing.load_preprocessed_audio(
|
||||||
|
new_audio_path,
|
||||||
|
self.__desired_sr,
|
||||||
|
self.__mono,
|
||||||
|
self.__chunk_length,
|
||||||
|
self.__overlap
|
||||||
|
)
|
||||||
|
self.__audio_queue.put((new_audio, new_audio_path))
|
||||||
|
logging.info(f"[MTAFE] [Audio Feeder {thread_id}] Feed: {new_audio_path.absolute()}")
|
||||||
|
logging.info(f"[MTAFE] [Audio Feeder {thread_id}] Thread finished!")
|
||||||
|
|
||||||
|
def __check_all_audiofeed_thread_finished(self) -> bool:
|
||||||
|
for ft in self.__audio_loader_threadpool:
|
||||||
|
if ft.running():
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
|
def __check_all_featureextractor_thread_finished(self) -> bool:
|
||||||
|
for ft in self.__feature_extractor_threadpool:
|
||||||
|
if ft.running():
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
|
def __feature_extractor_thread(self, thread_id):
|
||||||
|
while (not self.__check_all_audiofeed_thread_finished() or not self.__audio_queue.empty()):
|
||||||
|
if (not self.__audio_queue.empty()):
|
||||||
|
audio_to_process, audio_path = self.__audio_queue.get()
|
||||||
|
self.__audio_queue.task_done()
|
||||||
|
logging.info(f"[MTAFE] [Feature Extractor {thread_id}] Extracting: {audio_path}")
|
||||||
|
features_to_add = self.__audio_inference_embedding(audio_to_process)
|
||||||
|
logging.info(f"[MTAFE] [Feature Extractor {thread_id}] Extracted: {len(features_to_add)} features")
|
||||||
|
with self.__features_lock:
|
||||||
|
self.__features[audio_path] = features_to_add
|
||||||
|
logging.info(f"[MTAFE] [Feature Extractor {thread_id}] Feature Extraction complete for {audio_path} w/ {len(features_to_add)} features")
|
||||||
|
logging.info(f"[MTAFE] [Feature Extractor {thread_id}] Thread finished!")
|
||||||
|
|
||||||
|
def __count_running_threads(self) -> tuple[int, int]:
|
||||||
|
running_extractors = 0
|
||||||
|
running_feeders = 0
|
||||||
|
for ft in self.__feature_extractor_threadpool:
|
||||||
|
if ft.running(): running_extractors += 1
|
||||||
|
for ft in self.__audio_loader_threadpool:
|
||||||
|
if ft.running(): running_feeders += 1
|
||||||
|
return (running_feeders, running_extractors)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def features(self) -> dict[Path, list[tuple[np.ndarray, float, int]]]:
|
||||||
|
return self.__features
|
||||||
|
|
||||||
|
def extract(self):
|
||||||
|
total_amount = self.__audio_paths_list.qsize()
|
||||||
|
logging.info(f"[MTAFE] [Main] Starting feature extraction for {total_amount} file(s)")
|
||||||
|
t_start = time.perf_counter()
|
||||||
|
with concurrent.futures.ThreadPoolExecutor(max_workers=(self.__audio_loader_threads + self.__feature_extractor_threads)) as executor:
|
||||||
|
for i in range(self.__audio_loader_threads):
|
||||||
|
ld_ft = executor.submit(self.__audio_feeder_thread, i)
|
||||||
|
self.__audio_loader_threadpool.append(ld_ft)
|
||||||
|
for i in range(self.__feature_extractor_threads):
|
||||||
|
ld_ft = executor.submit(self.__feature_extractor_thread, i)
|
||||||
|
self.__feature_extractor_threadpool.append(ld_ft)
|
||||||
|
while ( (not self.__check_all_audiofeed_thread_finished()) and (not self.__check_all_featureextractor_thread_finished()) ):
|
||||||
|
nfeeder, nextract = self.__count_running_threads()
|
||||||
|
print(f"[MTAFE Progress] Processed {len(self.__features)}/{total_amount} (L:{self.__audio_queue.qsize()}/W:{self.__audio_paths_list.qsize()}, LD:{nfeeder}/EXT:{nextract})", end="\r")
|
||||||
|
t_stop = time.perf_counter()
|
||||||
|
logging.info(f"[MTAFE] Processed {len(self.__features)}/{total_amount} (L:{self.__audio_queue.qsize()}/W:{self.__audio_paths_list.qsize()} COMPLETE)")
|
||||||
|
delta_t = t_stop - t_start
|
||||||
|
total_features = sum( [len(self.__features[path]) for path in self.__features] )
|
||||||
|
logging.info(f"[MTAFE] Extraction complete. Took {delta_t} seconds. Added {total_features} vectors/embeddings")
|
||||||
17
FeatureExtraction/test.py
Normal file
17
FeatureExtraction/test.py
Normal file
@@ -0,0 +1,17 @@
|
|||||||
|
import logging
|
||||||
|
from audiopreprocessing import triggerlog
|
||||||
|
#logger = logging.getLogger(__name__)
|
||||||
|
logging.basicConfig(format="%(asctime)s/%(levelname)s: [%(module)s] %(message)s", level=logging.INFO)
|
||||||
|
|
||||||
|
from dataset_files import MultiThreadedAudioFeatureExtractor, random_audio_chunk
|
||||||
|
mtafe = MultiThreadedAudioFeatureExtractor(
|
||||||
|
audio_paths=random_audio_chunk(8),
|
||||||
|
max_audio_in_queue=8,
|
||||||
|
audio_feeder_threads=8,
|
||||||
|
feature_extractor_threads=1,
|
||||||
|
desired_sr=32000,
|
||||||
|
force_mono=False,
|
||||||
|
chunk_length=15,
|
||||||
|
chunk_overlap=2
|
||||||
|
)
|
||||||
|
mtafe.extract()
|
||||||
17
FeatureExtraction/test_mtafe.py
Normal file
17
FeatureExtraction/test_mtafe.py
Normal file
@@ -0,0 +1,17 @@
|
|||||||
|
#import mtafe
|
||||||
|
import logging
|
||||||
|
#import dataset_files
|
||||||
|
logging.basicConfig(format="%(asctime)s/%(levelname)s: [%(module)s] %(message)s", level=logging.DEBUG)
|
||||||
|
|
||||||
|
logging.info("Running tests")
|
||||||
|
# m = mtafe.mtafe(
|
||||||
|
# audio_paths=dataset_files.random_audio_chunk(2),
|
||||||
|
# max_audio_in_queue=8,
|
||||||
|
# audio_feeder_threads=8,
|
||||||
|
# feature_extractor_threads=1,
|
||||||
|
# desired_sr=32000,
|
||||||
|
# force_mono=False,
|
||||||
|
# chunk_length=15,
|
||||||
|
# chunk_overlap=2
|
||||||
|
# )
|
||||||
|
# m.run()
|
||||||
24
FeatureExtraction/test_panns.py
Normal file
24
FeatureExtraction/test_panns.py
Normal file
@@ -0,0 +1,24 @@
|
|||||||
|
import logging
|
||||||
|
from audiopreprocessing import triggerlog
|
||||||
|
#logger = logging.getLogger(__name__)
|
||||||
|
import sys
|
||||||
|
logging.basicConfig(format="%(asctime)s/%(levelname)s: [%(module)s] %(message)s", level=logging.INFO, handlers=[logging.FileHandler('test_panns.log'), logging.StreamHandler(sys.stdout)])
|
||||||
|
from pathlib import Path
|
||||||
|
from mtafe_panns import mtafe_panns
|
||||||
|
from dataset_files import random_audio_chunk, serialize_dict_obj
|
||||||
|
mtafe = mtafe_panns(
|
||||||
|
audio_paths=random_audio_chunk(4),
|
||||||
|
max_audio_in_queue=4,
|
||||||
|
audio_feeder_threads=4,
|
||||||
|
feature_extractor_threads=1,
|
||||||
|
desired_sr=32000,
|
||||||
|
force_mono=False,
|
||||||
|
chunk_length=15,
|
||||||
|
chunk_overlap=2,
|
||||||
|
batch_size=32
|
||||||
|
)
|
||||||
|
mtafe.extract()
|
||||||
|
|
||||||
|
print("Saving inferenced results to file...")
|
||||||
|
p = Path('./test_panns.pkl')
|
||||||
|
serialize_dict_obj(p, mtafe.features)
|
||||||
BIN
FeatureExtraction/测试.png
Normal file
BIN
FeatureExtraction/测试.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 1.2 MiB |
10022
LocalDatasetAnalysis.ipynb
Normal file
10022
LocalDatasetAnalysis.ipynb
Normal file
File diff suppressed because it is too large
Load Diff
253
milvustests/quickstart.ipynb
Normal file
253
milvustests/quickstart.ipynb
Normal file
@@ -0,0 +1,253 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "47471ef9",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Creating client"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"id": "d08ab631",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from pymilvus import MilvusClient\n",
|
||||||
|
"\n",
|
||||||
|
"client = MilvusClient(uri=\"http://localhost:19530\", token=\"root:Milvus\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "ecf3a2dd",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Creating collection"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"id": "7bf82b6c",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"if client.has_collection(collection_name=\"demo_collection\"):\n",
|
||||||
|
" client.drop_collection(collection_name=\"demo_collection\")\n",
|
||||||
|
"\n",
|
||||||
|
"client.create_collection(\n",
|
||||||
|
" collection_name=\"demo_collection\",\n",
|
||||||
|
" dimension=768, # The vectors we will use in this demo has 768 dimensions\n",
|
||||||
|
")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "eef3759b",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Adding sample vector data using Embeddings to Milvus"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"id": "7f6083de",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stderr",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"d:\\Repository\\DLSiteFSearch\\DLSiteFSearchPython_venv\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
|
||||||
|
" from .autonotebook import tqdm as notebook_tqdm\n",
|
||||||
|
"None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.\n",
|
||||||
|
"d:\\Repository\\DLSiteFSearch\\DLSiteFSearchPython_venv\\Lib\\site-packages\\huggingface_hub\\file_download.py:144: UserWarning: `huggingface_hub` cache-system uses symlinks by default to efficiently store duplicated files but your machine does not support them in C:\\Users\\qt\\.cache\\huggingface\\hub\\models--GPTCache--paraphrase-albert-small-v2. Caching files will still work but in a degraded version that might require more space on your disk. This warning can be disabled by setting the `HF_HUB_DISABLE_SYMLINKS_WARNING` environment variable. For more details, see https://huggingface.co/docs/huggingface_hub/how-to-cache#limitations.\n",
|
||||||
|
"To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development\n",
|
||||||
|
" warnings.warn(message)\n",
|
||||||
|
"d:\\Repository\\DLSiteFSearch\\DLSiteFSearchPython_venv\\Lib\\site-packages\\huggingface_hub\\file_download.py:144: UserWarning: `huggingface_hub` cache-system uses symlinks by default to efficiently store duplicated files but your machine does not support them in C:\\Users\\qt\\.cache\\huggingface\\hub\\models--GPTCache--paraphrase-albert-onnx. Caching files will still work but in a degraded version that might require more space on your disk. This warning can be disabled by setting the `HF_HUB_DISABLE_SYMLINKS_WARNING` environment variable. For more details, see https://huggingface.co/docs/huggingface_hub/how-to-cache#limitations.\n",
|
||||||
|
"To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development\n",
|
||||||
|
" warnings.warn(message)\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Dim: 768 (768,)\n",
|
||||||
|
"Data has 3 entities, each with fields: dict_keys(['id', 'vector', 'text', 'subject'])\n",
|
||||||
|
"Vector dim: 768\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"from pymilvus import model\n",
|
||||||
|
"# If connection to https://huggingface.co/ failed, uncomment the following path\n",
|
||||||
|
"# import os\n",
|
||||||
|
"# os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'\n",
|
||||||
|
"\n",
|
||||||
|
"# This will download a small embedding model \"paraphrase-albert-small-v2\" (~50MB).\n",
|
||||||
|
"embedding_fn = model.DefaultEmbeddingFunction()\n",
|
||||||
|
"\n",
|
||||||
|
"# Text strings to search from.\n",
|
||||||
|
"docs = [\n",
|
||||||
|
" \"Artificial intelligence was founded as an academic discipline in 1956.\",\n",
|
||||||
|
" \"Alan Turing was the first person to conduct substantial research in AI.\",\n",
|
||||||
|
" \"Born in Maida Vale, London, Turing was raised in southern England.\",\n",
|
||||||
|
"]\n",
|
||||||
|
"\n",
|
||||||
|
"vectors = embedding_fn.encode_documents(docs)\n",
|
||||||
|
"# The output vector has 768 dimensions, matching the collection that we just created.\n",
|
||||||
|
"print(\"Dim:\", embedding_fn.dim, vectors[0].shape) # Dim: 768 (768,)\n",
|
||||||
|
"\n",
|
||||||
|
"# Each entity has id, vector representation, raw text, and a subject label that we use\n",
|
||||||
|
"# to demo metadata filtering later.\n",
|
||||||
|
"data = [\n",
|
||||||
|
" {\"id\": i, \"vector\": vectors[i], \"text\": docs[i], \"subject\": \"history\"}\n",
|
||||||
|
" for i in range(len(vectors))\n",
|
||||||
|
"]\n",
|
||||||
|
"\n",
|
||||||
|
"print(\"Data has\", len(data), \"entities, each with fields: \", data[0].keys())\n",
|
||||||
|
"print(\"Vector dim:\", len(data[0][\"vector\"]))\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "4e89e602",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Inserting data to Milvus"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"id": "e2098f0a",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"{'insert_count': 3, 'ids': [0, 1, 2]}\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"res = client.insert(collection_name=\"demo_collection\", data=data)\n",
|
||||||
|
"\n",
|
||||||
|
"print(res)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "0a0e4a35",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Semantic search / Vector search"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"id": "2a687f94",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"data: [\"[{'id': 2, 'distance': 0.5859946012496948, 'entity': {'text': 'Born in Maida Vale, London, Turing was raised in southern England.', 'subject': 'history'}}, {'id': 1, 'distance': 0.5118255615234375, 'entity': {'text': 'Alan Turing was the first person to conduct substantial research in AI.', 'subject': 'history'}}]\"]\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"query_vectors = embedding_fn.encode_queries([\"Who is Alan Turing?\"])\n",
|
||||||
|
"# If you don't have the embedding function you can use a fake vector to finish the demo:\n",
|
||||||
|
"# query_vectors = [ [ random.uniform(-1, 1) for _ in range(768) ] ]\n",
|
||||||
|
"\n",
|
||||||
|
"res = client.search(\n",
|
||||||
|
" collection_name=\"demo_collection\", # target collection\n",
|
||||||
|
" data=query_vectors, # query vectors\n",
|
||||||
|
" limit=2, # number of returned entities\n",
|
||||||
|
" output_fields=[\"text\", \"subject\"], # specifies fields to be returned\n",
|
||||||
|
")\n",
|
||||||
|
"\n",
|
||||||
|
"print(res)\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "4f8e5ba8",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Metadata filtering"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"id": "03d6ae37",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"data: ['[]']\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Insert more docs in another subject.\n",
|
||||||
|
"docs = [\n",
|
||||||
|
" \"Machine learning has been used for drug design.\",\n",
|
||||||
|
" \"Computational synthesis with AI algorithms predicts molecular properties.\",\n",
|
||||||
|
" \"DDR1 is involved in cancers and fibrosis.\",\n",
|
||||||
|
"]\n",
|
||||||
|
"vectors = embedding_fn.encode_documents(docs)\n",
|
||||||
|
"data = [\n",
|
||||||
|
" {\"id\": 3 + i, \"vector\": vectors[i], \"text\": docs[i], \"subject\": \"biology\"}\n",
|
||||||
|
" for i in range(len(vectors))\n",
|
||||||
|
"]\n",
|
||||||
|
"\n",
|
||||||
|
"client.insert(collection_name=\"demo_collection\", data=data)\n",
|
||||||
|
"\n",
|
||||||
|
"# This will exclude any text in \"history\" subject despite close to the query vector.\n",
|
||||||
|
"res = client.search(\n",
|
||||||
|
" collection_name=\"demo_collection\",\n",
|
||||||
|
" data=embedding_fn.encode_queries([\"tell me AI related information\"]),\n",
|
||||||
|
" filter=\"subject == 'biology'\",\n",
|
||||||
|
" limit=2,\n",
|
||||||
|
" output_fields=[\"text\", \"subject\"],\n",
|
||||||
|
")\n",
|
||||||
|
"\n",
|
||||||
|
"print(res)"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "DLSiteFSearchPython_venv",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.12.9"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
||||||
95
mtafe_lab/audiopreprocessing.py
Normal file
95
mtafe_lab/audiopreprocessing.py
Normal file
@@ -0,0 +1,95 @@
|
|||||||
|
import librosa
|
||||||
|
import pickle
|
||||||
|
import os
|
||||||
|
import numpy as np
|
||||||
|
from pathlib import Path
|
||||||
|
import logging
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
def triggerlog():
|
||||||
|
logger.critical("Testing: info")
|
||||||
|
|
||||||
|
def resample_load(input_path : Path, target_sr : int = 16000, mono_audio : bool = False) -> np.ndarray: # AI
|
||||||
|
"""Load and resamples the audio into `target_sr`.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
input_path (Path): pathlib.Path object to audio file
|
||||||
|
target_sr (int, optional): Target Sample Rate to resample. Defaults to 16000.
|
||||||
|
mono_audio (bool, optional): Load the audio in mono mode. Defaults to False.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
np.ndarray: _description_
|
||||||
|
"""
|
||||||
|
# Load audio file with original sample rate
|
||||||
|
logger.info(f"[resample_load] Loading audio {input_path}")
|
||||||
|
audio, orig_sr = librosa.load(input_path, sr=None, mono=mono_audio)
|
||||||
|
|
||||||
|
# Resample if necessary
|
||||||
|
if orig_sr != target_sr:
|
||||||
|
logger.info(f"[resample_load] Resampling to {target_sr}")
|
||||||
|
audio = librosa.resample(audio, orig_sr=orig_sr, target_sr=target_sr)
|
||||||
|
|
||||||
|
return audio
|
||||||
|
|
||||||
|
def chunk_audio(audio : np.ndarray, sr: int, chunk_length: float = 10.0, overlap: float = 2.0) -> tuple[list[np.ndarray], list[float], int]: # AI
|
||||||
|
"""
|
||||||
|
Chunks audio file into overlapping segments. Only pass in mono audio here.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
audio_file: Loaded audio ndarray (one channel only)
|
||||||
|
sr: Sample rate for the given audio file
|
||||||
|
chunk_length: Length of each chunk in seconds
|
||||||
|
overlap: Overlap between chunks in seconds
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of audio chunks, list of chunk positions, and given sample rate
|
||||||
|
"""
|
||||||
|
logger.info(f"[chunk_audio] Chunking audio ({len(audio) / sr}s)")
|
||||||
|
# Calculate chunk size and hop length in samples
|
||||||
|
chunk_size = int(chunk_length * sr)
|
||||||
|
hop_length = int((chunk_length - overlap) * sr)
|
||||||
|
|
||||||
|
# Generate chunks
|
||||||
|
chunks = []
|
||||||
|
positions = []
|
||||||
|
k = 0
|
||||||
|
for i in range(0, len(audio) - chunk_size + 1, hop_length):
|
||||||
|
chunk = audio[i:i + chunk_size]
|
||||||
|
chunks.append(chunk)
|
||||||
|
positions.append(i / sr)
|
||||||
|
k += 1
|
||||||
|
if k == 0: # The full audio length is less than chunk_length
|
||||||
|
chunks = [audio]
|
||||||
|
positions = [0.0]
|
||||||
|
logger.info(f"[chunk_audio] Audio less than chunk_length. Returning original audio as chunk\r")
|
||||||
|
else:
|
||||||
|
logger.info(f"[chunk_audio] Audio is split into {k} chunks")
|
||||||
|
|
||||||
|
return chunks, positions, sr
|
||||||
|
|
||||||
|
def load_preprocessed_audio(
|
||||||
|
path: Path,
|
||||||
|
desired_sr: int,
|
||||||
|
mono: bool = False,
|
||||||
|
chunk_length: float = 15.0,
|
||||||
|
overlap: float = 2.0) -> list[tuple[np.ndarray, float, int]]:
|
||||||
|
|
||||||
|
result = []
|
||||||
|
# Load and resample audio
|
||||||
|
audio = resample_load(path, desired_sr, mono) # Stereo 2D matrix, Mono 1D array
|
||||||
|
if mono or (audio.ndim == 1):
|
||||||
|
# Chunk audio: mono (or the audio file loaded in itself is mono)
|
||||||
|
chunks, positions, _ = chunk_audio(audio, desired_sr, chunk_length, overlap)
|
||||||
|
assert len(chunks) == len(positions)
|
||||||
|
result.extend(zip(chunks, positions, [-1 for _ in range(len(chunks))]))
|
||||||
|
# (ndarray_chunk1, pos1, -1): first audio chunk, position1, -1 (Mono channel indicator)
|
||||||
|
else:
|
||||||
|
# Chunk audio: stereo/multichannel
|
||||||
|
for channel_id, channel_audio in enumerate(audio):
|
||||||
|
chunks, positions, _ = chunk_audio(channel_audio, desired_sr, chunk_length, overlap)
|
||||||
|
assert len(chunks) == len(positions)
|
||||||
|
result.extend(zip(chunks, positions, [channel_id for _ in range(len(chunks))]))
|
||||||
|
# (ndarray_chunk1, pos1, 0): first audio chunk, position1, 0 (channel 0)
|
||||||
|
logging.info(f"[load_preprocessed_audio] Loaded audio {path} ({desired_sr}Hz, Chunk {chunk_length}s with overlap {overlap}s) MONO:{mono}")
|
||||||
|
return result
|
||||||
135
mtafe_lab/dataset.py
Normal file
135
mtafe_lab/dataset.py
Normal file
@@ -0,0 +1,135 @@
|
|||||||
|
import platform
|
||||||
|
import os
|
||||||
|
import pickle
|
||||||
|
import random
|
||||||
|
import multiprocessing
|
||||||
|
import threading
|
||||||
|
import time
|
||||||
|
import concurrent.futures
|
||||||
|
import numpy as np
|
||||||
|
from pathlib import Path
|
||||||
|
import audiopreprocessing
|
||||||
|
import logging
|
||||||
|
import queue
|
||||||
|
|
||||||
|
def serialize_dict_obj(path : Path, object : dict) -> int:
|
||||||
|
"""Serializes Python Dictionary object to a file via Pickle.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
path (Path): Path to store the file
|
||||||
|
object (dict): Dictionary object to serialize
|
||||||
|
Returns:
|
||||||
|
int: size in bytes written
|
||||||
|
"""
|
||||||
|
# Horrible practice, horrible security, but it will work for now
|
||||||
|
with path.open("wb") as fp:
|
||||||
|
pickle.dump(object, fp)
|
||||||
|
fp.seek(0, os.SEEK_END)
|
||||||
|
size = fp.tell()
|
||||||
|
return size
|
||||||
|
|
||||||
|
logging.info("Reading local dataset directory structure...")
|
||||||
|
|
||||||
|
ASMRThreePath = Path("C:\\ASMRThree")
|
||||||
|
ASMRTwoPath = Path("D:\\ASMRTwo")
|
||||||
|
ASMROnePath = Path("E:\\ASMROne")
|
||||||
|
|
||||||
|
if (platform.system() == 'Linux'):
|
||||||
|
ASMROnePath = Path('/mnt/Scratchpad/ASMROne')
|
||||||
|
ASMRTwoPath = Path('/mnt/MyStuffz/ASMRTwo')
|
||||||
|
ASMRThreePath = Path('/mnt/Windows11/ASMRThree')
|
||||||
|
|
||||||
|
size_one, size_two, size_three = 0, 0, 0
|
||||||
|
files_one, files_two, files_three = [], [], []
|
||||||
|
folders_one, folders_two, folders_three = [], [], []
|
||||||
|
|
||||||
|
# Statistic calculation for ASMROne
|
||||||
|
for root, dirs, files in ASMROnePath.walk(): # Root will iterate through all folders
|
||||||
|
if root.absolute() != ASMROnePath.absolute(): # Skip root of ASMROnePath
|
||||||
|
folders_one.append(root) # Add folder to list
|
||||||
|
for fname in files: # Iterate through all files in current root
|
||||||
|
file = root/fname # Get file path
|
||||||
|
assert file.is_file()
|
||||||
|
files_one.append(file)
|
||||||
|
size_one += file.stat().st_size # Get file size
|
||||||
|
|
||||||
|
# Statistic calculation for ASMRTwo
|
||||||
|
for root, dirs, files in ASMRTwoPath.walk(): # Root will iterate through all folders
|
||||||
|
if root.absolute() != ASMRTwoPath.absolute(): # Skip root of ASMRTwoPath
|
||||||
|
folders_two.append(root) # Add folder to list
|
||||||
|
for fname in files: # Iterate through all files in current root
|
||||||
|
file = root/fname # Get file path
|
||||||
|
assert file.is_file()
|
||||||
|
files_two.append(file)
|
||||||
|
size_two += file.stat().st_size # Get file size
|
||||||
|
|
||||||
|
# Statistic calculation for ASMRThree
|
||||||
|
for root, dirs, files in ASMRThreePath.walk(): # Root will iterate through all folders
|
||||||
|
if root.absolute() != ASMRThreePath.absolute(): # Skip root of ASMRThreePath
|
||||||
|
folders_three.append(root) # Add folder to list
|
||||||
|
for fname in files: # Iterate through all files in current root
|
||||||
|
file = root/fname # Get file path
|
||||||
|
assert file.is_file()
|
||||||
|
files_three.append(file)
|
||||||
|
size_three += file.stat().st_size # Get file size
|
||||||
|
|
||||||
|
DataSubsetPaths = [ASMROnePath, ASMRTwoPath, ASMRThreePath]
|
||||||
|
DLSiteWorksPaths = []
|
||||||
|
# Collect ASMR Works (RJ ID, Paths)
|
||||||
|
for ASMRSubsetPath in DataSubsetPaths:
|
||||||
|
for WorkPaths in ASMRSubsetPath.iterdir():
|
||||||
|
DLSiteWorksPaths.append(WorkPaths)
|
||||||
|
|
||||||
|
fileExt2fileType = {
|
||||||
|
".TXT": "Document",
|
||||||
|
".WAV": "Audio",
|
||||||
|
".MP3": "Audio",
|
||||||
|
".PNG": "Image",
|
||||||
|
".JPG": "Image",
|
||||||
|
".VTT": "Subtitle",
|
||||||
|
".PDF": "Document",
|
||||||
|
".FLAC": "Audio",
|
||||||
|
".MP4": "Video",
|
||||||
|
".LRC": "Subtitle",
|
||||||
|
".SRT": "Subtitle",
|
||||||
|
".JPEG": "Image",
|
||||||
|
".ASS": "Subtitle",
|
||||||
|
"": "NO EXTENSION",
|
||||||
|
".M4A": "Audio",
|
||||||
|
".MKV": "Video"
|
||||||
|
}
|
||||||
|
fileext_stat = {}
|
||||||
|
file_list = files_one + files_two + files_three
|
||||||
|
file_list_count = len(file_list)
|
||||||
|
|
||||||
|
for file in file_list:
|
||||||
|
f_ext = file.suffix.upper()
|
||||||
|
if (f_ext in fileext_stat.keys()):
|
||||||
|
fileext_stat[f_ext]['Count'] += 1
|
||||||
|
fileext_stat[f_ext]['List'].append(file)
|
||||||
|
fileext_stat[f_ext]['ExtensionMass'] += file.stat().st_size
|
||||||
|
else:
|
||||||
|
fileext_stat[f_ext] = {}
|
||||||
|
fileext_stat[f_ext]['Count'] = 1
|
||||||
|
fileext_stat[f_ext]['List'] = [file]
|
||||||
|
fileext_stat[f_ext]['ExtensionMass'] = file.stat().st_size # The total sum of sizes of the same file extension
|
||||||
|
fileext_stat[f_ext]['MediaType'] = fileExt2fileType[f_ext]
|
||||||
|
|
||||||
|
audio_paths = []
|
||||||
|
for extension in fileext_stat: # I can't be bothered to convert this into a list compresion
|
||||||
|
if fileext_stat[extension]['MediaType'] == "Audio":
|
||||||
|
audio_paths += fileext_stat[extension]['List']
|
||||||
|
|
||||||
|
def random_audio_chunk(n : int, seed : int = 177013) -> list[Path]:
|
||||||
|
"""Returns a random selection of audio files
|
||||||
|
|
||||||
|
Args:
|
||||||
|
n (int): Amount of files to return
|
||||||
|
seed (int, optional): Seed for RNG. Defaults to 177013.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
list[Path]: List of randomly selected audio paths (using Path object)
|
||||||
|
"""
|
||||||
|
random.seed(seed)
|
||||||
|
#return random.choices(audio_paths, k=n) # Contains repeated elements
|
||||||
|
return random.sample(audio_paths, k=n)
|
||||||
269
mtafe_lab/mtafe.py
Normal file
269
mtafe_lab/mtafe.py
Normal file
@@ -0,0 +1,269 @@
|
|||||||
|
import logging
|
||||||
|
#logging.basicConfig(format="%(asctime)s/%(levelname)s: [%(module)s] %(message)s", level=logging.INFO)
|
||||||
|
|
||||||
|
import dataset
|
||||||
|
import numpy as np
|
||||||
|
import audiopreprocessing
|
||||||
|
import threading
|
||||||
|
import queue
|
||||||
|
import time
|
||||||
|
from concurrent.futures import ThreadPoolExecutor, Future
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
class mtafe:
|
||||||
|
# Input
|
||||||
|
audio_path_queue: queue.Queue[Path] # List of audio paths to preprocess
|
||||||
|
# Feeder/Extractor/Queue threading options
|
||||||
|
audio_feeder_threads: int # Amount of audio feeder threads
|
||||||
|
feature_extractor_threads: int # Amount of feature extractor threads (if the method allows)
|
||||||
|
max_audio_in_queue: int # Maximum audio in queue
|
||||||
|
# Audio preprocessing parameters
|
||||||
|
desired_sr: int # Desired Sample Rate (Resampling)
|
||||||
|
mono: bool # Force load audio in mono mode
|
||||||
|
chunk_length: float # Audio chunk length
|
||||||
|
overlap: float # Audio chunk overlap
|
||||||
|
# Runtime
|
||||||
|
audio_queue: queue.Queue[ # List of ...
|
||||||
|
tuple[ # Pair of chunked audio and its path
|
||||||
|
list[tuple[np.ndarray, float, int]], # Chunked audio list of (ndarray, time position of chunk relative to original audio, channel_id)
|
||||||
|
Path # Path to original audio
|
||||||
|
]
|
||||||
|
] # Listed of Chunked/Resampled audio
|
||||||
|
audio_feeder_threadpool: list[Future]
|
||||||
|
feature_extractor_threadpool: list[Future]
|
||||||
|
features_lock: threading.Lock
|
||||||
|
audio_feeder_barrier: threading.Barrier # Synchronization barrier for all audio feeder threads
|
||||||
|
# Output
|
||||||
|
features: dict[Path, list[tuple[np.ndarray, float, int]]]
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
paudio_paths: list[Path],
|
||||||
|
pmax_audio_in_queue: int = 16,
|
||||||
|
paudio_feeder_threads: int = 8,
|
||||||
|
pfeature_extractor_threads: int = 8,
|
||||||
|
pdesired_sr: int = 32000,
|
||||||
|
pforce_mono: bool = False,
|
||||||
|
pchunk_length: float = 15.0,
|
||||||
|
pchunk_overlap: float = 2.0
|
||||||
|
):
|
||||||
|
# Check if the paths passed in are all valid and add them to queue
|
||||||
|
self.audio_path_queue = queue.Queue()
|
||||||
|
for p in paudio_paths:
|
||||||
|
if not p.is_file():
|
||||||
|
raise Exception(f"Path '{p.absolute()}' is NOT a valid file!")
|
||||||
|
else:
|
||||||
|
self.audio_path_queue.put(p)
|
||||||
|
self.audio_path_queue.put(None) # To signal to the producer that the audio path list is empty, since Queue.empty() is unreliable
|
||||||
|
|
||||||
|
logging.info(f"[MTAFE] [Constructor] Queued {self.audio_path_queue.qsize() - 1} files")
|
||||||
|
|
||||||
|
# Set up private attributes
|
||||||
|
## Audio preprocessing parameters
|
||||||
|
self.desired_sr = pdesired_sr
|
||||||
|
self.mono = pforce_mono
|
||||||
|
self.chunk_length = pchunk_length
|
||||||
|
self.overlap = pchunk_overlap
|
||||||
|
|
||||||
|
## Extractor/Feeder settings
|
||||||
|
self.max_audio_in_queue = pmax_audio_in_queue
|
||||||
|
self.audio_feeder_threads = paudio_feeder_threads
|
||||||
|
self.feature_extractor_threads = pfeature_extractor_threads
|
||||||
|
|
||||||
|
## Set up runtime conditions
|
||||||
|
self.audio_queue = queue.Queue(maxsize=self.max_audio_in_queue)
|
||||||
|
self.features = {}
|
||||||
|
self.features_lock = threading.Lock()
|
||||||
|
self.audio_feeder_barrier = threading.Barrier(self.audio_feeder_threads)
|
||||||
|
self.audio_feeder_threadpool = []
|
||||||
|
self.feature_extractor_threadpool = []
|
||||||
|
|
||||||
|
logging.info(f"[MTAFE] [Constructor] Extraction parameters: {pdesired_sr}Hz, Mono: {pforce_mono}, Divide into {pchunk_length}s chunks with {pchunk_overlap}s of overlap")
|
||||||
|
logging.info(f"[MTAFE] [Constructor] Using {paudio_feeder_threads} threads for preprocessing audio and {pfeature_extractor_threads} threads for feature extraction. Max queue size of {pmax_audio_in_queue} files")
|
||||||
|
|
||||||
|
def audio_inference_embedding(self, audio: list[tuple[np.ndarray, float, int]]) -> list[tuple[np.ndarray, float, int]]:
|
||||||
|
"""Receives a list of audio chunks, and then extracts embeddings for all audio chunks, returns the resulting embedding as a list of tuples(embedding, time, channel_id)
|
||||||
|
|
||||||
|
Args:
|
||||||
|
audio (list[tuple[np.ndarray, float, int]]): list of audio chunks
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
list[tuple[np.ndarray, float, int]]: List of (embedding vector, timepos, channel id)
|
||||||
|
"""
|
||||||
|
features = []
|
||||||
|
for audio_chunk in audio:
|
||||||
|
audio, timepos, channel_id = audio_chunk
|
||||||
|
zero = np.zeros(32)
|
||||||
|
features.append( (zero, timepos, channel_id) )
|
||||||
|
time.sleep(1.5) # Simulate effort, change to simulate spent seconds in each audio file
|
||||||
|
return features
|
||||||
|
# To be overridden
|
||||||
|
|
||||||
|
def audio_feeder_worker(self, thread_id: int, barrier: threading.Barrier): # AI
|
||||||
|
try:
|
||||||
|
while True:
|
||||||
|
# Add timeout to prevent blocking indefinitely
|
||||||
|
try:
|
||||||
|
new_audio_path = self.audio_path_queue.get(timeout=10)
|
||||||
|
except queue.Empty:
|
||||||
|
logging.warning(f"[MTAFE] [Audio Feeder {thread_id}] Queue get timeout")
|
||||||
|
continue
|
||||||
|
|
||||||
|
if new_audio_path is None:
|
||||||
|
self.audio_path_queue.put(new_audio_path) # Put None back
|
||||||
|
break
|
||||||
|
|
||||||
|
logging.info(f"[MTAFE] [Audio Feeder {thread_id}] Preprocess: {new_audio_path.absolute()}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
new_audio = audiopreprocessing.load_preprocessed_audio(
|
||||||
|
new_audio_path,
|
||||||
|
self.desired_sr,
|
||||||
|
self.mono,
|
||||||
|
self.chunk_length,
|
||||||
|
self.overlap
|
||||||
|
)
|
||||||
|
|
||||||
|
# Add timeout to prevent deadlock on full queue
|
||||||
|
try:
|
||||||
|
self.audio_queue.put((new_audio, new_audio_path), timeout=30)
|
||||||
|
logging.info(f"[MTAFE] [Audio Feeder {thread_id}] Feed: {new_audio_path.absolute()}")
|
||||||
|
except queue.Full:
|
||||||
|
logging.error(f"[MTAFE] [Audio Feeder {thread_id}] Queue full, skipping {new_audio_path}")
|
||||||
|
continue
|
||||||
|
except Exception as e:
|
||||||
|
logging.error(f"[MTAFE] [Audio Feeder {thread_id}] Error processing {new_audio_path}: {str(e)}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Add barrier timeout to prevent indefinite wait
|
||||||
|
logging.info(f"[MTAFE] [Audio Feeder {thread_id}] Waiting for other threads")
|
||||||
|
try:
|
||||||
|
barrier.wait(timeout=60)
|
||||||
|
except threading.BrokenBarrierError:
|
||||||
|
logging.error(f"[MTAFE] [Audio Feeder {thread_id}] Barrier broken")
|
||||||
|
|
||||||
|
if thread_id == 0:
|
||||||
|
self.audio_queue.put(None) # Signal end
|
||||||
|
logging.info(f"[MTAFE] [Audio Feeder {thread_id}] Thread finished!")
|
||||||
|
except Exception as e:
|
||||||
|
logging.error(f"[MTAFE] [Audio Feeder {thread_id}] Fatal exception: {str(e)}")
|
||||||
|
logging.exception(e)
|
||||||
|
# Ensure barrier can progress even if a thread fails
|
||||||
|
try:
|
||||||
|
barrier.abort()
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
# Ensure sentinel is added even if threads fail
|
||||||
|
if thread_id == 0:
|
||||||
|
try:
|
||||||
|
self.audio_queue.put(None, timeout=5)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# def audio_feeder_worker(self, thread_id: int, barrier: threading.Barrier):
|
||||||
|
# try:
|
||||||
|
# while True:
|
||||||
|
# # Attempt to get audio path from audio path queue
|
||||||
|
# new_audio_path = self.audio_path_queue.get()
|
||||||
|
# # Check thread exit condition (If the queue returns None, that means the audio path queue is now empty and the thread should end itself)
|
||||||
|
# if (new_audio_path is None):
|
||||||
|
# self.audio_path_queue.put(new_audio_path) # Put None back to notify other audio feeder threads
|
||||||
|
# break # Break out of the infinite loop
|
||||||
|
# # Audio path queue is not empty:
|
||||||
|
# logging.info(f"[MTAFE] [Audio Feeder {thread_id}] Preprocess: {new_audio_path.absolute()}")
|
||||||
|
# new_audio = audiopreprocessing.load_preprocessed_audio(
|
||||||
|
# new_audio_path,
|
||||||
|
# self.desired_sr,
|
||||||
|
# self.mono,
|
||||||
|
# self.chunk_length,
|
||||||
|
# self.overlap
|
||||||
|
# )
|
||||||
|
# self.audio_queue.put((new_audio, new_audio_path))
|
||||||
|
# logging.info(f"[MTAFE] [Audio Feeder {thread_id}] Feed: {new_audio_path.absolute()}")
|
||||||
|
# logging.info(f"[MTAFE] [Audio Feeder {thread_id}] Waiting for other threads to finish")
|
||||||
|
# barrier.wait()
|
||||||
|
# if (thread_id == 0):
|
||||||
|
# self.audio_queue.put(None) # None to signal audio_queue has no more elements to process
|
||||||
|
# logging.info(f"[MTAFE] [Audio Feeder {thread_id}] Thread finished!")
|
||||||
|
# except Exception as e:
|
||||||
|
# logging.error(f"[MTAFE] [Audio Feeder {thread_id}] An exception occurred! Committing seppuku!")
|
||||||
|
# logging.exception(e)
|
||||||
|
# return
|
||||||
|
|
||||||
|
def feature_extractor_worker(self, thread_id: int):
|
||||||
|
while True:
|
||||||
|
# Attempt to get next audio chunks to process
|
||||||
|
next_audio_tuple = self.audio_queue.get()
|
||||||
|
# Check thread exit condition
|
||||||
|
if (next_audio_tuple is None):
|
||||||
|
self.audio_queue.put(next_audio_tuple) # Put the None back to notify other threads
|
||||||
|
break # unalive urself
|
||||||
|
else: # Assuming we got more tuples
|
||||||
|
current_audio_to_process, current_audio_path = next_audio_tuple # Deconstruct tuple
|
||||||
|
logging.info(f"[MTAFE] [Feature Extractor {thread_id}] Extracting: {current_audio_path}")
|
||||||
|
features_to_add = self.audio_inference_embedding(current_audio_to_process)
|
||||||
|
with self.features_lock:
|
||||||
|
self.features[current_audio_path] = features_to_add
|
||||||
|
logging.info(f"[MTAFE] [Feature Extractor {thread_id}] Feature Extraction complete for {current_audio_path} w/ {len(features_to_add)} features")
|
||||||
|
logging.info(f"[MTAFE] [Feature Extractor {thread_id}] Thread finished!")
|
||||||
|
|
||||||
|
def test_audio_feeder_worker(self):
|
||||||
|
total_file_amount = self.audio_path_queue.qsize() - 1
|
||||||
|
logging.info("[MTAFE] [test_audio_feeder_worker] Spinning up new threads...")
|
||||||
|
with ThreadPoolExecutor(max_workers=self.audio_feeder_threads) as executor:
|
||||||
|
for i in range(self.audio_feeder_threads):
|
||||||
|
ld_ft = executor.submit(self.audio_feeder_worker, i, self.audio_feeder_barrier)
|
||||||
|
self.audio_feeder_threadpool.append(ld_ft)
|
||||||
|
logging.info(f"[MTAFE] [test_audio_feeder_worker] Launched audio feeder {i}")
|
||||||
|
for i in range(total_file_amount):
|
||||||
|
_, p = self.audio_queue.get()
|
||||||
|
time.sleep(0.25)
|
||||||
|
logging.info(f"[MTAFE] [test_audio_feeder_worker] Popped: {p}")
|
||||||
|
logging.info("[MTAFE] [test_audio_feeder_worker] All audio feeder worker joined!")
|
||||||
|
#logging.info(f"[MTAFE] [test_audio_feeder_worker] Current audio queue size: {self.audio_queue.qsize()}")
|
||||||
|
|
||||||
|
def count_running_threads(self) -> tuple[int, int]:
|
||||||
|
running_extractors = 0
|
||||||
|
running_feeders = 0
|
||||||
|
for ft in self.feature_extractor_threadpool:
|
||||||
|
if ft.running(): running_extractors += 1
|
||||||
|
for ft in self.audio_feeder_threadpool:
|
||||||
|
if ft.running(): running_feeders += 1
|
||||||
|
return (running_feeders, running_extractors)
|
||||||
|
|
||||||
|
def check_all_audiofeed_thread_finished(self) -> bool:
|
||||||
|
for ft in self.audio_feeder_threadpool:
|
||||||
|
if ft.running():
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
|
def check_all_featureextractor_thread_finished(self) -> bool:
|
||||||
|
for ft in self.feature_extractor_threadpool:
|
||||||
|
if ft.running():
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
|
def extract(self):
|
||||||
|
total_amount = self.audio_path_queue.qsize() - 1 # Account for None to indicate queue end
|
||||||
|
logging.info(f"[MTAFE] [Main] Starting feature extraction for {total_amount} file(s)")
|
||||||
|
t_start = time.perf_counter() # Timer
|
||||||
|
with ThreadPoolExecutor(max_workers=(self.audio_feeder_threads + self.feature_extractor_threads)) as executor:
|
||||||
|
# Audio feeder threads
|
||||||
|
for i in range(self.audio_feeder_threads):
|
||||||
|
logging.info(f"[MTAFE] Started audio feeder thread {i}")
|
||||||
|
ld_ft = executor.submit(self.audio_feeder_worker, i, self.audio_feeder_barrier)
|
||||||
|
self.audio_feeder_threadpool.append(ld_ft)
|
||||||
|
# Feature extractor threads
|
||||||
|
for i in range(self.feature_extractor_threads):
|
||||||
|
logging.info(f"[MTAFE] Started feature extractor thread {i}")
|
||||||
|
ex_ft = executor.submit(self.feature_extractor_worker, i)
|
||||||
|
self.feature_extractor_threadpool.append(ex_ft)
|
||||||
|
# Progress checking
|
||||||
|
while ( (not self.check_all_audiofeed_thread_finished()) and (not self.check_all_featureextractor_thread_finished()) ):
|
||||||
|
nfeeder, nextract = self.count_running_threads()
|
||||||
|
print(f"[MTAFE Progress] Processed {len(self.features)}/{total_amount} (L:{self.audio_queue.qsize()}/W:{self.audio_path_queue.qsize()}, LD:{nfeeder}/EXT:{nextract})", end="\r")
|
||||||
|
t_stop = time.perf_counter()
|
||||||
|
logging.info(f"[MTAFE] Processed {len(self.features)}/{total_amount} (L:{self.audio_queue.qsize() - 1}/W:{self.audio_path_queue.qsize() - 1} COMPLETE)")
|
||||||
|
delta_t = t_stop - t_start
|
||||||
|
total_features = sum( [len(self.features[path]) for path in self.features] )
|
||||||
|
logging.info(f"[MTAFE] Extraction complete. Took {delta_t} seconds. Added {total_features} vectors/embeddings")
|
||||||
22
mtafe_lab/test_mtafe.py
Normal file
22
mtafe_lab/test_mtafe.py
Normal file
@@ -0,0 +1,22 @@
|
|||||||
|
import logging
|
||||||
|
logging.basicConfig(format="%(asctime)s/%(levelname)s: [%(module)s] %(message)s", level=logging.INFO)
|
||||||
|
|
||||||
|
import mtafe
|
||||||
|
from dataset import random_audio_chunk
|
||||||
|
|
||||||
|
logging.info("Generating random audio path list")
|
||||||
|
rdpl = random_audio_chunk(256)
|
||||||
|
|
||||||
|
logging.info("Initializing MTAFE")
|
||||||
|
m = mtafe.mtafe(
|
||||||
|
paudio_paths=rdpl,
|
||||||
|
pmax_audio_in_queue=8,
|
||||||
|
paudio_feeder_threads=8,
|
||||||
|
pfeature_extractor_threads=2,
|
||||||
|
pdesired_sr=32000,
|
||||||
|
pforce_mono=False,
|
||||||
|
pchunk_length=15,
|
||||||
|
pchunk_overlap=2
|
||||||
|
)
|
||||||
|
#m.test_audio_feeder_worker()
|
||||||
|
m.extract()
|
||||||
BIN
mtafe_lab/testmtafeprofile.txt
Normal file
BIN
mtafe_lab/testmtafeprofile.txt
Normal file
Binary file not shown.
8
requirements.txt
Normal file
8
requirements.txt
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
opencv-python
|
||||||
|
python-orb-slam3
|
||||||
|
pandas
|
||||||
|
matplotlib
|
||||||
|
numpy
|
||||||
|
pymilvus[model]
|
||||||
|
protobuf
|
||||||
|
grpcio-tools
|
||||||
BIN
source_keypoints_opencvorb.jpg
Normal file
BIN
source_keypoints_opencvorb.jpg
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 387 KiB |
BIN
source_keypoints_slamorb3.jpg
Normal file
BIN
source_keypoints_slamorb3.jpg
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 445 KiB |
1411
test.ipynb
1411
test.ipynb
File diff suppressed because one or more lines are too long
2180
test_cvorb.ipynb
2180
test_cvorb.ipynb
File diff suppressed because one or more lines are too long
1387
test_slamorb.ipynb
Normal file
1387
test_slamorb.ipynb
Normal file
File diff suppressed because one or more lines are too long
Reference in New Issue
Block a user