added more information

This commit is contained in:
2025-04-07 00:40:21 +02:00
parent af81c82d18
commit a9d3d10da9
16 changed files with 27549 additions and 2515 deletions

View File

@@ -0,0 +1,253 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "47471ef9",
"metadata": {},
"source": [
"Creating client"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "d08ab631",
"metadata": {},
"outputs": [],
"source": [
"from pymilvus import MilvusClient\n",
"\n",
"client = MilvusClient(uri=\"http://localhost:19530\", token=\"root:Milvus\")"
]
},
{
"cell_type": "markdown",
"id": "ecf3a2dd",
"metadata": {},
"source": [
"Creating collection"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "7bf82b6c",
"metadata": {},
"outputs": [],
"source": [
"if client.has_collection(collection_name=\"demo_collection\"):\n",
" client.drop_collection(collection_name=\"demo_collection\")\n",
"\n",
"client.create_collection(\n",
" collection_name=\"demo_collection\",\n",
" dimension=768, # The vectors we will use in this demo has 768 dimensions\n",
")"
]
},
{
"cell_type": "markdown",
"id": "eef3759b",
"metadata": {},
"source": [
"Adding sample vector data using Embeddings to Milvus"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "7f6083de",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\Repository\\DLSiteFSearch\\DLSiteFSearchPython_venv\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n",
"None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.\n",
"d:\\Repository\\DLSiteFSearch\\DLSiteFSearchPython_venv\\Lib\\site-packages\\huggingface_hub\\file_download.py:144: UserWarning: `huggingface_hub` cache-system uses symlinks by default to efficiently store duplicated files but your machine does not support them in C:\\Users\\qt\\.cache\\huggingface\\hub\\models--GPTCache--paraphrase-albert-small-v2. Caching files will still work but in a degraded version that might require more space on your disk. This warning can be disabled by setting the `HF_HUB_DISABLE_SYMLINKS_WARNING` environment variable. For more details, see https://huggingface.co/docs/huggingface_hub/how-to-cache#limitations.\n",
"To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development\n",
" warnings.warn(message)\n",
"d:\\Repository\\DLSiteFSearch\\DLSiteFSearchPython_venv\\Lib\\site-packages\\huggingface_hub\\file_download.py:144: UserWarning: `huggingface_hub` cache-system uses symlinks by default to efficiently store duplicated files but your machine does not support them in C:\\Users\\qt\\.cache\\huggingface\\hub\\models--GPTCache--paraphrase-albert-onnx. Caching files will still work but in a degraded version that might require more space on your disk. This warning can be disabled by setting the `HF_HUB_DISABLE_SYMLINKS_WARNING` environment variable. For more details, see https://huggingface.co/docs/huggingface_hub/how-to-cache#limitations.\n",
"To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development\n",
" warnings.warn(message)\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Dim: 768 (768,)\n",
"Data has 3 entities, each with fields: dict_keys(['id', 'vector', 'text', 'subject'])\n",
"Vector dim: 768\n"
]
}
],
"source": [
"from pymilvus import model\n",
"# If connection to https://huggingface.co/ failed, uncomment the following path\n",
"# import os\n",
"# os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'\n",
"\n",
"# This will download a small embedding model \"paraphrase-albert-small-v2\" (~50MB).\n",
"embedding_fn = model.DefaultEmbeddingFunction()\n",
"\n",
"# Text strings to search from.\n",
"docs = [\n",
" \"Artificial intelligence was founded as an academic discipline in 1956.\",\n",
" \"Alan Turing was the first person to conduct substantial research in AI.\",\n",
" \"Born in Maida Vale, London, Turing was raised in southern England.\",\n",
"]\n",
"\n",
"vectors = embedding_fn.encode_documents(docs)\n",
"# The output vector has 768 dimensions, matching the collection that we just created.\n",
"print(\"Dim:\", embedding_fn.dim, vectors[0].shape) # Dim: 768 (768,)\n",
"\n",
"# Each entity has id, vector representation, raw text, and a subject label that we use\n",
"# to demo metadata filtering later.\n",
"data = [\n",
" {\"id\": i, \"vector\": vectors[i], \"text\": docs[i], \"subject\": \"history\"}\n",
" for i in range(len(vectors))\n",
"]\n",
"\n",
"print(\"Data has\", len(data), \"entities, each with fields: \", data[0].keys())\n",
"print(\"Vector dim:\", len(data[0][\"vector\"]))\n"
]
},
{
"cell_type": "markdown",
"id": "4e89e602",
"metadata": {},
"source": [
"Inserting data to Milvus"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "e2098f0a",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'insert_count': 3, 'ids': [0, 1, 2]}\n"
]
}
],
"source": [
"res = client.insert(collection_name=\"demo_collection\", data=data)\n",
"\n",
"print(res)"
]
},
{
"cell_type": "markdown",
"id": "0a0e4a35",
"metadata": {},
"source": [
"Semantic search / Vector search"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "2a687f94",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"data: [\"[{'id': 2, 'distance': 0.5859946012496948, 'entity': {'text': 'Born in Maida Vale, London, Turing was raised in southern England.', 'subject': 'history'}}, {'id': 1, 'distance': 0.5118255615234375, 'entity': {'text': 'Alan Turing was the first person to conduct substantial research in AI.', 'subject': 'history'}}]\"]\n"
]
}
],
"source": [
"query_vectors = embedding_fn.encode_queries([\"Who is Alan Turing?\"])\n",
"# If you don't have the embedding function you can use a fake vector to finish the demo:\n",
"# query_vectors = [ [ random.uniform(-1, 1) for _ in range(768) ] ]\n",
"\n",
"res = client.search(\n",
" collection_name=\"demo_collection\", # target collection\n",
" data=query_vectors, # query vectors\n",
" limit=2, # number of returned entities\n",
" output_fields=[\"text\", \"subject\"], # specifies fields to be returned\n",
")\n",
"\n",
"print(res)\n"
]
},
{
"cell_type": "markdown",
"id": "4f8e5ba8",
"metadata": {},
"source": [
"Metadata filtering"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "03d6ae37",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"data: ['[]']\n"
]
}
],
"source": [
"# Insert more docs in another subject.\n",
"docs = [\n",
" \"Machine learning has been used for drug design.\",\n",
" \"Computational synthesis with AI algorithms predicts molecular properties.\",\n",
" \"DDR1 is involved in cancers and fibrosis.\",\n",
"]\n",
"vectors = embedding_fn.encode_documents(docs)\n",
"data = [\n",
" {\"id\": 3 + i, \"vector\": vectors[i], \"text\": docs[i], \"subject\": \"biology\"}\n",
" for i in range(len(vectors))\n",
"]\n",
"\n",
"client.insert(collection_name=\"demo_collection\", data=data)\n",
"\n",
"# This will exclude any text in \"history\" subject despite close to the query vector.\n",
"res = client.search(\n",
" collection_name=\"demo_collection\",\n",
" data=embedding_fn.encode_queries([\"tell me AI related information\"]),\n",
" filter=\"subject == 'biology'\",\n",
" limit=2,\n",
" output_fields=[\"text\", \"subject\"],\n",
")\n",
"\n",
"print(res)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "DLSiteFSearchPython_venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.9"
}
},
"nbformat": 4,
"nbformat_minor": 5
}