{ "cells": [ { "cell_type": "markdown", "id": "47471ef9", "metadata": {}, "source": [ "Creating client" ] }, { "cell_type": "code", "execution_count": 1, "id": "d08ab631", "metadata": {}, "outputs": [], "source": [ "from pymilvus import MilvusClient\n", "\n", "client = MilvusClient(uri=\"http://localhost:19530\", token=\"root:Milvus\")" ] }, { "cell_type": "markdown", "id": "ecf3a2dd", "metadata": {}, "source": [ "Creating collection" ] }, { "cell_type": "code", "execution_count": 2, "id": "7bf82b6c", "metadata": {}, "outputs": [], "source": [ "if client.has_collection(collection_name=\"demo_collection\"):\n", " client.drop_collection(collection_name=\"demo_collection\")\n", "\n", "client.create_collection(\n", " collection_name=\"demo_collection\",\n", " dimension=768, # The vectors we will use in this demo has 768 dimensions\n", ")" ] }, { "cell_type": "markdown", "id": "eef3759b", "metadata": {}, "source": [ "Adding sample vector data using Embeddings to Milvus" ] }, { "cell_type": "code", "execution_count": 3, "id": "7f6083de", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "d:\\Repository\\DLSiteFSearch\\DLSiteFSearchPython_venv\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n", "None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.\n", "d:\\Repository\\DLSiteFSearch\\DLSiteFSearchPython_venv\\Lib\\site-packages\\huggingface_hub\\file_download.py:144: UserWarning: `huggingface_hub` cache-system uses symlinks by default to efficiently store duplicated files but your machine does not support them in C:\\Users\\qt\\.cache\\huggingface\\hub\\models--GPTCache--paraphrase-albert-small-v2. Caching files will still work but in a degraded version that might require more space on your disk. This warning can be disabled by setting the `HF_HUB_DISABLE_SYMLINKS_WARNING` environment variable. For more details, see https://huggingface.co/docs/huggingface_hub/how-to-cache#limitations.\n", "To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development\n", " warnings.warn(message)\n", "d:\\Repository\\DLSiteFSearch\\DLSiteFSearchPython_venv\\Lib\\site-packages\\huggingface_hub\\file_download.py:144: UserWarning: `huggingface_hub` cache-system uses symlinks by default to efficiently store duplicated files but your machine does not support them in C:\\Users\\qt\\.cache\\huggingface\\hub\\models--GPTCache--paraphrase-albert-onnx. Caching files will still work but in a degraded version that might require more space on your disk. This warning can be disabled by setting the `HF_HUB_DISABLE_SYMLINKS_WARNING` environment variable. For more details, see https://huggingface.co/docs/huggingface_hub/how-to-cache#limitations.\n", "To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development\n", " warnings.warn(message)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Dim: 768 (768,)\n", "Data has 3 entities, each with fields: dict_keys(['id', 'vector', 'text', 'subject'])\n", "Vector dim: 768\n" ] } ], "source": [ "from pymilvus import model\n", "# If connection to https://huggingface.co/ failed, uncomment the following path\n", "# import os\n", "# os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'\n", "\n", "# This will download a small embedding model \"paraphrase-albert-small-v2\" (~50MB).\n", "embedding_fn = model.DefaultEmbeddingFunction()\n", "\n", "# Text strings to search from.\n", "docs = [\n", " \"Artificial intelligence was founded as an academic discipline in 1956.\",\n", " \"Alan Turing was the first person to conduct substantial research in AI.\",\n", " \"Born in Maida Vale, London, Turing was raised in southern England.\",\n", "]\n", "\n", "vectors = embedding_fn.encode_documents(docs)\n", "# The output vector has 768 dimensions, matching the collection that we just created.\n", "print(\"Dim:\", embedding_fn.dim, vectors[0].shape) # Dim: 768 (768,)\n", "\n", "# Each entity has id, vector representation, raw text, and a subject label that we use\n", "# to demo metadata filtering later.\n", "data = [\n", " {\"id\": i, \"vector\": vectors[i], \"text\": docs[i], \"subject\": \"history\"}\n", " for i in range(len(vectors))\n", "]\n", "\n", "print(\"Data has\", len(data), \"entities, each with fields: \", data[0].keys())\n", "print(\"Vector dim:\", len(data[0][\"vector\"]))\n" ] }, { "cell_type": "markdown", "id": "4e89e602", "metadata": {}, "source": [ "Inserting data to Milvus" ] }, { "cell_type": "code", "execution_count": 4, "id": "e2098f0a", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'insert_count': 3, 'ids': [0, 1, 2]}\n" ] } ], "source": [ "res = client.insert(collection_name=\"demo_collection\", data=data)\n", "\n", "print(res)" ] }, { "cell_type": "markdown", "id": "0a0e4a35", "metadata": {}, "source": [ "Semantic search / Vector search" ] }, { "cell_type": "code", "execution_count": 5, "id": "2a687f94", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "data: [\"[{'id': 2, 'distance': 0.5859946012496948, 'entity': {'text': 'Born in Maida Vale, London, Turing was raised in southern England.', 'subject': 'history'}}, {'id': 1, 'distance': 0.5118255615234375, 'entity': {'text': 'Alan Turing was the first person to conduct substantial research in AI.', 'subject': 'history'}}]\"]\n" ] } ], "source": [ "query_vectors = embedding_fn.encode_queries([\"Who is Alan Turing?\"])\n", "# If you don't have the embedding function you can use a fake vector to finish the demo:\n", "# query_vectors = [ [ random.uniform(-1, 1) for _ in range(768) ] ]\n", "\n", "res = client.search(\n", " collection_name=\"demo_collection\", # target collection\n", " data=query_vectors, # query vectors\n", " limit=2, # number of returned entities\n", " output_fields=[\"text\", \"subject\"], # specifies fields to be returned\n", ")\n", "\n", "print(res)\n" ] }, { "cell_type": "markdown", "id": "4f8e5ba8", "metadata": {}, "source": [ "Metadata filtering" ] }, { "cell_type": "code", "execution_count": 6, "id": "03d6ae37", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "data: ['[]']\n" ] } ], "source": [ "# Insert more docs in another subject.\n", "docs = [\n", " \"Machine learning has been used for drug design.\",\n", " \"Computational synthesis with AI algorithms predicts molecular properties.\",\n", " \"DDR1 is involved in cancers and fibrosis.\",\n", "]\n", "vectors = embedding_fn.encode_documents(docs)\n", "data = [\n", " {\"id\": 3 + i, \"vector\": vectors[i], \"text\": docs[i], \"subject\": \"biology\"}\n", " for i in range(len(vectors))\n", "]\n", "\n", "client.insert(collection_name=\"demo_collection\", data=data)\n", "\n", "# This will exclude any text in \"history\" subject despite close to the query vector.\n", "res = client.search(\n", " collection_name=\"demo_collection\",\n", " data=embedding_fn.encode_queries([\"tell me AI related information\"]),\n", " filter=\"subject == 'biology'\",\n", " limit=2,\n", " output_fields=[\"text\", \"subject\"],\n", ")\n", "\n", "print(res)" ] } ], "metadata": { "kernelspec": { "display_name": "DLSiteFSearchPython_venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.9" } }, "nbformat": 4, "nbformat_minor": 5 }