added more information

2025-04-07 00:40:21 +02:00
parent af81c82d18
commit a9d3d10da9
16 changed files with 27549 additions and 2515 deletions
--- a/milvustests/quickstart.ipynb
+++ b/milvustests/quickstart.ipynb
@@ -0,0 +1,253 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "47471ef9",
+   "metadata": {},
+   "source": [
+    "Creating client"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "d08ab631",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pymilvus import MilvusClient\n",
+    "\n",
+    "client = MilvusClient(uri=\"http://localhost:19530\", token=\"root:Milvus\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ecf3a2dd",
+   "metadata": {},
+   "source": [
+    "Creating collection"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "7bf82b6c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if client.has_collection(collection_name=\"demo_collection\"):\n",
+    "    client.drop_collection(collection_name=\"demo_collection\")\n",
+    "\n",
+    "client.create_collection(\n",
+    "    collection_name=\"demo_collection\",\n",
+    "    dimension=768,  # The vectors we will use in this demo has 768 dimensions\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "eef3759b",
+   "metadata": {},
+   "source": [
+    "Adding sample vector data using Embeddings to Milvus"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "7f6083de",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "d:\\Repository\\DLSiteFSearch\\DLSiteFSearchPython_venv\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n",
+      "None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.\n",
+      "d:\\Repository\\DLSiteFSearch\\DLSiteFSearchPython_venv\\Lib\\site-packages\\huggingface_hub\\file_download.py:144: UserWarning: `huggingface_hub` cache-system uses symlinks by default to efficiently store duplicated files but your machine does not support them in C:\\Users\\qt\\.cache\\huggingface\\hub\\models--GPTCache--paraphrase-albert-small-v2. Caching files will still work but in a degraded version that might require more space on your disk. This warning can be disabled by setting the `HF_HUB_DISABLE_SYMLINKS_WARNING` environment variable. For more details, see https://huggingface.co/docs/huggingface_hub/how-to-cache#limitations.\n",
+      "To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development\n",
+      "  warnings.warn(message)\n",
+      "d:\\Repository\\DLSiteFSearch\\DLSiteFSearchPython_venv\\Lib\\site-packages\\huggingface_hub\\file_download.py:144: UserWarning: `huggingface_hub` cache-system uses symlinks by default to efficiently store duplicated files but your machine does not support them in C:\\Users\\qt\\.cache\\huggingface\\hub\\models--GPTCache--paraphrase-albert-onnx. Caching files will still work but in a degraded version that might require more space on your disk. This warning can be disabled by setting the `HF_HUB_DISABLE_SYMLINKS_WARNING` environment variable. For more details, see https://huggingface.co/docs/huggingface_hub/how-to-cache#limitations.\n",
+      "To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development\n",
+      "  warnings.warn(message)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Dim: 768 (768,)\n",
+      "Data has 3 entities, each with fields:  dict_keys(['id', 'vector', 'text', 'subject'])\n",
+      "Vector dim: 768\n"
+     ]
+    }
+   ],
+   "source": [
+    "from pymilvus import model\n",
+    "# If connection to https://huggingface.co/ failed, uncomment the following path\n",
+    "# import os\n",
+    "# os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'\n",
+    "\n",
+    "# This will download a small embedding model \"paraphrase-albert-small-v2\" (~50MB).\n",
+    "embedding_fn = model.DefaultEmbeddingFunction()\n",
+    "\n",
+    "# Text strings to search from.\n",
+    "docs = [\n",
+    "    \"Artificial intelligence was founded as an academic discipline in 1956.\",\n",
+    "    \"Alan Turing was the first person to conduct substantial research in AI.\",\n",
+    "    \"Born in Maida Vale, London, Turing was raised in southern England.\",\n",
+    "]\n",
+    "\n",
+    "vectors = embedding_fn.encode_documents(docs)\n",
+    "# The output vector has 768 dimensions, matching the collection that we just created.\n",
+    "print(\"Dim:\", embedding_fn.dim, vectors[0].shape)  # Dim: 768 (768,)\n",
+    "\n",
+    "# Each entity has id, vector representation, raw text, and a subject label that we use\n",
+    "# to demo metadata filtering later.\n",
+    "data = [\n",
+    "    {\"id\": i, \"vector\": vectors[i], \"text\": docs[i], \"subject\": \"history\"}\n",
+    "    for i in range(len(vectors))\n",
+    "]\n",
+    "\n",
+    "print(\"Data has\", len(data), \"entities, each with fields: \", data[0].keys())\n",
+    "print(\"Vector dim:\", len(data[0][\"vector\"]))\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4e89e602",
+   "metadata": {},
+   "source": [
+    "Inserting data to Milvus"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "e2098f0a",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'insert_count': 3, 'ids': [0, 1, 2]}\n"
+     ]
+    }
+   ],
+   "source": [
+    "res = client.insert(collection_name=\"demo_collection\", data=data)\n",
+    "\n",
+    "print(res)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0a0e4a35",
+   "metadata": {},
+   "source": [
+    "Semantic search / Vector search"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "2a687f94",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "data: [\"[{'id': 2, 'distance': 0.5859946012496948, 'entity': {'text': 'Born in Maida Vale, London, Turing was raised in southern England.', 'subject': 'history'}}, {'id': 1, 'distance': 0.5118255615234375, 'entity': {'text': 'Alan Turing was the first person to conduct substantial research in AI.', 'subject': 'history'}}]\"]\n"
+     ]
+    }
+   ],
+   "source": [
+    "query_vectors = embedding_fn.encode_queries([\"Who is Alan Turing?\"])\n",
+    "# If you don't have the embedding function you can use a fake vector to finish the demo:\n",
+    "# query_vectors = [ [ random.uniform(-1, 1) for _ in range(768) ] ]\n",
+    "\n",
+    "res = client.search(\n",
+    "    collection_name=\"demo_collection\",  # target collection\n",
+    "    data=query_vectors,  # query vectors\n",
+    "    limit=2,  # number of returned entities\n",
+    "    output_fields=[\"text\", \"subject\"],  # specifies fields to be returned\n",
+    ")\n",
+    "\n",
+    "print(res)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4f8e5ba8",
+   "metadata": {},
+   "source": [
+    "Metadata filtering"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "03d6ae37",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "data: ['[]']\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Insert more docs in another subject.\n",
+    "docs = [\n",
+    "    \"Machine learning has been used for drug design.\",\n",
+    "    \"Computational synthesis with AI algorithms predicts molecular properties.\",\n",
+    "    \"DDR1 is involved in cancers and fibrosis.\",\n",
+    "]\n",
+    "vectors = embedding_fn.encode_documents(docs)\n",
+    "data = [\n",
+    "    {\"id\": 3 + i, \"vector\": vectors[i], \"text\": docs[i], \"subject\": \"biology\"}\n",
+    "    for i in range(len(vectors))\n",
+    "]\n",
+    "\n",
+    "client.insert(collection_name=\"demo_collection\", data=data)\n",
+    "\n",
+    "# This will exclude any text in \"history\" subject despite close to the query vector.\n",
+    "res = client.search(\n",
+    "    collection_name=\"demo_collection\",\n",
+    "    data=embedding_fn.encode_queries([\"tell me AI related information\"]),\n",
+    "    filter=\"subject == 'biology'\",\n",
+    "    limit=2,\n",
+    "    output_fields=[\"text\", \"subject\"],\n",
+    ")\n",
+    "\n",
+    "print(res)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "DLSiteFSearchPython_venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}