DLSiteFSearch/milvustests/quickstart.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "47471ef9",
   "metadata": {},
   "source": [
    "Creating client"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "d08ab631",
   "metadata": {},
   "outputs": [],
   "source": [
    "from pymilvus import MilvusClient\n",
    "\n",
    "client = MilvusClient(uri=\"http://localhost:19530\", token=\"root:Milvus\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ecf3a2dd",
   "metadata": {},
   "source": [
    "Creating collection"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "7bf82b6c",
   "metadata": {},
   "outputs": [],
   "source": [
    "if client.has_collection(collection_name=\"demo_collection\"):\n",
    "    client.drop_collection(collection_name=\"demo_collection\")\n",
    "\n",
    "client.create_collection(\n",
    "    collection_name=\"demo_collection\",\n",
    "    dimension=768,  # The vectors we will use in this demo has 768 dimensions\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "eef3759b",
   "metadata": {},
   "source": [
    "Adding sample vector data using Embeddings to Milvus"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "7f6083de",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "d:\\Repository\\DLSiteFSearch\\DLSiteFSearchPython_venv\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n",
      "None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.\n",
      "d:\\Repository\\DLSiteFSearch\\DLSiteFSearchPython_venv\\Lib\\site-packages\\huggingface_hub\\file_download.py:144: UserWarning: `huggingface_hub` cache-system uses symlinks by default to efficiently store duplicated files but your machine does not support them in C:\\Users\\qt\\.cache\\huggingface\\hub\\models--GPTCache--paraphrase-albert-small-v2. Caching files will still work but in a degraded version that might require more space on your disk. This warning can be disabled by setting the `HF_HUB_DISABLE_SYMLINKS_WARNING` environment variable. For more details, see https://huggingface.co/docs/huggingface_hub/how-to-cache#limitations.\n",
      "To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development\n",
      "  warnings.warn(message)\n",
      "d:\\Repository\\DLSiteFSearch\\DLSiteFSearchPython_venv\\Lib\\site-packages\\huggingface_hub\\file_download.py:144: UserWarning: `huggingface_hub` cache-system uses symlinks by default to efficiently store duplicated files but your machine does not support them in C:\\Users\\qt\\.cache\\huggingface\\hub\\models--GPTCache--paraphrase-albert-onnx. Caching files will still work but in a degraded version that might require more space on your disk. This warning can be disabled by setting the `HF_HUB_DISABLE_SYMLINKS_WARNING` environment variable. For more details, see https://huggingface.co/docs/huggingface_hub/how-to-cache#limitations.\n",
      "To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development\n",
      "  warnings.warn(message)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Dim: 768 (768,)\n",
      "Data has 3 entities, each with fields:  dict_keys(['id', 'vector', 'text', 'subject'])\n",
      "Vector dim: 768\n"
     ]
    }
   ],
   "source": [
    "from pymilvus import model\n",
    "# If connection to https://huggingface.co/ failed, uncomment the following path\n",
    "# import os\n",
    "# os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'\n",
    "\n",
    "# This will download a small embedding model \"paraphrase-albert-small-v2\" (~50MB).\n",
    "embedding_fn = model.DefaultEmbeddingFunction()\n",
    "\n",
    "# Text strings to search from.\n",
    "docs = [\n",
    "    \"Artificial intelligence was founded as an academic discipline in 1956.\",\n",
    "    \"Alan Turing was the first person to conduct substantial research in AI.\",\n",
    "    \"Born in Maida Vale, London, Turing was raised in southern England.\",\n",
    "]\n",
    "\n",
    "vectors = embedding_fn.encode_documents(docs)\n",
    "# The output vector has 768 dimensions, matching the collection that we just created.\n",
    "print(\"Dim:\", embedding_fn.dim, vectors[0].shape)  # Dim: 768 (768,)\n",
    "\n",
    "# Each entity has id, vector representation, raw text, and a subject label that we use\n",
    "# to demo metadata filtering later.\n",
    "data = [\n",
    "    {\"id\": i, \"vector\": vectors[i], \"text\": docs[i], \"subject\": \"history\"}\n",
    "    for i in range(len(vectors))\n",
    "]\n",
    "\n",
    "print(\"Data has\", len(data), \"entities, each with fields: \", data[0].keys())\n",
    "print(\"Vector dim:\", len(data[0][\"vector\"]))\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4e89e602",
   "metadata": {},
   "source": [
    "Inserting data to Milvus"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "e2098f0a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'insert_count': 3, 'ids': [0, 1, 2]}\n"
     ]
    }
   ],
   "source": [
    "res = client.insert(collection_name=\"demo_collection\", data=data)\n",
    "\n",
    "print(res)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0a0e4a35",
   "metadata": {},
   "source": [
    "Semantic search / Vector search"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "2a687f94",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "data: [\"[{'id': 2, 'distance': 0.5859946012496948, 'entity': {'text': 'Born in Maida Vale, London, Turing was raised in southern England.', 'subject': 'history'}}, {'id': 1, 'distance': 0.5118255615234375, 'entity': {'text': 'Alan Turing was the first person to conduct substantial research in AI.', 'subject': 'history'}}]\"]\n"
     ]
    }
   ],
   "source": [
    "query_vectors = embedding_fn.encode_queries([\"Who is Alan Turing?\"])\n",
    "# If you don't have the embedding function you can use a fake vector to finish the demo:\n",
    "# query_vectors = [ [ random.uniform(-1, 1) for _ in range(768) ] ]\n",
    "\n",
    "res = client.search(\n",
    "    collection_name=\"demo_collection\",  # target collection\n",
    "    data=query_vectors,  # query vectors\n",
    "    limit=2,  # number of returned entities\n",
    "    output_fields=[\"text\", \"subject\"],  # specifies fields to be returned\n",
    ")\n",
    "\n",
    "print(res)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4f8e5ba8",
   "metadata": {},
   "source": [
    "Metadata filtering"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "03d6ae37",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "data: ['[]']\n"
     ]
    }
   ],
   "source": [
    "# Insert more docs in another subject.\n",
    "docs = [\n",
    "    \"Machine learning has been used for drug design.\",\n",
    "    \"Computational synthesis with AI algorithms predicts molecular properties.\",\n",
    "    \"DDR1 is involved in cancers and fibrosis.\",\n",
    "]\n",
    "vectors = embedding_fn.encode_documents(docs)\n",
    "data = [\n",
    "    {\"id\": 3 + i, \"vector\": vectors[i], \"text\": docs[i], \"subject\": \"biology\"}\n",
    "    for i in range(len(vectors))\n",
    "]\n",
    "\n",
    "client.insert(collection_name=\"demo_collection\", data=data)\n",
    "\n",
    "# This will exclude any text in \"history\" subject despite close to the query vector.\n",
    "res = client.search(\n",
    "    collection_name=\"demo_collection\",\n",
    "    data=embedding_fn.encode_queries([\"tell me AI related information\"]),\n",
    "    filter=\"subject == 'biology'\",\n",
    "    limit=2,\n",
    "    output_fields=[\"text\", \"subject\"],\n",
    ")\n",
    "\n",
    "print(res)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "DLSiteFSearchPython_venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}