254 lines
8.9 KiB
Plaintext
254 lines
8.9 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "47471ef9",
|
|
"metadata": {},
|
|
"source": [
|
|
"Creating client"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 1,
|
|
"id": "d08ab631",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"from pymilvus import MilvusClient\n",
|
|
"\n",
|
|
"client = MilvusClient(uri=\"http://localhost:19530\", token=\"root:Milvus\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "ecf3a2dd",
|
|
"metadata": {},
|
|
"source": [
|
|
"Creating collection"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"id": "7bf82b6c",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"if client.has_collection(collection_name=\"demo_collection\"):\n",
|
|
" client.drop_collection(collection_name=\"demo_collection\")\n",
|
|
"\n",
|
|
"client.create_collection(\n",
|
|
" collection_name=\"demo_collection\",\n",
|
|
" dimension=768, # The vectors we will use in this demo has 768 dimensions\n",
|
|
")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "eef3759b",
|
|
"metadata": {},
|
|
"source": [
|
|
"Adding sample vector data using Embeddings to Milvus"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"id": "7f6083de",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"d:\\Repository\\DLSiteFSearch\\DLSiteFSearchPython_venv\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
|
|
" from .autonotebook import tqdm as notebook_tqdm\n",
|
|
"None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.\n",
|
|
"d:\\Repository\\DLSiteFSearch\\DLSiteFSearchPython_venv\\Lib\\site-packages\\huggingface_hub\\file_download.py:144: UserWarning: `huggingface_hub` cache-system uses symlinks by default to efficiently store duplicated files but your machine does not support them in C:\\Users\\qt\\.cache\\huggingface\\hub\\models--GPTCache--paraphrase-albert-small-v2. Caching files will still work but in a degraded version that might require more space on your disk. This warning can be disabled by setting the `HF_HUB_DISABLE_SYMLINKS_WARNING` environment variable. For more details, see https://huggingface.co/docs/huggingface_hub/how-to-cache#limitations.\n",
|
|
"To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development\n",
|
|
" warnings.warn(message)\n",
|
|
"d:\\Repository\\DLSiteFSearch\\DLSiteFSearchPython_venv\\Lib\\site-packages\\huggingface_hub\\file_download.py:144: UserWarning: `huggingface_hub` cache-system uses symlinks by default to efficiently store duplicated files but your machine does not support them in C:\\Users\\qt\\.cache\\huggingface\\hub\\models--GPTCache--paraphrase-albert-onnx. Caching files will still work but in a degraded version that might require more space on your disk. This warning can be disabled by setting the `HF_HUB_DISABLE_SYMLINKS_WARNING` environment variable. For more details, see https://huggingface.co/docs/huggingface_hub/how-to-cache#limitations.\n",
|
|
"To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development\n",
|
|
" warnings.warn(message)\n"
|
|
]
|
|
},
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Dim: 768 (768,)\n",
|
|
"Data has 3 entities, each with fields: dict_keys(['id', 'vector', 'text', 'subject'])\n",
|
|
"Vector dim: 768\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"from pymilvus import model\n",
|
|
"# If connection to https://huggingface.co/ failed, uncomment the following path\n",
|
|
"# import os\n",
|
|
"# os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'\n",
|
|
"\n",
|
|
"# This will download a small embedding model \"paraphrase-albert-small-v2\" (~50MB).\n",
|
|
"embedding_fn = model.DefaultEmbeddingFunction()\n",
|
|
"\n",
|
|
"# Text strings to search from.\n",
|
|
"docs = [\n",
|
|
" \"Artificial intelligence was founded as an academic discipline in 1956.\",\n",
|
|
" \"Alan Turing was the first person to conduct substantial research in AI.\",\n",
|
|
" \"Born in Maida Vale, London, Turing was raised in southern England.\",\n",
|
|
"]\n",
|
|
"\n",
|
|
"vectors = embedding_fn.encode_documents(docs)\n",
|
|
"# The output vector has 768 dimensions, matching the collection that we just created.\n",
|
|
"print(\"Dim:\", embedding_fn.dim, vectors[0].shape) # Dim: 768 (768,)\n",
|
|
"\n",
|
|
"# Each entity has id, vector representation, raw text, and a subject label that we use\n",
|
|
"# to demo metadata filtering later.\n",
|
|
"data = [\n",
|
|
" {\"id\": i, \"vector\": vectors[i], \"text\": docs[i], \"subject\": \"history\"}\n",
|
|
" for i in range(len(vectors))\n",
|
|
"]\n",
|
|
"\n",
|
|
"print(\"Data has\", len(data), \"entities, each with fields: \", data[0].keys())\n",
|
|
"print(\"Vector dim:\", len(data[0][\"vector\"]))\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "4e89e602",
|
|
"metadata": {},
|
|
"source": [
|
|
"Inserting data to Milvus"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 4,
|
|
"id": "e2098f0a",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"{'insert_count': 3, 'ids': [0, 1, 2]}\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"res = client.insert(collection_name=\"demo_collection\", data=data)\n",
|
|
"\n",
|
|
"print(res)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "0a0e4a35",
|
|
"metadata": {},
|
|
"source": [
|
|
"Semantic search / Vector search"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 5,
|
|
"id": "2a687f94",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"data: [\"[{'id': 2, 'distance': 0.5859946012496948, 'entity': {'text': 'Born in Maida Vale, London, Turing was raised in southern England.', 'subject': 'history'}}, {'id': 1, 'distance': 0.5118255615234375, 'entity': {'text': 'Alan Turing was the first person to conduct substantial research in AI.', 'subject': 'history'}}]\"]\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"query_vectors = embedding_fn.encode_queries([\"Who is Alan Turing?\"])\n",
|
|
"# If you don't have the embedding function you can use a fake vector to finish the demo:\n",
|
|
"# query_vectors = [ [ random.uniform(-1, 1) for _ in range(768) ] ]\n",
|
|
"\n",
|
|
"res = client.search(\n",
|
|
" collection_name=\"demo_collection\", # target collection\n",
|
|
" data=query_vectors, # query vectors\n",
|
|
" limit=2, # number of returned entities\n",
|
|
" output_fields=[\"text\", \"subject\"], # specifies fields to be returned\n",
|
|
")\n",
|
|
"\n",
|
|
"print(res)\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "4f8e5ba8",
|
|
"metadata": {},
|
|
"source": [
|
|
"Metadata filtering"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 6,
|
|
"id": "03d6ae37",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"data: ['[]']\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# Insert more docs in another subject.\n",
|
|
"docs = [\n",
|
|
" \"Machine learning has been used for drug design.\",\n",
|
|
" \"Computational synthesis with AI algorithms predicts molecular properties.\",\n",
|
|
" \"DDR1 is involved in cancers and fibrosis.\",\n",
|
|
"]\n",
|
|
"vectors = embedding_fn.encode_documents(docs)\n",
|
|
"data = [\n",
|
|
" {\"id\": 3 + i, \"vector\": vectors[i], \"text\": docs[i], \"subject\": \"biology\"}\n",
|
|
" for i in range(len(vectors))\n",
|
|
"]\n",
|
|
"\n",
|
|
"client.insert(collection_name=\"demo_collection\", data=data)\n",
|
|
"\n",
|
|
"# This will exclude any text in \"history\" subject despite close to the query vector.\n",
|
|
"res = client.search(\n",
|
|
" collection_name=\"demo_collection\",\n",
|
|
" data=embedding_fn.encode_queries([\"tell me AI related information\"]),\n",
|
|
" filter=\"subject == 'biology'\",\n",
|
|
" limit=2,\n",
|
|
" output_fields=[\"text\", \"subject\"],\n",
|
|
")\n",
|
|
"\n",
|
|
"print(res)"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "DLSiteFSearchPython_venv",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.12.9"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|