Creating client

In [1]:
from pymilvus import MilvusClient

client = MilvusClient(uri="http://localhost:19530", token="root:Milvus")

Creating collection

In [2]:
if client.has_collection(collection_name="demo_collection"):
 client.drop_collection(collection_name="demo_collection")

client.create_collection(
 collection_name="demo_collection",
 dimension=768, # The vectors we will use in this demo has 768 dimensions
)

Adding sample vector data using Embeddings to Milvus

In [3]:
from pymilvus import model
# If connection to https://huggingface.co/ failed, uncomment the following path
# import os
# os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'

# This will download a small embedding model "paraphrase-albert-small-v2" (~50MB).
embedding_fn = model.DefaultEmbeddingFunction()

# Text strings to search from.
docs = [
 "Artificial intelligence was founded as an academic discipline in 1956.",
 "Alan Turing was the first person to conduct substantial research in AI.",
 "Born in Maida Vale, London, Turing was raised in southern England.",
]

vectors = embedding_fn.encode_documents(docs)
# The output vector has 768 dimensions, matching the collection that we just created.
print("Dim:", embedding_fn.dim, vectors[0].shape) # Dim: 768 (768,)

# Each entity has id, vector representation, raw text, and a subject label that we use
# to demo metadata filtering later.
data = [
 {"id": i, "vector": vectors[i], "text": docs[i], "subject": "history"}
 for i in range(len(vectors))
]

print("Data has", len(data), "entities, each with fields: ", data[0].keys())
print("Vector dim:", len(data[0]["vector"]))


 from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Dim: 768 (768,)
Data has 3 entities, each with fields: dict_keys(['id', 'vector', 'text', 'subject'])
Vector dim: 768


Inserting data to Milvus

In [4]:
res = client.insert(collection_name="demo_collection", data=data)

print(res)

{'insert_count': 3, 'ids': [0, 1, 2]}


Semantic search / Vector search

In [5]:
query_vectors = embedding_fn.encode_queries(["Who is Alan Turing?"])
# If you don't have the embedding function you can use a fake vector to finish the demo:
# query_vectors = [ [ random.uniform(-1, 1) for _ in range(768) ] ]

res = client.search(
 collection_name="demo_collection", # target collection
 data=query_vectors, # query vectors
 limit=2, # number of returned entities
 output_fields=["text", "subject"], # specifies fields to be returned
)

print(res)


data: ["[{'id': 2, 'distance': 0.5859946012496948, 'entity': {'text': 'Born in Maida Vale, London, Turing was raised in southern England.', 'subject': 'history'}}, {'id': 1, 'distance': 0.5118255615234375, 'entity': {'text': 'Alan Turing was the first person to conduct substantial research in AI.', 'subject': 'history'}}]"]


Metadata filtering

In [6]:
# Insert more docs in another subject.
docs = [
 "Machine learning has been used for drug design.",
 "Computational synthesis with AI algorithms predicts molecular properties.",
 "DDR1 is involved in cancers and fibrosis.",
]
vectors = embedding_fn.encode_documents(docs)
data = [
 {"id": 3 + i, "vector": vectors[i], "text": docs[i], "subject": "biology"}
 for i in range(len(vectors))
]

client.insert(collection_name="demo_collection", data=data)

# This will exclude any text in "history" subject despite close to the query vector.
res = client.search(
 collection_name="demo_collection",
 data=embedding_fn.encode_queries(["tell me AI related information"]),
 filter="subject == 'biology'",
 limit=2,
 output_fields=["text", "subject"],
)

print(res)

data: ['[]']
