Skip to main content

Index Types Module

The index module provides different indexing strategies for document retrieval.

Quick Start

from praisonaiagents.knowledge.index import (
    IndexType,
    IndexStats,
    IndexProtocol,
    IndexRegistry,
    get_index_registry,
    KeywordIndex
)

# Use built-in keyword index (BM25, no external deps)
index = KeywordIndex()

# Add documents
index.add_documents([
    "Python is a programming language",
    "Machine learning with Python",
    "Java enterprise development"
])

# Query
results = index.query("Python programming", top_k=2)
for result in results:
    print(f"{result['text']} (score: {result['score']})")

Index Types

IndexType Enum

from praisonaiagents.knowledge.index import IndexType

class IndexType(Enum):
    VECTOR = "vector"     # Semantic similarity
    KEYWORD = "keyword"   # BM25 keyword matching
    HYBRID = "hybrid"     # Vector + Keyword combined
    GRAPH = "graph"       # Knowledge graph (placeholder)

Type Comparison

TypeMethodBest For
vectorSemantic embeddingsConceptual queries
keywordBM25 term matchingExact term queries
hybridCombined scoringGeneral purpose
graphEntity relationshipsConnected data

Classes

IndexStats

Statistics about an index.
@dataclass
class IndexStats:
    document_count: int
    index_type: IndexType
    metadata: Dict[str, Any] = field(default_factory=dict)

IndexProtocol

Protocol for index implementations.
class IndexProtocol(Protocol):
    name: str
    index_type: IndexType
    
    def add_documents(
        self,
        documents: List[str],
        metadatas: Optional[List[Dict[str, Any]]] = None,
        ids: Optional[List[str]] = None
    ) -> List[str]:
        """Add documents to the index."""
        ...
    
    def query(
        self,
        query: str,
        top_k: int = 10,
        **kwargs
    ) -> List[Dict[str, Any]]:
        """Query the index."""
        ...
    
    def get_stats(self) -> IndexStats:
        """Get index statistics."""
        ...
    
    def clear(self) -> None:
        """Clear the index."""
        ...

KeywordIndex

Built-in BM25 keyword index (no external dependencies).
from praisonaiagents.knowledge.index import KeywordIndex

index = KeywordIndex()

# Add documents
index.add_documents([
    "Introduction to Python programming",
    "Advanced Python techniques",
    "Java for beginners"
])

# Query with BM25 scoring
results = index.query("Python", top_k=2)
# Returns documents ranked by BM25 score

# Get statistics
stats = index.get_stats()
print(f"Documents: {stats.document_count}")

IndexRegistry

Registry for managing index implementations.
from praisonaiagents.knowledge.index import get_index_registry

registry = get_index_registry()

# List available indices
indices = registry.list_indices()  # ['keyword', 'vector', ...]

# Get index by name
index = registry.get("keyword")

# Register custom index
registry.register("custom", MyIndex)

Using with Knowledge

from praisonaiagents import Agent, Knowledge

# Configure index type
agent = Agent(
    instructions="You are a helpful assistant",
    knowledge=["./docs/"],
    knowledge_config={
        "index_type": "hybrid",  # or "vector", "keyword"
    }
)

response = agent.chat("Find exact term 'API endpoint'")

Creating Custom Indices

from praisonaiagents.knowledge.index import (
    IndexType,
    IndexStats,
    get_index_registry
)
from typing import List, Dict, Any, Optional

class MyIndex:
    name = "my_index"
    index_type = IndexType.KEYWORD
    
    def __init__(self, **config):
        self.documents = []
    
    def add_documents(
        self,
        documents: List[str],
        metadatas: Optional[List[Dict[str, Any]]] = None,
        ids: Optional[List[str]] = None
    ) -> List[str]:
        # Implementation
        ...
    
    def query(
        self,
        query: str,
        top_k: int = 10,
        **kwargs
    ) -> List[Dict[str, Any]]:
        # Implementation
        ...
    
    def get_stats(self) -> IndexStats:
        return IndexStats(
            document_count=len(self.documents),
            index_type=self.index_type
        )
    
    def clear(self) -> None:
        self.documents.clear()

# Register
registry = get_index_registry()
registry.register("my_index", MyIndex)

Performance

  • KeywordIndex uses pure Python BM25 implementation
  • No external dependencies for keyword indexing
  • Vector indices require embedding models (lazy-loaded)