Skip to main content

Eval Module

The eval module provides an evaluation framework for testing agent performance, including accuracy, reliability, and performance metrics.

Installation

pip install praisonaiagents

Quick Start

from praisonaiagents.eval import AccuracyEvaluator, EvalResult

evaluator = AccuracyEvaluator()
result = evaluator.evaluate(
    expected="Paris",
    actual="The capital of France is Paris."
)
print(result.score)  # 1.0

Classes

AccuracyEvaluator

Evaluate response accuracy.
from praisonaiagents.eval import AccuracyEvaluator

evaluator = AccuracyEvaluator(
    match_type="contains",  # or "exact", "semantic"
    case_sensitive=False
)

PerformanceEvaluator

Evaluate performance metrics.
from praisonaiagents.eval import PerformanceEvaluator

evaluator = PerformanceEvaluator()
result = evaluator.evaluate(
    latency_ms=150,
    tokens_used=500,
    cost=0.001
)

ReliabilityEvaluator

Evaluate reliability across multiple runs.
from praisonaiagents.eval import ReliabilityEvaluator

evaluator = ReliabilityEvaluator()
result = evaluator.evaluate(
    results=[result1, result2, result3],
    expected_consistency=0.9
)

CriteriaEvaluator

Evaluate against custom criteria.
from praisonaiagents.eval import CriteriaEvaluator

evaluator = CriteriaEvaluator(
    criteria=[
        "Response is professional",
        "Response is factually accurate",
        "Response is concise"
    ],
    llm="gpt-4o-mini"
)

EvalResult

Result of an evaluation.
from praisonaiagents.eval import EvalResult

result.score        # 0.0 to 1.0
result.passed       # True/False
result.details      # Detailed breakdown
result.metadata     # Additional info

Usage Examples

Basic Accuracy Test

from praisonaiagents.eval import AccuracyEvaluator

evaluator = AccuracyEvaluator()

# Test agent response
result = evaluator.evaluate(
    expected="42",
    actual="The answer is 42."
)

print(f"Score: {result.score}")
print(f"Passed: {result.passed}")

Batch Evaluation

from praisonaiagents.eval import AccuracyEvaluator, EvalResults

evaluator = AccuracyEvaluator()

test_cases = [
    {"expected": "Paris", "actual": "Paris is the capital"},
    {"expected": "Tokyo", "actual": "The answer is Tokyo"},
]

results = evaluator.evaluate_batch(test_cases)
print(f"Average score: {results.average_score}")

LLM-based Evaluation

from praisonaiagents.eval import CriteriaEvaluator

evaluator = CriteriaEvaluator(
    criteria=[
        "Response answers the question",
        "Response is helpful",
        "Response is accurate"
    ]
)

result = evaluator.evaluate(
    question="What is machine learning?",
    response="Machine learning is a subset of AI..."
)