The Judge class uses an LLM to evaluate outputs with human-like reasoning. This is the recommended approach for most evaluations.
Quick Start
With Agent
Custom Criteria
Copy
from praisonaiagents.eval import Judge# Evaluate any outputresult = Judge().run( output="The capital of France is Paris.", expected="Paris is the capital of France.")print(f"Score: {result.score}/10")print(f"Reasoning: {result.reasoning}")
Copy
from praisonaiagents import Agentfrom praisonaiagents.eval import Judge# Create your agentagent = Agent(name="assistant", instructions="Answer questions")# Evaluate agent outputresult = Judge().run( agent=agent, input="What is 2+2?", expected="4")print(f"Score: {result.score}/10")
Copy
from praisonaiagents.eval import Judge# Define your own evaluation criteriaresult = Judge(criteria="Response is helpful, accurate, and concise").run( output="Here's a detailed explanation of quantum physics...")print(f"Score: {result.score}/10")print(f"Passed: {result.passed}")
from praisonaiagents.eval import AccuracyJudgejudge = AccuracyJudge()result = judge.run( output="Paris", expected="Paris", input="What is the capital of France?")# Score: 10/10 - Perfect match
CriteriaJudge
Evaluates against custom criteria.
Copy
from praisonaiagents.eval import CriteriaJudgejudge = CriteriaJudge(criteria="Response is professional and helpful")result = judge.run(output="Hello! How can I assist you today?")# Score: 9/10 - Professional and helpful
RecipeJudge
Evaluates multi-agent workflow outputs.
Copy
from praisonaiagents.eval import RecipeJudgejudge = RecipeJudge()result = judge.run( output=workflow_output, expected="Complete research report")
from praisonaiagents.eval import add_judge, get_judge, list_judges# Register a custom judgeadd_judge("my_judge", MyCustomJudge)# List all judgesprint(list_judges()) # ['accuracy', 'criteria', 'recipe', 'my_judge']# Get a judge by namejudge = get_judge("my_judge")
from praisonaiagents.eval import ReliabilityEvaluatorevaluator = ReliabilityEvaluator( agent=my_agent, input_text="Search for AI news", expected_tools=["search_web", "summarize"])result = evaluator.run(print_summary=True)print(f"Status: {result.status}") # PASSED or FAILEDprint(f"Pass Rate: {result.pass_rate}%")
Copy
from praisonaiagents.eval import ReliabilityEvaluator# Ensure certain tools are NOT calledevaluator = ReliabilityEvaluator( agent=my_agent, input_text="Answer from memory only", forbidden_tools=["search_web", "database_query"])result = evaluator.run(print_summary=True)
Copy
from praisonaiagents.eval import ReliabilityEvaluator# Evaluate without running agentevaluator = ReliabilityEvaluator( agent=my_agent, expected_tools=["search_web"])result = evaluator.evaluate_tool_calls( actual_tools=["search_web", "summarize"], print_summary=True)
Evaluate against custom criteria with numeric or binary scoring.
Numeric Scoring
Binary Scoring
With Callback
Copy
from praisonaiagents.eval import CriteriaEvaluatorevaluator = CriteriaEvaluator( criteria="Response is helpful, accurate, and professional", agent=my_agent, input_text="How do I reset my password?", scoring_type="numeric", threshold=7.0)result = evaluator.run(print_summary=True)print(f"Score: {result.avg_score}/10")print(f"Passed: {result.all_passed}")
Copy
from praisonaiagents.eval import CriteriaEvaluatorevaluator = CriteriaEvaluator( criteria="Response contains no harmful content", agent=my_agent, input_text="Tell me about safety", scoring_type="binary")result = evaluator.run(print_summary=True)print(f"Passed: {result.all_passed}")
Copy
from praisonaiagents.eval import CriteriaEvaluatordef on_fail(score): print(f"⚠️ Failed: {score.reasoning}") # Send alert, log, etc.evaluator = CriteriaEvaluator( criteria="Response is under 100 words", agent=my_agent, input_text="Explain quantum physics", on_fail=on_fail)result = evaluator.run()