Judge provides a simple, unified API for evaluating agent outputs using LLM-as-judge. It supports accuracy evaluation, criteria-based evaluation, and custom judges.
from praisonaiagents.eval import Judgeresult = Judge().run(output="4", expected="4")print(f"Score: {result.score}/10")print(f"Passed: {result.passed}")
Copy
from praisonaiagents.eval import Judgejudge = Judge(criteria="Response is helpful and accurate")result = judge.run(output="Hello! How can I help you today?")print(f"Score: {result.score}/10")
Copy
from praisonaiagents import Agentfrom praisonaiagents.eval import Judgeagent = Agent(instructions="You are a math tutor")result = Judge().run( agent=agent, input_text="What is 2+2?", expected="4")
from praisonaiagents.eval import Judge, JudgeConfigconfig = JudgeConfig( model="gpt-4o", temperature=0.1, threshold=8.0, criteria="Response is accurate and well-formatted")judge = Judge(config=config)
from praisonaiagents.eval import Judgeclass RecipeJudge(Judge): """Judge for evaluating recipe quality.""" CRITERIA_PROMPT = """Evaluate this recipe:CRITERIA: Recipe is complete with ingredients and stepsRECIPE:{output}Score 1-10 based on completeness and clarity.SCORE: [1-10]REASONING: [explanation]"""
2
Register Judge
Copy
from praisonaiagents.eval import add_judgeadd_judge("recipe", RecipeJudge)
3
Use Judge
Copy
from praisonaiagents.eval import get_judgeRecipeJudge = get_judge("recipe")judge = RecipeJudge()result = judge.run(output=recipe_text)
from praisonaiagents.eval import add_judge, get_judge, list_judges# List available judgesprint(list_judges()) # ['accuracy', 'criteria']# Get a judgeAccuracyJudge = get_judge("accuracy")