Evaluation Results
Aggregate, analyze, and visualize Agent evaluation results. Track trends, compare runs, and generate reports.Collect Results
Copy
import { EvalResults, createEvalResults } from 'praisonai';
const results = createEvalResults();
// Add test results
results.add({
name: 'accuracy-test-1',
passed: true,
score: 0.95,
duration: 150,
input: 'What is 2+2?',
output: '4',
expected: '4'
});
results.add({
name: 'accuracy-test-2',
passed: false,
score: 0.6,
duration: 200,
input: 'Explain quantum computing',
output: 'A type of computing...',
expected: 'Uses quantum mechanics...',
error: 'Incomplete explanation'
});
Aggregate Statistics
Copy
import { EvalResults } from 'praisonai';
const results = new EvalResults();
// Add multiple results...
results.add({ name: 'test-1', passed: true, score: 0.9, duration: 100 });
results.add({ name: 'test-2', passed: true, score: 0.85, duration: 120 });
results.add({ name: 'test-3', passed: false, score: 0.5, duration: 200 });
// Get aggregated stats
const stats = results.aggregate();
console.log('Evaluation Summary:');
console.log(` Total: ${stats.totalTests}`);
console.log(` Passed: ${stats.passedTests}`);
console.log(` Failed: ${stats.failedTests}`);
console.log(` Pass Rate: ${(stats.passRate * 100).toFixed(1)}%`);
console.log(` Avg Score: ${(stats.avgScore * 100).toFixed(1)}%`);
console.log(` Avg Duration: ${stats.avgDuration}ms`);
Categorize Results
Copy
import { EvalResults } from 'praisonai';
const results = new EvalResults();
// Add results with categories
const r1 = results.add({ name: 'math-1', passed: true, score: 0.95, duration: 100 });
const r2 = results.add({ name: 'math-2', passed: true, score: 0.9, duration: 110 });
const r3 = results.add({ name: 'writing-1', passed: false, score: 0.6, duration: 200 });
results.categorize(r1.id, 'math');
results.categorize(r2.id, 'math');
results.categorize(r3.id, 'writing');
// Get stats by category
const byCategory = results.aggregateByCategory();
for (const [category, stats] of byCategory) {
console.log(`\n${category}:`);
console.log(` Pass Rate: ${(stats.passRate * 100).toFixed(1)}%`);
console.log(` Avg Score: ${(stats.avgScore * 100).toFixed(1)}%`);
}
Track Trends
Copy
import { EvalResults } from 'praisonai';
const results = new EvalResults();
// Add results over time (simulated)
for (let i = 0; i < 100; i++) {
results.add({
name: `test-${i}`,
passed: Math.random() > 0.2,
score: 0.7 + Math.random() * 0.3,
duration: 100 + Math.random() * 100
});
}
// Get trend over time windows
const trend = results.getTrend(60000); // 1-minute windows
console.log('Trend Analysis:');
for (const point of trend) {
console.log(` ${new Date(point.timestamp).toISOString()}`);
console.log(` Pass Rate: ${(point.passRate * 100).toFixed(1)}%`);
console.log(` Tests: ${point.testCount}`);
}
Format Results
Copy
import { EvalResults } from 'praisonai';
const results = new EvalResults();
// Add results...
// Format as table
console.log(results.formatTable());
/*
| Name | Passed | Score | Duration |
|------|--------|-------|----------|
| test-1 | ✅ | 95.0% | 100ms |
| test-2 | ❌ | 60.0% | 200ms |
*/
// Format summary
console.log(results.formatSummary());
// Tests: 10 | Pass: 8 | Fail: 2 | Rate: 80.0% | Avg Score: 85.0%
Export and Import
Copy
import { EvalResults } from 'praisonai';
const results = new EvalResults();
// Add results...
// Export for storage
const exported = results.export();
const json = JSON.stringify(exported);
// Save to file
await fs.writeFile('eval-results.json', json);
// Later: Import results
const loaded = JSON.parse(await fs.readFile('eval-results.json', 'utf-8'));
const newResults = new EvalResults();
newResults.import(loaded);
Integration with EvalSuite
Copy
import { Agent, EvalSuite, EvalResults } from 'praisonai';
const agent = new Agent({
name: 'Test Agent',
instructions: 'You are a helpful assistant.'
});
const suite = new EvalSuite();
const collector = new EvalResults();
// Run evaluation suite
const testCases = [
{ input: 'Hello', expected: 'greeting' },
{ input: 'Bye', expected: 'farewell' }
];
for (const test of testCases) {
const response = await agent.chat(test.input);
const result = await suite.runAccuracy(test.input, {
input: test.input,
expectedOutput: test.expected,
actualOutput: response
});
// Collect result
collector.add({
name: test.input,
passed: result.passed,
score: result.score,
duration: result.duration || 0,
input: test.input,
output: response,
expected: test.expected
});
}
// Analyze collected results
console.log(collector.formatSummary());
Related
- Evaluation - Evaluation framework
- Benchmarks - Performance benchmarking
- Observability - Monitoring

