Skip to main content

Documentation Index

Fetch the complete documentation index at: https://docs.praison.ai/llms.txt

Use this file to discover all available pages before exploring further.

Evaluation Results

Aggregate, analyze, and visualize Agent evaluation results. Track trends, compare runs, and generate reports.

Collect Results

import { EvalResults, createEvalResults } from 'praisonai';

const results = createEvalResults();

// Add test results
results.add({
  name: 'accuracy-test-1',
  passed: true,
  score: 0.95,
  duration: 150,
  input: 'What is 2+2?',
  output: '4',
  expected: '4'
});

results.add({
  name: 'accuracy-test-2',
  passed: false,
  score: 0.6,
  duration: 200,
  input: 'Explain quantum computing',
  output: 'A type of computing...',
  expected: 'Uses quantum mechanics...',
  error: 'Incomplete explanation'
});

Aggregate Statistics

import { EvalResults } from 'praisonai';

const results = new EvalResults();

// Add multiple results...
results.add({ name: 'test-1', passed: true, score: 0.9, duration: 100 });
results.add({ name: 'test-2', passed: true, score: 0.85, duration: 120 });
results.add({ name: 'test-3', passed: false, score: 0.5, duration: 200 });

// Get aggregated stats
const stats = results.aggregate();

console.log('Evaluation Summary:');
console.log(`  Total: ${stats.totalTests}`);
console.log(`  Passed: ${stats.passedTests}`);
console.log(`  Failed: ${stats.failedTests}`);
console.log(`  Pass Rate: ${(stats.passRate * 100).toFixed(1)}%`);
console.log(`  Avg Score: ${(stats.avgScore * 100).toFixed(1)}%`);
console.log(`  Avg Duration: ${stats.avgDuration}ms`);

Categorize Results

import { EvalResults } from 'praisonai';

const results = new EvalResults();

// Add results with categories
const r1 = results.add({ name: 'math-1', passed: true, score: 0.95, duration: 100 });
const r2 = results.add({ name: 'math-2', passed: true, score: 0.9, duration: 110 });
const r3 = results.add({ name: 'writing-1', passed: false, score: 0.6, duration: 200 });

results.categorize(r1.id, 'math');
results.categorize(r2.id, 'math');
results.categorize(r3.id, 'writing');

// Get stats by category
const byCategory = results.aggregateByCategory();

for (const [category, stats] of byCategory) {
  console.log(`\n${category}:`);
  console.log(`  Pass Rate: ${(stats.passRate * 100).toFixed(1)}%`);
  console.log(`  Avg Score: ${(stats.avgScore * 100).toFixed(1)}%`);
}
import { EvalResults } from 'praisonai';

const results = new EvalResults();

// Add results over time (simulated)
for (let i = 0; i < 100; i++) {
  results.add({
    name: `test-${i}`,
    passed: Math.random() > 0.2,
    score: 0.7 + Math.random() * 0.3,
    duration: 100 + Math.random() * 100
  });
}

// Get trend over time windows
const trend = results.getTrend(60000); // 1-minute windows

console.log('Trend Analysis:');
for (const point of trend) {
  console.log(`  ${new Date(point.timestamp).toISOString()}`);
  console.log(`    Pass Rate: ${(point.passRate * 100).toFixed(1)}%`);
  console.log(`    Tests: ${point.testCount}`);
}

Format Results

import { EvalResults } from 'praisonai';

const results = new EvalResults();
// Add results...

// Format as table
console.log(results.formatTable());
/*
| Name | Passed | Score | Duration |
|------|--------|-------|----------|
| test-1 | ✅ | 95.0% | 100ms |
| test-2 | ❌ | 60.0% | 200ms |
*/

// Format summary
console.log(results.formatSummary());
// Tests: 10 | Pass: 8 | Fail: 2 | Rate: 80.0% | Avg Score: 85.0%

Export and Import

import { EvalResults } from 'praisonai';

const results = new EvalResults();
// Add results...

// Export for storage
const exported = results.export();
const json = JSON.stringify(exported);

// Save to file
await fs.writeFile('eval-results.json', json);

// Later: Import results
const loaded = JSON.parse(await fs.readFile('eval-results.json', 'utf-8'));
const newResults = new EvalResults();
newResults.import(loaded);

Integration with EvalSuite

import { Agent, EvalSuite, EvalResults } from 'praisonai';

const agent = new Agent({
  name: 'Test Agent',
  instructions: 'You are a helpful assistant.'
});

const suite = new EvalSuite();
const collector = new EvalResults();

// Run evaluation suite
const testCases = [
  { input: 'Hello', expected: 'greeting' },
  { input: 'Bye', expected: 'farewell' }
];

for (const test of testCases) {
  const response = await agent.chat(test.input);
  const result = await suite.runAccuracy(test.input, {
    input: test.input,
    expectedOutput: test.expected,
    actualOutput: response
  });
  
  // Collect result
  collector.add({
    name: test.input,
    passed: result.passed,
    score: result.score,
    duration: result.duration || 0,
    input: test.input,
    output: response,
    expected: test.expected
  });
}

// Analyze collected results
console.log(collector.formatSummary());