run-eval.ts•25.9 kB
import { EvalRunner } from './runner.js';
import path from 'path';
import fs from 'fs/promises';
import { LLMProvider } from './types.js';
import OpenAI from 'openai';
import Anthropic from '@anthropic-ai/sdk';
import { GoogleGenAI } from '@google/genai';
import dotenv from 'dotenv';
import mustache from 'mustache';
// Configuration interface
interface EvalConfig {
judgeProvider: string;
judgeModel: string;
}
// OpenAI provider implementation
class OpenAIProvider implements LLMProvider {
name = 'openai';
models = ['gpt-4o', 'gpt-4-turbo', 'gpt-3.5-turbo'];
// Track validation tokens (for judging)
private tokenCounts = { prompt: 0, completion: 0, total: 0 };
// Track tool usage tokens separately
private toolTokenCounts = { toolPrompt: 0, toolCompletion: 0, toolTotal: 0 };
// Flag to determine if a call is for validation or tool usage
private isToolCall = false;
private verbose = process.env.EVAL_VERBOSE === 'true';
private client: OpenAI;
constructor(private apiKey: string) {
this.client = new OpenAI({
apiKey: this.apiKey
});
}
// Set the context for token tracking
setToolCallContext(isToolCall: boolean) {
this.isToolCall = isToolCall;
}
async runPrompt(prompt: string, model: string): Promise<string> {
try {
// Determine if this is for tool usage or validation
const isForTool = this.isToolCall;
if (this.verbose) {
console.log(`Running OpenAI prompt with model ${model} ${isForTool ? '(for tool usage)' : '(for validation)'}`);
}
// Different system prompts based on context
const systemPrompt = isForTool ?
'You are an assistant helping with data analysis. Use the tools available to analyze data and answer questions.' :
'You are an evaluation assistant that reviews tool responses and determines if they meet criteria. Format your response as SCORE: [0-1 number], PASSED: [true/false], REASONING: [your detailed explanation].';
// Real API call
const response = await this.client.chat.completions.create({
model,
messages: [
{ role: 'system', content: systemPrompt },
{ role: 'user', content: prompt }
],
temperature: 0.1
});
// Update appropriate token counter based on context
if (isForTool) {
this.toolTokenCounts.toolPrompt += response.usage?.prompt_tokens || 0;
this.toolTokenCounts.toolCompletion += response.usage?.completion_tokens || 0;
this.toolTokenCounts.toolTotal = this.toolTokenCounts.toolPrompt + this.toolTokenCounts.toolCompletion;
} else {
this.tokenCounts.prompt += response.usage?.prompt_tokens || 0;
this.tokenCounts.completion += response.usage?.completion_tokens || 0;
this.tokenCounts.total = this.tokenCounts.prompt + this.tokenCounts.completion;
}
// Reset context after call
this.isToolCall = false;
if (!response.choices || response.choices.length === 0) {
return `Got empty response from OpenAI`;
}
if (!response.choices[0]?.message) {
return `Got empty response from OpenAI`;
}
if (!response.choices[0].message.content) {
return `Got empty response from OpenAI`;
}
return response.choices[0].message.content;
} catch (error) {
console.error('OpenAI API error:', error);
return `SCORE: 0\nPASSED: false\nREASONING: Error calling OpenAI API: ${error instanceof Error ? error.message : String(error)}`;
}
}
getTokenUsage() {
return {
...this.tokenCounts,
...this.toolTokenCounts
};
}
}
// Anthropic provider implementation
class AnthropicProvider implements LLMProvider {
name = 'anthropic';
models = ['claude-3-5-haiku-latest', 'claude-3-7-sonnet-latest', 'claude-3-opus-latest'];
// Track validation tokens (for judging)
private tokenCounts = { prompt: 0, completion: 0, total: 0 };
// Track tool usage tokens separately
private toolTokenCounts = { toolPrompt: 0, toolCompletion: 0, toolTotal: 0 };
// Flag to determine if a call is for validation or tool usage
private isToolCall = false;
private verbose = process.env.EVAL_VERBOSE === 'true';
private client: Anthropic;
constructor(private apiKey: string) {
this.client = new Anthropic({
apiKey: this.apiKey
});
}
// Set the context for token tracking
setToolCallContext(isToolCall: boolean) {
this.isToolCall = isToolCall;
}
async runPrompt(prompt: string, model: string): Promise<string> {
try {
// Determine if this is for tool usage or validation
const isForTool = this.isToolCall;
if (this.verbose) {
console.log(`Running Anthropic prompt with model ${model} ${isForTool ? '(for tool usage)' : '(for validation)'}`);
}
// Different system prompts based on context
const systemPrompt = isForTool ?
'You are an assistant helping with data analysis. Use the tools available to analyze data and answer questions.' :
'You are an evaluation assistant that reviews tool responses and determines if they meet criteria. Format your response as SCORE: [0-1 number], PASSED: [true/false], REASONING: [your detailed explanation].';
// Real API call
const response = await this.client.messages.create({
model,
system: systemPrompt,
max_tokens: 1000,
messages: [
{ role: 'user', content: prompt }
],
temperature: 0.1
});
// Update appropriate token counter based on context
if (isForTool) {
this.toolTokenCounts.toolPrompt += response.usage?.input_tokens || 0;
this.toolTokenCounts.toolCompletion += response.usage?.output_tokens || 0;
this.toolTokenCounts.toolTotal = this.toolTokenCounts.toolPrompt + this.toolTokenCounts.toolCompletion;
} else {
this.tokenCounts.prompt += response.usage?.input_tokens || 0;
this.tokenCounts.completion += response.usage?.output_tokens || 0;
this.tokenCounts.total = this.tokenCounts.prompt + this.tokenCounts.completion;
}
// Reset context after call
this.isToolCall = false;
if (response.content?.[0]?.type === 'text') {
return response.content[0].text as string;
} else {
return `Got ${response.content?.[0]?.type || 'unknown'} response, expected text`;
}
} catch (error) {
console.error('Anthropic API error:', error);
return `SCORE: 0\nPASSED: false\nREASONING: Error calling Anthropic API: ${error instanceof Error ? error.message : String(error)}`;
}
}
getTokenUsage() {
return {
...this.tokenCounts,
...this.toolTokenCounts
};
}
}
// Google Gemini provider implementation
class GeminiProvider implements LLMProvider {
name = 'gemini';
models = ['gemini-2.0-flash-001', 'gemini-2.0-pro-001', 'gemini-1.5-pro-latest', 'gemini-1.5-flash-latest'];
// Track validation tokens (for judging)
private tokenCounts = { prompt: 0, completion: 0, total: 0 };
// Track tool usage tokens separately
private toolTokenCounts = { toolPrompt: 0, toolCompletion: 0, toolTotal: 0 };
// Flag to determine if a call is for validation or tool usage
private isToolCall = false;
private verbose = process.env.EVAL_VERBOSE === 'true';
private client: GoogleGenAI;
constructor(private apiKey: string) {
this.client = new GoogleGenAI({apiKey: this.apiKey});
}
// Set the context for token tracking
setToolCallContext(isToolCall: boolean) {
this.isToolCall = isToolCall;
}
async runPrompt(prompt: string, model: string): Promise<string> {
try {
// Determine if this is for tool usage or validation
const isForTool = this.isToolCall;
if (this.verbose) {
console.log(`Running Google Gemini prompt with model ${model} ${isForTool ? '(for tool usage)' : '(for validation)'}`);
}
// Different system prompts based on context
const systemPrompt = isForTool ?
'You are an assistant helping with data analysis. Use the tools available to analyze data and answer questions.' :
'You are an evaluation assistant that reviews tool responses and determines if they meet criteria. Format your response as SCORE: [0-1 number], PASSED: [true/false], REASONING: [your detailed explanation].';
// Combine system prompt and user prompt
const fullPrompt = systemPrompt + '\n\n' + prompt;
// Real API call - using the correct API structure
const response = await this.client.models.generateContent({
model: model,
contents: fullPrompt
});
// Unfortunately, Gemini's token counts aren't available in the response
// We'll use estimates based on characters for now
const inputChars = fullPrompt.length;
const outputText = response.text || '';
const outputChars = outputText.length;
// Rough estimate: 4 chars per token
const estimatedInputTokens = Math.round(inputChars / 4);
const estimatedOutputTokens = Math.round(outputChars / 4);
// Update appropriate token counter based on context
if (isForTool) {
this.toolTokenCounts.toolPrompt += estimatedInputTokens;
this.toolTokenCounts.toolCompletion += estimatedOutputTokens;
this.toolTokenCounts.toolTotal = this.toolTokenCounts.toolPrompt + this.toolTokenCounts.toolCompletion;
} else {
this.tokenCounts.prompt += estimatedInputTokens;
this.tokenCounts.completion += estimatedOutputTokens;
this.tokenCounts.total = this.tokenCounts.prompt + this.tokenCounts.completion;
}
// Reset context after call
this.isToolCall = false;
return outputText;
} catch (error) {
console.error('Google Gemini API error:', error);
return `SCORE: 0\nPASSED: false\nREASONING: Error calling Google Gemini API: ${error instanceof Error ? error.message : String(error)}`;
}
}
getTokenUsage() {
return {
...this.tokenCounts,
...this.toolTokenCounts
};
}
}
async function generateReportIndex(reportsDir: string): Promise<void> {
// Ensure reports directory exists
await fs.mkdir(reportsDir, { recursive: true });
// Get all report files
const files = await fs.readdir(reportsDir);
const reportFiles = files.filter(file => file.startsWith('report-') && file.endsWith('.html'));
// Sort by date (newest first)
reportFiles.sort((a, b) => {
return b.localeCompare(a);
});
// Prepare template data
const reports = reportFiles.map((file, index) => {
const isLatest = index === 0;
const dateMatch = file.match(/report-(.+)\.html/);
const dateStr = dateMatch && dateMatch[1] ? dateMatch[1].replace(/-/g, ':').replace('T', ' ').substring(0, 19) : 'Unknown date';
return {
filename: file,
dateStr,
isLatest
};
});
// Load template
const templatePath = path.join(process.cwd(), 'eval', 'templates', 'index.html');
let template;
try {
template = await fs.readFile(templatePath, 'utf-8');
console.log(`Loaded index template from ${templatePath}`);
} catch (error) {
console.error(`Error loading template from ${templatePath}:`, error);
// Fall back to a basic template if the file doesn't exist
template = `<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Honeycomb MCP Evaluation Reports</title>
<style>
body { font-family: sans-serif; line-height: 1.6; margin: 0; padding: 20px; color: #333; }
.container { max-width: 800px; margin: 0 auto; }
h1 { color: #F5A623; border-bottom: 2px solid #F5A623; padding-bottom: 10px; }
ul { list-style-type: none; padding: 0; }
li { margin: 10px 0; padding: 10px; border-bottom: 1px solid #eee; }
a { color: #0066cc; text-decoration: none; }
a:hover { text-decoration: underline; }
.date { color: #666; font-size: 0.9em; }
.latest { background: #fffbf4; border-left: 3px solid #F5A623; padding-left: 15px; }
</style>
</head>
<body>
<div class="container">
<h1>Honeycomb MCP Evaluation Reports</h1>
<p>Select a report to view detailed evaluation results:</p>
<ul>
{{#reports}}
<li class="{{#isLatest}}latest{{/isLatest}}">
<a href="{{filename}}">{{#isLatest}}📊 Latest: {{/isLatest}}Report from {{dateStr}}</a>
{{#isLatest}}<small>(This is the most recent evaluation run)</small>{{/isLatest}}
</li>
{{/reports}}
</ul>
</div>
</body>
</html>`;
}
// Render template
const html = mustache.render(template, { reports });
await fs.writeFile(path.join(reportsDir, 'index.html'), html, 'utf-8');
console.log(`Report index generated at: ${path.join(reportsDir, 'index.html')}`);
}
async function generateReport(summaryPath: string, outputPath: string): Promise<void> {
// Ensure reports directory exists
await fs.mkdir(path.dirname(outputPath), { recursive: true });
const summaryData = await fs.readFile(summaryPath, 'utf-8');
const summary = JSON.parse(summaryData);
// Load template
const templatePath = path.join(process.cwd(), 'eval', 'templates', 'report.html');
let template;
try {
template = await fs.readFile(templatePath, 'utf-8');
console.log(`Loaded report template from ${templatePath}`);
} catch (error) {
console.error(`Error loading template from ${templatePath}:`, error);
// Fall back to a basic template if the file doesn't exist
// Using minimal version - in a real implementation you'd have a complete fallback template
template = `<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Honeycomb MCP Evaluation Report</title>
<style>
body { font-family: sans-serif; line-height: 1.6; margin: 0; padding: 20px; color: #333; }
.container { max-width: 1200px; margin: 0 auto; }
h1 { color: #F5A623; border-bottom: 2px solid #F5A623; padding-bottom: 10px; }
</style>
</head>
<body>
<div class="container">
<h1>Honeycomb MCP Evaluation Report</h1>
<p>Generated on: {{timestamp}}</p>
<p>See template file for complete implementation.</p>
</div>
</body>
</html>`;
}
// Prepare template data
const view = {
timestamp: new Date(summary.timestamp).toLocaleString(),
totalTests: summary.totalTests,
passed: summary.passed,
failed: summary.failed,
successRate: (summary.successRate * 100).toFixed(1),
averageLatency: summary.averageLatency.toFixed(0),
averageToolCalls: summary.averageToolCalls ? summary.averageToolCalls.toFixed(1) : 'N/A',
averageToolTokens: summary.averageToolTokens ? summary.averageToolTokens.toFixed(0) : 'N/A',
judgeInfo: summary.metadata?.judge ? {
provider: summary.metadata.judge.provider,
model: summary.metadata.judge.model
} : null,
results: summary.results.map((result: any) => {
const isAgent = result.prompt.agentMode;
const isConversation = result.prompt.conversationMode;
const isMultiStep = result.prompt.steps && result.prompt.steps.length > 0;
const isSingle = !isAgent && !isConversation && !isMultiStep;
// Format token usage if available
const hasTokenUsage = result.metrics.tokenUsage?.total !== undefined;
// Format tool calls
const hasToolCalls = result.toolCalls && result.toolCalls.length > 0;
const toolCalls = hasToolCalls ? result.toolCalls.map((call: any, idx: number) => {
// Format the summary properly
let formattedSummary = call.summary;
if (formattedSummary) {
// Check if it's an array
if (Array.isArray(formattedSummary)) {
formattedSummary = JSON.stringify(formattedSummary, null, 2);
}
}
const toolName = call.tool || (call.complete ? 'Final Summary' : 'Thinking');
return {
tool: toolName,
'tool.isThinking': toolName === 'Thinking',
'tool.isFinalSummary': toolName === 'Final Summary',
'tool.hasError': !!call.error || !!(call.response && call.response.error),
index: idx + 1,
step: call.step,
thought: call.thought,
plan: call.plan,
reasoning: call.reasoning,
summary: formattedSummary,
complete: call.complete,
error: call.error || (call.response && call.response.error ?
(typeof call.response.error === 'string' ? call.response.error :
(call.response.error.message || JSON.stringify(call.response.error, null, 2))) : null),
parametersJson: JSON.stringify(call.parameters || {}, null, 2),
responseJson: JSON.stringify(call.response || {}, null, 2),
callLatency: call.latencyMs || 0
};
}) : [];
// Get agent scores if available
const agentScores = result.validation.agentScores;
return {
id: result.id,
provider: result.provider,
model: result.model,
modelSafe: result.model.replace(/[^a-zA-Z0-9-]/g, '_'),
isAgent,
isConversation,
isMultiStep,
isSingle,
toolCallCount: result.metrics.toolCallCount || 1,
passed: result.validation.passed,
score: result.validation.score !== undefined ? result.validation.score.toFixed(2) : 'N/A',
reasoning: result.validation.reasoning,
latency: result.metrics.latencyMs,
// Agent-specific metrics
goalAchievement: agentScores?.goalAchievement !== undefined ? agentScores.goalAchievement.toFixed(2) : 'N/A',
reasoningQuality: agentScores?.reasoningQuality !== undefined ? agentScores.reasoningQuality.toFixed(2) : 'N/A',
pathEfficiency: agentScores?.pathEfficiency !== undefined ? agentScores.pathEfficiency.toFixed(2) : 'N/A',
hasTokenUsage,
promptTokens: result.metrics.tokenUsage?.prompt || 0,
completionTokens: result.metrics.tokenUsage?.completion || 0,
totalTokens: result.metrics.tokenUsage?.total || 0,
toolPromptTokens: result.metrics.tokenUsage?.toolPrompt || 0,
toolCompletionTokens: result.metrics.tokenUsage?.toolCompletion || 0,
toolTotalTokens: result.metrics.tokenUsage?.toolTotal || 0,
hasToolCalls,
toolCallsLength: toolCalls.length,
toolCalls,
toolResponseJson: JSON.stringify(result.toolResponse, null, 2),
promptJson: JSON.stringify(result.prompt, null, 2)
};
})
};
// Render template
const html = mustache.render(template, view);
await fs.writeFile(outputPath, html, 'utf-8');
console.log(`Report generated at: ${outputPath}`);
}
async function main() {
const args = process.argv.slice(2);
const command = args[0];
const testFile = args[1]; // Add support for specifying a test file
// Load environment variables from root .env file
try {
dotenv.config({ path: path.resolve(process.cwd(), '.env') });
console.log('Loaded environment variables from .env file');
} catch (error) {
console.log('No .env file found or error loading it, will use environment variables if available');
}
if (command === 'run') {
// Load environment variables for API keys
const openaiApiKey = process.env.OPENAI_API_KEY;
const anthropicApiKey = process.env.ANTHROPIC_API_KEY;
const geminiApiKey = process.env.GEMINI_API_KEY;
// Initialize providers array
const providers: LLMProvider[] = [];
// Add providers based on available API keys
if (openaiApiKey) {
providers.push(new OpenAIProvider(openaiApiKey));
console.log('Added OpenAI provider with API key');
}
if (anthropicApiKey) {
providers.push(new AnthropicProvider(anthropicApiKey));
console.log('Added Anthropic provider with API key');
}
if (geminiApiKey) {
providers.push(new GeminiProvider(geminiApiKey));
console.log('Added Google Gemini provider with API key');
}
// Exit if no API keys are available
if (providers.length === 0) {
console.error('\nERROR: No valid API keys available.\n');
console.error('You must set at least one of these environment variables:');
console.error(' - OPENAI_API_KEY for OpenAI models');
console.error(' - ANTHROPIC_API_KEY for Anthropic models');
console.error(' - GEMINI_API_KEY for Google Gemini models\n');
console.error('For example: OPENAI_API_KEY=your_key pnpm run eval\n');
process.exit(1);
}
// Judge configuration
const config: EvalConfig = {
judgeProvider: process.env.EVAL_JUDGE_PROVIDER || 'anthropic',
judgeModel: process.env.EVAL_JUDGE_MODEL || 'claude-3-5-haiku-latest'
};
// Validate judge configuration
const judgeProvider = providers.find(p => p.name === config.judgeProvider);
if (!judgeProvider) {
console.error(`Specified judge provider "${config.judgeProvider}" not available. Check API keys and configuration.`);
process.exit(1);
}
// Check if the model exists for the provider
if (!judgeProvider.models.includes(config.judgeModel)) {
console.warn(`Warning: Judge model "${config.judgeModel}" not in known models for ${config.judgeProvider}.`);
console.warn(`Available models: ${judgeProvider.models.join(', ')}`);
console.warn('Continuing with the specified model, but it might not work.');
}
console.log(`Using ${config.judgeProvider}/${config.judgeModel} as the validation judge`);
// Select models to use (could be from config or args)
// Parse from JSON string in env var if available
// This can be either a string or an array of strings for each provider
let selectedModels = new Map([
['openai', ['gpt-4o']],
['anthropic', ['claude-3-5-haiku-latest']],
['gemini', ['gemini-2.0-flash-001']]
]);
if (process.env.EVAL_MODELS) {
try {
const modelConfig = JSON.parse(process.env.EVAL_MODELS);
// Convert the modelConfig to a Map with arrays of models
const modelMap = new Map();
for (const [provider, models] of Object.entries(modelConfig)) {
if (Array.isArray(models)) {
modelMap.set(provider, models);
} else {
modelMap.set(provider, [models]);
}
}
selectedModels = modelMap;
console.log('Using models from environment config:',
Object.fromEntries(selectedModels.entries()));
} catch (error) {
console.error('Error parsing EVAL_MODELS env var:', error);
}
}
// Get concurrency from env or default to 2
const concurrency = parseInt(process.env.EVAL_CONCURRENCY || '2', 10);
// Configuration for runner
const runnerConfig: any = {
promptsDir: path.resolve('eval/prompts'),
resultsDir: path.resolve('eval/results'),
providers,
selectedModels,
concurrency,
judge: {
provider: config.judgeProvider,
model: config.judgeModel
},
verbose: process.env.EVAL_VERBOSE === 'true',
testFile: testFile // Pass the specific test file to run, if specified
};
// For stdio-based MCP connection
if (process.env.MCP_SERVER_COMMAND) {
console.log(`Using MCP server command: ${process.env.MCP_SERVER_COMMAND}`);
runnerConfig.serverCommandLine = process.env.MCP_SERVER_COMMAND;
}
// For HTTP-based MCP connection
else if (process.env.MCP_SERVER_URL) {
console.log(`Using MCP server URL: ${process.env.MCP_SERVER_URL}`);
runnerConfig.serverUrl = process.env.MCP_SERVER_URL;
}
// Default for local development
else {
console.log('Using default node build/index.mjs command');
runnerConfig.serverCommandLine = 'node build/index.mjs';
}
const runner = new EvalRunner(runnerConfig);
console.log('Starting evaluation run...');
const summary = await runner.runAll();
// Save summary
const summaryPath = path.resolve(`eval/results/summary-${new Date().toISOString().replace(/[:\.]/g, '-')}.json`);
await fs.writeFile(summaryPath, JSON.stringify(summary, null, 2), 'utf-8');
console.log(`Evaluation complete. Summary saved to ${summaryPath}`);
// Generate report
const reportTimestamp = new Date().toISOString().replace(/[:\.]/g, '-');
const reportPath = path.resolve(`eval/reports/report-${reportTimestamp}.html`);
await generateReport(summaryPath, reportPath);
// Generate or update an index.html that lists all reports
await generateReportIndex(path.resolve('eval/reports'));
} else if (command === 'report' && args[1]) {
const summaryPath = args[1];
const reportTimestamp = new Date().toISOString().replace(/[:\.]/g, '-');
const reportPath = path.resolve(`eval/reports/report-${reportTimestamp}.html`);
await generateReport(summaryPath, reportPath);
// Update the index after generating a new report
await generateReportIndex(path.resolve('eval/reports'));
} else if (command === 'update-index') {
await generateReportIndex(path.resolve('eval/reports'));
} else {
console.log(`
Usage:
run-eval run [test-file] Run evaluations (optionally specify a single test file)
run-eval report [summary-path] Generate report from a summary file
run-eval update-index Update the reports index.html file
Examples:
run-eval run Run all tests
run-eval run simple-test.json Run a specific test file
`);
}
}
main().catch(console.error);