#!/usr/bin/env node
/**
* Azure Chaos Studio Integration for Azure AI MCP Server
*
* This script orchestrates chaos engineering experiments using Azure Chaos Studio
* to test the resilience and reliability of the Azure AI MCP Server.
*/
import { DefaultAzureCredential } from '@azure/identity';
import { ChaosManagementClient } from '@azure/arm-chaos';
import axios from 'axios';
import winston from 'winston';
// Configuration
const config = {
subscriptionId: process.env.AZURE_SUBSCRIPTION_ID,
resourceGroupName: process.env.AZURE_RESOURCE_GROUP_NAME || 'rg-azure-ai-mcp-server-prod',
chaosStudioEndpoint: process.env.AZURE_CHAOS_STUDIO_ENDPOINT,
containerAppName: process.env.CONTAINER_APP_NAME || 'ca-azure-ai-mcp-server-prod',
healthCheckUrl: process.env.HEALTH_CHECK_URL,
testDurationMinutes: parseInt(process.env.CHAOS_TEST_DURATION || '5'),
recoveryTimeoutMinutes: parseInt(process.env.CHAOS_RECOVERY_TIMEOUT || '10'),
};
// Logger setup
const logger = winston.createLogger({
level: 'info',
format: winston.format.combine(
winston.format.timestamp(),
winston.format.json()
),
transports: [
new winston.transports.Console(),
new winston.transports.File({ filename: 'chaos-tests.log' })
]
});
class ChaosTestRunner {
constructor() {
this.credential = new DefaultAzureCredential();
this.chaosClient = new ChaosManagementClient(this.credential, config.subscriptionId);
this.testResults = [];
}
/**
* Run all chaos engineering experiments
*/
async runAllTests() {
logger.info('Starting chaos engineering test suite');
const experiments = [
{
name: 'container-app-stop',
description: 'Stop container app instances to test auto-recovery',
type: 'service-disruption'
},
{
name: 'network-latency',
description: 'Introduce network latency to test timeout handling',
type: 'network-chaos'
},
{
name: 'cpu-pressure',
description: 'Apply CPU pressure to test performance under load',
type: 'resource-exhaustion'
},
{
name: 'memory-pressure',
description: 'Apply memory pressure to test memory management',
type: 'resource-exhaustion'
},
{
name: 'dependency-failure',
description: 'Simulate Azure service failures',
type: 'dependency-chaos'
}
];
for (const experiment of experiments) {
try {
logger.info(`Running experiment: ${experiment.name}`);
const result = await this.runExperiment(experiment);
this.testResults.push(result);
// Wait between experiments
await this.sleep(30000);
} catch (error) {
logger.error(`Experiment ${experiment.name} failed:`, error);
this.testResults.push({
name: experiment.name,
status: 'failed',
error: error.message
});
}
}
return this.generateReport();
}
/**
* Run a specific chaos experiment
*/
async runExperiment(experiment) {
const startTime = Date.now();
// Pre-experiment health check
const preHealthy = await this.checkHealth();
if (!preHealthy) {
throw new Error('System not healthy before experiment');
}
// Start experiment
logger.info(`Starting experiment: ${experiment.name}`);
const experimentId = await this.startChaosExperiment(experiment);
// Monitor during experiment
const monitoringResults = await this.monitorExperiment(experiment, experimentId);
// Stop experiment
await this.stopChaosExperiment(experimentId);
// Wait for recovery
const recovered = await this.waitForRecovery();
const endTime = Date.now();
const duration = endTime - startTime;
return {
name: experiment.name,
description: experiment.description,
type: experiment.type,
status: recovered ? 'passed' : 'failed',
duration: duration,
preHealthy: preHealthy,
recovered: recovered,
monitoring: monitoringResults,
timestamp: new Date().toISOString()
};
}
/**
* Start a chaos experiment using Azure Chaos Studio
*/
async startChaosExperiment(experiment) {
const experimentDefinition = this.createExperimentDefinition(experiment);
try {
// Create experiment if it doesn't exist
const experimentName = `chaos-${experiment.name}-${Date.now()}`;
await this.chaosClient.experiments.createOrUpdate(
config.resourceGroupName,
experimentName,
experimentDefinition
);
// Start the experiment
await this.chaosClient.experiments.start(
config.resourceGroupName,
experimentName
);
logger.info(`Chaos experiment ${experimentName} started`);
return experimentName;
} catch (error) {
logger.error('Failed to start chaos experiment:', error);
throw error;
}
}
/**
* Create experiment definition based on type
*/
createExperimentDefinition(experiment) {
const baseDefinition = {
location: 'East US',
identity: {
type: 'SystemAssigned'
},
selectors: [
{
type: 'List',
id: 'selector1',
targets: [
{
type: 'ChaosTarget',
id: `/subscriptions/${config.subscriptionId}/resourceGroups/${config.resourceGroupName}/providers/Microsoft.App/containerApps/${config.containerAppName}/providers/Microsoft.Chaos/targets/Microsoft-ContainerApp`
}
]
}
]
};
switch (experiment.type) {
case 'service-disruption':
return {
...baseDefinition,
steps: [
{
name: 'Step1',
branches: [
{
name: 'Branch1',
actions: [
{
type: 'continuous',
name: 'urn:csci:microsoft:containerApp:stop/1.0',
duration: `PT${config.testDurationMinutes}M`,
parameters: [
{
key: 'abruptShutdown',
value: 'false'
}
],
selectorId: 'selector1'
}
]
}
]
}
]
};
case 'network-chaos':
return {
...baseDefinition,
steps: [
{
name: 'Step1',
branches: [
{
name: 'Branch1',
actions: [
{
type: 'continuous',
name: 'urn:csci:microsoft:containerApp:networkLatency/1.0',
duration: `PT${config.testDurationMinutes}M`,
parameters: [
{
key: 'latencyMs',
value: '1000'
},
{
key: 'jitterMs',
value: '200'
}
],
selectorId: 'selector1'
}
]
}
]
}
]
};
case 'resource-exhaustion':
return {
...baseDefinition,
steps: [
{
name: 'Step1',
branches: [
{
name: 'Branch1',
actions: [
{
type: 'continuous',
name: experiment.name.includes('cpu')
? 'urn:csci:microsoft:containerApp:cpuPressure/1.0'
: 'urn:csci:microsoft:containerApp:memoryPressure/1.0',
duration: `PT${config.testDurationMinutes}M`,
parameters: [
{
key: 'pressureLevel',
value: '80'
}
],
selectorId: 'selector1'
}
]
}
]
}
]
};
default:
throw new Error(`Unknown experiment type: ${experiment.type}`);
}
}
/**
* Monitor experiment progress and system behavior
*/
async monitorExperiment(experiment, experimentId) {
const monitoringResults = {
healthChecks: [],
responseTime: [],
errorRate: [],
resourceUtilization: []
};
const monitoringDuration = config.testDurationMinutes * 60 * 1000;
const checkInterval = 30000; // 30 seconds
const startTime = Date.now();
while (Date.now() - startTime < monitoringDuration) {
try {
// Health check
const healthResult = await this.checkHealth();
monitoringResults.healthChecks.push({
timestamp: new Date().toISOString(),
healthy: healthResult
});
// Response time check
const responseTime = await this.measureResponseTime();
monitoringResults.responseTime.push({
timestamp: new Date().toISOString(),
responseTime: responseTime
});
// Error rate check (would need to query Application Insights)
const errorRate = await this.getErrorRate();
monitoringResults.errorRate.push({
timestamp: new Date().toISOString(),
errorRate: errorRate
});
logger.info(`Monitoring - Health: ${healthResult}, Response Time: ${responseTime}ms, Error Rate: ${errorRate}%`);
} catch (error) {
logger.warn('Monitoring check failed:', error.message);
}
await this.sleep(checkInterval);
}
return monitoringResults;
}
/**
* Stop chaos experiment
*/
async stopChaosExperiment(experimentId) {
try {
await this.chaosClient.experiments.cancel(
config.resourceGroupName,
experimentId
);
logger.info(`Chaos experiment ${experimentId} stopped`);
} catch (error) {
logger.error('Failed to stop chaos experiment:', error);
}
}
/**
* Check system health
*/
async checkHealth() {
try {
const response = await axios.get(`${config.healthCheckUrl}/health`, {
timeout: 10000
});
return response.status === 200;
} catch (error) {
logger.warn('Health check failed:', error.message);
return false;
}
}
/**
* Measure response time
*/
async measureResponseTime() {
const startTime = Date.now();
try {
await axios.get(`${config.healthCheckUrl}/health`, {
timeout: 30000
});
return Date.now() - startTime;
} catch (error) {
return -1; // Indicate failure
}
}
/**
* Get error rate from monitoring
*/
async getErrorRate() {
// In a real implementation, this would query Application Insights
// For now, return a simulated value
return Math.random() * 5; // 0-5% error rate
}
/**
* Wait for system recovery after experiment
*/
async waitForRecovery() {
const maxWaitTime = config.recoveryTimeoutMinutes * 60 * 1000;
const checkInterval = 10000; // 10 seconds
const startTime = Date.now();
while (Date.now() - startTime < maxWaitTime) {
const healthy = await this.checkHealth();
if (healthy) {
logger.info('System recovered successfully');
return true;
}
logger.info('Waiting for system recovery...');
await this.sleep(checkInterval);
}
logger.error('System failed to recover within timeout');
return false;
}
/**
* Generate comprehensive test report
*/
generateReport() {
const report = {
summary: {
totalTests: this.testResults.length,
passed: this.testResults.filter(r => r.status === 'passed').length,
failed: this.testResults.filter(r => r.status === 'failed').length,
timestamp: new Date().toISOString()
},
results: this.testResults,
recommendations: this.generateRecommendations()
};
logger.info('Chaos engineering test report:', report);
return report;
}
/**
* Generate recommendations based on test results
*/
generateRecommendations() {
const recommendations = [];
const failedTests = this.testResults.filter(r => r.status === 'failed');
if (failedTests.length > 0) {
recommendations.push('Review failed experiments and improve system resilience');
}
const slowRecovery = this.testResults.filter(r => r.duration > 300000); // 5 minutes
if (slowRecovery.length > 0) {
recommendations.push('Optimize recovery time for better RTO');
}
if (this.testResults.length === 0) {
recommendations.push('No tests executed - verify chaos studio configuration');
}
return recommendations;
}
/**
* Utility function for delays
*/
sleep(ms) {
return new Promise(resolve => setTimeout(resolve, ms));
}
}
// Main execution
async function main() {
try {
const runner = new ChaosTestRunner();
const report = await runner.runAllTests();
console.log('\n=== CHAOS ENGINEERING TEST REPORT ===');
console.log(JSON.stringify(report, null, 2));
// Exit with appropriate code
process.exit(report.summary.failed > 0 ? 1 : 0);
} catch (error) {
logger.error('Chaos test execution failed:', error);
process.exit(1);
}
}
// Run if called directly
if (import.meta.url === `file://${process.argv[1]}`) {
main();
}
export { ChaosTestRunner };