AnyCrawl MCP Server

mcp-server.ts•25 kB

#!/usr/bin/env node import { Server } from '@modelcontextprotocol/sdk/server/index.js'; import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js'; import { StreamableHTTPServerTransport } from '@modelcontextprotocol/sdk/server/streamableHttp.js'; import { SSEServerTransport } from "@modelcontextprotocol/sdk/server/sse.js"; import { CallToolRequestSchema, ListToolsRequestSchema, Tool, CallToolResult, TextContent, ErrorCode, McpError, } from '@modelcontextprotocol/sdk/types.js'; import { AnyCrawlClient } from '@anycrawl/js-sdk'; import { logger } from './logger.js'; import { z } from 'zod'; import { ScrapeToolSchema, CrawlToolSchema, SearchToolSchema, CrawlStatusToolSchema, CrawlResultsToolSchema, CancelCrawlToolSchema, } from './schemas.js'; export class AnyCrawlMCPServer { private server: Server; private client: AnyCrawlClient; constructor(apiKey: string, baseUrl?: string) { logger.info('Initializing AnyCrawl MCP Server'); this.server = new Server( { name: 'anycrawl-mcp-server', version: '0.0.3', }, { capabilities: { tools: {}, }, } ); this.client = new AnyCrawlClient(apiKey, baseUrl); this.setupToolHandlers(); logger.info('AnyCrawl MCP Server initialized successfully'); } // Get tool definitions for FastMCP integration public getToolDefinitions(): Array<{ name: string; description: string; parameters: any }> { return this.getToolDefinitionsInternal().map(tool => ({ name: tool.name, description: tool.description, parameters: tool.inputSchema })); } // Internal method to get tool definitions with zod schemas private getToolDefinitionsInternal(): Array<{ name: string; description: string; inputSchema: any }> { return [ { name: 'anycrawl_scrape', description: `Scrape a single URL and extract content in selected formats. Best for: One known page (articles, docs, product pages). Not recommended for: Multi-page coverage (use anycrawl_crawl) or open-ended discovery (use anycrawl_search). RECOMMENDED: Use 'playwright' engine for best results with dynamic content and modern websites. Usage (parameters): - url: HTTP/HTTPS URL to scrape (string, required) - engine: 'playwright' | 'cheerio' | 'puppeteer' (required, default: 'playwright') - proxy: Proxy URL (string, optional) - formats: Output formats ['markdown'|'html'|'text'|'screenshot'|'screenshot@fullPage'|'rawHtml'|'json'] (optional) - timeout: Request timeout in ms (number, optional) - retry: Enable auto-retry on failure (boolean, optional) - wait_for: Wait in ms for dynamic pages (number, optional) - include_tags: HTML tags to include (string[], optional) - exclude_tags: HTML tags to exclude (string[], optional) - json_options: { schema?, user_prompt?, schema_name?, schema_description? } (optional) - extract_source: 'html' | 'markdown' (optional) Returns: { url, status, jobId?, title?, html?, markdown?, metadata?, timestamp? } Examples: - Recommended: { "url": "https://example.com", "engine": "playwright" } - With JSON extraction: { "url": "https://news.ycombinator.com", "engine": "playwright", "formats": ["markdown"], "json_options": { "user_prompt": "Extract titles", "schema_name": "Articles" } } - With JSON schema extraction: { "url": "https://example.com/article", "engine": "playwright", "json_options": { "schema_name": "Article", "schema_description": "Extract article metadata and content", "schema": { "type": "object", "properties": { "title": { "type": "string" }, "author": { "type": "string" }, "date": { "type": "string" }, "content": { "type": "string" } }, "required": ["title", "content"] } } }`, inputSchema: ScrapeToolSchema }, { name: 'anycrawl_crawl', description: `Crawl an entire website with configurable depth and limits. Best for: Multi-page coverage, site mapping, content discovery. Not recommended for: Single pages (use anycrawl_scrape) or open-ended discovery (use anycrawl_search). RECOMMENDED: Use 'playwright' engine for best results with dynamic content and modern websites. Usage (parameters): - url: Starting URL to crawl (string, required) - engine: 'playwright' | 'cheerio' | 'puppeteer' (required, default: 'playwright') - max_depth: Maximum crawl depth (number, optional, default: 10) - limit: Maximum pages to crawl (number, optional, default: 100) - strategy: Crawl strategy 'all' | 'same-domain' | 'same-hostname' | 'same-origin' (optional, default: 'same-domain') - include_paths: Path patterns to include (string[], optional) - exclude_paths: Path patterns to exclude (string[], optional) - retry: Enable auto-retry on failure (boolean, optional) - poll_seconds: Polling interval for job status (number, optional) - poll_interval_ms: Polling interval in milliseconds (number, optional) - timeout_ms: Job timeout in milliseconds (number, optional) - scrape_options: Nested scrape options for each page (object, optional) Returns: { job_id, status, message } for async jobs Examples: - Recommended: { "url": "https://example.com", "engine": "playwright", "limit": 50 } - Deep crawl: { "url": "https://docs.example.com", "engine": "playwright", "max_depth": 5, "limit": 200 } - Filtered crawl: { "url": "https://blog.example.com", "engine": "playwright", "include_paths": ["/posts/*"], "exclude_paths": ["/admin/*"] }`, inputSchema: CrawlToolSchema }, { name: 'anycrawl_search', description: `Search the web and optionally scrape results. Best for: Open-ended discovery, finding relevant content. Not recommended for: Known URLs (use anycrawl_scrape) or comprehensive site coverage (use anycrawl_crawl). RECOMMENDED: Use limit=5 for balanced performance and cost. Use 'playwright' engine for scraping results. Usage (parameters): - query: Search query string (string, required) - engine: Search engine 'google' (optional, default: 'google') - limit: Number of results to return (number, optional, default: 5) - offset: Number of results to skip (number, optional, default: 0) - pages: Number of search result pages to process (number, optional) - lang: Language code (string, optional) - country: Country code (string, optional) - safeSearch: Safe search level 0-2 (number, optional) - scrape_options: Options for scraping search results (object, optional) Returns: Array of search results with optional scraped content Examples: - Recommended: { "query": "artificial intelligence news", "limit": 5 } - With scraping: { "query": "TypeScript tutorials", "limit": 5, "scrape_options": { "formats": ["markdown"], "engine": "playwright" } } - Localized search: { "query": "machine learning", "lang": "es", "country": "ES", "limit": 5 }`, inputSchema: SearchToolSchema }, { name: 'anycrawl_crawl_status', description: `Get the status of a crawl job. Check progress, completion status, and statistics for an ongoing or completed crawl job. Usage (parameters): - job_id: The crawl job ID (string, required) Returns: { job_id, status, start_time, expires_at, credits_used, total, completed, failed } Examples: - Check status: { "job_id": "crawl_12345" }`, inputSchema: CrawlStatusToolSchema }, { name: 'anycrawl_crawl_results', description: `Get the results of a completed crawl job. Retrieve the scraped content and metadata from a completed crawl job. Usage (parameters): - job_id: The crawl job ID (string, required) - skip: Number of results to skip (number, optional, default: 0) Returns: { status, total, completed, creditsUsed, next?, data[] } Examples: - Get all results: { "job_id": "crawl_12345" } - Paginated results: { "job_id": "crawl_12345", "skip": 50 }`, inputSchema: CrawlResultsToolSchema }, { name: 'anycrawl_cancel_crawl', description: `Cancel a running crawl job. Stop an ongoing crawl job and prevent further processing. Usage (parameters): - job_id: The crawl job ID to cancel (string, required) Returns: { success: boolean, message: string } Examples: - Cancel job: { "job_id": "crawl_12345" }`, inputSchema: CancelCrawlToolSchema } ]; } // Helper for tests to simulate tool calls without MCP transport public async handleToolCall(request: { name: string; arguments: any }): Promise<any> { const { name, arguments: args } = request as any; // Validate arguments if (!args || typeof args !== 'object') { logger.error(`Invalid arguments for tool ${name}:`, { args }); return { content: [{ type: 'text', text: `Invalid arguments: expected object, got ${typeof args}` }], isError: true, }; } try { logger.info(`Tool called: ${name}`, { tool: name, parameters: args, timestamp: new Date().toISOString() }); switch (name) { case 'anycrawl_scrape': { const r = await this.handleScrape(args); logger.info(`Tool ${name} completed successfully`, { tool: name, timestamp: new Date().toISOString() }); return r; } case 'anycrawl_crawl': { const r = await this.handleCrawl(args); logger.info(`Tool ${name} completed successfully`, { tool: name, timestamp: new Date().toISOString() }); return r; } case 'anycrawl_crawl_status': { const r = await this.handleCrawlStatus(args); logger.info(`Tool ${name} completed successfully`, { tool: name, timestamp: new Date().toISOString() }); return r; } case 'anycrawl_crawl_results': { const r = await this.handleCrawlResults(args); logger.info(`Tool ${name} completed successfully`, { tool: name, timestamp: new Date().toISOString() }); return r; } case 'anycrawl_cancel_crawl': { const r = await this.handleCancelCrawl(args); logger.info(`Tool ${name} completed successfully`, { tool: name, timestamp: new Date().toISOString() }); return r; } case 'anycrawl_search': { const r = await this.handleSearch(args); logger.info(`Tool ${name} completed successfully`, { tool: name, timestamp: new Date().toISOString() }); return r; } default: return { content: [{ type: 'text', text: `Unknown tool: ${name}` as string }], isError: true, } as any; } } catch (error) { const message = error instanceof Error ? error.message : 'Unknown error occurred'; logger.error(`Tool ${name} failed:`, { tool: name, error: message, parameters: args, timestamp: new Date().toISOString() }); return { content: [{ type: 'text', text: `Tool execution failed: ${message}` as string }], isError: true, } as any; } } private setupToolHandlers(): void { this.server.setRequestHandler(ListToolsRequestSchema, async () => { return { tools: this.getToolDefinitionsInternal() }; }); this.server.setRequestHandler(CallToolRequestSchema, async (request: any) => { const { name, arguments: args } = request.params; logger.info(`Tool called: ${name}`, { tool: name, parameters: args, timestamp: new Date().toISOString() }); try { let result: CallToolResult; switch (name) { case 'anycrawl_scrape': result = await this.handleScrape(args); break; case 'anycrawl_crawl': result = await this.handleCrawl(args); break; case 'anycrawl_crawl_status': result = await this.handleCrawlStatus(args); break; case 'anycrawl_crawl_results': result = await this.handleCrawlResults(args); break; case 'anycrawl_cancel_crawl': result = await this.handleCancelCrawl(args); break; case 'anycrawl_search': result = await this.handleSearch(args); break; default: logger.warn(`Unknown tool requested: ${name}`); throw new McpError(ErrorCode.MethodNotFound, `Unknown tool: ${name}`); } logger.info(`Tool ${name} completed successfully`, { tool: name, timestamp: new Date().toISOString() }); return result; } catch (error) { const errorMessage = error instanceof Error ? error.message : 'Unknown error occurred'; logger.error(`Tool ${name} failed:`, { tool: name, error: errorMessage, parameters: args, timestamp: new Date().toISOString() }); if (error instanceof McpError) { throw error; } throw new McpError(ErrorCode.InternalError, `Tool execution failed: ${errorMessage}`); } }); } private async handleScrape(args: any): Promise<CallToolResult> { logger.info(`Starting scrape for URL: ${args.url}`); const validatedArgs = ScrapeToolSchema.parse(args); const scrapeArgs: any = { url: validatedArgs.url, engine: validatedArgs.engine, }; if ('proxy' in args) scrapeArgs.proxy = validatedArgs.proxy; if ('formats' in args) scrapeArgs.formats = validatedArgs.formats; if ('timeout' in args) scrapeArgs.timeout = validatedArgs.timeout; if ('retry' in args) scrapeArgs.retry = validatedArgs.retry; if ('wait_for' in args) scrapeArgs.wait_for = validatedArgs.wait_for; if ('include_tags' in args) scrapeArgs.include_tags = validatedArgs.include_tags; if ('exclude_tags' in args) scrapeArgs.exclude_tags = validatedArgs.exclude_tags; if ('json_options' in args) scrapeArgs.json_options = validatedArgs.json_options; if ('extract_source' in args) scrapeArgs.extract_source = validatedArgs.extract_source; const result = await this.client.scrape(scrapeArgs); if (!result || typeof (result as any).status !== 'string') { logger.error('Unexpected scrape result payload:', result as any); throw new McpError(ErrorCode.InternalError, 'Unexpected scrape result from AnyCrawl API'); } if (result.status === 'failed') { logger.warn(`Scraping failed for ${result.url}: ${result.error}`); return { content: [ { type: 'text', text: `Scraping failed for ${result.url}: ${result.error}`, }, ], isError: true, }; } logger.info(`Scraping completed successfully for ${result.url}`); const response = { url: result.url, status: result.status, jobId: result.jobId, title: result.title, html: result.html, markdown: result.markdown, metadata: result.metadata, timestamp: result.timestamp, }; const ret = { content: [ { type: 'text', text: JSON.stringify(response, null, 2), } as TextContent, ], }; logger.debug('Tool anycrawl_scrape completed successfully'); return ret; } private async handleCrawl(args: any): Promise<CallToolResult> { logger.info(`Starting crawl for URL: ${args.url}`); const validatedArgs = CrawlToolSchema.parse(args); // Top-level crawl params (NOT part of ScrapeOptionsSchema) const crawlArgs: any = { url: validatedArgs.url, }; if ('retry' in args) crawlArgs.retry = validatedArgs.retry; if ('exclude_paths' in args) crawlArgs.exclude_paths = validatedArgs.exclude_paths; if ('include_paths' in args) crawlArgs.include_paths = validatedArgs.include_paths; if ('max_depth' in args) crawlArgs.max_depth = validatedArgs.max_depth; if ('strategy' in args) crawlArgs.strategy = validatedArgs.strategy; if ('limit' in args) crawlArgs.limit = validatedArgs.limit; if ('engine' in args) crawlArgs.engine = validatedArgs.engine; // ScrapeOptionsSchema fields must be nested under scrape_options const scrapeOptions: any = {}; if ('proxy' in args) scrapeOptions.proxy = validatedArgs.proxy; if ('formats' in args) scrapeOptions.formats = validatedArgs.formats; if ('timeout' in args) scrapeOptions.timeout = validatedArgs.timeout; if ('wait_for' in args) scrapeOptions.wait_for = validatedArgs.wait_for; if ('include_tags' in args) scrapeOptions.include_tags = validatedArgs.include_tags; if ('exclude_tags' in args) scrapeOptions.exclude_tags = validatedArgs.exclude_tags; if ('json_options' in args) scrapeOptions.json_options = validatedArgs.json_options; if ('extract_source' in args) scrapeOptions.extract_source = validatedArgs.extract_source; // Use SDK's aggregated crawl which handles creation and polling internally const pollSeconds: number = (args as any).poll_seconds ?? (((args as any).poll_interval_ms ? Math.max(1, Math.round((args as any).poll_interval_ms / 1000)) : 3)); const timeoutMs: number = (args as any).timeout_ms ?? 60000; const aggregated = await (this.client as any).crawl(crawlArgs, pollSeconds, timeoutMs); const ret = { content: [ { type: 'text', text: JSON.stringify(aggregated, null, 2), } as TextContent, ], }; logger.debug('Tool anycrawl_crawl completed successfully'); return ret; } private async handleCrawlStatus(args: any): Promise<CallToolResult> { const validatedArgs = CrawlStatusToolSchema.parse(args); const result = await this.client.getCrawlStatus(validatedArgs.job_id); const response = { job_id: result.job_id, status: result.status, start_time: result.start_time, expires_at: result.expires_at, credits_used: result.credits_used, total: result.total, completed: result.completed, failed: result.failed, }; const ret3 = { content: [ { type: 'text', text: JSON.stringify(response, null, 2), } as TextContent, ], }; logger.debug('Tool anycrawl_crawl_status completed successfully'); return ret3; } private async handleCrawlResults(args: any): Promise<CallToolResult> { const validatedArgs = CrawlResultsToolSchema.parse(args); const result = await this.client.getCrawlResults(validatedArgs.job_id, validatedArgs.skip); const response = { status: result.status, total: result.total, completed: result.completed, creditsUsed: result.creditsUsed, next: result.next, data: result.data, }; const ret4 = { content: [ { type: 'text', text: JSON.stringify(response, null, 2), } as TextContent, ], }; logger.debug('Tool anycrawl_crawl_results completed successfully'); return ret4; } private async handleCancelCrawl(args: any): Promise<CallToolResult> { const validatedArgs = CancelCrawlToolSchema.parse(args); const result = await this.client.cancelCrawl(validatedArgs.job_id); const ret5 = { content: [ { type: 'text', text: `Crawl job cancelled successfully!\nJob ID: ${result.job_id}\nStatus: ${result.status}`, } as TextContent, ], }; logger.debug('Tool anycrawl_cancel_crawl completed successfully'); return ret5; } private async handleSearch(args: any): Promise<CallToolResult> { const validatedArgs = SearchToolSchema.parse(args); // Top-level search params const searchArgs: any = { query: validatedArgs.query, }; if ('engine' in args) searchArgs.engine = validatedArgs.engine; // search engine (e.g., google) if ('limit' in args) searchArgs.limit = validatedArgs.limit; if ('offset' in args) searchArgs.offset = validatedArgs.offset; if ('pages' in args) searchArgs.pages = validatedArgs.pages; if ('lang' in args) searchArgs.lang = validatedArgs.lang; if ('country' in args) searchArgs.country = validatedArgs.country; if ('safeSearch' in args) searchArgs.safeSearch = validatedArgs.safeSearch; // Build nested scrape_options from SearchScrapeOptions fields const scrapeOptions: any = {}; if ('scrape_engine' in args) scrapeOptions.engine = (validatedArgs as any).scrape_engine; if ('proxy' in args) scrapeOptions.proxy = validatedArgs.proxy as any; if ('formats' in args) scrapeOptions.formats = validatedArgs.formats as any; if ('timeout' in args) scrapeOptions.timeout = validatedArgs.timeout as any; if ('wait_for' in args) scrapeOptions.wait_for = validatedArgs.wait_for as any; if ('include_tags' in args) scrapeOptions.include_tags = validatedArgs.include_tags as any; if ('exclude_tags' in args) scrapeOptions.exclude_tags = validatedArgs.exclude_tags as any; if ('json_options' in args) scrapeOptions.json_options = validatedArgs.json_options as any; if ('extract_source' in args) scrapeOptions.extract_source = (validatedArgs as any).extract_source; if (Object.keys(scrapeOptions).length > 0) { searchArgs.scrape_options = scrapeOptions; } const results = await this.client.search(searchArgs); const ret6 = { content: [ { type: 'text', text: JSON.stringify(results, null, 2), } as TextContent, ], }; logger.debug('Tool anycrawl_search completed successfully'); return ret6; } async run(): Promise<void> { const transport = new StdioServerTransport(); await this.server.connect(transport); logger.info('AnyCrawl MCP Server running on stdio'); } public async connectTransport(transport: StdioServerTransport | StreamableHTTPServerTransport | SSEServerTransport): Promise<void> { await this.server.connect(transport); } }

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/any4ai/anycrawl-mcp-server'

If you have feedback or need assistance with the MCP directory API, please join our Discord server