Skip to main content
Glama
scrape.py2.54 kB
import asyncio import mcp.types as types from typing import Any, List import json import re from crawl4ai import AsyncWebCrawler, CacheMode from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig async def scrape_url(url: str) -> List[Any]: """Scrape a webpage using crawl4ai with simple implementation. Args: url: The URL to scrape Returns: A list containing TextContent object with the result as JSON """ try: # Simple validation for domains/subdomains with http(s) url_pattern = re.compile( r"^(?:https?://)?(?:[A-Za-z0-9](?:[A-Za-z0-9-]{0,61}[A-Za-z0-9])?\.)+[A-Za-z]{2,}(?:/[^/\s]*)*$" ) if not url_pattern.match(url): return [ types.TextContent( type="text", text=json.dumps( { "success": False, "url": url, "error": "Invalid URL format", } ), ) ] # Add https:// if missing if not url.startswith("http://") and not url.startswith("https://"): url = f"https://{url}" # Use default configurations with minimal customization browser_config = BrowserConfig( browser_type="chromium", headless=True, ignore_https_errors=True, verbose=False, extra_args=[ "--no-sandbox", "--disable-setuid-sandbox", "--disable-dev-shm-usage", ], ) run_config = CrawlerRunConfig( cache_mode=CacheMode.BYPASS, verbose=False, page_timeout=30 * 1000, # Convert to milliseconds ) async with AsyncWebCrawler(config=browser_config) as crawler: result = await asyncio.wait_for( crawler.arun( url=url, config=run_config, ), timeout=30, ) # Create response in the format requested return [ types.TextContent( type="text", text=json.dumps({"markdown": result.markdown}) ) ] except Exception as e: return [ types.TextContent( type="text", text=json.dumps({"success": False, "url": url, "error": str(e)}), ) ]

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/ritvij14/crawl4ai-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server