Documentation Crawler & MCP Server

Overview Schema Related Servers Score Discussions

MIT License

MCPDocSearch
crawler_cli

main.py•14.3 kB

import asyncio import re from pathlib import Path from typing import List, Optional from urllib.parse import urlparse import typer from crawl4ai import BrowserConfig, CacheMode, CrawlerRunConfig from crawl4ai.deep_crawling import BestFirstCrawlingStrategy from crawl4ai.deep_crawling.filters import ( ContentTypeFilter, FilterChain, URLPatternFilter, ) from crawl4ai.deep_crawling.scorers import KeywordRelevanceScorer # Import DefaultMarkdownGenerator AND our custom one from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator from typing_extensions import Annotated # Import from our modules from .markdown import ( LinkRemovingMarkdownGenerator, ) # Import the custom generator # noqa: E501 from .config import ( DEFAULT_CACHE_MODE, DEFAULT_CONTENT_TYPES, DEFAULT_EXCLUDE_PATTERNS, DEFAULT_INCLUDE_PATTERNS, DEFAULT_KEYWORDS, DEFAULT_KEYWORD_WEIGHT, DEFAULT_MAX_DEPTH, # DEFAULT_OUTPUT_FILENAME, # Unused DEFAULT_OUTPUT_TITLE, ) from .crawler import run_crawl from .utils import err_console # --- Typer Application --- app = typer.Typer( help="A CLI tool to crawl websites and generate Markdown documentation.", # Optional: disable Typer's own completion commands if not needed add_completion=False, ) @app.command() def main( url: Annotated[str, typer.Argument(help="The starting URL for the crawl.")], output_file: Annotated[ Optional[Path], # Allow None typer.Option( "--output", "-o", help=( "Path to save the merged Markdown output. If omitted, " "generates filename from URL in ./storage/." ), ), ] = None, # Default to None to detect if user provided it output_title: Annotated[ str, typer.Option("--title", help="Title for the output Markdown file."), ] = DEFAULT_OUTPUT_TITLE, max_depth: Annotated[ int, typer.Option( "--max-depth", "-d", help="Maximum crawl depth (must be between 1 and 5).", min=1, max=5, # Typer will now error if value is outside the min/max range ), ] = DEFAULT_MAX_DEPTH, include_external: Annotated[ bool, typer.Option( "--include-external/--exclude-external", help="Follow external links during crawl.", ), ] = False, include_patterns: Annotated[ Optional[List[str]], # Make optional to allow empty list from CLI typer.Option( "--include-pattern", help=( "URL pattern to include (can be used multiple times). " "Default: common doc patterns." ), ), ] = None, # Default to None, handle logic below exclude_patterns: Annotated[ Optional[List[str]], # Make optional to allow empty list from CLI typer.Option( "--exclude-pattern", help=( "URL pattern to exclude (can be used multiple times). " "Default: fragment identifiers." ), ), ] = None, # Default to None, handle logic below content_types: Annotated[ Optional[List[str]], # Make optional to allow empty list from CLI typer.Option( "--content-type", help=( "Allowed content type (can be used multiple times). " "Default: text/html." ), ), ] = None, # Default to None, handle logic below keywords: Annotated[ Optional[List[str]], # Make optional to allow empty list from CLI typer.Option( "--keyword", "-k", help=( "Keyword for relevance scoring (can be used multiple times). " "Default: common doc keywords." ), ), ] = None, # Default to None, handle logic below keyword_weight: Annotated[ float, typer.Option("--keyword-weight", help="Weight for keyword relevance scorer."), ] = DEFAULT_KEYWORD_WEIGHT, remove_links_flag: Annotated[ bool, typer.Option( "--remove-links/--keep-links", help=( "Remove nav links, headers, footers from HTML before " "markdown conversion." ), ), ] = True, ignore_images: Annotated[ bool, typer.Option( "--ignore-images/--include-images", help="Ignore images during markdown conversion.", ), ] = True, verbose: Annotated[ bool, typer.Option("--verbose", "-v", help="Enable verbose output during crawl."), ] = False, stream: Annotated[ bool, typer.Option("--stream/--no-stream", help="Process results as they arrive."), ] = True, # Accept cache_mode as string, parse manually later cache_mode_str: Annotated[ str, typer.Option( "--cache-mode", help=( f"Cache mode to use. Choices: " f"{[e.name.lower() for e in CacheMode]}. Case-insensitive." ), ), ] = DEFAULT_CACHE_MODE.name, # Default to the name of the enum member exclude_markdown_external_links: Annotated[ bool, typer.Option( "--exclude-markdown-external-links/" "--include-markdown-external-links", help="Exclude external links from the final markdown output.", ), ] = True, only_text: Annotated[ bool, typer.Option( "--only-text/--keep-markup", help="Extract only text content, ignoring markup, for markdown.", ), ] = True, wait_for: Annotated[ Optional[str], typer.Option( "--wait-for", help=( "Time in seconds or CSS selector to wait for before capturing content. " "For JavaScript-rendered pages, use a number (e.g., 5) or a CSS selector " "(e.g., 'css:.content-loaded'). Prefix with 'js:' for JavaScript conditions." ), ), ] = None, js_code: Annotated[ Optional[str], typer.Option( "--js-code", help=( "JavaScript code to execute on the page before capturing content. " "Useful for interacting with the page or making it render dynamic content." ), ), ] = None, page_load_timeout: Annotated[ int, typer.Option( "--page-load-timeout", help="Maximum time in seconds to wait for page to load completely.", ), ] = 30, wait_for_js_render: Annotated[ bool, typer.Option( "--wait-for-js-render/--no-wait-for-js-render", help="Wait for JavaScript-rendered content using a special script for SPAs.", ), ] = False, ): """ Crawls a website starting from the given URL and generates a merged Markdown file. """ # --- Handle Optional List Defaults --- # If the user didn't provide the option, use the default list. # If they provided the option but no values, it will be an empty list. final_include_patterns = ( include_patterns if include_patterns is not None else DEFAULT_INCLUDE_PATTERNS ) final_exclude_patterns = ( exclude_patterns if exclude_patterns is not None else DEFAULT_EXCLUDE_PATTERNS ) final_content_types = ( content_types if content_types is not None else DEFAULT_CONTENT_TYPES ) final_keywords = keywords if keywords is not None else DEFAULT_KEYWORDS # --- Determine Output File Path --- if output_file is None: # User did not specify --output, generate filename from URL storage_dir = Path("storage") storage_dir.mkdir(parents=True, exist_ok=True) # Ensure storage dir exists try: parsed_url = urlparse(url) domain = parsed_url.netloc # Sanitize domain for use in filename # Remove http/https, www., and replace non-alphanumeric with underscore # noqa: E501 sanitized_domain = re.sub(r"^www\.", "", domain) # Remove www. sanitized_domain = re.sub( r"[^\w.-]+", "_", sanitized_domain ) # Replace invalid chars # Remove trailing dots or underscores sanitized_domain = sanitized_domain.strip("._") # Handle cases where sanitization results in empty string if not sanitized_domain: sanitized_domain = "default_crawl_output" output_filename = f"{sanitized_domain}.md" output_file = storage_dir / output_filename print("No output file specified. Using generated path:\n" f"{output_file}") except Exception as e: err_console.print( f"[bold red]Error generating output filename from URL '{url}':" f"[/bold red] {e}" ) raise typer.Exit(code=1) # else: output_file remains the Path provided by the user # --- Configure Filters --- filters = [] if final_exclude_patterns: filters.append(URLPatternFilter(patterns=final_exclude_patterns, reverse=True)) if final_include_patterns: filters.append(URLPatternFilter(patterns=final_include_patterns)) if final_content_types: filters.append(ContentTypeFilter(allowed_types=final_content_types)) filter_chain = FilterChain(filters) # --- Configure Scorer --- scorer = KeywordRelevanceScorer(keywords=final_keywords, weight=keyword_weight) # --- Configure Crawling Strategy --- strategy = BestFirstCrawlingStrategy( max_depth=max_depth, include_external=include_external, filter_chain=filter_chain, url_scorer=scorer, ) # --- Configure Markdown Generator --- # Choose the generator based on the flag if remove_links_flag: # Use our custom generator that removes links/nav markdown_strategy = LinkRemovingMarkdownGenerator( options={"ignore_images": ignore_images} ) if verbose: print("Using LinkRemovingMarkdownGenerator.") else: # Use the default generator markdown_strategy = DefaultMarkdownGenerator( options={"ignore_images": ignore_images} ) if verbose: print("Using DefaultMarkdownGenerator (keeping links/nav).") # --- Configure Browser --- browser_config = BrowserConfig( verbose=verbose # Add other browser configs here if needed later ) # --- Configure Run --- # Pass the selected markdown strategy here # Determine the final wait_for value final_wait_for = wait_for if wait_for else ("5" if wait_for_js_render else None) merged_run_config = CrawlerRunConfig( deep_crawl_strategy=strategy, # Correct argument name is markdown_generator markdown_generator=markdown_strategy, verbose=verbose, stream=stream, # cache_mode=cache_mode, # Set below exclude_external_links=exclude_markdown_external_links, only_text=only_text, # Add wait_for parameter using our determined value wait_for=final_wait_for, # Add js_code parameter if provided js_code=js_code if js_code else ( # If wait_for_js_render is enabled and no custom js_code is provided, # use this SPA-friendly script to ensure content loads """ // Scroll through the page to trigger lazy loading function scrollToBottom() { window.scrollTo(0, document.body.scrollHeight); } // Scroll a few times with delay to ensure content loads scrollToBottom(); setTimeout(scrollToBottom, 1000); setTimeout(scrollToBottom, 2000); // Try to find and click any "show more" or expand buttons setTimeout(() => { const buttons = Array.from(document.querySelectorAll('button, a, [role="button"]')) .filter(el => { const text = el.textContent.toLowerCase(); return text.includes('show') || text.includes('more') || text.includes('expand') || text.includes('load'); }); buttons.forEach(button => button.click()); }, 3000); """ if wait_for_js_render else None ), # Add page_timeout (milliseconds) page_timeout=page_load_timeout * 1000, # Convert to milliseconds ) # --- Manually Parse Cache Mode --- try: # Find the matching enum member case-insensitively parsed_cache_mode = next( mode for mode in CacheMode if mode.name.lower() == cache_mode_str.lower() ) merged_run_config.cache_mode = parsed_cache_mode except StopIteration: valid_modes = [e.name for e in CacheMode] err_console.print( f"[bold red]Invalid cache mode:[/bold red] '{cache_mode_str}'. " f"Valid choices (case-insensitive): {valid_modes}" ) raise typer.Exit(code=1) # --- Run the Crawl --- # Use asyncio.run to execute the async function from the sync Typer command try: asyncio.run( run_crawl( start_url=url, output_file=output_file, output_title=output_title, browser_config=browser_config, run_config=merged_run_config, verbose=verbose, ) ) except typer.Exit: # Catch typer.Exit to prevent asyncio errors on controlled exits pass except Exception as e: # Catch other potential errors during setup or run_crawl call err_console.print(f"[bold red]An unexpected error occurred:[/bold red] {e}") raise typer.Exit(code=1) # --- Entry Point --- # This allows running the CLI directly using `python -m crawler_cli.main` if __name__ == "__main__": app()

Latest Blog Posts

The 50MB Markdown Files That Broke Our Server
By punkpeye on December 3, 2025.
react
react-router
node-js
OpenTelemetry for Model Context Protocol (MCP) Analytics and Agent Observability
By Om-Shree-0709 on November 29, 2025.
observability
mcp
opentelemetry
Securing Enterprise AI Agents with Unique Identities in the Model Context Protocol (MCP)
By Om-Shree-0709 on November 27, 2025.

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/alizdavoodi/MCPDocSearch'

If you have feedback or need assistance with the MCP directory API, please join our Discord server