Skip to main content
Glama

Documentation Crawler & MCP Server

by alizdavoodi
main.py14.3 kB
import asyncio import re from pathlib import Path from typing import List, Optional from urllib.parse import urlparse import typer from crawl4ai import BrowserConfig, CacheMode, CrawlerRunConfig from crawl4ai.deep_crawling import BestFirstCrawlingStrategy from crawl4ai.deep_crawling.filters import ( ContentTypeFilter, FilterChain, URLPatternFilter, ) from crawl4ai.deep_crawling.scorers import KeywordRelevanceScorer # Import DefaultMarkdownGenerator AND our custom one from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator from typing_extensions import Annotated # Import from our modules from .markdown import ( LinkRemovingMarkdownGenerator, ) # Import the custom generator # noqa: E501 from .config import ( DEFAULT_CACHE_MODE, DEFAULT_CONTENT_TYPES, DEFAULT_EXCLUDE_PATTERNS, DEFAULT_INCLUDE_PATTERNS, DEFAULT_KEYWORDS, DEFAULT_KEYWORD_WEIGHT, DEFAULT_MAX_DEPTH, # DEFAULT_OUTPUT_FILENAME, # Unused DEFAULT_OUTPUT_TITLE, ) from .crawler import run_crawl from .utils import err_console # --- Typer Application --- app = typer.Typer( help="A CLI tool to crawl websites and generate Markdown documentation.", # Optional: disable Typer's own completion commands if not needed add_completion=False, ) @app.command() def main( url: Annotated[str, typer.Argument(help="The starting URL for the crawl.")], output_file: Annotated[ Optional[Path], # Allow None typer.Option( "--output", "-o", help=( "Path to save the merged Markdown output. If omitted, " "generates filename from URL in ./storage/." ), ), ] = None, # Default to None to detect if user provided it output_title: Annotated[ str, typer.Option("--title", help="Title for the output Markdown file."), ] = DEFAULT_OUTPUT_TITLE, max_depth: Annotated[ int, typer.Option( "--max-depth", "-d", help="Maximum crawl depth (must be between 1 and 5).", min=1, max=5, # Typer will now error if value is outside the min/max range ), ] = DEFAULT_MAX_DEPTH, include_external: Annotated[ bool, typer.Option( "--include-external/--exclude-external", help="Follow external links during crawl.", ), ] = False, include_patterns: Annotated[ Optional[List[str]], # Make optional to allow empty list from CLI typer.Option( "--include-pattern", help=( "URL pattern to include (can be used multiple times). " "Default: common doc patterns." ), ), ] = None, # Default to None, handle logic below exclude_patterns: Annotated[ Optional[List[str]], # Make optional to allow empty list from CLI typer.Option( "--exclude-pattern", help=( "URL pattern to exclude (can be used multiple times). " "Default: fragment identifiers." ), ), ] = None, # Default to None, handle logic below content_types: Annotated[ Optional[List[str]], # Make optional to allow empty list from CLI typer.Option( "--content-type", help=( "Allowed content type (can be used multiple times). " "Default: text/html." ), ), ] = None, # Default to None, handle logic below keywords: Annotated[ Optional[List[str]], # Make optional to allow empty list from CLI typer.Option( "--keyword", "-k", help=( "Keyword for relevance scoring (can be used multiple times). " "Default: common doc keywords." ), ), ] = None, # Default to None, handle logic below keyword_weight: Annotated[ float, typer.Option("--keyword-weight", help="Weight for keyword relevance scorer."), ] = DEFAULT_KEYWORD_WEIGHT, remove_links_flag: Annotated[ bool, typer.Option( "--remove-links/--keep-links", help=( "Remove nav links, headers, footers from HTML before " "markdown conversion." ), ), ] = True, ignore_images: Annotated[ bool, typer.Option( "--ignore-images/--include-images", help="Ignore images during markdown conversion.", ), ] = True, verbose: Annotated[ bool, typer.Option("--verbose", "-v", help="Enable verbose output during crawl."), ] = False, stream: Annotated[ bool, typer.Option("--stream/--no-stream", help="Process results as they arrive."), ] = True, # Accept cache_mode as string, parse manually later cache_mode_str: Annotated[ str, typer.Option( "--cache-mode", help=( f"Cache mode to use. Choices: " f"{[e.name.lower() for e in CacheMode]}. Case-insensitive." ), ), ] = DEFAULT_CACHE_MODE.name, # Default to the name of the enum member exclude_markdown_external_links: Annotated[ bool, typer.Option( "--exclude-markdown-external-links/" "--include-markdown-external-links", help="Exclude external links from the final markdown output.", ), ] = True, only_text: Annotated[ bool, typer.Option( "--only-text/--keep-markup", help="Extract only text content, ignoring markup, for markdown.", ), ] = True, wait_for: Annotated[ Optional[str], typer.Option( "--wait-for", help=( "Time in seconds or CSS selector to wait for before capturing content. " "For JavaScript-rendered pages, use a number (e.g., 5) or a CSS selector " "(e.g., 'css:.content-loaded'). Prefix with 'js:' for JavaScript conditions." ), ), ] = None, js_code: Annotated[ Optional[str], typer.Option( "--js-code", help=( "JavaScript code to execute on the page before capturing content. " "Useful for interacting with the page or making it render dynamic content." ), ), ] = None, page_load_timeout: Annotated[ int, typer.Option( "--page-load-timeout", help="Maximum time in seconds to wait for page to load completely.", ), ] = 30, wait_for_js_render: Annotated[ bool, typer.Option( "--wait-for-js-render/--no-wait-for-js-render", help="Wait for JavaScript-rendered content using a special script for SPAs.", ), ] = False, ): """ Crawls a website starting from the given URL and generates a merged Markdown file. """ # --- Handle Optional List Defaults --- # If the user didn't provide the option, use the default list. # If they provided the option but no values, it will be an empty list. final_include_patterns = ( include_patterns if include_patterns is not None else DEFAULT_INCLUDE_PATTERNS ) final_exclude_patterns = ( exclude_patterns if exclude_patterns is not None else DEFAULT_EXCLUDE_PATTERNS ) final_content_types = ( content_types if content_types is not None else DEFAULT_CONTENT_TYPES ) final_keywords = keywords if keywords is not None else DEFAULT_KEYWORDS # --- Determine Output File Path --- if output_file is None: # User did not specify --output, generate filename from URL storage_dir = Path("storage") storage_dir.mkdir(parents=True, exist_ok=True) # Ensure storage dir exists try: parsed_url = urlparse(url) domain = parsed_url.netloc # Sanitize domain for use in filename # Remove http/https, www., and replace non-alphanumeric with underscore # noqa: E501 sanitized_domain = re.sub(r"^www\.", "", domain) # Remove www. sanitized_domain = re.sub( r"[^\w.-]+", "_", sanitized_domain ) # Replace invalid chars # Remove trailing dots or underscores sanitized_domain = sanitized_domain.strip("._") # Handle cases where sanitization results in empty string if not sanitized_domain: sanitized_domain = "default_crawl_output" output_filename = f"{sanitized_domain}.md" output_file = storage_dir / output_filename print("No output file specified. Using generated path:\n" f"{output_file}") except Exception as e: err_console.print( f"[bold red]Error generating output filename from URL '{url}':" f"[/bold red] {e}" ) raise typer.Exit(code=1) # else: output_file remains the Path provided by the user # --- Configure Filters --- filters = [] if final_exclude_patterns: filters.append(URLPatternFilter(patterns=final_exclude_patterns, reverse=True)) if final_include_patterns: filters.append(URLPatternFilter(patterns=final_include_patterns)) if final_content_types: filters.append(ContentTypeFilter(allowed_types=final_content_types)) filter_chain = FilterChain(filters) # --- Configure Scorer --- scorer = KeywordRelevanceScorer(keywords=final_keywords, weight=keyword_weight) # --- Configure Crawling Strategy --- strategy = BestFirstCrawlingStrategy( max_depth=max_depth, include_external=include_external, filter_chain=filter_chain, url_scorer=scorer, ) # --- Configure Markdown Generator --- # Choose the generator based on the flag if remove_links_flag: # Use our custom generator that removes links/nav markdown_strategy = LinkRemovingMarkdownGenerator( options={"ignore_images": ignore_images} ) if verbose: print("Using LinkRemovingMarkdownGenerator.") else: # Use the default generator markdown_strategy = DefaultMarkdownGenerator( options={"ignore_images": ignore_images} ) if verbose: print("Using DefaultMarkdownGenerator (keeping links/nav).") # --- Configure Browser --- browser_config = BrowserConfig( verbose=verbose # Add other browser configs here if needed later ) # --- Configure Run --- # Pass the selected markdown strategy here # Determine the final wait_for value final_wait_for = wait_for if wait_for else ("5" if wait_for_js_render else None) merged_run_config = CrawlerRunConfig( deep_crawl_strategy=strategy, # Correct argument name is markdown_generator markdown_generator=markdown_strategy, verbose=verbose, stream=stream, # cache_mode=cache_mode, # Set below exclude_external_links=exclude_markdown_external_links, only_text=only_text, # Add wait_for parameter using our determined value wait_for=final_wait_for, # Add js_code parameter if provided js_code=js_code if js_code else ( # If wait_for_js_render is enabled and no custom js_code is provided, # use this SPA-friendly script to ensure content loads """ // Scroll through the page to trigger lazy loading function scrollToBottom() { window.scrollTo(0, document.body.scrollHeight); } // Scroll a few times with delay to ensure content loads scrollToBottom(); setTimeout(scrollToBottom, 1000); setTimeout(scrollToBottom, 2000); // Try to find and click any "show more" or expand buttons setTimeout(() => { const buttons = Array.from(document.querySelectorAll('button, a, [role="button"]')) .filter(el => { const text = el.textContent.toLowerCase(); return text.includes('show') || text.includes('more') || text.includes('expand') || text.includes('load'); }); buttons.forEach(button => button.click()); }, 3000); """ if wait_for_js_render else None ), # Add page_timeout (milliseconds) page_timeout=page_load_timeout * 1000, # Convert to milliseconds ) # --- Manually Parse Cache Mode --- try: # Find the matching enum member case-insensitively parsed_cache_mode = next( mode for mode in CacheMode if mode.name.lower() == cache_mode_str.lower() ) merged_run_config.cache_mode = parsed_cache_mode except StopIteration: valid_modes = [e.name for e in CacheMode] err_console.print( f"[bold red]Invalid cache mode:[/bold red] '{cache_mode_str}'. " f"Valid choices (case-insensitive): {valid_modes}" ) raise typer.Exit(code=1) # --- Run the Crawl --- # Use asyncio.run to execute the async function from the sync Typer command try: asyncio.run( run_crawl( start_url=url, output_file=output_file, output_title=output_title, browser_config=browser_config, run_config=merged_run_config, verbose=verbose, ) ) except typer.Exit: # Catch typer.Exit to prevent asyncio errors on controlled exits pass except Exception as e: # Catch other potential errors during setup or run_crawl call err_console.print(f"[bold red]An unexpected error occurred:[/bold red] {e}") raise typer.Exit(code=1) # --- Entry Point --- # This allows running the CLI directly using `python -m crawler_cli.main` if __name__ == "__main__": app()

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/alizdavoodi/MCPDocSearch'

If you have feedback or need assistance with the MCP directory API, please join our Discord server