Skip to main content
Glama
base_parser.py7.93 kB
"""Base parser interface for PrestaShop documentation.""" import re from abc import ABC, abstractmethod from pathlib import Path from typing import Dict, List, Optional, Tuple import yaml class BaseParser(ABC): """Base class for all documentation parsers. Each parser is responsible for: 1. Identifying if it can parse a specific file 2. Extracting structured data from the file 3. Providing metadata for indexing """ def __init__(self): """Initialize the parser.""" self.name = self.__class__.__name__ @abstractmethod def can_parse(self, file_path: Path, frontmatter: Dict) -> bool: """Check if this parser can handle the given file. Args: file_path: Path to the markdown file frontmatter: Extracted YAML frontmatter (empty dict if none) Returns: True if this parser can handle the file, False otherwise """ pass @abstractmethod def parse(self, file_path: Path) -> Optional[Dict]: """Parse the file and return structured data. Args: file_path: Path to the markdown file Returns: Dictionary with parsed data, or None if parsing fails Expected structure: { "name": str, # Document name/identifier "title": str, # Display title "category": str, # Main category (admin-api, basics, etc.) "subcategory": str, # Subdirectory path "doc_type": str, # Type: hook, reference, tutorial, guide, etc. "path": str, # Relative path from docs root "content": str, # Full markdown content "metadata": dict, # Parser-specific metadata "version": str, # PrestaShop version (if specified) } """ pass def extract_frontmatter(self, content: str) -> Tuple[Dict, str]: """Extract YAML frontmatter and markdown content. Args: content: Raw file content Returns: Tuple of (frontmatter dict, markdown content) """ if not content.startswith("---"): return {}, content parts = content.split("---", 2) if len(parts) < 3: return {}, content try: frontmatter = yaml.safe_load(parts[1]) or {} except yaml.YAMLError as e: print(f"YAML parsing error: {e}") return {}, content markdown = parts[2].strip() return frontmatter, markdown def extract_code_examples(self, markdown: str) -> List[str]: """Extract code blocks from markdown content. Args: markdown: Markdown content Returns: List of code examples """ # Match code blocks with triple backticks pattern = r"```(?:\w+)?\n(.*?)```" matches = re.findall(pattern, markdown, re.DOTALL) return [match.strip() for match in matches if match.strip()] def get_category_from_path(self, file_path: Path, docs_root: Path) -> str: """Extract category from file path. Args: file_path: Path to the file docs_root: Root documentation directory Returns: Category name (top-level folder) """ try: relative_path = file_path.relative_to(docs_root) return relative_path.parts[0] if relative_path.parts else "unknown" except ValueError: return "unknown" def get_subcategory_from_path(self, file_path: Path, docs_root: Path) -> str: """Extract subcategory (subdirectory path) from file path. Args: file_path: Path to the file docs_root: Root documentation directory Returns: Subcategory path (e.g., "installation/advanced") """ try: relative_path = file_path.relative_to(docs_root) if len(relative_path.parts) > 2: # Join all parts except first (category) and last (filename) return "/".join(relative_path.parts[1:-1]) return "" except ValueError: return "" def extract_version(self, frontmatter: Dict, content: str) -> Optional[str]: """Extract PrestaShop version from frontmatter or content. Args: frontmatter: YAML frontmatter content: Markdown content Returns: Version string (e.g., "9.0", "8.1") or None """ # Check frontmatter if "version" in frontmatter: return str(frontmatter["version"]) # Check for version in content (e.g., "PrestaShop 9.0") version_pattern = r"PrestaShop\s+(\d+\.\d+)" match = re.search(version_pattern, content) if match: return match.group(1) return None def clean_title(self, title: str) -> str: """Clean and normalize title. Args: title: Raw title Returns: Cleaned title """ if not title: return "Untitled" # Remove markdown formatting title = re.sub(r'[*_`]', '', title) # Remove extra whitespace title = " ".join(title.split()) return title def should_skip_file(self, file_path: Path) -> bool: """Check if file should be skipped based on patterns. Args: file_path: Path to check Returns: True if file should be skipped """ path_str = str(file_path) # Skip patterns skip_patterns = [ "/img/", "/images/", "/_partials/", "/.github/", "/node_modules/", ] for pattern in skip_patterns: if pattern in path_str: return True # Skip non-markdown files if file_path.suffix != ".md": return True return False class ParserRegistry: """Registry to manage multiple parsers and select the appropriate one.""" def __init__(self): """Initialize the parser registry.""" self.parsers: List[BaseParser] = [] def register(self, parser: BaseParser): """Register a parser. Args: parser: Parser instance to register """ self.parsers.append(parser) def get_parser(self, file_path: Path) -> Optional[BaseParser]: """Get the appropriate parser for a file. Args: file_path: Path to the file Returns: Parser instance or None if no parser can handle the file """ if not file_path.exists(): return None try: content = file_path.read_text(encoding="utf-8") except Exception as e: print(f"Error reading {file_path}: {e}") return None # Extract frontmatter for parser selection if content.startswith("---"): parts = content.split("---", 2) if len(parts) >= 3: try: frontmatter = yaml.safe_load(parts[1]) or {} except yaml.YAMLError: frontmatter = {} else: frontmatter = {} else: frontmatter = {} # Try each parser in order for parser in self.parsers: if parser.can_parse(file_path, frontmatter): return parser return None def parse_file(self, file_path: Path) -> Optional[Dict]: """Parse a file using the appropriate parser. Args: file_path: Path to the file Returns: Parsed data dict or None """ parser = self.get_parser(file_path) if parser: return parser.parse(file_path) return None

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/florinel-chis/prestashop-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server