"""Base parser interface for PrestaShop documentation."""
import re
from abc import ABC, abstractmethod
from pathlib import Path
from typing import Dict, List, Optional, Tuple
import yaml
class BaseParser(ABC):
"""Base class for all documentation parsers.
Each parser is responsible for:
1. Identifying if it can parse a specific file
2. Extracting structured data from the file
3. Providing metadata for indexing
"""
def __init__(self):
"""Initialize the parser."""
self.name = self.__class__.__name__
@abstractmethod
def can_parse(self, file_path: Path, frontmatter: Dict) -> bool:
"""Check if this parser can handle the given file.
Args:
file_path: Path to the markdown file
frontmatter: Extracted YAML frontmatter (empty dict if none)
Returns:
True if this parser can handle the file, False otherwise
"""
pass
@abstractmethod
def parse(self, file_path: Path) -> Optional[Dict]:
"""Parse the file and return structured data.
Args:
file_path: Path to the markdown file
Returns:
Dictionary with parsed data, or None if parsing fails
Expected structure:
{
"name": str, # Document name/identifier
"title": str, # Display title
"category": str, # Main category (admin-api, basics, etc.)
"subcategory": str, # Subdirectory path
"doc_type": str, # Type: hook, reference, tutorial, guide, etc.
"path": str, # Relative path from docs root
"content": str, # Full markdown content
"metadata": dict, # Parser-specific metadata
"version": str, # PrestaShop version (if specified)
}
"""
pass
def extract_frontmatter(self, content: str) -> Tuple[Dict, str]:
"""Extract YAML frontmatter and markdown content.
Args:
content: Raw file content
Returns:
Tuple of (frontmatter dict, markdown content)
"""
if not content.startswith("---"):
return {}, content
parts = content.split("---", 2)
if len(parts) < 3:
return {}, content
try:
frontmatter = yaml.safe_load(parts[1]) or {}
except yaml.YAMLError as e:
print(f"YAML parsing error: {e}")
return {}, content
markdown = parts[2].strip()
return frontmatter, markdown
def extract_code_examples(self, markdown: str) -> List[str]:
"""Extract code blocks from markdown content.
Args:
markdown: Markdown content
Returns:
List of code examples
"""
# Match code blocks with triple backticks
pattern = r"```(?:\w+)?\n(.*?)```"
matches = re.findall(pattern, markdown, re.DOTALL)
return [match.strip() for match in matches if match.strip()]
def get_category_from_path(self, file_path: Path, docs_root: Path) -> str:
"""Extract category from file path.
Args:
file_path: Path to the file
docs_root: Root documentation directory
Returns:
Category name (top-level folder)
"""
try:
relative_path = file_path.relative_to(docs_root)
return relative_path.parts[0] if relative_path.parts else "unknown"
except ValueError:
return "unknown"
def get_subcategory_from_path(self, file_path: Path, docs_root: Path) -> str:
"""Extract subcategory (subdirectory path) from file path.
Args:
file_path: Path to the file
docs_root: Root documentation directory
Returns:
Subcategory path (e.g., "installation/advanced")
"""
try:
relative_path = file_path.relative_to(docs_root)
if len(relative_path.parts) > 2:
# Join all parts except first (category) and last (filename)
return "/".join(relative_path.parts[1:-1])
return ""
except ValueError:
return ""
def extract_version(self, frontmatter: Dict, content: str) -> Optional[str]:
"""Extract PrestaShop version from frontmatter or content.
Args:
frontmatter: YAML frontmatter
content: Markdown content
Returns:
Version string (e.g., "9.0", "8.1") or None
"""
# Check frontmatter
if "version" in frontmatter:
return str(frontmatter["version"])
# Check for version in content (e.g., "PrestaShop 9.0")
version_pattern = r"PrestaShop\s+(\d+\.\d+)"
match = re.search(version_pattern, content)
if match:
return match.group(1)
return None
def clean_title(self, title: str) -> str:
"""Clean and normalize title.
Args:
title: Raw title
Returns:
Cleaned title
"""
if not title:
return "Untitled"
# Remove markdown formatting
title = re.sub(r'[*_`]', '', title)
# Remove extra whitespace
title = " ".join(title.split())
return title
def should_skip_file(self, file_path: Path) -> bool:
"""Check if file should be skipped based on patterns.
Args:
file_path: Path to check
Returns:
True if file should be skipped
"""
path_str = str(file_path)
# Skip patterns
skip_patterns = [
"/img/",
"/images/",
"/_partials/",
"/.github/",
"/node_modules/",
]
for pattern in skip_patterns:
if pattern in path_str:
return True
# Skip non-markdown files
if file_path.suffix != ".md":
return True
return False
class ParserRegistry:
"""Registry to manage multiple parsers and select the appropriate one."""
def __init__(self):
"""Initialize the parser registry."""
self.parsers: List[BaseParser] = []
def register(self, parser: BaseParser):
"""Register a parser.
Args:
parser: Parser instance to register
"""
self.parsers.append(parser)
def get_parser(self, file_path: Path) -> Optional[BaseParser]:
"""Get the appropriate parser for a file.
Args:
file_path: Path to the file
Returns:
Parser instance or None if no parser can handle the file
"""
if not file_path.exists():
return None
try:
content = file_path.read_text(encoding="utf-8")
except Exception as e:
print(f"Error reading {file_path}: {e}")
return None
# Extract frontmatter for parser selection
if content.startswith("---"):
parts = content.split("---", 2)
if len(parts) >= 3:
try:
frontmatter = yaml.safe_load(parts[1]) or {}
except yaml.YAMLError:
frontmatter = {}
else:
frontmatter = {}
else:
frontmatter = {}
# Try each parser in order
for parser in self.parsers:
if parser.can_parse(file_path, frontmatter):
return parser
return None
def parse_file(self, file_path: Path) -> Optional[Dict]:
"""Parse a file using the appropriate parser.
Args:
file_path: Path to the file
Returns:
Parsed data dict or None
"""
parser = self.get_parser(file_path)
if parser:
return parser.parse(file_path)
return None