Skip to main content
Glama
repository_manager.py•21.2 kB
"""Git repository management for skills repositories.""" import logging import re import shutil from collections.abc import Callable from datetime import UTC, datetime from pathlib import Path from typing import TypedDict from urllib.parse import urlparse import git from git import RemoteProgress from mcp_skills.models.repository import Repository from mcp_skills.services.metadata_store import MetadataStore logger = logging.getLogger(__name__) class RepoConfig(TypedDict): """Type definition for repository configuration.""" url: str priority: int license: str class CloneProgress(RemoteProgress): """GitPython progress handler for repository cloning and updates. Translates GitPython's RemoteProgress callbacks into a simpler callback interface suitable for CLI progress bars. Args: callback: Function called with (current, total, message) during git operations """ def __init__(self, callback: Callable[[int, int, str], None]) -> None: """Initialize progress handler with callback function.""" super().__init__() self.callback = callback def update( self, _op_code: int, cur_count: int | float, max_count: int | float | None = None, message: str = "", ) -> None: """Called by GitPython during clone/pull operations. Args: op_code: Operation code (not used, but required by GitPython) cur_count: Current progress count max_count: Total count (None for indeterminate progress) message: Progress message from git """ if max_count and self.callback: self.callback(int(cur_count), int(max_count), message or "") class RepositoryManager: """Manage git-based skills repositories. Handles cloning, updating, and tracking multiple skill repositories. Supports prioritization for resolving conflicts between repositories. """ # Default repositories to clone on setup DEFAULT_REPOS: list[RepoConfig] = [ { "url": "https://github.com/anthropics/skills.git", "priority": 100, "license": "Apache-2.0", }, { "url": "https://github.com/obra/superpowers.git", "priority": 90, "license": "MIT", }, { "url": "https://github.com/ComposioHQ/awesome-claude-skills.git", "priority": 85, "license": "Apache-2.0", }, { "url": "https://github.com/Prat011/awesome-llm-skills.git", "priority": 85, "license": "Apache-2.0", }, { "url": "https://github.com/bobmatnyc/claude-mpm-skills.git", "priority": 80, "license": "MIT", }, ] def __init__(self, base_dir: Path | None = None) -> None: """Initialize repository manager. Args: base_dir: Base directory for storing repositories. Defaults to ~/.mcp-skillset/repos/ Migration Note: - Automatically migrates from JSON to SQLite on first use - JSON file backed up as repos.json.backup after successful migration - SQLite database provides O(1) indexed lookups vs O(n) JSON scans """ self.base_dir = base_dir or Path.home() / ".mcp-skillset" / "repos" self.base_dir.mkdir(parents=True, exist_ok=True) self.metadata_file = self.base_dir.parent / "repos.json" # Initialize SQLite metadata store db_path = self.base_dir.parent / "metadata.db" self.metadata_store = MetadataStore(db_path=db_path) # Auto-migrate from JSON if needed if self.metadata_file.exists() and not self.metadata_store.has_data(): count = self.metadata_store.migrate_from_json(self.metadata_file) if count > 0: logger.info(f"Migrated {count} repositories from JSON to SQLite") # Backup JSON file after successful migration backup_path = self.metadata_file.with_suffix(".json.backup") shutil.move(str(self.metadata_file), str(backup_path)) logger.info(f"JSON metadata backed up to {backup_path}") def add_repository( self, url: str, priority: int = 0, license: str = "Unknown" ) -> Repository: """Clone new repository. Args: url: Git repository URL priority: Priority for skill selection (0-100) license: Repository license (default: "Unknown") Returns: Repository metadata object Raises: ValueError: If URL is invalid or repository already exists Design Decision: Git Clone Strategy Rationale: Using GitPython's clone_from() for simplicity and Python integration. Direct subprocess calls would require manual error handling and platform-specific git binary management. GitPython provides consistent cross-platform behavior. Trade-offs: - Simplicity: GitPython handles git binary detection and error wrapping - Performance: Slightly slower than subprocess (~5-10% overhead for small repos) - Dependency: Requires GitPython library, but already in project dependencies Error Handling: - InvalidGitRepositoryError: URL is not a valid git repository - GitCommandError: Clone operation failed (network, permissions, etc.) - ValueError: Invalid priority range or duplicate repository """ # 1. Validate URL if not self._is_valid_git_url(url): raise ValueError(f"Invalid git URL: {url}") # 2. Validate priority range if not 0 <= priority <= 100: raise ValueError(f"Priority must be between 0-100, got {priority}") # 3. Generate repository ID from URL repo_id = self._generate_repo_id(url) # 4. Check if already exists existing = self.get_repository(repo_id) if existing: raise ValueError( f"Repository already exists: {repo_id} at {existing.local_path}" ) # 5. Clone repository using GitPython local_path = self.base_dir / repo_id logger.info(f"Cloning repository {url} to {local_path}") try: git.Repo.clone_from(url, local_path, depth=1) except git.exc.GitCommandError as e: raise ValueError(f"Failed to clone repository {url}: {e}") from e # 6. Scan for skills skill_count = self._count_skills(local_path) logger.info(f"Found {skill_count} skills in {repo_id}") # 7. Create Repository object repository = Repository( id=repo_id, url=url, local_path=local_path, priority=priority, last_updated=datetime.now(UTC), skill_count=skill_count, license=license, ) # 8. Store metadata in SQLite self.metadata_store.add_repository(repository) return repository def add_repository_with_progress( self, url: str, priority: int = 0, license: str = "Unknown", progress_callback: Callable[[int, int, str], None] | None = None, ) -> Repository: """Clone new repository with progress tracking. Args: url: Git repository URL priority: Priority for skill selection (0-100) license: Repository license (default: "Unknown") progress_callback: Called with (current, total, message) during clone Returns: Repository metadata object Raises: ValueError: If URL is invalid or repository already exists Design Decision: Progress Callback Pattern Rationale: Using optional callback parameter preserves backward compatibility while enabling rich progress displays in CLI. Callback pattern is simpler than event-based systems and avoids coupling service layer to UI libraries. Trade-offs: - Simplicity: Direct callback is easy to understand and test - Coupling: Caller controls UI but must handle progress updates - Flexibility: Works with any UI framework (Rich, tqdm, etc.) Error Handling: - InvalidGitRepositoryError: URL is not a valid git repository - GitCommandError: Clone operation failed (network, permissions, etc.) - ValueError: Invalid priority range or duplicate repository """ # 1. Validate URL if not self._is_valid_git_url(url): raise ValueError(f"Invalid git URL: {url}") # 2. Validate priority range if not 0 <= priority <= 100: raise ValueError(f"Priority must be between 0-100, got {priority}") # 3. Generate repository ID from URL repo_id = self._generate_repo_id(url) # 4. Check if already exists existing = self.get_repository(repo_id) if existing: raise ValueError( f"Repository already exists: {repo_id} at {existing.local_path}" ) # 5. Clone repository with progress tracking local_path = self.base_dir / repo_id logger.info(f"Cloning repository {url} to {local_path}") try: if progress_callback: progress_handler = CloneProgress(progress_callback) git.Repo.clone_from(url, local_path, depth=1, progress=progress_handler) else: git.Repo.clone_from(url, local_path, depth=1) except git.exc.GitCommandError as e: raise ValueError(f"Failed to clone repository {url}: {e}") from e # 6. Scan for skills skill_count = self._count_skills(local_path) logger.info(f"Found {skill_count} skills in {repo_id}") # 7. Create Repository object repository = Repository( id=repo_id, url=url, local_path=local_path, priority=priority, last_updated=datetime.now(UTC), skill_count=skill_count, license=license, ) # 8. Store metadata in SQLite self.metadata_store.add_repository(repository) return repository def update_repository(self, repo_id: str) -> Repository: """Pull latest changes from repository. Args: repo_id: Repository identifier Returns: Updated repository metadata Raises: ValueError: If repository not found Error Handling: - ValueError: Repository not found in metadata - GitCommandError: Pull operation failed (network, conflicts, etc.) - InvalidGitRepositoryError: Local clone is corrupted Recovery Strategy: - Pull failures are propagated to caller for explicit handling - Consider re-cloning if local repository is corrupted - No automatic conflict resolution (user must handle manually) """ # 1. Find repository by ID repository = self.get_repository(repo_id) if not repository: raise ValueError(f"Repository not found: {repo_id}") # 2. Git pull latest changes logger.info(f"Updating repository {repo_id} from {repository.url}") try: repo = git.Repo(repository.local_path) origin = repo.remotes.origin origin.pull() except git.exc.InvalidGitRepositoryError as e: raise ValueError( f"Local repository is corrupted: {repository.local_path}. " f"Consider removing and re-cloning: {e}" ) from e except git.exc.GitCommandError as e: raise ValueError(f"Failed to update repository {repo_id}: {e}") from e # 3. Rescan for new/updated skills skill_count = self._count_skills(repository.local_path) logger.info(f"Rescanned {repo_id}: {skill_count} skills found") # 4. Update metadata repository.last_updated = datetime.now(UTC) repository.skill_count = skill_count # 5. Save updated metadata to SQLite self.metadata_store.update_repository(repository) return repository def update_repository_with_progress( self, repo_id: str, progress_callback: Callable[[int, int, str], None] | None = None, ) -> Repository: """Pull latest changes from repository with progress tracking. Args: repo_id: Repository identifier progress_callback: Called with (current, total, message) during pull Returns: Updated repository metadata Raises: ValueError: If repository not found Error Handling: - ValueError: Repository not found in metadata - GitCommandError: Pull operation failed (network, conflicts, etc.) - InvalidGitRepositoryError: Local clone is corrupted Recovery Strategy: - Pull failures are propagated to caller for explicit handling - Consider re-cloning if local repository is corrupted - No automatic conflict resolution (user must handle manually) """ # 1. Find repository by ID repository = self.get_repository(repo_id) if not repository: raise ValueError(f"Repository not found: {repo_id}") # 2. Git pull latest changes with progress tracking logger.info(f"Updating repository {repo_id} from {repository.url}") try: repo = git.Repo(repository.local_path) origin = repo.remotes.origin if progress_callback: progress_handler = CloneProgress(progress_callback) origin.pull(progress=progress_handler) else: origin.pull() except git.exc.InvalidGitRepositoryError as e: raise ValueError( f"Local repository is corrupted: {repository.local_path}. " f"Consider removing and re-cloning: {e}" ) from e except git.exc.GitCommandError as e: raise ValueError(f"Failed to update repository {repo_id}: {e}") from e # 3. Rescan for new/updated skills skill_count = self._count_skills(repository.local_path) logger.info(f"Rescanned {repo_id}: {skill_count} skills found") # 4. Update metadata repository.last_updated = datetime.now(UTC) repository.skill_count = skill_count # 5. Save updated metadata to SQLite self.metadata_store.update_repository(repository) return repository def list_repositories(self) -> list[Repository]: """List all configured repositories. Returns: List of Repository objects sorted by priority (highest first) Performance Note: - Time Complexity: O(n log n) due to ORDER BY in SQL - Space Complexity: O(n) for loading all repositories - Uses idx_repos_priority index for optimized sorting SQLite automatically uses the priority index for efficient sorting without requiring full table scan. """ return self.metadata_store.list_repositories() def remove_repository(self, repo_id: str) -> None: """Remove repository and its skills. Args: repo_id: Repository identifier to remove Raises: ValueError: If repository not found Error Handling: - ValueError: Repository not found in metadata - OSError: File deletion failed (permissions, locked files) Data Consistency: - Metadata is removed atomically with temp file strategy - If directory deletion fails after metadata removal, directory is orphaned - Future enhancement: Two-phase commit for atomic operation Failure Recovery: - Orphaned directories can be manually deleted from base_dir - Re-running remove will fail (metadata already gone) but directory remains - Consider: Mark as deleted in metadata, then cleanup in background """ # 1. Find repository by ID repository = self.get_repository(repo_id) if not repository: raise ValueError(f"Repository not found: {repo_id}") logger.info(f"Removing repository {repo_id} from {repository.local_path}") # 2. Delete local clone try: if repository.local_path.exists(): shutil.rmtree(repository.local_path) logger.info(f"Deleted local clone at {repository.local_path}") except OSError as e: logger.error(f"Failed to delete repository directory: {e}") raise ValueError( f"Failed to delete repository directory {repository.local_path}: {e}" ) from e # 3. Remove from metadata storage (SQLite with CASCADE deletes skills) # Note: Skill index removal will be handled by SkillManager (Task 4) # and ChromaDB integration (Task 5) in later phases self.metadata_store.delete_repository(repo_id) def get_repository(self, repo_id: str) -> Repository | None: """Get repository by ID. Args: repo_id: Repository identifier Returns: Repository object or None if not found Performance: - Time Complexity: O(1) via SQLite primary key index - Direct lookup without table scan """ return self.metadata_store.get_repository(repo_id) # Private helper methods def _is_valid_git_url(self, url: str) -> bool: """Validate git repository URL format. Args: url: URL to validate Returns: True if URL appears to be a valid git repository URL Supported Formats: - HTTPS: https://github.com/user/repo.git - SSH: git@github.com:user/repo.git - Git protocol: git://github.com/user/repo.git Note: This is basic format validation, not network reachability check. Actual repository validity is tested during clone operation. """ if not url: return False # HTTPS URLs if url.startswith("https://") or url.startswith("http://"): try: parsed = urlparse(url) # Must have scheme, netloc, and path return bool(parsed.scheme and parsed.netloc and parsed.path) except Exception: return False # SSH URLs (git@host:path/to/repo.git) if url.startswith("git@"): # Basic validation: must contain colon separator return ":" in url # Git protocol URLs return bool(url.startswith("git://")) def _generate_repo_id(self, url: str) -> str: """Generate repository ID from URL. Args: url: Git repository URL Returns: Repository ID in format "owner/repo" or "hostname/owner/repo" Examples: "https://github.com/anthropics/skills.git" -> "anthropics/skills" "git@github.com:obra/superpowers.git" -> "obra/superpowers" "https://gitlab.com/group/subgroup/project.git" -> "group/subgroup/project" Design Decision: ID Format Rationale: Use path-based IDs that preserve repository identity across different clone URLs (HTTPS vs SSH). This allows identifying duplicates when users add same repo with different URL formats. Trade-offs: - Uniqueness: Path-based IDs work for GitHub/GitLab style URLs - Collisions: Rare, but possible for self-hosted repos with same path - Readability: IDs are human-readable and match repo names """ # Remove .git suffix if present clean_url = url.rstrip("/") if clean_url.endswith(".git"): clean_url = clean_url[:-4] # Handle SSH URLs (git@host:path) if url.startswith("git@") and ":" in clean_url: # Extract path after colon path = clean_url.split(":", 1)[1] return path.strip("/") # Handle HTTPS/HTTP/Git URLs try: parsed = urlparse(clean_url) # Extract path without leading slash path = parsed.path.lstrip("/") return path except Exception: # Fallback: use sanitized URL as ID return re.sub(r"[^a-zA-Z0-9_-]", "_", clean_url) def _count_skills(self, repo_path: Path) -> int: """Count SKILL.md files in repository. Args: repo_path: Path to repository root Returns: Number of skill files found Performance: - Time Complexity: O(n) where n = total files in repo - Optimization: Could cache results and only rescan changed files Future Enhancement: - Use watchdog for incremental updates - Store skill metadata during scan for faster access """ skill_files = list(repo_path.rglob("SKILL.md")) return len(skill_files)

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/bobmatnyc/mcp-skills'

If you have feedback or need assistance with the MCP directory API, please join our Discord server