Skip to main content
Glama
project_initializer.py19 kB
""" Project initialization component for automatically detecting project type and configuring settings """ import logging import os import json from pathlib import Path from typing import Dict, List, Optional, Tuple, Any, Set import fnmatch logger = logging.getLogger("files-db-mcp.project_initializer") # Project type detection patterns PROJECT_TYPE_PATTERNS = { "python": ["pyproject.toml", "setup.py", "requirements.txt", "__init__.py"], "javascript": ["package.json", "package-lock.json", "yarn.lock", "node_modules"], "typescript": ["tsconfig.json", "tsc.config", "index.ts", "package.json"], "go": ["go.mod", "go.sum", "main.go", ".go"], "rust": ["Cargo.toml", "Cargo.lock", "src/main.rs", "src/lib.rs"], "java": ["pom.xml", "build.gradle", "settings.gradle", ".java"], "c_cpp": ["CMakeLists.txt", "Makefile", ".cpp", ".hpp", ".c", ".h"], "csharp": [".csproj", ".sln", "Program.cs", "NuGet.Config"], "php": ["composer.json", "composer.lock", "artisan", ".php"], "ruby": ["Gemfile", "Rakefile", ".ruby-version", ".rb"], } # Default embedding models by project type DEFAULT_EMBEDDING_MODELS = { "python": "jinaai/jina-embeddings-v2-base-code", "javascript": "jinaai/jina-embeddings-v2-base-code", "typescript": "jinaai/jina-embeddings-v2-base-code", "go": "Salesforce/SFR-Embedding-2_R", "rust": "Salesforce/SFR-Embedding-2_R", "java": "Alibaba-NLP/gte-large-en-v1.5", "c_cpp": "Salesforce/SFR-Embedding-2_R", "csharp": "Alibaba-NLP/gte-large-en-v1.5", "php": "BAAI/bge-large-en-v1.5", "ruby": "BAAI/bge-large-en-v1.5", # Fallback for unknown project types "default": "sentence-transformers/all-MiniLM-L6-v2", } # Default model configurations by project type DEFAULT_MODEL_CONFIGS = { "python": { "device": "auto", "normalize_embeddings": True, "quantization": "int8", "prompt_template": "Code: {text}" }, "javascript": { "device": "auto", "normalize_embeddings": True, "quantization": "int8", "prompt_template": "Code: {text}" }, "typescript": { "device": "auto", "normalize_embeddings": True, "quantization": "int8", "prompt_template": "Code: {text}" }, "go": { "device": "auto", "normalize_embeddings": True, "prompt_template": "Code: {text}" }, "rust": { "device": "auto", "normalize_embeddings": True, "prompt_template": "Code: {text}" }, "java": { "device": "auto", "normalize_embeddings": True, "quantization": "int8", }, "c_cpp": { "device": "auto", "normalize_embeddings": True, "prompt_template": "Code: {text}" }, "csharp": { "device": "auto", "normalize_embeddings": True, "quantization": "int8", }, "php": { "device": "auto", "normalize_embeddings": True, "quantization": "int8", }, "ruby": { "device": "auto", "normalize_embeddings": True, "quantization": "int8", }, # Fallback for unknown project types "default": { "device": "auto", "normalize_embeddings": True, "quantization": "int8", }, } # Default ignore patterns by project type (in addition to global ones) DEFAULT_IGNORE_PATTERNS = { "python": ["venv/", "env/", "__pycache__/", "*.pyc", "*.pyo", "*.egg-info/", "dist/", "build/"], "javascript": ["node_modules/", "dist/", "build/", "coverage/", ".next/", ".cache/"], "typescript": ["node_modules/", "dist/", "build/", "coverage/", ".next/", ".cache/"], "go": ["vendor/", "bin/", "*.test"], "rust": ["target/", "debug/", "release/", "Cargo.lock"], "java": ["target/", "build/", "*.class", "*.jar"], "c_cpp": ["build/", "*.o", "*.a", "*.so", "*.exe", "Debug/", "Release/"], "csharp": ["bin/", "obj/", "packages/", "*.suo", "*.user", ".vs/"], "php": ["vendor/", "var/", "public/bundles/"], "ruby": ["vendor/", "tmp/", "log/"], # Fallback for unknown project types "default": [], } class ProjectInitializer: """ Project initialization with auto-detection of project type and smart defaults """ def __init__(self, project_path: str, data_dir: str): """ Initialize project detector Args: project_path: Path to the project directory data_dir: Directory to store configuration and state """ self.project_path = Path(project_path) self.data_dir = Path(data_dir) self.config_file = self.data_dir / "config.json" # Ensure data directory exists os.makedirs(self.data_dir, exist_ok=True) # Default settings self.detected_project_types = [] self.primary_project_type = "default" self.custom_ignore_patterns = [] self.embedding_model = DEFAULT_EMBEDDING_MODELS["default"] self.model_config = DEFAULT_MODEL_CONFIGS["default"].copy() def detect_project_types(self) -> List[str]: """ Detect project types based on file patterns Returns: List of detected project types """ detected_types = [] logger.info(f"Detecting project types in: {self.project_path}") # Check for VCS directories vcs_dirs = [".git", ".svn", ".hg"] has_vcs = any(os.path.isdir(self.project_path / vcs_dir) for vcs_dir in vcs_dirs) if has_vcs: logger.info(f"Detected version control system in project") # Scan root directory first for project markers root_files = [f.name for f in self.project_path.glob("*") if f.is_file()] # Score project types based on pattern matches type_scores = {project_type: 0 for project_type in PROJECT_TYPE_PATTERNS.keys()} # Check root files against patterns for project_type, patterns in PROJECT_TYPE_PATTERNS.items(): for pattern in patterns: # Direct file match (highest score) if any(f == pattern for f in root_files): type_scores[project_type] += 10 continue # Check for pattern match in root for file in root_files: if fnmatch.fnmatch(file, pattern): type_scores[project_type] += 8 # Walk directory for deeper pattern matches (limited depth for performance) max_depth = 3 current_depth = 0 for root, dirs, files in os.walk(self.project_path): # Skip ignored directories if any(ignored in root for ignored in [".git", "node_modules", "__pycache__", "venv"]): continue # Calculate current depth rel_path = os.path.relpath(root, self.project_path) current_depth = 0 if rel_path == "." else rel_path.count(os.sep) + 1 # Skip if we've gone too deep if current_depth > max_depth: continue # Check all files in this directory against patterns for file in files: file_path = os.path.join(root, file) rel_file_path = os.path.relpath(file_path, self.project_path) for project_type, patterns in PROJECT_TYPE_PATTERNS.items(): for pattern in patterns: # Direct file name match if file == pattern: # Score decreases with depth type_scores[project_type] += max(5, 10 - current_depth * 2) continue # Extension match if pattern.startswith(".") and file.endswith(pattern): # Score extension matches based on frequency and depth type_scores[project_type] += max(1, 3 - current_depth) continue # Pattern match if fnmatch.fnmatch(file, pattern): type_scores[project_type] += max(1, 3 - current_depth) # Sort project types by score and filter those with score > 0 sorted_types = sorted( [(project_type, score) for project_type, score in type_scores.items() if score > 0], key=lambda x: x[1], reverse=True ) # Log detection results logger.info(f"Project type detection scores: {dict(sorted_types)}") # Return detected types (all with score > 0) detected_types = [project_type for project_type, score in sorted_types] if not detected_types: logger.warning("No specific project type detected, using default configuration") detected_types = ["default"] self.detected_project_types = detected_types self.primary_project_type = detected_types[0] if detected_types else "default" return detected_types def detect_ignore_patterns(self) -> List[str]: """ Detect project-specific ignore patterns from .gitignore, etc. Returns: List of additional ignore patterns """ ignore_patterns = [] # Check .gitignore gitignore_path = self.project_path / ".gitignore" if gitignore_path.exists(): logger.info(f"Found .gitignore at {gitignore_path}") try: with open(gitignore_path, "r") as f: for line in f: line = line.strip() # Skip comments and empty lines if line and not line.startswith("#"): # Convert .gitignore patterns to glob patterns # Remove negation as it's handled differently if line.startswith("!"): continue # Handle directory markers (trailing slash) if line.endswith("/"): line = f"{line}**" ignore_patterns.append(line) logger.info(f"Added {len(ignore_patterns)} patterns from .gitignore") except Exception as e: logger.warning(f"Error reading .gitignore: {e!s}") # Check .dockerignore dockerignore_path = self.project_path / ".dockerignore" if dockerignore_path.exists(): logger.info(f"Found .dockerignore at {dockerignore_path}") try: with open(dockerignore_path, "r") as f: for line in f: line = line.strip() # Skip comments and empty lines if line and not line.startswith("#"): # Skip negation patterns if line.startswith("!"): continue ignore_patterns.append(line) logger.info(f"Added patterns from .dockerignore") except Exception as e: logger.warning(f"Error reading .dockerignore: {e!s}") # Add default ignore patterns for the detected project type for project_type in self.detected_project_types: if project_type in DEFAULT_IGNORE_PATTERNS: type_patterns = DEFAULT_IGNORE_PATTERNS[project_type] ignore_patterns.extend(type_patterns) logger.info(f"Added {len(type_patterns)} default ignore patterns for {project_type}") # Deduplicate patterns self.custom_ignore_patterns = list(set(ignore_patterns)) return self.custom_ignore_patterns def select_embedding_model(self) -> Tuple[str, Dict[str, Any]]: """ Select appropriate embedding model based on project type Returns: Tuple of (model_name, model_config) """ # First check environment variable for FAST_STARTUP mode fast_startup = os.environ.get("FAST_STARTUP", "false").lower() == "true" # Check if embedding model is explicitly set in environment env_model = os.environ.get("EMBEDDING_MODEL") if env_model: logger.info(f"Using embedding model from environment: {env_model}") self.embedding_model = env_model # Keep default config but allow overrides from config file if self.primary_project_type in DEFAULT_MODEL_CONFIGS: self.model_config = DEFAULT_MODEL_CONFIGS[self.primary_project_type].copy() else: self.model_config = DEFAULT_MODEL_CONFIGS["default"].copy() # Check if we have a custom config file elif self.config_file.exists(): try: with open(self.config_file, "r") as f: config = json.load(f) if "embedding_model" in config: self.embedding_model = config["embedding_model"] logger.info(f"Using custom embedding model from config: {self.embedding_model}") if "model_config" in config: self.model_config.update(config["model_config"]) logger.info(f"Using custom model configuration from config") return self.embedding_model, self.model_config except Exception as e: logger.warning(f"Error reading config file: {e!s}") # If fast startup is requested, use the smallest model elif fast_startup: logger.info("FAST_STARTUP is enabled, using lightweight embedding model") self.embedding_model = "sentence-transformers/all-MiniLM-L6-v2" # Smallest model ~90MB self.model_config = DEFAULT_MODEL_CONFIGS["default"].copy() # Otherwise use default model for the primary project type elif self.primary_project_type in DEFAULT_EMBEDDING_MODELS: self.embedding_model = DEFAULT_EMBEDDING_MODELS[self.primary_project_type] self.model_config = DEFAULT_MODEL_CONFIGS[self.primary_project_type].copy() else: self.embedding_model = DEFAULT_EMBEDDING_MODELS["default"] self.model_config = DEFAULT_MODEL_CONFIGS["default"].copy() # Set device to auto-detect by default if self.model_config.get("device") == "auto": try: # Try to detect GPU import torch if torch.cuda.is_available(): self.model_config["device"] = "cuda" logger.info("CUDA detected, using GPU for embeddings") elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available(): self.model_config["device"] = "mps" logger.info("MPS detected, using Apple Silicon GPU for embeddings") else: self.model_config["device"] = "cpu" logger.info("No GPU detected, using CPU for embeddings") except ImportError: # If torch is not available, default to CPU self.model_config["device"] = "cpu" logger.info("PyTorch not available, defaulting to CPU for embeddings") logger.info(f"Selected embedding model: {self.embedding_model}") logger.info(f"Model configuration: {self.model_config}") return self.embedding_model, self.model_config def generate_config_file(self): """Generate configuration file with defaults""" # Only generate if it doesn't already exist if not self.config_file.exists(): try: config = { "project_type": self.primary_project_type, "detected_project_types": self.detected_project_types, "embedding_model": self.embedding_model, "model_config": self.model_config, "custom_ignore_patterns": self.custom_ignore_patterns, "auto_generated": True, "version": "0.1.0" } with open(self.config_file, "w") as f: json.dump(config, f, indent=2) logger.info(f"Generated configuration file: {self.config_file}") except Exception as e: logger.error(f"Error generating configuration file: {e!s}") def load_config(self) -> Dict[str, Any]: """Load configuration from file""" if self.config_file.exists(): try: with open(self.config_file, "r") as f: config = json.load(f) logger.info(f"Loaded configuration from {self.config_file}") return config except Exception as e: logger.error(f"Error loading configuration file: {e!s}") return {} def get_ignore_patterns(self) -> List[str]: """Get combined ignore patterns""" global_ignore_patterns = [ ".git", "node_modules", "__pycache__", "*.pyc", "*.pyo", ".DS_Store", ".idea", ".vscode" ] # Combine global patterns with custom patterns return list(set(global_ignore_patterns + self.custom_ignore_patterns)) def initialize_project(self) -> Dict[str, Any]: """ Initialize project with auto-detection and configuration Returns: Dictionary with project configuration """ logger.info(f"Initializing project at: {self.project_path}") # Step 1: Detect project types project_types = self.detect_project_types() logger.info(f"Detected project types: {project_types}") # Step 2: Detect ignore patterns ignore_patterns = self.detect_ignore_patterns() logger.info(f"Detected {len(ignore_patterns)} ignore patterns") # Step 3: Select embedding model embedding_model, model_config = self.select_embedding_model() # Step 4: Generate config file self.generate_config_file() # Return the configuration return { "project_types": project_types, "primary_project_type": self.primary_project_type, "embedding_model": embedding_model, "model_config": model_config, "ignore_patterns": self.get_ignore_patterns() }

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/randomm/files-db-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server