Skip to main content
Glama
by 8b-is
dynamic_tokenizer.rs11.5 kB
//! Dynamic tokenizer - "Learning each project's language!" - Omni //! Automatically discovers and tokenizes common patterns in any codebase use crate::scanner::FileNode; use std::collections::HashMap; /// Dynamic tokenizer that learns project-specific patterns pub struct DynamicTokenizer { /// Path component frequencies path_components: HashMap<String, usize>, /// File name frequencies file_names: HashMap<String, usize>, /// Extension frequencies extensions: HashMap<String, usize>, /// Common prefixes/suffixes prefixes: HashMap<String, usize>, suffixes: HashMap<String, usize>, /// Generated token mappings tokens: HashMap<String, String>, } impl Default for DynamicTokenizer { fn default() -> Self { Self::new() } } impl DynamicTokenizer { pub fn new() -> Self { Self { path_components: HashMap::new(), file_names: HashMap::new(), extensions: HashMap::new(), prefixes: HashMap::new(), suffixes: HashMap::new(), tokens: HashMap::new(), } } /// Analyze nodes to learn patterns pub fn analyze(&mut self, nodes: &[FileNode]) { for node in nodes { // Analyze path components let path_str = node.path.to_string_lossy(); // Split path into components for component in path_str.split('/').filter(|c| !c.is_empty()) { *self .path_components .entry(component.to_string()) .or_insert(0) += 1; } // Analyze file name if let Some(file_name) = node.path.file_name() { let name = file_name.to_string_lossy().to_string(); *self.file_names.entry(name.clone()).or_insert(0) += 1; // Extract common patterns self.analyze_name_patterns(&name); } // Analyze extension if let Some(ext) = node.path.extension() { let ext_str = ext.to_string_lossy().to_string(); *self.extensions.entry(ext_str).or_insert(0) += 1; } } // Generate optimal tokens self.generate_tokens(); } /// Analyze file name for common patterns fn analyze_name_patterns(&mut self, name: &str) { // Common prefixes let prefix_patterns = ["test_", "Test", "_", "mock_", "stub_", "fake_"]; for prefix in &prefix_patterns { if name.starts_with(prefix) { *self.prefixes.entry(prefix.to_string()).or_insert(0) += 1; } } // Common suffixes let suffix_patterns = [ "_test", "Test", "Spec", "_spec", ".test", ".spec", "Controller", "Service", "Repository", "Model", "View", "Component", "Module", "Config", ]; for suffix in &suffix_patterns { if name.contains(suffix) { *self.suffixes.entry(suffix.to_string()).or_insert(0) += 1; } } // Camel/Snake case components if name.contains('_') { // Snake case - split and analyze for part in name.split('_') { if part.len() > 2 { *self.path_components.entry(part.to_string()).or_insert(0) += 1; } } } else if name.chars().any(|c| c.is_uppercase()) && name.chars().any(|c| c.is_lowercase()) { // CamelCase - split and analyze let parts = split_camel_case(name); for part in parts { if part.len() > 2 { *self.path_components.entry(part).or_insert(0) += 1; } } } } /// Generate optimal token assignments fn generate_tokens(&mut self) { let mut token_id = 0x80; // Start from 128 // Sort all patterns by frequency let mut all_patterns: Vec<(String, usize)> = Vec::new(); // Collect all patterns with their frequencies for (pattern, count) in &self.path_components { if *count > 2 { // Only tokenize if it appears more than twice all_patterns.push((pattern.clone(), *count)); } } for (pattern, count) in &self.file_names { if *count > 2 { all_patterns.push((pattern.clone(), *count)); } } for (pattern, count) in &self.extensions { if *count > 5 { // Extensions need higher frequency all_patterns.push((format!(".{}", pattern), *count)); } } // Sort by frequency (descending) and pattern length (descending for same frequency) all_patterns.sort_by(|a, b| b.1.cmp(&a.1).then_with(|| b.0.len().cmp(&a.0.len()))); // Assign tokens to most frequent patterns for (pattern, _count) in all_patterns.iter().take(127) { // Max 127 tokens self.tokens .insert(pattern.clone(), format!("{:02X}", token_id)); token_id += 1; if token_id > 0xFE { // Reserve 0xFF break; } } } /// Compress a path using learned tokens pub fn compress_path(&self, path: &str) -> String { let mut compressed = path.to_string(); // Apply tokens from longest to shortest to avoid substring issues let mut tokens_by_length: Vec<(&String, &String)> = self.tokens.iter().collect(); tokens_by_length.sort_by(|a, b| b.0.len().cmp(&a.0.len())); for (pattern, token) in tokens_by_length { compressed = compressed.replace(pattern, &format!("{{{}}}", token)); } compressed } /// Get the token dictionary header pub fn get_token_header(&self) -> String { let mut header = String::from("TOKENS:\n"); // Sort tokens by ID for consistent output let mut sorted_tokens: Vec<(&String, &String)> = self.tokens.iter().collect(); sorted_tokens.sort_by(|a, b| a.1.cmp(b.1)); for (pattern, token) in sorted_tokens { header.push_str(&format!(" {}={}\n", token, pattern)); } header } /// Get compression statistics pub fn get_stats(&self) -> TokenizerStats { let total_pattern_bytes: usize = self.tokens.keys().map(|k| k.len()).sum(); let total_token_bytes = self.tokens.len() * 3; // {XX} format TokenizerStats { patterns_found: self.path_components.len() + self.file_names.len() + self.extensions.len(), tokens_generated: self.tokens.len(), estimated_savings: total_pattern_bytes.saturating_sub(total_token_bytes), } } } #[derive(Debug)] pub struct TokenizerStats { pub patterns_found: usize, pub tokens_generated: usize, pub estimated_savings: usize, } /// Split CamelCase into components fn split_camel_case(s: &str) -> Vec<String> { let mut result = Vec::new(); let mut current = String::new(); for (i, ch) in s.chars().enumerate() { if i > 0 && ch.is_uppercase() && !current.is_empty() { result.push(current.clone()); current.clear(); } current.push(ch.to_lowercase().to_string().chars().next().unwrap()); } if !current.is_empty() { result.push(current); } result } #[cfg(test)] mod tests { use super::*; #[test] fn test_camel_case_split() { assert_eq!( split_camel_case("UserController"), vec!["user", "controller"] ); assert_eq!( split_camel_case("HTTPSConnection"), vec!["h", "t", "t", "p", "s", "connection"] ); } #[test] fn test_pattern_detection() { let mut tokenizer = DynamicTokenizer::new(); // Simulate a typical web project let patterns = vec![ "src/components/UserList.tsx", "src/components/UserDetail.tsx", "src/components/UserForm.tsx", "src/services/UserService.ts", "src/services/AuthService.ts", "src/services/ApiService.ts", // Added to make services appear 3 times "src/controllers/UserController.ts", "src/controllers/AuthController.ts", "src/controllers/ApiController.ts", // Added to make controllers appear 3 times "tests/unit/UserService.test.ts", "tests/unit/AuthService.test.ts", "tests/integration/ApiService.test.ts", // Added to make tests appear 3 times ]; for pattern in patterns { tokenizer.analyze_name_patterns(pattern); for component in pattern.split('/') { *tokenizer .path_components .entry(component.to_string()) .or_insert(0) += 1; } } tokenizer.generate_tokens(); // Should tokenize frequent patterns (appear > 2 times) assert!(tokenizer.tokens.contains_key("src")); assert!(tokenizer.tokens.contains_key("components")); assert!(tokenizer.tokens.contains_key("services")); assert!(tokenizer.tokens.contains_key("tests")); assert!(tokenizer.tokens.contains_key("controllers")); } #[test] fn test_compression() { let mut tokenizer = DynamicTokenizer::new(); // Add patterns for _ in 0..10 { *tokenizer .path_components .entry("src".to_string()) .or_insert(0) += 1; *tokenizer .path_components .entry("components".to_string()) .or_insert(0) += 1; } tokenizer.generate_tokens(); // Test compression let original = "src/components/Button.tsx"; let compressed = tokenizer.compress_path(original); // Should be shorter assert!(compressed.len() < original.len()); // Should contain token markers assert!(compressed.contains("{")); assert!(compressed.contains("}")); } #[test] fn test_token_assignment_order() { let mut tokenizer = DynamicTokenizer::new(); // Add patterns with different frequencies *tokenizer .path_components .entry("very_frequent".to_string()) .or_insert(0) = 100; *tokenizer .path_components .entry("less_frequent".to_string()) .or_insert(0) = 50; *tokenizer .path_components .entry("rare".to_string()) .or_insert(0) = 3; *tokenizer .path_components .entry("too_rare".to_string()) .or_insert(0) = 1; // Won't be tokenized tokenizer.generate_tokens(); // Most frequent should get lower token IDs let very_frequent_token = tokenizer.tokens.get("very_frequent").unwrap(); let less_frequent_token = tokenizer.tokens.get("less_frequent").unwrap(); assert!(very_frequent_token < less_frequent_token); // Too rare shouldn't be tokenized assert!(!tokenizer.tokens.contains_key("too_rare")); } }

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/8b-is/smart-tree'

If you have feedback or need assistance with the MCP directory API, please join our Discord server