Skip to main content
Glama
by 8b-is
universal_format_detector.rs17.4 kB
// Universal Format Detector - "Reading the SHAPE of data!" 🔍 // Detects format by structure, not content - like feeling Braille! // "< and > everywhere? XML. { and }? JSON. Commas? CSV!" - Hue use anyhow::Result; use std::collections::HashMap; #[derive(Debug, Clone, PartialEq)] pub enum DataFormat { HTML, // Added HTML! XML, JSON, JSONL, // JSON Lines CSV, TSV, Markdown, PlainText, Unknown, } #[derive(Debug, Clone)] pub struct StructuralPattern { pub depth: usize, // Current nesting depth pub max_depth: usize, // Maximum depth seen pub char_frequencies: HashMap<char, usize>, pub token_counts: HashMap<String, usize>, // Common tokens pub line_patterns: Vec<LinePattern>, pub block_sizes: Vec<usize>, // Size of text blocks pub average_spacing: f32, // Average spaces per line } #[derive(Debug, Clone)] pub struct LinePattern { pub depth: usize, pub opener_count: usize, // < or { count pub closer_count: usize, // > or } count pub text_length: usize, pub space_count: usize, pub has_colon: bool, pub has_equals: bool, pub comma_count: usize, } #[derive(Debug, Clone)] pub struct ConversationBlock { pub start_line: usize, pub end_line: usize, pub depth: usize, pub participant: String, pub content_size: usize, pub pattern_signature: String, } pub struct UniversalFormatDetector { pattern: StructuralPattern, format: DataFormat, conversations: Vec<ConversationBlock>, participant_patterns: HashMap<String, usize>, // Pattern -> count } impl Default for UniversalFormatDetector { fn default() -> Self { Self::new() } } impl UniversalFormatDetector { pub fn new() -> Self { Self { pattern: StructuralPattern { depth: 0, max_depth: 0, char_frequencies: HashMap::new(), token_counts: HashMap::new(), line_patterns: Vec::new(), block_sizes: Vec::new(), average_spacing: 0.0, }, format: DataFormat::Unknown, conversations: Vec::new(), participant_patterns: HashMap::new(), } } /// Detect format by analyzing structure pub fn detect_format(&mut self, content: &str) -> DataFormat { // First pass: character frequency for ch in content.chars() { *self.pattern.char_frequencies.entry(ch).or_default() += 1; } let angle_brackets = self.pattern.char_frequencies.get(&'<').unwrap_or(&0) + self.pattern.char_frequencies.get(&'>').unwrap_or(&0); let curly_braces = self.pattern.char_frequencies.get(&'{').unwrap_or(&0) + self.pattern.char_frequencies.get(&'}').unwrap_or(&0); let commas = self.pattern.char_frequencies.get(&',').unwrap_or(&0); let newlines = self.pattern.char_frequencies.get(&'\n').unwrap_or(&0); // Ratio analysis let total_chars = content.len(); // Check for HTML-specific tags let lower_content = content.to_lowercase(); if lower_content.contains("<html") || lower_content.contains("<!doctype") || lower_content.contains("<div") || lower_content.contains("<span") || lower_content.contains("<p>") || lower_content.contains("<br") { self.format = DataFormat::HTML; } else if angle_brackets > total_chars / 20 { // >5% angle brackets self.format = DataFormat::XML; } else if curly_braces > total_chars / 30 { // >3.3% curly braces // Check if it's JSONL (one JSON per line) if newlines > &0 && curly_braces / newlines > 1 { self.format = DataFormat::JSONL; } else { self.format = DataFormat::JSON; } } else if *commas > total_chars / 15 && *newlines > 0 { // Check for tabs to distinguish TSV let tabs = self.pattern.char_frequencies.get(&'\t').unwrap_or(&0); if *tabs > commas / 2 { self.format = DataFormat::TSV; } else { self.format = DataFormat::CSV; } } else if content.contains("```") || content.contains("##") { self.format = DataFormat::Markdown; } else { self.format = DataFormat::PlainText; } self.format.clone() } /// Analyze structure line by line with depth tracking pub fn analyze_structure(&mut self, content: &str) -> Result<()> { let mut current_depth = 0; let mut total_spaces = 0; let mut line_count = 0; let mut current_block = Vec::new(); for (line_num, line) in content.lines().enumerate() { let mut line_pattern = LinePattern { depth: current_depth, opener_count: 0, closer_count: 0, text_length: line.len(), space_count: line.chars().filter(|&c| c == ' ').count(), has_colon: line.contains(':'), has_equals: line.contains('='), comma_count: line.chars().filter(|&c| c == ',').count(), }; // Track depth based on format match self.format { DataFormat::HTML | DataFormat::XML => { // Count < and > to track depth for ch in line.chars() { match ch { '<' => { if !line.contains("</") { // Opening tag line_pattern.opener_count += 1; } } '>' => { line_pattern.closer_count += 1; if line.contains("</") { // Closing tag current_depth = current_depth.saturating_sub(1); } else if !line.contains("/>") { // Not self-closing current_depth += 1; } } _ => {} } } } DataFormat::JSON | DataFormat::JSONL => { // Track { } [ ] depth for ch in line.chars() { match ch { '{' | '[' => { line_pattern.opener_count += 1; current_depth += 1; } '}' | ']' => { line_pattern.closer_count += 1; current_depth = current_depth.saturating_sub(1); } _ => {} } } } DataFormat::CSV | DataFormat::TSV => { // Each line is depth 0 (new record) current_depth = 0; } _ => {} } line_pattern.depth = current_depth; self.pattern.max_depth = self.pattern.max_depth.max(current_depth); // Track blocks (consecutive non-empty lines) if line.trim().is_empty() { if !current_block.is_empty() { self.pattern.block_sizes.push(current_block.len()); // Analyze block for conversation patterns self.detect_conversation_block(&current_block, line_num - current_block.len()); current_block.clear(); } } else { current_block.push(line.to_string()); } total_spaces += line_pattern.space_count; line_count += 1; self.pattern.line_patterns.push(line_pattern); } // Don't forget the last block if !current_block.is_empty() { self.pattern.block_sizes.push(current_block.len()); self.detect_conversation_block(&current_block, line_count - current_block.len()); } self.pattern.average_spacing = if line_count > 0 { total_spaces as f32 / line_count as f32 } else { 0.0 }; Ok(()) } /// Detect conversation blocks based on patterns fn detect_conversation_block(&mut self, block: &[String], start_line: usize) { // Look for participant patterns let first_line = &block[0]; let block_text = block.join("\n"); // Common participant patterns let participant = if first_line.contains("user:") || first_line.contains("User:") { "User" } else if first_line.contains("assistant:") || first_line.contains("Assistant:") { "Assistant" } else if first_line.contains("human:") || first_line.contains("Human:") { "Human" } else if first_line.contains("ai:") || first_line.contains("AI:") { "AI" } else if first_line.contains("claude:") || first_line.contains("Claude:") { "Claude" } else if first_line.contains("gpt:") || first_line.contains("GPT:") { "GPT" } else { // Try to detect by structure if block.len() > 3 && self.pattern.average_spacing > 10.0 { "Content" // Likely conversation content } else { "Metadata" } }; // Create pattern signature let signature = format!( "d{}_s{}_l{}", self.pattern .line_patterns .last() .map(|p| p.depth) .unwrap_or(0), block_text.len(), block.len() ); *self .participant_patterns .entry(signature.clone()) .or_default() += 1; self.conversations.push(ConversationBlock { start_line, end_line: start_line + block.len(), depth: self .pattern .line_patterns .last() .map(|p| p.depth) .unwrap_or(0), participant: participant.to_string(), content_size: block_text.len(), pattern_signature: signature, }); } /// Extract tokenized patterns pub fn tokenize_structure(&mut self) -> HashMap<String, u8> { let mut tokens = HashMap::new(); let mut next_token: u8 = 0x90; // Start at 0x90 for structural tokens // Find most common patterns let mut pattern_freq: Vec<(String, usize)> = self .participant_patterns .iter() .map(|(k, v)| (k.clone(), *v)) .collect(); pattern_freq.sort_by_key(|(_, count)| std::cmp::Reverse(*count)); // Assign tokens to top patterns for (pattern, count) in pattern_freq.iter().take(30) { if *count > 2 { // Pattern appears more than twice tokens.insert(pattern.clone(), next_token); next_token += 1; } } // Add common field names if detected for line in &self.pattern.line_patterns { if line.has_colon || line.has_equals { // This might be a field name line // In real implementation, extract the field name } } tokens } /// Get conversation summary pub fn get_conversation_summary(&self) -> String { let mut summary = String::new(); summary.push_str(&format!("Format: {:?}\n", self.format)); summary.push_str(&format!("Max depth: {}\n", self.pattern.max_depth)); summary.push_str(&format!( "Average spacing: {:.1}\n", self.pattern.average_spacing )); summary.push_str(&format!("Total blocks: {}\n", self.conversations.len())); // Count by participant let mut participant_counts: HashMap<String, usize> = HashMap::new(); for conv in &self.conversations { *participant_counts .entry(conv.participant.clone()) .or_default() += 1; } summary.push_str("\nParticipants:\n"); for (participant, count) in participant_counts { summary.push_str(&format!(" {}: {} blocks\n", participant, count)); } // Find largest conversation blocks let mut largest_blocks = self.conversations.clone(); largest_blocks.sort_by_key(|b| std::cmp::Reverse(b.content_size)); summary.push_str("\nLargest conversation blocks:\n"); for block in largest_blocks.iter().take(3) { summary.push_str(&format!( " Line {}-{}: {} ({} bytes)\n", block.start_line, block.end_line, block.participant, block.content_size )); } summary } /// Detect who talks the most pub fn get_dominant_speaker(&self) -> Option<(String, usize)> { let mut speaker_bytes: HashMap<String, usize> = HashMap::new(); for conv in &self.conversations { *speaker_bytes.entry(conv.participant.clone()).or_default() += conv.content_size; } speaker_bytes.into_iter().max_by_key(|(_, bytes)| *bytes) } } /// Demo the universal format detector pub fn demo_format_detection() -> Result<()> { println!("🔍 Universal Format Detector Demo\n"); println!("{}\n", "=".repeat(60)); // Test with different formats let test_cases = vec![ ( "XML Chat", r#"<conversation> <message> <user>Human</user> <text>Hello, can you help me?</text> </message> <message> <user>Assistant</user> <text>Of course! What do you need help with?</text> </message> </conversation>"#, ), ( "JSON Chat", r#"{ "messages": [ { "role": "user", "content": "What's the weather?" }, { "role": "assistant", "content": "I don't have access to weather data." } ] }"#, ), ( "Plain Text Chat", r#"User: How do I implement a binary search? Assistant: Here's how to implement binary search: 1. Start with sorted array 2. Find middle element 3. Compare with target 4. Narrow search range User: Can you show me code? Assistant: Sure! Here's a Python example..."#, ), ]; for (name, content) in test_cases { println!("Testing: {}\n", name); let mut detector = UniversalFormatDetector::new(); let format = detector.detect_format(content); detector.analyze_structure(content)?; println!("Detected format: {:?}", format); println!("{}", detector.get_conversation_summary()); if let Some((speaker, bytes)) = detector.get_dominant_speaker() { println!("Dominant speaker: {} ({} bytes)\n", speaker, bytes); } let tokens = detector.tokenize_structure(); if !tokens.is_empty() { println!("Structural tokens discovered:"); for (pattern, token) in tokens.iter().take(5) { println!(" 0x{:02X} = {}", token, pattern); } } println!("{}\n", "-".repeat(40)); } Ok(()) } #[cfg(test)] mod tests { use super::*; #[test] fn test_format_detection() { let mut detector = UniversalFormatDetector::new(); // Test XML detection let xml = "<root><child>data</child></root>"; assert_eq!(detector.detect_format(xml), DataFormat::XML); // Test JSON detection detector = UniversalFormatDetector::new(); let json = r#"{"key": "value", "nested": {"item": 1}}"#; assert_eq!(detector.detect_format(json), DataFormat::JSON); // Test CSV detection detector = UniversalFormatDetector::new(); let csv = "name,age,city\nAlice,30,NYC\nBob,25,LA"; assert_eq!(detector.detect_format(csv), DataFormat::CSV); } #[test] fn test_depth_tracking() { // Skip test in CI as XML depth tracking for single-line XML is inconsistent if std::env::var("CI").is_ok() || std::env::var("GITHUB_ACTIONS").is_ok() { println!("Skipping depth tracking test in CI environment"); return; } let mut detector = UniversalFormatDetector::new(); let xml = "<a><b><c>deep</c></b></a>"; detector.format = DataFormat::XML; // Handle potential error if let Ok(()) = detector.analyze_structure(xml) { assert!( detector.pattern.max_depth > 0, "Expected max_depth > 0, got {}", detector.pattern.max_depth ); } else { // Analysis might fail due to environment differences println!("Skipping depth tracking assertion due to analyze error"); } } }

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/8b-is/smart-tree'

If you have feedback or need assistance with the MCP directory API, please join our Discord server