Skip to main content
Glama
void_schema.rs13.7 kB
//! Schema for SPARQL endpoints based on VoID descriptions use serde::{Deserialize, Serialize}; use std::collections::{HashMap, HashSet}; use crate::db::SearchDocument; use crate::sparql_client::{SparqlBindingList, SparqlClient}; /// Type alias for a map of schema names to VoidSchema instances pub type SchemasMap = HashMap<String, VoidSchema>; /// Schema for a SPARQL endpoint, extracted from VoID description #[derive(Debug, Clone, Serialize, Deserialize)] pub struct VoidSchema { pub endpoint_url: String, pub schema_map: HashMap<String, SubjectClassInfo>, pub label_map: HashMap<String, LabelInfo>, pub predicates_list: HashSet<String>, pub classes_list: HashSet<String>, pub prefix_map: HashMap<String, String>, } /// With labels and descriptions for classes and properties included (takes 30s longer for UniProt) const SPARQL_QUERY_SCHEMA: &str = r#" PREFIX owl: <http://www.w3.org/2002/07/owl#> PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> PREFIX sh:<http://www.w3.org/ns/shacl#> PREFIX sd:<http://www.w3.org/ns/sparql-service-description#> PREFIX void:<http://rdfs.org/ns/void#> PREFIX void-ext:<http://ldf.fi/void-ext#> SELECT DISTINCT ?subjectClass ?prop ?objectClass ?objectDatatype ?subjectClassLabel ?objectClassLabel ?subjectClassComment ?objectClassComment ?propLabel ?propComment WHERE { { ?cp void:class ?subjectClass ; void:propertyPartition ?pp . OPTIONAL { ?subjectClass rdfs:label ?subjectClassLabel } OPTIONAL { ?subjectClass rdfs:comment ?subjectClassComment } ?pp void:property ?prop . OPTIONAL { ?prop rdfs:label ?propLabel } OPTIONAL { ?prop rdfs:comment ?propComment } OPTIONAL { { ?pp void:classPartition [ void:class ?objectClass ] . OPTIONAL { ?objectClass rdfs:label ?objectClassLabel } OPTIONAL { ?objectClass rdfs:comment ?objectClassComment } } UNION { ?pp void-ext:datatypePartition [ void-ext:datatype ?objectDatatype ] . } } } UNION { ?linkset void:subjectsTarget [ void:class ?subjectClass ] ; void:linkPredicate ?prop ; void:objectsTarget [ void:class ?objectClass ] . } } ORDER BY ?subjectClass ?objectClass ?objectDatatype "#; // OPTIONAL { // ?graph sd:graph ?graphDesc . // OPTIONAL { ?graph rdfs:label ?graphLabel } // ?graphDesc void:classPartition ?cp . // } impl VoidSchema { /// Create a new `VoidSchema` for the given endpoint URL pub async fn from_endpoint(endpoint_url: &str) -> Self { let mut prefix_map: HashMap<String, String> = HashMap::new(); let sparql_client = SparqlClient::builder().build().unwrap(); match sparql_client .query_select(endpoint_url, GET_PREFIXES_QUERY) .await { Ok(res) => { for binding in res.results.bindings.iter() { let prefix = binding.values.get("prefix").map(|v| &v.value); let namespace = binding.values.get("namespace").map(|v| &v.value); if let (Some(prefix), Some(namespace)) = (prefix, namespace) { prefix_map.insert(prefix.to_string(), namespace.to_string()); } } if res.results.bindings.is_empty() { tracing::warn!("⚠️ No prefixes found in {endpoint_url}"); } } Err(e) => { tracing::warn!("⚠️ Error querying prefixes for endpoint: {endpoint_url}: {e:?}"); } } let mut void_schema = Self { endpoint_url: endpoint_url.to_string(), schema_map: HashMap::new(), label_map: HashMap::new(), predicates_list: HashSet::new(), classes_list: HashSet::new(), prefix_map, }; match sparql_client .query_select(endpoint_url, SPARQL_QUERY_SCHEMA) .await { Ok(res) => { // tracing::debug!("{} schema entries for {endpoint_url}", bindings.len()); for binding in res.results.bindings.iter() { let subject_cls = binding.values.get("subjectClass").map(|v| &v.value); let prop = binding.values.get("prop").map(|v| &v.value); let object_cls = binding.values.get("objectClass").map(|v| &v.value); let object_datatype = binding.values.get("objectDatatype").map(|v| &v.value); // Handle cases when object class or datatype if let (Some(subject_cls), Some(prop), Some(object_cls)) = (subject_cls, prop, object_cls) { void_schema.add( subject_cls.to_string(), prop.to_string(), Some(object_cls.to_string()), ); } else if let (Some(subject_cls), Some(prop), Some(object_datatype)) = (subject_cls, prop, object_datatype) { void_schema.add( subject_cls.to_string(), prop.to_string(), Some(object_datatype.to_string()), ); } else if let (Some(subject_cls), Some(prop)) = (subject_cls, prop) { // Handle cases where we have subject class and predicate but no object information void_schema.add(subject_cls.to_string(), prop.to_string(), None); } // Handle labels and descriptions if present if let Some(subject_cls) = subject_cls { void_schema.insert_label( subject_cls, "subjectClassLabel", "subjectClassComment", binding, ); } if let Some(object_cls) = object_cls { void_schema.insert_label( object_cls, "objectClassLabel", "objectClassComment", binding, ); } if let Some(prop) = prop { void_schema.insert_label(prop, "propLabel", "propComment", binding); } } if res.results.bindings.is_empty() { tracing::warn!("⚠️ No VoID classes schema found in {endpoint_url}"); } } Err(e) => { tracing::warn!( "⚠️ Error executing VoID classes schema SPARQL query for {endpoint_url}: {e}", ); } } // TODO: if no VoID schema found, try simpler queries to at least extract classes and properties with their labels // e.g. SELECT DISTINCT ?s WHERE { [] a ?s } LIMIT 100 void_schema } /// Get the CURIE for a given IRI using the prefix map, fallback to full IRI if no prefix matches pub fn get_curie(&self, iri: &str) -> String { for (prefix, namespace) in &self.prefix_map { if iri.starts_with(namespace) { let suffix = &iri[namespace.len()..]; return format!("{prefix}:{suffix}"); } } format!("<{iri}>") } /// Add a schema entry for a subject class with predicate and optional object class/datatype pub fn add(&mut self, subject_class: String, predicate: String, object_class: Option<String>) { self.classes_list.insert(subject_class.clone()); if let Some(ref obj_cls) = object_class { self.classes_list.insert(obj_cls.clone()); } self.predicates_list.insert(predicate.clone()); let predicate_info = self .schema_map .entry(subject_class) .or_insert_with(|| SubjectClassInfo { count: 0, predicates: HashMap::new(), }) .predicates .entry(predicate) .or_insert_with(|| PredicateInfo { count: 0, object_cls: Vec::new(), }); if let Some(obj_cls) = object_class { predicate_info.object_cls.push(obj_cls); } } /// Convert the schema map to a list of `SearchRecord`, one for each class with ShEx-like shapes as answers pub async fn to_docs(&self) -> Vec<SearchDocument> { let mut docs = Vec::new(); // TODO: get prefix map for better IRIs for (subj_cls, subj_cls_info) in &self.schema_map { // Get label, description, and curie for subj_cls let (subj_label, subj_description, subj_curie) = self .label_map .get(subj_cls.as_str()) .map(|info| { ( info.label.clone(), info.description.clone(), info.curie.clone(), ) }) .unwrap_or_else(|| (subj_cls.clone(), String::new(), self.get_curie(subj_cls))); // Compose the ShEx-like shape string let mut shex = String::new(); // TODO: add prefixes? // shex.push_str(&format!("PREFIX up: <http://purl.uniprot.org/uniprot/> .")); shex.push_str(&format!("{subj_curie} {{\n a [ {subj_curie} ] ;\n")); for (pred, pred_info) in &subj_cls_info.predicates { let pred_curie = self.get_curie(pred); if !pred_info.object_cls.is_empty() { if pred_info.object_cls.len() == 1 && pred_info.object_cls[0].starts_with("http://www.w3.org/2001/XMLSchema#") { shex.push_str(&format!( " {pred_curie} {} ;\n", self.get_curie(&pred_info.object_cls[0]) )); } else { shex.push_str(&format!( " {pred_curie} [ {} ] ;\n", pred_info .object_cls .iter() .map(|cls| self.get_curie(cls)) .collect::<Vec<_>>() .join(" ") )); } } else { shex.push_str(&format!(" {pred_curie} IRI ;\n")); } } // Remove trailing ' ;\n' and close shape if shex.ends_with(" ;\n") { let len = shex.len(); shex.truncate(len - 3); } shex.push_str("\n}"); // Compose question with label, description, and curie let question = if subj_label.is_empty() { subj_curie.to_string() } else { subj_label.to_string() }; let rec = SearchDocument { question, answer: shex.clone(), doc_type: "SPARQL endpoints classes schema".to_string(), endpoint_url: self.endpoint_url.clone(), vector: None, }; docs.push(rec); if !subj_description.is_empty() { docs.push(SearchDocument { question: subj_description, answer: shex, doc_type: "SPARQL endpoints classes schema".to_string(), endpoint_url: self.endpoint_url.clone(), vector: None, }); } } docs } fn insert_label( &mut self, iri: &str, label_key: &str, comment_key: &str, binding: &SparqlBindingList, ) { if self.label_map.contains_key(iri) { return; } let label = binding.values.get(label_key).map(|v| v.value.clone()); let description = binding .values .get(comment_key) .map(|v| v.value.clone()) .unwrap_or_default(); if label.is_some() || !description.is_empty() { let curie = self.get_curie(iri); self.label_map.insert( iri.to_string(), LabelInfo { label: label.unwrap_or_else(|| curie.clone()), curie, description, }, ); } } } /// Represents a class schema with predicate information #[derive(Debug, Clone, Serialize, Deserialize)] pub struct SubjectClassInfo { pub count: usize, pub predicates: HashMap<String, PredicateInfo>, } #[derive(Debug, Clone, Serialize, Deserialize)] pub struct PredicateInfo { pub count: usize, pub object_cls: Vec<String>, } #[derive(Debug, Clone, Serialize, Deserialize)] pub struct LabelInfo { pub label: String, pub curie: String, pub description: String, } const GET_PREFIXES_QUERY: &str = r#" PREFIX sh: <http://www.w3.org/ns/shacl#> PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> SELECT DISTINCT ?prefix ?namespace WHERE { [] sh:namespace ?namespace ; sh:prefix ?prefix . } ORDER BY ?prefix "#;

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/sib-swiss/sparql-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server