void_schema.rs•13.7 kB
//! Schema for SPARQL endpoints based on VoID descriptions
use serde::{Deserialize, Serialize};
use std::collections::{HashMap, HashSet};
use crate::db::SearchDocument;
use crate::sparql_client::{SparqlBindingList, SparqlClient};
/// Type alias for a map of schema names to VoidSchema instances
pub type SchemasMap = HashMap<String, VoidSchema>;
/// Schema for a SPARQL endpoint, extracted from VoID description
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct VoidSchema {
pub endpoint_url: String,
pub schema_map: HashMap<String, SubjectClassInfo>,
pub label_map: HashMap<String, LabelInfo>,
pub predicates_list: HashSet<String>,
pub classes_list: HashSet<String>,
pub prefix_map: HashMap<String, String>,
}
/// With labels and descriptions for classes and properties included (takes 30s longer for UniProt)
const SPARQL_QUERY_SCHEMA: &str = r#"
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX sh:<http://www.w3.org/ns/shacl#>
PREFIX sd:<http://www.w3.org/ns/sparql-service-description#>
PREFIX void:<http://rdfs.org/ns/void#>
PREFIX void-ext:<http://ldf.fi/void-ext#>
SELECT DISTINCT ?subjectClass ?prop ?objectClass ?objectDatatype
?subjectClassLabel ?objectClassLabel ?subjectClassComment ?objectClassComment ?propLabel ?propComment
WHERE {
{
?cp void:class ?subjectClass ;
void:propertyPartition ?pp .
OPTIONAL { ?subjectClass rdfs:label ?subjectClassLabel }
OPTIONAL { ?subjectClass rdfs:comment ?subjectClassComment }
?pp void:property ?prop .
OPTIONAL { ?prop rdfs:label ?propLabel }
OPTIONAL { ?prop rdfs:comment ?propComment }
OPTIONAL {
{
?pp void:classPartition [ void:class ?objectClass ] .
OPTIONAL { ?objectClass rdfs:label ?objectClassLabel }
OPTIONAL { ?objectClass rdfs:comment ?objectClassComment }
} UNION {
?pp void-ext:datatypePartition [ void-ext:datatype ?objectDatatype ] .
}
}
} UNION {
?linkset void:subjectsTarget [ void:class ?subjectClass ] ;
void:linkPredicate ?prop ;
void:objectsTarget [ void:class ?objectClass ] .
}
} ORDER BY ?subjectClass ?objectClass ?objectDatatype
"#;
// OPTIONAL {
// ?graph sd:graph ?graphDesc .
// OPTIONAL { ?graph rdfs:label ?graphLabel }
// ?graphDesc void:classPartition ?cp .
// }
impl VoidSchema {
/// Create a new `VoidSchema` for the given endpoint URL
pub async fn from_endpoint(endpoint_url: &str) -> Self {
let mut prefix_map: HashMap<String, String> = HashMap::new();
let sparql_client = SparqlClient::builder().build().unwrap();
match sparql_client
.query_select(endpoint_url, GET_PREFIXES_QUERY)
.await
{
Ok(res) => {
for binding in res.results.bindings.iter() {
let prefix = binding.values.get("prefix").map(|v| &v.value);
let namespace = binding.values.get("namespace").map(|v| &v.value);
if let (Some(prefix), Some(namespace)) = (prefix, namespace) {
prefix_map.insert(prefix.to_string(), namespace.to_string());
}
}
if res.results.bindings.is_empty() {
tracing::warn!("⚠️ No prefixes found in {endpoint_url}");
}
}
Err(e) => {
tracing::warn!("⚠️ Error querying prefixes for endpoint: {endpoint_url}: {e:?}");
}
}
let mut void_schema = Self {
endpoint_url: endpoint_url.to_string(),
schema_map: HashMap::new(),
label_map: HashMap::new(),
predicates_list: HashSet::new(),
classes_list: HashSet::new(),
prefix_map,
};
match sparql_client
.query_select(endpoint_url, SPARQL_QUERY_SCHEMA)
.await
{
Ok(res) => {
// tracing::debug!("{} schema entries for {endpoint_url}", bindings.len());
for binding in res.results.bindings.iter() {
let subject_cls = binding.values.get("subjectClass").map(|v| &v.value);
let prop = binding.values.get("prop").map(|v| &v.value);
let object_cls = binding.values.get("objectClass").map(|v| &v.value);
let object_datatype = binding.values.get("objectDatatype").map(|v| &v.value);
// Handle cases when object class or datatype
if let (Some(subject_cls), Some(prop), Some(object_cls)) =
(subject_cls, prop, object_cls)
{
void_schema.add(
subject_cls.to_string(),
prop.to_string(),
Some(object_cls.to_string()),
);
} else if let (Some(subject_cls), Some(prop), Some(object_datatype)) =
(subject_cls, prop, object_datatype)
{
void_schema.add(
subject_cls.to_string(),
prop.to_string(),
Some(object_datatype.to_string()),
);
} else if let (Some(subject_cls), Some(prop)) = (subject_cls, prop) {
// Handle cases where we have subject class and predicate but no object information
void_schema.add(subject_cls.to_string(), prop.to_string(), None);
}
// Handle labels and descriptions if present
if let Some(subject_cls) = subject_cls {
void_schema.insert_label(
subject_cls,
"subjectClassLabel",
"subjectClassComment",
binding,
);
}
if let Some(object_cls) = object_cls {
void_schema.insert_label(
object_cls,
"objectClassLabel",
"objectClassComment",
binding,
);
}
if let Some(prop) = prop {
void_schema.insert_label(prop, "propLabel", "propComment", binding);
}
}
if res.results.bindings.is_empty() {
tracing::warn!("⚠️ No VoID classes schema found in {endpoint_url}");
}
}
Err(e) => {
tracing::warn!(
"⚠️ Error executing VoID classes schema SPARQL query for {endpoint_url}: {e}",
);
}
}
// TODO: if no VoID schema found, try simpler queries to at least extract classes and properties with their labels
// e.g. SELECT DISTINCT ?s WHERE { [] a ?s } LIMIT 100
void_schema
}
/// Get the CURIE for a given IRI using the prefix map, fallback to full IRI if no prefix matches
pub fn get_curie(&self, iri: &str) -> String {
for (prefix, namespace) in &self.prefix_map {
if iri.starts_with(namespace) {
let suffix = &iri[namespace.len()..];
return format!("{prefix}:{suffix}");
}
}
format!("<{iri}>")
}
/// Add a schema entry for a subject class with predicate and optional object class/datatype
pub fn add(&mut self, subject_class: String, predicate: String, object_class: Option<String>) {
self.classes_list.insert(subject_class.clone());
if let Some(ref obj_cls) = object_class {
self.classes_list.insert(obj_cls.clone());
}
self.predicates_list.insert(predicate.clone());
let predicate_info = self
.schema_map
.entry(subject_class)
.or_insert_with(|| SubjectClassInfo {
count: 0,
predicates: HashMap::new(),
})
.predicates
.entry(predicate)
.or_insert_with(|| PredicateInfo {
count: 0,
object_cls: Vec::new(),
});
if let Some(obj_cls) = object_class {
predicate_info.object_cls.push(obj_cls);
}
}
/// Convert the schema map to a list of `SearchRecord`, one for each class with ShEx-like shapes as answers
pub async fn to_docs(&self) -> Vec<SearchDocument> {
let mut docs = Vec::new();
// TODO: get prefix map for better IRIs
for (subj_cls, subj_cls_info) in &self.schema_map {
// Get label, description, and curie for subj_cls
let (subj_label, subj_description, subj_curie) = self
.label_map
.get(subj_cls.as_str())
.map(|info| {
(
info.label.clone(),
info.description.clone(),
info.curie.clone(),
)
})
.unwrap_or_else(|| (subj_cls.clone(), String::new(), self.get_curie(subj_cls)));
// Compose the ShEx-like shape string
let mut shex = String::new();
// TODO: add prefixes?
// shex.push_str(&format!("PREFIX up: <http://purl.uniprot.org/uniprot/> ."));
shex.push_str(&format!("{subj_curie} {{\n a [ {subj_curie} ] ;\n"));
for (pred, pred_info) in &subj_cls_info.predicates {
let pred_curie = self.get_curie(pred);
if !pred_info.object_cls.is_empty() {
if pred_info.object_cls.len() == 1
&& pred_info.object_cls[0].starts_with("http://www.w3.org/2001/XMLSchema#")
{
shex.push_str(&format!(
" {pred_curie} {} ;\n",
self.get_curie(&pred_info.object_cls[0])
));
} else {
shex.push_str(&format!(
" {pred_curie} [ {} ] ;\n",
pred_info
.object_cls
.iter()
.map(|cls| self.get_curie(cls))
.collect::<Vec<_>>()
.join(" ")
));
}
} else {
shex.push_str(&format!(" {pred_curie} IRI ;\n"));
}
}
// Remove trailing ' ;\n' and close shape
if shex.ends_with(" ;\n") {
let len = shex.len();
shex.truncate(len - 3);
}
shex.push_str("\n}");
// Compose question with label, description, and curie
let question = if subj_label.is_empty() {
subj_curie.to_string()
} else {
subj_label.to_string()
};
let rec = SearchDocument {
question,
answer: shex.clone(),
doc_type: "SPARQL endpoints classes schema".to_string(),
endpoint_url: self.endpoint_url.clone(),
vector: None,
};
docs.push(rec);
if !subj_description.is_empty() {
docs.push(SearchDocument {
question: subj_description,
answer: shex,
doc_type: "SPARQL endpoints classes schema".to_string(),
endpoint_url: self.endpoint_url.clone(),
vector: None,
});
}
}
docs
}
fn insert_label(
&mut self,
iri: &str,
label_key: &str,
comment_key: &str,
binding: &SparqlBindingList,
) {
if self.label_map.contains_key(iri) {
return;
}
let label = binding.values.get(label_key).map(|v| v.value.clone());
let description = binding
.values
.get(comment_key)
.map(|v| v.value.clone())
.unwrap_or_default();
if label.is_some() || !description.is_empty() {
let curie = self.get_curie(iri);
self.label_map.insert(
iri.to_string(),
LabelInfo {
label: label.unwrap_or_else(|| curie.clone()),
curie,
description,
},
);
}
}
}
/// Represents a class schema with predicate information
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SubjectClassInfo {
pub count: usize,
pub predicates: HashMap<String, PredicateInfo>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PredicateInfo {
pub count: usize,
pub object_cls: Vec<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct LabelInfo {
pub label: String,
pub curie: String,
pub description: String,
}
const GET_PREFIXES_QUERY: &str = r#"
PREFIX sh: <http://www.w3.org/ns/shacl#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
SELECT DISTINCT ?prefix ?namespace
WHERE {
[] sh:namespace ?namespace ;
sh:prefix ?prefix .
} ORDER BY ?prefix
"#;