oai_pmh_client.py•3.52 kB
"""
OAI-PMH client for Riksarkivet.
"""
from typing import Dict, Optional, Union, List
import requests
from lxml import etree
from ..config import OAI_BASE_URL, NAMESPACES
class OAIPMHClient:
"""Client for interacting with OAI-PMH repositories."""
def __init__(self, base_url: str = OAI_BASE_URL):
self.base_url = base_url
self.session = requests.Session()
self.session.headers.update({"User-Agent": "Transcribed-Search-Browser/1.0"})
def get_record(
self, identifier: str, metadata_prefix: str = "oai_ape_ead"
) -> Dict[str, Union[str, List, Dict]]:
"""Get a specific record with full metadata."""
params = {
"verb": "GetRecord",
"identifier": identifier,
"metadataPrefix": metadata_prefix,
}
root = self._make_request(params)
record = root.xpath("//oai:record", namespaces=NAMESPACES)[0]
header = record.xpath("oai:header", namespaces=NAMESPACES)[0]
result = {
"identifier": self._get_text(header, "oai:identifier") or "",
"datestamp": self._get_text(header, "oai:datestamp") or "",
"metadata_format": metadata_prefix,
}
if metadata_prefix == "oai_ape_ead":
result.update(self._extract_ead_metadata(record))
return result
def extract_pid(self, identifier: str) -> Optional[str]:
"""Extract PID from a record for IIIF access."""
try:
record = self.get_record(identifier, "oai_ape_ead")
if "nad_link" in record:
return record["nad_link"].split("/")[-1]
return None
except Exception:
return None
def _make_request(self, params: Dict[str, str]) -> etree.Element:
"""Make an OAI-PMH request and return parsed XML."""
response = self.session.get(self.base_url, params=params)
response.raise_for_status()
parser = etree.XMLParser(remove_blank_text=True)
root = etree.fromstring(response.content, parser)
errors = root.xpath("//oai:error", namespaces=NAMESPACES)
if errors:
error_code = errors[0].get("code", "unknown")
error_msg = errors[0].text or "Unknown error"
raise Exception(f"OAI-PMH Error [{error_code}]: {error_msg}")
return root
def _get_text(self, element, xpath: str) -> Optional[str]:
"""Safely extract text from an XML element."""
result = element.xpath(xpath, namespaces=NAMESPACES)
return result[0].text if result and result[0].text else None
def _extract_ead_metadata(self, record) -> Dict[str, Union[str, List, Dict]]:
"""Extract metadata from EAD format."""
metadata = {}
ead = record.xpath(".//ead:ead", namespaces=NAMESPACES)
if not ead:
return metadata
ead = ead[0]
title = ead.xpath(".//ead:unittitle", namespaces=NAMESPACES)
if title and title[0].text:
metadata["title"] = title[0].text
date = ead.xpath(".//ead:unitdate", namespaces=NAMESPACES)
if date and date[0].text:
metadata["date"] = date[0].text
nad_links = ead.xpath(".//ead:extref/@xlink:href", namespaces=NAMESPACES)
if nad_links:
for link in nad_links:
if "sok.riksarkivet.se" in link or "sok-acc.riksarkivet.se" in link:
metadata["nad_link"] = link
break
return metadata