Skip to main content
Glama
links.py7.1 kB
from __future__ import annotations from typing import Any, Dict, List import pandas as pd from .datasets import DatasetLoadError, load_dataset from .taxonomy import guess_genus NORDIC_COCONUT_DATASET = "coconut_csv-09-2025_FI_NO_plants.csv" LAJI_DATASET = "laji2_fi.txt" GBIF_DATASET = "gbif_plants_FI_NO_merged.tsv" def _normalize_name(value: Any) -> str: if value is None: return "" return str(value).strip().lower() def _to_int(value: Any) -> int | None: try: if pd.isna(value): return None except TypeError: if value is None: return None try: return int(float(value)) except (TypeError, ValueError): return None def _load_dataset(dataset_name: str) -> LoadedDataset: return load_dataset(dataset_name) def link_compound_to_nordic_plants(compound: str) -> Dict[str, Any]: """Return Nordic plant observations for a compound across COCONUT→Laji→GBIF.""" if not compound or not compound.strip(): raise ValueError("compound must be a non-empty string") normalized = _normalize_name(compound) coconut_dataset = _load_dataset(NORDIC_COCONUT_DATASET) coconut_frame = coconut_dataset.frame if "name" not in coconut_frame.columns or "organisms" not in coconut_frame.columns: raise DatasetLoadError(f"Dataset '{NORDIC_COCONUT_DATASET}' is missing required columns.") coconut_frame = coconut_frame.copy() coconut_frame["__name_norm"] = coconut_frame["name"].astype(str).str.strip().str.lower() matches = coconut_frame[coconut_frame["__name_norm"] == normalized] organism_map: Dict[str, str] = {} organism_series = matches["organisms"] if "organisms" in matches.columns else pd.Series(dtype=str) for raw_organisms in organism_series.dropna(): for token in str(raw_organisms).split("|"): token = token.strip() if token: organism_map.setdefault(token.lower(), token) organism_list = sorted(organism_map.values(), key=lambda v: v.lower()) if not matches.shape[0]: return { "compound": compound, "organisms": organism_list, "table": [], "missing_in_laji": [], "missing_in_gbif": [], } observed_norms = set(organism_map.keys()) laji_dataset = _load_dataset(LAJI_DATASET) laji_frame = laji_dataset.frame required_laji = {"Scientific name", "Observation count from Finland", "Identifier"} if not required_laji.issubset(laji_frame.columns): missing = ", ".join(sorted(required_laji - set(laji_frame.columns))) raise DatasetLoadError(f"Dataset '{LAJI_DATASET}' is missing columns: {missing}") laji_subset = laji_frame[list(required_laji)].copy() laji_subset["__name_norm"] = laji_subset["Scientific name"].astype(str).str.strip().str.lower() laji_subset = laji_subset[laji_subset["__name_norm"].isin(observed_norms)] laji_subset = laji_subset.drop_duplicates("__name_norm", keep="first") laji_subset["obs_FI_laji"] = laji_subset["Observation count from Finland"].apply(_to_int) laji_subset["url_laji"] = laji_subset["Identifier"].apply( lambda identifier: f"https://laji.fi/taxon/{identifier}/occurrence" if isinstance(identifier, str) and identifier else None ) laji_subset.rename(columns={"Scientific name": "organism_laji"}, inplace=True) gbif_dataset = _load_dataset(GBIF_DATASET) gbif_frame = gbif_dataset.frame required_gbif = { "canonicalName", "speciesKey", "obs_FI", "obs_NO", "count_FI_60N", "count_NO_60N", "count_FI_66N", "count_NO_66N", } if not required_gbif.issubset(gbif_frame.columns): missing = ", ".join(sorted(required_gbif - set(gbif_frame.columns))) raise DatasetLoadError(f"Dataset '{GBIF_DATASET}' is missing columns: {missing}") gbif_subset = gbif_frame[list(required_gbif)].copy() gbif_subset["__name_norm"] = gbif_subset["canonicalName"].astype(str).str.strip().str.lower() gbif_subset = gbif_subset[gbif_subset["__name_norm"].isin(observed_norms)] gbif_subset = gbif_subset.drop_duplicates("__name_norm", keep="first") gbif_subset.rename( columns={ "canonicalName": "organism_gbif", "speciesKey": "speciesKey", "obs_FI": "obs_FI_gbif", "obs_NO": "obs_NO_gbif", "count_FI_60N": "obs_FI_60N", "count_NO_60N": "obs_NO_60N", "count_FI_66N": "obs_66N_FI", "count_NO_66N": "obs_66N_NO", }, inplace=True, ) def _gbif_url(key: Any) -> str | None: numeric = _to_int(key) if numeric is None: return None return f"https://www.gbif.org/species/{numeric}" gbif_subset["url_gbif"] = gbif_subset["speciesKey"].apply(_gbif_url) for col in ["obs_FI_gbif", "obs_NO_gbif", "obs_FI_60N", "obs_NO_60N", "obs_66N_FI", "obs_66N_NO"]: gbif_subset[col] = gbif_subset[col].apply(_to_int) combined = pd.merge( laji_subset[["__name_norm", "organism_laji", "obs_FI_laji", "url_laji"]], gbif_subset[ [ "__name_norm", "organism_gbif", "obs_FI_gbif", "obs_NO_gbif", "obs_FI_60N", "obs_NO_60N", "obs_66N_FI", "obs_66N_NO", "url_gbif", ] ], on="__name_norm", how="outer", ) def _organism_name(row: pd.Series) -> str: return ( row.get("organism_laji") or row.get("organism_gbif") or organism_map.get(row.get("__name_norm", ""), row.get("__name_norm", "")) ) table: List[Dict[str, Any]] = [] for _, row in combined.iterrows(): organism_name = _organism_name(row) table.append( { "organism": organism_name, "genus": guess_genus(organism_name), "obs_FI_laji": _to_int(row.get("obs_FI_laji")), "obs_FI_gbif": _to_int(row.get("obs_FI_gbif")), "obs_NO_gbif": _to_int(row.get("obs_NO_gbif")), "obs_FI_60N": _to_int(row.get("obs_FI_60N")), "obs_NO_60N": _to_int(row.get("obs_NO_60N")), "obs_66N_FI": _to_int(row.get("obs_66N_FI")), "obs_66N_NO": _to_int(row.get("obs_66N_NO")), "url_laji": row.get("url_laji"), "url_gbif": row.get("url_gbif"), } ) laji_norms = set(laji_subset["__name_norm"]) gbif_norms = set(gbif_subset["__name_norm"]) missing_in_laji = sorted(organism_map[key] for key in observed_norms - laji_norms) missing_in_gbif = sorted(organism_map[key] for key in observed_norms - gbif_norms) return { "compound": matches.iloc[0]["name"], "organisms": organism_list, "table": table, "missing_in_laji": missing_in_laji, "missing_in_gbif": missing_in_gbif, }

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/ndaniel/aurora-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server