links.py•7.1 kB
from __future__ import annotations
from typing import Any, Dict, List
import pandas as pd
from .datasets import DatasetLoadError, load_dataset
from .taxonomy import guess_genus
NORDIC_COCONUT_DATASET = "coconut_csv-09-2025_FI_NO_plants.csv"
LAJI_DATASET = "laji2_fi.txt"
GBIF_DATASET = "gbif_plants_FI_NO_merged.tsv"
def _normalize_name(value: Any) -> str:
if value is None:
return ""
return str(value).strip().lower()
def _to_int(value: Any) -> int | None:
try:
if pd.isna(value):
return None
except TypeError:
if value is None:
return None
try:
return int(float(value))
except (TypeError, ValueError):
return None
def _load_dataset(dataset_name: str) -> LoadedDataset:
return load_dataset(dataset_name)
def link_compound_to_nordic_plants(compound: str) -> Dict[str, Any]:
"""Return Nordic plant observations for a compound across COCONUT→Laji→GBIF."""
if not compound or not compound.strip():
raise ValueError("compound must be a non-empty string")
normalized = _normalize_name(compound)
coconut_dataset = _load_dataset(NORDIC_COCONUT_DATASET)
coconut_frame = coconut_dataset.frame
if "name" not in coconut_frame.columns or "organisms" not in coconut_frame.columns:
raise DatasetLoadError(f"Dataset '{NORDIC_COCONUT_DATASET}' is missing required columns.")
coconut_frame = coconut_frame.copy()
coconut_frame["__name_norm"] = coconut_frame["name"].astype(str).str.strip().str.lower()
matches = coconut_frame[coconut_frame["__name_norm"] == normalized]
organism_map: Dict[str, str] = {}
organism_series = matches["organisms"] if "organisms" in matches.columns else pd.Series(dtype=str)
for raw_organisms in organism_series.dropna():
for token in str(raw_organisms).split("|"):
token = token.strip()
if token:
organism_map.setdefault(token.lower(), token)
organism_list = sorted(organism_map.values(), key=lambda v: v.lower())
if not matches.shape[0]:
return {
"compound": compound,
"organisms": organism_list,
"table": [],
"missing_in_laji": [],
"missing_in_gbif": [],
}
observed_norms = set(organism_map.keys())
laji_dataset = _load_dataset(LAJI_DATASET)
laji_frame = laji_dataset.frame
required_laji = {"Scientific name", "Observation count from Finland", "Identifier"}
if not required_laji.issubset(laji_frame.columns):
missing = ", ".join(sorted(required_laji - set(laji_frame.columns)))
raise DatasetLoadError(f"Dataset '{LAJI_DATASET}' is missing columns: {missing}")
laji_subset = laji_frame[list(required_laji)].copy()
laji_subset["__name_norm"] = laji_subset["Scientific name"].astype(str).str.strip().str.lower()
laji_subset = laji_subset[laji_subset["__name_norm"].isin(observed_norms)]
laji_subset = laji_subset.drop_duplicates("__name_norm", keep="first")
laji_subset["obs_FI_laji"] = laji_subset["Observation count from Finland"].apply(_to_int)
laji_subset["url_laji"] = laji_subset["Identifier"].apply(
lambda identifier: f"https://laji.fi/taxon/{identifier}/occurrence" if isinstance(identifier, str) and identifier else None
)
laji_subset.rename(columns={"Scientific name": "organism_laji"}, inplace=True)
gbif_dataset = _load_dataset(GBIF_DATASET)
gbif_frame = gbif_dataset.frame
required_gbif = {
"canonicalName",
"speciesKey",
"obs_FI",
"obs_NO",
"count_FI_60N",
"count_NO_60N",
"count_FI_66N",
"count_NO_66N",
}
if not required_gbif.issubset(gbif_frame.columns):
missing = ", ".join(sorted(required_gbif - set(gbif_frame.columns)))
raise DatasetLoadError(f"Dataset '{GBIF_DATASET}' is missing columns: {missing}")
gbif_subset = gbif_frame[list(required_gbif)].copy()
gbif_subset["__name_norm"] = gbif_subset["canonicalName"].astype(str).str.strip().str.lower()
gbif_subset = gbif_subset[gbif_subset["__name_norm"].isin(observed_norms)]
gbif_subset = gbif_subset.drop_duplicates("__name_norm", keep="first")
gbif_subset.rename(
columns={
"canonicalName": "organism_gbif",
"speciesKey": "speciesKey",
"obs_FI": "obs_FI_gbif",
"obs_NO": "obs_NO_gbif",
"count_FI_60N": "obs_FI_60N",
"count_NO_60N": "obs_NO_60N",
"count_FI_66N": "obs_66N_FI",
"count_NO_66N": "obs_66N_NO",
},
inplace=True,
)
def _gbif_url(key: Any) -> str | None:
numeric = _to_int(key)
if numeric is None:
return None
return f"https://www.gbif.org/species/{numeric}"
gbif_subset["url_gbif"] = gbif_subset["speciesKey"].apply(_gbif_url)
for col in ["obs_FI_gbif", "obs_NO_gbif", "obs_FI_60N", "obs_NO_60N", "obs_66N_FI", "obs_66N_NO"]:
gbif_subset[col] = gbif_subset[col].apply(_to_int)
combined = pd.merge(
laji_subset[["__name_norm", "organism_laji", "obs_FI_laji", "url_laji"]],
gbif_subset[
[
"__name_norm",
"organism_gbif",
"obs_FI_gbif",
"obs_NO_gbif",
"obs_FI_60N",
"obs_NO_60N",
"obs_66N_FI",
"obs_66N_NO",
"url_gbif",
]
],
on="__name_norm",
how="outer",
)
def _organism_name(row: pd.Series) -> str:
return (
row.get("organism_laji")
or row.get("organism_gbif")
or organism_map.get(row.get("__name_norm", ""), row.get("__name_norm", ""))
)
table: List[Dict[str, Any]] = []
for _, row in combined.iterrows():
organism_name = _organism_name(row)
table.append(
{
"organism": organism_name,
"genus": guess_genus(organism_name),
"obs_FI_laji": _to_int(row.get("obs_FI_laji")),
"obs_FI_gbif": _to_int(row.get("obs_FI_gbif")),
"obs_NO_gbif": _to_int(row.get("obs_NO_gbif")),
"obs_FI_60N": _to_int(row.get("obs_FI_60N")),
"obs_NO_60N": _to_int(row.get("obs_NO_60N")),
"obs_66N_FI": _to_int(row.get("obs_66N_FI")),
"obs_66N_NO": _to_int(row.get("obs_66N_NO")),
"url_laji": row.get("url_laji"),
"url_gbif": row.get("url_gbif"),
}
)
laji_norms = set(laji_subset["__name_norm"])
gbif_norms = set(gbif_subset["__name_norm"])
missing_in_laji = sorted(organism_map[key] for key in observed_norms - laji_norms)
missing_in_gbif = sorted(organism_map[key] for key in observed_norms - gbif_norms)
return {
"compound": matches.iloc[0]["name"],
"organisms": organism_list,
"table": table,
"missing_in_laji": missing_in_laji,
"missing_in_gbif": missing_in_gbif,
}