inhibitors.py•6.43 kB
from __future__ import annotations
from functools import lru_cache
from typing import Dict, List, Optional
import pandas as pd
from .datasets import DatasetLoadError, load_dataset
from .links import link_compound_to_nordic_plants
_KNOWN_STATUS_VALUES = ("known", "new")
_CONFIDENCE_VALUES = ("high", "medium", "low-medium", "low")
_SCOPE_DATASETS = {
"global": "coconut_csv-09-2025.csv",
"nordic": "coconut_csv-09-2025_FI_NO_plants.csv",
}
@lru_cache(maxsize=1)
def _inhibitors_frame() -> pd.DataFrame:
dataset = load_dataset("all_mito_complex_I_inhibitors.txt")
frame = dataset.frame.copy()
for column in ("compound", "known_status", "confidence"):
if column in frame.columns:
frame[column] = frame[column].astype(str).str.strip()
return frame
def list_inhibitors(
known_status: Optional[str] = None,
confidence: Optional[str] = None,
limit: Optional[int] = 20,
) -> List[Dict[str, object]]:
frame = _inhibitors_frame()
if known_status:
norm_status = known_status.strip().lower()
if norm_status not in _KNOWN_STATUS_VALUES:
raise ValueError(f"known_status must be one of {_KNOWN_STATUS_VALUES}")
frame = frame[frame["known_status"].str.lower() == norm_status]
if confidence:
norm_confidence = confidence.strip().lower()
if norm_confidence not in _CONFIDENCE_VALUES:
raise ValueError(f"confidence must be one of {_CONFIDENCE_VALUES}")
frame = frame[frame["confidence"].str.lower() == norm_confidence]
if limit is not None:
try:
limit_value = int(limit)
except ValueError as exc:
raise ValueError("limit must be an integer or null") from exc
if limit_value >= 0:
frame = frame.head(limit_value)
columns = [col for col in ("compound", "known_status", "confidence", "pubmed_references", "pubmed_ids") if col in frame.columns]
return frame.loc[:, columns].to_dict(orient="records")
def summarize_inhibitors() -> Dict[str, Dict[str, int]]:
frame = _inhibitors_frame()
total = int(len(frame))
status_counts = {status: int(frame[frame["known_status"].str.lower() == status].shape[0]) for status in _KNOWN_STATUS_VALUES}
new_frame = frame[frame["known_status"].str.lower() == "new"]
known_frame = frame[frame["known_status"].str.lower() == "known"]
def _confidence_counts(subset: pd.DataFrame) -> Dict[str, int]:
return {
confidence: int((subset["confidence"].str.lower() == confidence).sum())
for confidence in _CONFIDENCE_VALUES
}
return {
"total": total,
"by_known_status": status_counts,
"new_by_confidence": _confidence_counts(new_frame),
"known_by_confidence": _confidence_counts(known_frame),
}
def get_inhibitor_sources(compound: str) -> Dict[str, object]:
"""Return source details (PubMed IDs/URLs) for a specific inhibitor."""
if not compound or not compound.strip():
raise ValueError("compound must be a non-empty string")
frame = _inhibitors_frame()
lowered = compound.strip().lower()
matching = frame[frame["compound"].str.lower() == lowered]
if matching.empty:
raise ValueError(f"Compound '{compound}' not found in inhibitors dataset.")
record = matching.iloc[0].to_dict()
raw_ids = str(record.get("pubmed_ids") or "")
pubmed_ids = [pid.strip() for pid in raw_ids.split(";") if pid.strip()]
pubmed_urls = [f"https://pubmed.ncbi.nlm.nih.gov/{pid}/" for pid in pubmed_ids]
return {
"compound": record.get("compound"),
"known_status": record.get("known_status"),
"confidence": record.get("confidence"),
"pubmed_references": int(record.get("pubmed_references")) if record.get("pubmed_references") is not None else None,
"pubmed_ids": pubmed_ids,
"pubmed_urls": pubmed_urls,
}
def find_inhibitor_plants(compound: str, scope: str = "global") -> Dict[str, object]:
"""Map an inhibitor compound to organism occurrences in the selected scope."""
if not compound or not compound.strip():
raise ValueError("compound must be a non-empty string")
scope_key = scope.strip().lower() if scope else "global"
if scope_key not in _SCOPE_DATASETS:
valid_scopes = ", ".join(sorted(_SCOPE_DATASETS))
raise ValueError(f"scope must be one of: {valid_scopes}")
normalized = compound.strip().lower()
inhibitors = _inhibitors_frame()
inhibitor_match = inhibitors[inhibitors["compound"].str.lower() == normalized]
if inhibitor_match.empty:
raise ValueError(f"Compound '{compound}' not found in inhibitors dataset.")
inhibitor_row = inhibitor_match.iloc[0].to_dict()
dataset_name = _SCOPE_DATASETS[scope_key]
dataset = load_dataset(dataset_name)
frame = dataset.frame.copy()
if "name" not in frame.columns or "organisms" not in frame.columns:
raise DatasetLoadError(f"Dataset '{dataset_name}' is missing required columns for linking.")
frame["name"] = frame["name"].astype(str).str.lower()
matches = frame[frame["name"] == normalized]
organisms: List[str] = []
records: List[Dict[str, object]] = []
if not matches.empty:
organism_set = set()
for _, row in matches.iterrows():
raw_org = str(row.get("organisms") or "")
split_orgs = [org.strip() for org in raw_org.split("|") if org.strip()]
organism_set.update(split_orgs)
record_entry: Dict[str, object] = {
"name": row.get("name"),
"organisms": split_orgs,
}
identifier = row.get("identifier")
if identifier is not None:
record_entry["identifier"] = identifier
records.append(record_entry)
organisms = sorted(organism_set)
result = {
"compound": inhibitor_row.get("compound", compound),
"known_status": inhibitor_row.get("known_status"),
"confidence": inhibitor_row.get("confidence"),
"scope": scope_key,
"dataset": dataset_name,
"match_count": int(matches.shape[0]),
"organism_count": len(organisms),
"organisms": organisms,
"records": records,
}
if scope_key == "nordic":
result["nordic_observations"] = link_compound_to_nordic_plants(compound)
return result