Source code for opencitations_client.models

"""Object models for components of OpenCitations."""

from __future__ import annotations

import datetime
from collections.abc import Iterable
from typing import Any, Literal, TypeAlias, TypeVar

from curies import Reference
from curies.utils import NoCURIEDelimiterError
from pydantic import BaseModel, Field, field_validator
from tqdm import tqdm

__all__ = [
    "Citation",
    "CitationReturnType",
    "Person",
    "Publisher",
    "Venue",
    "Work",
    "process_citation",
    "process_work",
]


[docs] class Citation(BaseModel): """Wraps the results from a citation.""" reference: Reference citing: list[Reference] = Field(...) cited: list[Reference] = Field(...) creation: datetime.date | None = None timespan: datetime.timedelta | None = None journal_self_citation: bool | None = None author_self_citation: bool | None = None
[docs] @field_validator("creation", mode="before") @classmethod def parse_dates(cls, v: Any) -> Any: """Parse the creation field.""" if isinstance(v, str): if len(v) == 4: # YYYY return datetime.date.fromisoformat(v + "-01-01") if len(v) == 7: # YYYY-MM return datetime.date.fromisoformat(v + "-01") if len(v) == 10: # YYYY-MM-DD return datetime.date.fromisoformat(v) return v
[docs] class Person(BaseModel): """Represents an author in OpenCitations.""" name: str references: list[Reference]
[docs] class Venue(BaseModel): """Represents a venue in OpenCitations.""" name: str references: list[Reference]
[docs] class Publisher(BaseModel): """Represents a publisher in OpenCitations.""" name: str references: list[Reference]
[docs] class Work(BaseModel): """A representation of metadata for a creative work.""" references: list[Reference] title: str authors: list[Person] pub_date: datetime.date | None = None venue: Venue | None = None volume: str | None = None issue: str | None = None page: str | None = None publishers: list[Publisher] | None = None editors: list[Person] | None = None type: str
[docs] @field_validator("pub_date", mode="before") @classmethod def parse_dates(cls, v: Any) -> Any: """Parse the creation field.""" if not v: return None if isinstance(v, str): if len(v) == 4: # YYYY return datetime.date.fromisoformat(v + "-01-01") if len(v) == 7: # YYYY-MM return datetime.date.fromisoformat(v + "-01") if len(v) == 10: # YYYY-MM-DD return datetime.date.fromisoformat(v) return v
@property def omid(self) -> str: """Get the OMID for the document.""" if rv := get_reference_with_prefix(self.references, "omid"): return rv.identifier raise ValueError(f"invalid omid: {self.omid}") @property def pubmed(self) -> str | None: """Get the PubMed identifier for the document, if it exists.""" if rv := get_reference_with_prefix(self.references, "pmid"): return rv.identifier return None
X = TypeVar("X", bound=BaseModel) def process_citation(record: dict[str, Any]) -> Citation: """Process a citation record.""" record["reference"] = Reference(prefix="oci", identifier=record.pop("oci")) record["journal_self_citation"] = _bool(record.pop("journal_sc")) record["author_self_citation"] = _bool(record.pop("author_sc")) record["citing"] = _process_curies(record.pop("citing")) record["cited"] = _process_curies(record.pop("cited")) return Citation.model_validate({k: v for k, v in record.items() if v}) def _process_curies(s: str) -> list[Reference]: return [Reference.from_curie(curie) for curie in s.split(" ")] def process_work(record: dict[str, Any]) -> Work: """Process a metadata record for a creative work.""" record["references"] = _process_curies(record.pop("id")) record["authors"] = _process_tagged_list(record.pop("author"), Person) if venue_raw := record.pop("venue"): try: record["venue"] = _process_tagged(venue_raw, Venue) except NoCURIEDelimiterError: tqdm.write(f"bad venue: {venue_raw}") if publisher_raw := record.pop("publisher"): try: record["publishers"] = _process_tagged_list(publisher_raw, Publisher) except NoCURIEDelimiterError: tqdm.write(f"bad publisher: {publisher_raw}") if editor_raw := record.pop("editor"): try: record["editors"] = _process_tagged_list(editor_raw, Person) except NoCURIEDelimiterError: tqdm.write(f"bad editor: {editor_raw}") return Work.model_validate(record) def _process_tagged_list(s: str, cls: type[X]) -> list[X]: if not s: return [] return [_process_tagged(x, cls) for x in s.split(";") if x.strip()] def _process_tagged(part: str, cls: type[X]) -> X: part = part.strip() if not part.endswith("]"): raise ValueError(f"no brackets were given: {part}") # partition on the _last_ one because some names have brackets in them name, _, rest = part.rpartition("[") references = _process_curies(rest.rstrip("]")) return cls(name=name.strip(), references=references) def _bool(s: Literal["yes", "no"]) -> bool: if s == "no": return False elif s == "yes": return True else: raise ValueError(f"invalid boolean value: {s}") def get_reference_with_prefix(references: Iterable[Reference], prefix: str) -> Reference | None: """Get a reference with the given prefix.""" for reference in references: if reference.prefix == prefix: return reference return None #: Citation return type CitationReturnType: TypeAlias = Literal["citation", "reference", "str"] CITATION_PREFIXES = {"doi", "pubmed", "omid"} def handle_input(reference: str | Reference) -> Reference: """Clean up a reference.""" if isinstance(reference, str): reference = Reference.from_curie(reference) if reference.prefix not in CITATION_PREFIXES: raise ValueError(f"invalid prefix: {reference.prefix}, use one of {CITATION_PREFIXES}") if reference.prefix == "pubmed": # put it in the internal representation, which is non-standard return Reference(prefix="pmid", identifier=reference.identifier) return reference