diff --git a/easyDataverse/dataset.py b/easyDataverse/dataset.py index 7402dfd..66a0ff8 100644 --- a/easyDataverse/dataset.py +++ b/easyDataverse/dataset.py @@ -6,11 +6,12 @@ import nob import xmltodict import yaml -from pydantic import BaseModel, ConfigDict, Field +from pydantic import BaseModel, ConfigDict, Field, ValidationInfo, field_validator from dvuploader import File, add_directory from easyDataverse.base import DataverseBase +from easyDataverse.datasettype import DatasetType from easyDataverse.license import CustomLicense, License from easyDataverse.uploader import update_dataset, upload_to_dataverse from easyDataverse.utils import YAMLDumper @@ -54,9 +55,51 @@ class Dataset(BaseModel): description="The files of the dataset.", ) + dataset_type: Optional[str] = Field( + default=None, + description="The type of the dataset.", + ) + API_TOKEN: Optional[str] = Field(None) DATAVERSE_URL: Optional[str] = Field(None) + # ! Validators + @field_validator("dataset_type", mode="after") + def _validate_dataset_type( + cls, + dataset_type: Optional[str], + info: ValidationInfo, + ) -> Optional[str]: + """Validates the dataset type against available types in the Dataverse installation. + + This validator ensures that the provided dataset type is valid and available + in the target Dataverse installation. It fetches the available dataset types + from the Dataverse instance and validates the provided type against them. + + Note: + If dataset_type is None, validation is skipped and None is returned. + The DATAVERSE_URL must be set in the model for validation to work. + """ + if dataset_type is None: + return dataset_type + elif info.data["DATAVERSE_URL"] is None: + raise ValueError( + "No Dataverse URL has been provided. Please provide a Dataverse URL to validate the dataset type.", + "This error should not happen and is likely a bug in the code.", + "Please report this issue https://github.com/gdcc/easyDataverse/issues", + ) + + available_types = DatasetType.from_instance(info.data["DATAVERSE_URL"]) # type: ignore + available_names = [type.name for type in available_types] + + if dataset_type not in available_names: + raise ValueError( + f"Dataset type '{dataset_type}' is not available in the Dataverse installation. " + f"Please use 'list_dataset_types' to see which dataset types are available." + ) + + return dataset_type + # ! Adders def add_metadatablock(self, metadatablock: DataverseBase) -> None: """Adds a metadatablock object to the dataset if it is of 'DataverseBase' type and has a metadatablock name""" @@ -190,13 +233,24 @@ def dataverse_dict(self) -> dict: else: terms = {} + dataset_type = self._get_dataset_type() + return { + "datasetType": dataset_type, "datasetVersion": { "metadataBlocks": blocks, **terms, - } + }, } + def _get_dataset_type(self) -> str: + """Returns the dataset type of the dataset.""" + + if self.dataset_type is None: + return "dataset" + + return self.dataset_type + def dataverse_json(self, indent: int = 2) -> str: """Returns a JSON representation of the dataverse dataset.""" diff --git a/easyDataverse/datasettype.py b/easyDataverse/datasettype.py new file mode 100644 index 0000000..0e1c4d8 --- /dev/null +++ b/easyDataverse/datasettype.py @@ -0,0 +1,64 @@ +from typing import List +from urllib.parse import urljoin +from pydantic import BaseModel, Field +import httpx +from pyDataverse.api import NativeApi + + +class DatasetType(BaseModel): + """ + Represents a dataset type in Dataverse. + + A dataset type defines the structure and metadata requirements for datasets + in a Dataverse instance, including which metadata blocks are linked to it. + """ + + id: int = Field(..., description="The ID of the dataset type") + name: str = Field(..., description="The name of the dataset type") + linkedMetadataBlocks: list[str] = Field( + default_factory=list, + description="The metadata blocks linked to the dataset type", + ) + + @classmethod + def from_instance(cls, base_url: str) -> List["DatasetType"]: + """ + Retrieve all dataset types from a Dataverse instance. + + Args: + base_url: The base URL of the Dataverse instance + + Returns: + A list of DatasetType objects representing all dataset types + available in the Dataverse instance + + Raises: + httpx.HTTPStatusError: If the API request fails + ValueError: If the Dataverse instance is not at least version 6.4 + """ + native_api = NativeApi(base_url=base_url) + + if cls._get_version(native_api) < (6, 4): + raise ValueError( + "Dataset types are only supported in Dataverse 6.4 and above" + ) + + url = urljoin(native_api.base_url, "api/datasets/datasetTypes") + response = httpx.get(url) + + if not response.is_success: + # If there are no dataset types, the response is a 200 with an empty list + return [] + + return [cls.model_validate(item) for item in response.json()["data"]] + + @staticmethod + def _get_version(native_api: NativeApi) -> tuple[int, int]: + """ + Get the version of the Dataverse instance. + """ + response = native_api.get_info_version() + response.raise_for_status() + version = response.json()["data"]["version"] + major, minor = version.split(".", 1) + return int(major), int(minor) diff --git a/easyDataverse/dataverse.py b/easyDataverse/dataverse.py index 8775f0a..6156b56 100644 --- a/easyDataverse/dataverse.py +++ b/easyDataverse/dataverse.py @@ -1,11 +1,13 @@ import asyncio from copy import deepcopy +from functools import cached_property import json from uuid import UUID from typing import Callable, Dict, List, Optional, Tuple, IO from urllib import parse import httpx +from easyDataverse.datasettype import DatasetType from easyDataverse.license import CustomLicense, License from rich.panel import Panel from rich.progress import Progress, SpinnerColumn, TextColumn @@ -112,6 +114,25 @@ def default_license(self) -> License: """The default license of the Dataverse installation.""" return next(filter(lambda x: x.is_default, self.licenses.values())) + @computed_field( + description="The dataset types available in the Dataverse installation." + ) + @cached_property + def dataset_types(self) -> Dict[str, DatasetType]: + """The dataset types available in the Dataverse installation.""" + if self.native_api is None: + raise ValueError( + "Native API is not available. Please connect to a Dataverse installation first." + ) + + try: + return { + dataset_type.name: dataset_type + for dataset_type in DatasetType.from_instance(self.native_api.base_url) + } + except ValueError: + return {} + def _connect(self) -> None: """Connects to a Dataverse installation and adds all metadtablocks as classes. @@ -299,6 +320,17 @@ def list_licenses(self): print("\n") + def list_dataset_types(self): + """Lists the dataset types available in the Dataverse installation.""" + rich.print("[bold]Dataset Types[/bold]") + for dataset_type in self.dataset_types.values(): + if dataset_type.name == "dataset": + print(f"- {dataset_type.name} (default)") + else: + print(f"- {dataset_type.name}") + + print("\n") + # ! Dataset Handlers def create_dataset(self) -> Dataset: @@ -308,7 +340,9 @@ def create_dataset(self) -> Dataset: Returns: Dataset: The newly created dataset. """ - return self._dataset_gen() + dataset = self._dataset_gen() + dataset._dataverse = self + return dataset @classmethod def load_from_url( @@ -409,6 +443,7 @@ def load_dataset( dataset.license = custom_license dataset.p_id = latest_version.datasetPersistentId # type: ignore + dataset.dataset_type = remote_ds.data.get("datasetType", None) # type: ignore blocks = latest_version.metadataBlocks # type: ignore files = latest_version.files # type: ignore diff --git a/easyDataverse/uploader.py b/easyDataverse/uploader.py index e35538b..4a98f57 100644 --- a/easyDataverse/uploader.py +++ b/easyDataverse/uploader.py @@ -34,7 +34,7 @@ def upload_to_dataverse( str: The resulting DOI of the dataset, if successful. """ - api, _ = _initialize_pydataverse(DATAVERSE_URL, API_TOKEN) + api, _ = _initialize_pydataverse(DATAVERSE_URL, API_TOKEN) # type: ignore ds = Dataset() ds.from_json(json_data) @@ -50,7 +50,7 @@ def upload_to_dataverse( if p_id: create_params["pid"] = p_id - response = api.create_dataset(**create_params) + response = api.create_dataset(**create_params) # type: ignore response.raise_for_status() # Get response data @@ -58,13 +58,13 @@ def upload_to_dataverse( _uploadFiles( files=files, - p_id=p_id, - api=api, + p_id=p_id, # type: ignore + api=api, # type: ignore n_parallel=n_parallel, ) # type: ignore console = Console() - url = urljoin(DATAVERSE_URL, f"dataset.xhtml?persistentId={p_id}") + url = urljoin(DATAVERSE_URL, f"dataset.xhtml?persistentId={p_id}") # type: ignore panel = Panel( f"🎉 {url}", title="Dataset URL", diff --git a/tests/integration/test_dataset_creation.py b/tests/integration/test_dataset_creation.py index 5beebf6..612631d 100644 --- a/tests/integration/test_dataset_creation.py +++ b/tests/integration/test_dataset_creation.py @@ -1,4 +1,5 @@ import os +from pydantic import ValidationError import pytest from easyDataverse.dataset import Dataset @@ -95,6 +96,59 @@ def test_creation_and_upload( "File should be in the sub-directory" ) + @pytest.mark.integration + def test_creation_and_upload_with_dataset_type( + self, + credentials, + ): + # Arrange + base_url, api_token = credentials + dataverse = Dataverse( + server_url=base_url, + api_token=api_token, + ) + + # Act + dataset = dataverse.create_dataset() + + dataset.dataset_type = "dataset" + dataset.citation.title = "My dataset" + dataset.citation.subject = ["Other"] + dataset.citation.add_author(name="John Doe") + dataset.citation.add_ds_description( + value="This is a description of the dataset", + date="2024", + ) + dataset.citation.add_dataset_contact( + name="John Doe", + email="john@doe.com", + ) + + pid = dataset.upload(dataverse_name="root") + + # Re-fetch the dataset + dataset = dataverse.load_dataset(pid) + + assert dataset.dataset_type == "dataset" + + @pytest.mark.integration + def test_creation_invalid_dataset_type( + self, + credentials, + ): + # Arrange + base_url, api_token = credentials + dataverse = Dataverse( + server_url=base_url, + api_token=api_token, + ) + + # Act + dataset = dataverse.create_dataset() + + with pytest.raises(ValidationError): + dataset.dataset_type = "invalid" + @pytest.mark.integration def test_creation_other_license( self, @@ -227,6 +281,7 @@ def test_tab_ingest_disabled( @staticmethod def sort_citation(dataset: Dataset): dv_dict = dataset.dataverse_dict() + del dv_dict["datasetType"] citation = dv_dict["datasetVersion"]["metadataBlocks"]["citation"] citation_fields = citation["fields"] dv_dict["datasetVersion"]["metadataBlocks"]["citation"]["fields"] = sorted( diff --git a/tests/integration/test_dataset_download.py b/tests/integration/test_dataset_download.py index b396dd6..77680c6 100644 --- a/tests/integration/test_dataset_download.py +++ b/tests/integration/test_dataset_download.py @@ -179,6 +179,9 @@ def test_dataset_download_with_file_and_filter_pattern( @staticmethod def sort_citation(dataset: Dict): + if "datasetType" in dataset: + del dataset["datasetType"] + citation = dataset["datasetVersion"]["metadataBlocks"]["citation"] citation_fields = citation["fields"] dataset["datasetVersion"]["metadataBlocks"]["citation"]["fields"] = sorted( diff --git a/tests/integration/test_dataset_update.py b/tests/integration/test_dataset_update.py index 08854e6..4bd6952 100644 --- a/tests/integration/test_dataset_update.py +++ b/tests/integration/test_dataset_update.py @@ -36,7 +36,7 @@ def test_dataset_update( # Fetch the dataset and update the title dataset = dataverse.load_dataset(pid) - dataset.citation.title = "Title has changed" + dataset.citation.title = "Title has changed" # type: ignore dataset.update() # Re-fetch the dataset diff --git a/tests/integration/test_datasettype.py b/tests/integration/test_datasettype.py new file mode 100644 index 0000000..51cfc56 --- /dev/null +++ b/tests/integration/test_datasettype.py @@ -0,0 +1,27 @@ +import pytest +from easyDataverse.datasettype import DatasetType + + +class TestDatasetType: + """Integration tests for DatasetType functionality.""" + + @pytest.mark.integration + def test_dataset_type_from_instance(self, credentials): + """ + Test retrieving dataset types from a Dataverse instance. + + This test verifies that we can successfully fetch dataset types + from a Dataverse installation and that the returned data matches + the expected structure. + + Args: + credentials: Fixture providing base_url and api_token for testing + """ + base_url, _ = credentials + dataset_types = DatasetType.from_instance(base_url) + + assert len(dataset_types) > 0 + expected_dataset_types = [ + DatasetType(id=1, name="dataset", linkedMetadataBlocks=[]), + ] + assert dataset_types == expected_dataset_types diff --git a/tests/unit/test_dataverse.py b/tests/unit/test_dataverse.py index 6074a98..127d6e0 100644 --- a/tests/unit/test_dataverse.py +++ b/tests/unit/test_dataverse.py @@ -9,8 +9,8 @@ def test_invalid_url(self): """Test that an invalid URL raises a ValueError""" with pytest.raises(ValueError): Dataverse( - server_url="not a url", - api_token="9eb39a88-ab0d-415d-80c2-32cbafdb5f6f", + server_url="not a url", # type: ignore + api_token="9eb39a88-ab0d-415d-80c2-32cbafdb5f6f", # type: ignore ) @pytest.mark.unit @@ -18,8 +18,8 @@ def test_invalid_api_token(self): """Test that an invalid API token raises a ValueError""" with pytest.raises(ValueError): Dataverse( - server_url="http://localhost:8080", - api_token="not a uuid", + server_url="http://localhost:8080", # type: ignore + api_token="not a uuid", # type: ignore ) @pytest.mark.unit