gdcc · JR-1991 · Jun 24, 2025 · Jun 6, 2025 · Jun 6, 2025 · Jun 6, 2025
diff --git a/easyDataverse/dataset.py b/easyDataverse/dataset.py
@@ -6,11 +6,12 @@
 import nob
 import xmltodict
 import yaml
-from pydantic import BaseModel, ConfigDict, Field
+from pydantic import BaseModel, ConfigDict, Field, ValidationInfo, field_validator
 
 from dvuploader import File, add_directory
 
 from easyDataverse.base import DataverseBase
+from easyDataverse.datasettype import DatasetType
 from easyDataverse.license import CustomLicense, License
 from easyDataverse.uploader import update_dataset, upload_to_dataverse
 from easyDataverse.utils import YAMLDumper
@@ -54,9 +55,51 @@ class Dataset(BaseModel):
         description="The files of the dataset.",
     )
 
+    dataset_type: Optional[str] = Field(
+        default=None,
+        description="The type of the dataset.",
+    )
+
     API_TOKEN: Optional[str] = Field(None)
     DATAVERSE_URL: Optional[str] = Field(None)
 
+    # ! Validators
+    @field_validator("dataset_type", mode="after")
+    def _validate_dataset_type(
+        cls,
+        dataset_type: Optional[str],
+        info: ValidationInfo,
+    ) -> Optional[str]:
+        """Validates the dataset type against available types in the Dataverse installation.
+
+        This validator ensures that the provided dataset type is valid and available
+        in the target Dataverse installation. It fetches the available dataset types
+        from the Dataverse instance and validates the provided type against them.
+
+        Note:
+            If dataset_type is None, validation is skipped and None is returned.
+            The DATAVERSE_URL must be set in the model for validation to work.
+        """
+        if dataset_type is None:
+            return dataset_type
+        elif info.data["DATAVERSE_URL"] is None:
+            raise ValueError(
+                "No Dataverse URL has been provided. Please provide a Dataverse URL to validate the dataset type.",
+                "This error should not happen and is likely a bug in the code.",
+                "Please report this issue https://github.com/gdcc/easyDataverse/issues",
+            )
+
+        available_types = DatasetType.from_instance(info.data["DATAVERSE_URL"])  # type: ignore
+        available_names = [type.name for type in available_types]
+
+        if dataset_type not in available_names:
+            raise ValueError(
+                f"Dataset type '{dataset_type}' is not available in the Dataverse installation. "
+                f"Please use 'list_dataset_types' to see which dataset types are available."
+            )
+
+        return dataset_type
+
     # ! Adders
     def add_metadatablock(self, metadatablock: DataverseBase) -> None:
         """Adds a metadatablock object to the dataset if it is of 'DataverseBase' type and has a metadatablock name"""
@@ -190,13 +233,24 @@ def dataverse_dict(self) -> dict:
         else:
             terms = {}
 
+        dataset_type = self._get_dataset_type()
+
         return {
+            "datasetType": dataset_type,
             "datasetVersion": {
                 "metadataBlocks": blocks,
                 **terms,
-            }
+            },
         }
 
+    def _get_dataset_type(self) -> str:
+        """Returns the dataset type of the dataset."""
+
+        if self.dataset_type is None:
+            return "dataset"
+
+        return self.dataset_type
+
     def dataverse_json(self, indent: int = 2) -> str:
         """Returns a JSON representation of the dataverse dataset."""
 

diff --git a/easyDataverse/datasettype.py b/easyDataverse/datasettype.py
@@ -0,0 +1,64 @@
+from typing import List
+from urllib.parse import urljoin
+from pydantic import BaseModel, Field
+import httpx
+from pyDataverse.api import NativeApi
+
+
+class DatasetType(BaseModel):
+    """
+    Represents a dataset type in Dataverse.
+
+    A dataset type defines the structure and metadata requirements for datasets
+    in a Dataverse instance, including which metadata blocks are linked to it.
+    """
+
+    id: int = Field(..., description="The ID of the dataset type")
+    name: str = Field(..., description="The name of the dataset type")
+    linkedMetadataBlocks: list[str] = Field(
+        default_factory=list,
+        description="The metadata blocks linked to the dataset type",
+    )
+
+    @classmethod
+    def from_instance(cls, base_url: str) -> List["DatasetType"]:
+        """
+        Retrieve all dataset types from a Dataverse instance.
+
+        Args:
+            base_url: The base URL of the Dataverse instance
+
+        Returns:
+            A list of DatasetType objects representing all dataset types
+            available in the Dataverse instance
+
+        Raises:
+            httpx.HTTPStatusError: If the API request fails
+            ValueError: If the Dataverse instance is not at least version 6.4
+        """
+        native_api = NativeApi(base_url=base_url)
+
+        if cls._get_version(native_api) < (6, 4):
+            raise ValueError(
+                "Dataset types are only supported in Dataverse 6.4 and above"
+            )
+
+        url = urljoin(native_api.base_url, "api/datasets/datasetTypes")
+        response = httpx.get(url)
+
+        if not response.is_success:
+            # If there are no dataset types, the response is a 200 with an empty list
+            return []
+
+        return [cls.model_validate(item) for item in response.json()["data"]]
+
+    @staticmethod
+    def _get_version(native_api: NativeApi) -> tuple[int, int]:
+        """
+        Get the version of the Dataverse instance.
+        """
+        response = native_api.get_info_version()
+        response.raise_for_status()
+        version = response.json()["data"]["version"]
+        major, minor = version.split(".", 1)
+        return int(major), int(minor)
diff --git a/easyDataverse/dataverse.py b/easyDataverse/dataverse.py
@@ -1,11 +1,13 @@
 import asyncio
 from copy import deepcopy
+from functools import cached_property
 import json
 from uuid import UUID
 from typing import Callable, Dict, List, Optional, Tuple, IO
 from urllib import parse
 
 import httpx
+from easyDataverse.datasettype import DatasetType
 from easyDataverse.license import CustomLicense, License
 from rich.panel import Panel
 from rich.progress import Progress, SpinnerColumn, TextColumn
@@ -112,6 +114,25 @@ def default_license(self) -> License:
         """The default license of the Dataverse installation."""
         return next(filter(lambda x: x.is_default, self.licenses.values()))
 
+    @computed_field(
+        description="The dataset types available in the Dataverse installation."
+    )
+    @cached_property
+    def dataset_types(self) -> Dict[str, DatasetType]:
+        """The dataset types available in the Dataverse installation."""
+        if self.native_api is None:
+            raise ValueError(
+                "Native API is not available. Please connect to a Dataverse installation first."
+            )
+
+        try:
+            return {
+                dataset_type.name: dataset_type
+                for dataset_type in DatasetType.from_instance(self.native_api.base_url)
+            }
+        except ValueError:
+            return {}
+
     def _connect(self) -> None:
         """Connects to a Dataverse installation and adds all metadtablocks as classes.
 
@@ -299,6 +320,17 @@ def list_licenses(self):
 
         print("\n")
 
+    def list_dataset_types(self):
+        """Lists the dataset types available in the Dataverse installation."""
+        rich.print("[bold]Dataset Types[/bold]")
+        for dataset_type in self.dataset_types.values():
+            if dataset_type.name == "dataset":
+                print(f"- {dataset_type.name} (default)")
+            else:
+                print(f"- {dataset_type.name}")
+
+        print("\n")
+
     # ! Dataset Handlers
 
     def create_dataset(self) -> Dataset:
@@ -308,7 +340,9 @@ def create_dataset(self) -> Dataset:
         Returns:
             Dataset: The newly created dataset.
         """
-        return self._dataset_gen()
+        dataset = self._dataset_gen()
+        dataset._dataverse = self
+        return dataset
 
     @classmethod
     def load_from_url(
@@ -409,6 +443,7 @@ def load_dataset(
                 dataset.license = custom_license
 
         dataset.p_id = latest_version.datasetPersistentId  # type: ignore
+        dataset.dataset_type = remote_ds.data.get("datasetType", None)  # type: ignore
         blocks = latest_version.metadataBlocks  # type: ignore
         files = latest_version.files  # type: ignore
 

diff --git a/easyDataverse/uploader.py b/easyDataverse/uploader.py
@@ -34,7 +34,7 @@ def upload_to_dataverse(
         str: The resulting DOI of the dataset, if successful.
     """
 
-    api, _ = _initialize_pydataverse(DATAVERSE_URL, API_TOKEN)
+    api, _ = _initialize_pydataverse(DATAVERSE_URL, API_TOKEN)  # type: ignore
     ds = Dataset()
     ds.from_json(json_data)
 
@@ -50,21 +50,21 @@ def upload_to_dataverse(
     if p_id:
         create_params["pid"] = p_id
 
-    response = api.create_dataset(**create_params)
+    response = api.create_dataset(**create_params)  # type: ignore
     response.raise_for_status()
 
     # Get response data
     p_id = response.json()["data"]["persistentId"]
 
     _uploadFiles(
         files=files,
-        p_id=p_id,
-        api=api,
+        p_id=p_id,  # type: ignore
+        api=api,  # type: ignore
         n_parallel=n_parallel,
     )  # type: ignore
 
     console = Console()
-    url = urljoin(DATAVERSE_URL, f"dataset.xhtml?persistentId={p_id}")
+    url = urljoin(DATAVERSE_URL, f"dataset.xhtml?persistentId={p_id}")  # type: ignore
     panel = Panel(
         f"🎉 {url}",
         title="Dataset URL",

diff --git a/tests/integration/test_dataset_creation.py b/tests/integration/test_dataset_creation.py
@@ -1,4 +1,5 @@
 import os
+from pydantic import ValidationError
 import pytest
 from easyDataverse.dataset import Dataset
 
@@ -95,6 +96,59 @@ def test_creation_and_upload(
                 "File should be in the sub-directory"
             )
 
+    @pytest.mark.integration
+    def test_creation_and_upload_with_dataset_type(
+        self,
+        credentials,
+    ):
+        # Arrange
+        base_url, api_token = credentials
+        dataverse = Dataverse(
+            server_url=base_url,
+            api_token=api_token,
+        )
+
+        # Act
+        dataset = dataverse.create_dataset()
+
+        dataset.dataset_type = "dataset"
+        dataset.citation.title = "My dataset"
+        dataset.citation.subject = ["Other"]
+        dataset.citation.add_author(name="John Doe")
+        dataset.citation.add_ds_description(
+            value="This is a description of the dataset",
+            date="2024",
+        )
+        dataset.citation.add_dataset_contact(
+            name="John Doe",
+            email="[email protected]",
+        )
+
+        pid = dataset.upload(dataverse_name="root")
+
+        # Re-fetch the dataset
+        dataset = dataverse.load_dataset(pid)
+
+        assert dataset.dataset_type == "dataset"
+
+    @pytest.mark.integration
+    def test_creation_invalid_dataset_type(
+        self,
+        credentials,
+    ):
+        # Arrange
+        base_url, api_token = credentials
+        dataverse = Dataverse(
+            server_url=base_url,
+            api_token=api_token,
+        )
+
+        # Act
+        dataset = dataverse.create_dataset()
+
+        with pytest.raises(ValidationError):
+            dataset.dataset_type = "invalid"
+
     @pytest.mark.integration
     def test_creation_other_license(
         self,
@@ -227,6 +281,7 @@ def test_tab_ingest_disabled(
     @staticmethod
     def sort_citation(dataset: Dataset):
         dv_dict = dataset.dataverse_dict()
+        del dv_dict["datasetType"]
         citation = dv_dict["datasetVersion"]["metadataBlocks"]["citation"]
         citation_fields = citation["fields"]
         dv_dict["datasetVersion"]["metadataBlocks"]["citation"]["fields"] = sorted(

diff --git a/tests/integration/test_dataset_download.py b/tests/integration/test_dataset_download.py
@@ -179,6 +179,9 @@ def test_dataset_download_with_file_and_filter_pattern(
 
     @staticmethod
     def sort_citation(dataset: Dict):
+        if "datasetType" in dataset:
+            del dataset["datasetType"]
+
         citation = dataset["datasetVersion"]["metadataBlocks"]["citation"]
         citation_fields = citation["fields"]
         dataset["datasetVersion"]["metadataBlocks"]["citation"]["fields"] = sorted(

diff --git a/tests/integration/test_dataset_update.py b/tests/integration/test_dataset_update.py
@@ -36,7 +36,7 @@ def test_dataset_update(
 
         # Fetch the dataset and update the title
         dataset = dataverse.load_dataset(pid)
-        dataset.citation.title = "Title has changed"
+        dataset.citation.title = "Title has changed"  # type: ignore
         dataset.update()
 
         # Re-fetch the dataset

diff --git a/tests/integration/test_datasettype.py b/tests/integration/test_datasettype.py
@@ -0,0 +1,27 @@
+import pytest
+from easyDataverse.datasettype import DatasetType
+
+
+class TestDatasetType:
+    """Integration tests for DatasetType functionality."""
+
+    @pytest.mark.integration
+    def test_dataset_type_from_instance(self, credentials):
+        """
+        Test retrieving dataset types from a Dataverse instance.
+
+        This test verifies that we can successfully fetch dataset types
+        from a Dataverse installation and that the returned data matches
+        the expected structure.
+
+        Args:
+            credentials: Fixture providing base_url and api_token for testing
+        """
+        base_url, _ = credentials
+        dataset_types = DatasetType.from_instance(base_url)
+
+        assert len(dataset_types) > 0
+        expected_dataset_types = [
+            DatasetType(id=1, name="dataset", linkedMetadataBlocks=[]),
+        ]
+        assert dataset_types == expected_dataset_types