Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 56 additions & 2 deletions easyDataverse/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,12 @@
import nob
import xmltodict
import yaml
from pydantic import BaseModel, ConfigDict, Field
from pydantic import BaseModel, ConfigDict, Field, ValidationInfo, field_validator

from dvuploader import File, add_directory

from easyDataverse.base import DataverseBase
from easyDataverse.datasettype import DatasetType
from easyDataverse.license import CustomLicense, License
from easyDataverse.uploader import update_dataset, upload_to_dataverse
from easyDataverse.utils import YAMLDumper
Expand Down Expand Up @@ -54,9 +55,51 @@ class Dataset(BaseModel):
description="The files of the dataset.",
)

dataset_type: Optional[str] = Field(
default=None,
description="The type of the dataset.",
)

API_TOKEN: Optional[str] = Field(None)
DATAVERSE_URL: Optional[str] = Field(None)

# ! Validators
@field_validator("dataset_type", mode="after")
def _validate_dataset_type(
cls,
dataset_type: Optional[str],
info: ValidationInfo,
) -> Optional[str]:
"""Validates the dataset type against available types in the Dataverse installation.

This validator ensures that the provided dataset type is valid and available
in the target Dataverse installation. It fetches the available dataset types
from the Dataverse instance and validates the provided type against them.

Note:
If dataset_type is None, validation is skipped and None is returned.
The DATAVERSE_URL must be set in the model for validation to work.
"""
if dataset_type is None:
return dataset_type
elif info.data["DATAVERSE_URL"] is None:
raise ValueError(
"No Dataverse URL has been provided. Please provide a Dataverse URL to validate the dataset type.",
"This error should not happen and is likely a bug in the code.",
"Please report this issue https://github.com/gdcc/easyDataverse/issues",
)

available_types = DatasetType.from_instance(info.data["DATAVERSE_URL"]) # type: ignore
available_names = [type.name for type in available_types]

if dataset_type not in available_names:
raise ValueError(
f"Dataset type '{dataset_type}' is not available in the Dataverse installation. "
f"Please use 'list_dataset_types' to see which dataset types are available."
)

return dataset_type

# ! Adders
def add_metadatablock(self, metadatablock: DataverseBase) -> None:
"""Adds a metadatablock object to the dataset if it is of 'DataverseBase' type and has a metadatablock name"""
Expand Down Expand Up @@ -190,13 +233,24 @@ def dataverse_dict(self) -> dict:
else:
terms = {}

dataset_type = self._get_dataset_type()

return {
"datasetType": dataset_type,
"datasetVersion": {
"metadataBlocks": blocks,
**terms,
}
},
}

def _get_dataset_type(self) -> str:
"""Returns the dataset type of the dataset."""

if self.dataset_type is None:
return "dataset"

return self.dataset_type

def dataverse_json(self, indent: int = 2) -> str:
"""Returns a JSON representation of the dataverse dataset."""

Expand Down
64 changes: 64 additions & 0 deletions easyDataverse/datasettype.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
from typing import List
from urllib.parse import urljoin
from pydantic import BaseModel, Field
import httpx
from pyDataverse.api import NativeApi


class DatasetType(BaseModel):
"""
Represents a dataset type in Dataverse.

A dataset type defines the structure and metadata requirements for datasets
in a Dataverse instance, including which metadata blocks are linked to it.
"""

id: int = Field(..., description="The ID of the dataset type")
name: str = Field(..., description="The name of the dataset type")
linkedMetadataBlocks: list[str] = Field(
default_factory=list,
description="The metadata blocks linked to the dataset type",
)

@classmethod
def from_instance(cls, base_url: str) -> List["DatasetType"]:
"""
Retrieve all dataset types from a Dataverse instance.

Args:
base_url: The base URL of the Dataverse instance

Returns:
A list of DatasetType objects representing all dataset types
available in the Dataverse instance

Raises:
httpx.HTTPStatusError: If the API request fails
ValueError: If the Dataverse instance is not at least version 6.4
"""
native_api = NativeApi(base_url=base_url)

if cls._get_version(native_api) < (6, 4):
raise ValueError(
"Dataset types are only supported in Dataverse 6.4 and above"
)

url = urljoin(native_api.base_url, "api/datasets/datasetTypes")
response = httpx.get(url)

if not response.is_success:
# If there are no dataset types, the response is a 200 with an empty list
return []

return [cls.model_validate(item) for item in response.json()["data"]]

@staticmethod
def _get_version(native_api: NativeApi) -> tuple[int, int]:
"""
Get the version of the Dataverse instance.
"""
response = native_api.get_info_version()
response.raise_for_status()
version = response.json()["data"]["version"]
major, minor = version.split(".", 1)
return int(major), int(minor)
37 changes: 36 additions & 1 deletion easyDataverse/dataverse.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
import asyncio
from copy import deepcopy
from functools import cached_property
import json
from uuid import UUID
from typing import Callable, Dict, List, Optional, Tuple, IO
from urllib import parse

import httpx
from easyDataverse.datasettype import DatasetType
from easyDataverse.license import CustomLicense, License
from rich.panel import Panel
from rich.progress import Progress, SpinnerColumn, TextColumn
Expand Down Expand Up @@ -112,6 +114,25 @@ def default_license(self) -> License:
"""The default license of the Dataverse installation."""
return next(filter(lambda x: x.is_default, self.licenses.values()))

@computed_field(
description="The dataset types available in the Dataverse installation."
)
@cached_property
def dataset_types(self) -> Dict[str, DatasetType]:
"""The dataset types available in the Dataverse installation."""
if self.native_api is None:
raise ValueError(
"Native API is not available. Please connect to a Dataverse installation first."
)

try:
return {
dataset_type.name: dataset_type
for dataset_type in DatasetType.from_instance(self.native_api.base_url)
}
except ValueError:
return {}

def _connect(self) -> None:
"""Connects to a Dataverse installation and adds all metadtablocks as classes.

Expand Down Expand Up @@ -299,6 +320,17 @@ def list_licenses(self):

print("\n")

def list_dataset_types(self):
"""Lists the dataset types available in the Dataverse installation."""
rich.print("[bold]Dataset Types[/bold]")
for dataset_type in self.dataset_types.values():
if dataset_type.name == "dataset":
print(f"- {dataset_type.name} (default)")
else:
print(f"- {dataset_type.name}")

print("\n")

# ! Dataset Handlers

def create_dataset(self) -> Dataset:
Expand All @@ -308,7 +340,9 @@ def create_dataset(self) -> Dataset:
Returns:
Dataset: The newly created dataset.
"""
return self._dataset_gen()
dataset = self._dataset_gen()
dataset._dataverse = self
return dataset

@classmethod
def load_from_url(
Expand Down Expand Up @@ -409,6 +443,7 @@ def load_dataset(
dataset.license = custom_license

dataset.p_id = latest_version.datasetPersistentId # type: ignore
dataset.dataset_type = remote_ds.data.get("datasetType", None) # type: ignore
blocks = latest_version.metadataBlocks # type: ignore
files = latest_version.files # type: ignore

Expand Down
10 changes: 5 additions & 5 deletions easyDataverse/uploader.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def upload_to_dataverse(
str: The resulting DOI of the dataset, if successful.
"""

api, _ = _initialize_pydataverse(DATAVERSE_URL, API_TOKEN)
api, _ = _initialize_pydataverse(DATAVERSE_URL, API_TOKEN) # type: ignore
ds = Dataset()
ds.from_json(json_data)

Expand All @@ -50,21 +50,21 @@ def upload_to_dataverse(
if p_id:
create_params["pid"] = p_id

response = api.create_dataset(**create_params)
response = api.create_dataset(**create_params) # type: ignore
response.raise_for_status()

# Get response data
p_id = response.json()["data"]["persistentId"]

_uploadFiles(
files=files,
p_id=p_id,
api=api,
p_id=p_id, # type: ignore
api=api, # type: ignore
n_parallel=n_parallel,
) # type: ignore

console = Console()
url = urljoin(DATAVERSE_URL, f"dataset.xhtml?persistentId={p_id}")
url = urljoin(DATAVERSE_URL, f"dataset.xhtml?persistentId={p_id}") # type: ignore
panel = Panel(
f"🎉 {url}",
title="Dataset URL",
Expand Down
55 changes: 55 additions & 0 deletions tests/integration/test_dataset_creation.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import os
from pydantic import ValidationError
import pytest
from easyDataverse.dataset import Dataset

Expand Down Expand Up @@ -95,6 +96,59 @@ def test_creation_and_upload(
"File should be in the sub-directory"
)

@pytest.mark.integration
def test_creation_and_upload_with_dataset_type(
self,
credentials,
):
# Arrange
base_url, api_token = credentials
dataverse = Dataverse(
server_url=base_url,
api_token=api_token,
)

# Act
dataset = dataverse.create_dataset()

dataset.dataset_type = "dataset"
dataset.citation.title = "My dataset"
dataset.citation.subject = ["Other"]
dataset.citation.add_author(name="John Doe")
dataset.citation.add_ds_description(
value="This is a description of the dataset",
date="2024",
)
dataset.citation.add_dataset_contact(
name="John Doe",
email="[email protected]",
)

pid = dataset.upload(dataverse_name="root")

# Re-fetch the dataset
dataset = dataverse.load_dataset(pid)

assert dataset.dataset_type == "dataset"

@pytest.mark.integration
def test_creation_invalid_dataset_type(
self,
credentials,
):
# Arrange
base_url, api_token = credentials
dataverse = Dataverse(
server_url=base_url,
api_token=api_token,
)

# Act
dataset = dataverse.create_dataset()

with pytest.raises(ValidationError):
dataset.dataset_type = "invalid"

@pytest.mark.integration
def test_creation_other_license(
self,
Expand Down Expand Up @@ -227,6 +281,7 @@ def test_tab_ingest_disabled(
@staticmethod
def sort_citation(dataset: Dataset):
dv_dict = dataset.dataverse_dict()
del dv_dict["datasetType"]
citation = dv_dict["datasetVersion"]["metadataBlocks"]["citation"]
citation_fields = citation["fields"]
dv_dict["datasetVersion"]["metadataBlocks"]["citation"]["fields"] = sorted(
Expand Down
3 changes: 3 additions & 0 deletions tests/integration/test_dataset_download.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,9 @@ def test_dataset_download_with_file_and_filter_pattern(

@staticmethod
def sort_citation(dataset: Dict):
if "datasetType" in dataset:
del dataset["datasetType"]

citation = dataset["datasetVersion"]["metadataBlocks"]["citation"]
citation_fields = citation["fields"]
dataset["datasetVersion"]["metadataBlocks"]["citation"]["fields"] = sorted(
Expand Down
2 changes: 1 addition & 1 deletion tests/integration/test_dataset_update.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def test_dataset_update(

# Fetch the dataset and update the title
dataset = dataverse.load_dataset(pid)
dataset.citation.title = "Title has changed"
dataset.citation.title = "Title has changed" # type: ignore
dataset.update()

# Re-fetch the dataset
Expand Down
27 changes: 27 additions & 0 deletions tests/integration/test_datasettype.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import pytest
from easyDataverse.datasettype import DatasetType


class TestDatasetType:
"""Integration tests for DatasetType functionality."""

@pytest.mark.integration
def test_dataset_type_from_instance(self, credentials):
"""
Test retrieving dataset types from a Dataverse instance.

This test verifies that we can successfully fetch dataset types
from a Dataverse installation and that the returned data matches
the expected structure.

Args:
credentials: Fixture providing base_url and api_token for testing
"""
base_url, _ = credentials
dataset_types = DatasetType.from_instance(base_url)

assert len(dataset_types) > 0
expected_dataset_types = [
DatasetType(id=1, name="dataset", linkedMetadataBlocks=[]),
]
assert dataset_types == expected_dataset_types
Loading