Skip to content

Commit 184b29c

Browse files
committed
Merge branch 'remove_unit_metadata' of github.com:SWIFTSIM/swiftsimio into remove_unit_metadata
2 parents 235cec7 + bd062bb commit 184b29c

29 files changed

Lines changed: 1273 additions & 299 deletions
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
name: 'Start hdfstream'
2+
description: 'Start the hdfstream service with a mounted directory'
3+
inputs:
4+
data-dir:
5+
description: 'Directory with the files to serve'
6+
required: true
7+
default: './data/'
8+
virtual-prefix:
9+
description: 'Virtual directory prefix to use'
10+
required: true
11+
default: 'Data'
12+
image:
13+
description: 'Server container image to use'
14+
required: true
15+
default: ghcr.io/jchelly/hdfstream-api:0.0.6
16+
runs:
17+
using: "composite"
18+
steps:
19+
- name: Start hdfstream service with tests/data mounted
20+
shell: bash
21+
run: |
22+
docker run -d \
23+
--name hdfstream \
24+
-p 8080:8080 \
25+
-v ${{ inputs.data-dir }}:/opt/hdfstream/data:ro \
26+
-e HDFSTREAM_PREFIX=${{ inputs.virtual-prefix }} \
27+
${{ inputs.image }}
28+
- name: Wait until hdfstream service is responding
29+
shell: bash
30+
run: |
31+
for i in {1..30}; do
32+
status=$(docker inspect --format='{{.State.Health.Status}}' hdfstream)
33+
if [ "$status" = "healthy" ]; then
34+
echo "Service is ready"
35+
exit 0
36+
fi
37+
sleep 2
38+
done
39+
echo "Service did not become healthy in time"
40+
exit 1
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
name: "Stop hdfstream"
2+
description: "Stop and remove the hdfstream container"
3+
runs:
4+
using: "composite"
5+
steps:
6+
- name: Stop the hdfstream service
7+
shell: bash
8+
run: docker rm -f hdfstream

.github/workflows/lint_and_test.yml

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,6 @@ jobs:
3636
strategy:
3737
matrix:
3838
python-version: ["3.10", "3.11", "3.12", "~3.13.0 <= 3.13.3 || ~3.13.5"] # exclude 3.13.4
39-
4039
steps:
4140
- name: Checkout swiftsimio
4241
uses: actions/checkout@v4
@@ -47,9 +46,21 @@ jobs:
4746
- name: Install dependencies
4847
run: |
4948
python -m pip install --upgrade pip
50-
pip install -e .[dev]
49+
pip install -e ".[dev,hdfstream]"
50+
- name: Preload test data files
51+
run: |
52+
python -c "from tests.conftest import preload_test_data; preload_test_data()"
53+
- name: Start hdfstream service
54+
uses: ./.github/actions/start-hdfstream
55+
with:
56+
data-dir: ./test_data
57+
virtual-prefix: test_data
58+
image: ghcr.io/jchelly/hdfstream-api:0.0.6
5159
- name: Test with pytest
5260
run: |
53-
pytest
61+
pytest --hdfstream-server=http://localhost:8080/hdfstream
5462
env:
5563
NUMBA_BOUNDSCHECK: 1
64+
- name: Stop hdfstream service
65+
uses: ./.github/actions/stop-hdfstream
66+
if: always()

docs/source/conf.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -112,4 +112,6 @@ def setup(app): # numpydoc ignore=GL08
112112
swiftgalaxy=("https://swiftgalaxy.readthedocs.io/en/stable/", None),
113113
velociraptor=("https://velociraptor-python.readthedocs.io/en/latest/", None),
114114
astropy=("https://docs.astropy.org/en/stable/", None),
115+
h5py=("https://docs.h5py.org/en/latest/", None),
116+
hdfstream=("https://hdfstream-python.readthedocs.io/en/latest/", None),
115117
)

docs/source/loading_data/index.rst

Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
.. _loading-data:
2+
13
Loading Data
24
============
35

@@ -269,3 +271,95 @@ in SWIFT will be automatically read.
269271
"extra_test.hdf5",
270272
)
271273
274+
275+
Reading from an open file
276+
-------------------------
277+
278+
:mod:`swiftsimio` normally opens and closes the HDF5 snapshot file for
279+
each operation. This is convenient for interactive use and avoids
280+
leaving files open for long periods of time, but sometimes it might be
281+
desirable to minimize the number of file open and close operations.
282+
283+
It is possible to pass an open :obj:`h5py.File` object to
284+
:mod:`swiftsimio.load` and :mod:`swiftsimio.mask` in place of the
285+
filename. In this case swiftsimio will do all file access through the
286+
provided file object. This allows us to read multiple datasets while
287+
only opening and closing the file once. For example:
288+
289+
.. code-block:: python
290+
291+
import h5py
292+
import swiftsimio as sw
293+
294+
with h5py.File("cosmo_volume_example.hdf5","r") as snap_file:
295+
data = sw.load(snap_file)
296+
pos = data.dark_matter.coordinates
297+
vel = data.dark_matter.velocities
298+
ids = data.dark_matter.particle_ids
299+
300+
This would open the snapshot file, read the dark matter particle
301+
positions, velocities and IDs, then close the file.
302+
303+
304+
Reading from a remote file
305+
--------------------------
306+
307+
:mod:`swiftsimio` is able to read from snapshots hosted on a remote
308+
server using the `hdfstream
309+
<https://hdfstream-python.readthedocs.io/en/latest>`_ python
310+
module. This is useful if you're interested in accessing a small part
311+
of a larger snapshot: you can read a small region or a subset of
312+
particle properties without downloading the whole snapshot.
313+
314+
To open a remote snapshot, you can pass a :obj:`hdfstream.RemoteFile`
315+
object to :mod:`swiftsimio.load` and :mod:`swiftsimio.mask` in place
316+
of the filename. For example, you can open one of the SWIFT example
317+
snapshots with:
318+
319+
.. code-block:: python
320+
321+
import hdfstream
322+
from swiftsimio import load
323+
324+
snap_file = hdfstream.open("cosma", "Tests/SWIFT/IOExamples/ssio_ci_04_2025/EagleSingle.hdf5")
325+
data = load(snap_file)
326+
327+
Here, ``data`` will be a :obj:`swiftsimio.reader.SWIFTDataset`. It
328+
functions in the same way as described in the :ref:`loading-data`
329+
section above, except that instead of reading data from a local HDF5
330+
file, it requests data from the server.
331+
332+
Opening a snapshot like this only downloads a small amount of
333+
metadata. Accessing particle properties, such as coordinates, will
334+
trigger another download:
335+
336+
.. code-block:: python
337+
338+
pos = data.dark_matter.coordinates
339+
340+
This will download the dark matter particle coordinates and return an
341+
array with units and cosmological factors attached.
342+
343+
To read part of a remote snapshot, we can use swiftsimio's
344+
:ref:`masking` feature as we would with a local snapshot, but passing
345+
the remote file to :mod:`swiftsimio.mask` :mod:`swiftsimio.load` in
346+
place of the filename.
347+
348+
.. code-block:: python
349+
350+
import hdfstream
351+
import swiftsimio as sw
352+
353+
snap_file = hdfstream.open("cosma", "Tests/SWIFT/IOExamples/ssio_ci_04_2025/EagleSingle.hdf5")
354+
355+
mask = sw.mask(snap_file)
356+
# The full metadata object is available from within the mask
357+
boxsize = mask.metadata.boxsize
358+
# load_region is a 3x2 list [[left, right], [bottom, top], [front, back]]
359+
load_region = [[0.0 * b, 0.5 * b] for b in boxsize]
360+
361+
# Constrain the mask
362+
mask.constrain_spatial(load_region)
363+
364+
# Now load the snapshot with this mask
365+
data = sw.load(snap_file, mask=mask)

docs/source/masking/index.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
.. _masking:
2+
13
Masking
24
=======
35

pyproject.toml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,8 +67,11 @@ docs = [
6767
"recommonmark",
6868
"sphinx_design",
6969
]
70+
hdfstream = [
71+
"hdfstream >= 0.0.23",
72+
]
7073
all = [
71-
"swiftsimio[dev,docs]"
74+
"swiftsimio[dev,docs,hdfstream]"
7275
]
7376

7477
[tool.ruff]

swiftsimio/__init__.py

Lines changed: 8 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -6,12 +6,12 @@
66
"""
77

88
from pathlib import Path
9-
import h5py
109

1110
from .reader import SWIFTDataset
1211
from .snapshot_writer import SWIFTSnapshotWriter as Writer
1312
from .masks import SWIFTMask
1413
from .statistics import SWIFTStatisticsFile
14+
from ._file_utils import open_path_or_handle
1515
from .__version__ import __version__
1616
from .__cite__ import __cite__
1717

@@ -82,7 +82,7 @@ def validate_file(filename: str) -> bool:
8282
If the file is not a SWIFT data file.
8383
"""
8484
try:
85-
with h5py.File(filename, "r") as handle:
85+
with open_path_or_handle(filename) as handle:
8686
if handle["Code"].attrs["Code"] != b"SWIFT":
8787
raise KeyError
8888
except KeyError:
@@ -137,13 +137,11 @@ def mask(
137137
more expensive, ~bytes per particle instead of ~bytes per cell
138138
spatial_only=False version).
139139
"""
140-
if isinstance(filename, str):
141-
filename = Path(filename)
142-
with h5py.File(filename, "r") as handle:
143-
units = SWIFTUnits(filename, handle=handle)
144-
metadata = _metadata_discriminator(filename, units, handle=handle)
140+
with open_path_or_handle(filename) as handle:
141+
units = SWIFTUnits(handle.filename, handle=handle)
142+
metadata = _metadata_discriminator(handle.filename, units, handle=handle)
145143
mask = SWIFTMask(
146-
filename,
144+
handle.filename,
147145
metadata=metadata,
148146
spatial_only=spatial_only,
149147
safe_padding=safe_padding,
@@ -169,11 +167,8 @@ def load(filename: str | Path, mask: SWIFTMask | None = None) -> SWIFTDataset:
169167
SWIFTDataset
170168
Dataset object providing an interface to the data file.
171169
"""
172-
if isinstance(filename, str):
173-
filename = Path(filename)
174-
175-
with h5py.File(filename, "r") as handle:
176-
data = SWIFTDataset(filename, mask=mask, handle=handle)
170+
with open_path_or_handle(filename) as handle:
171+
data = SWIFTDataset(handle.filename, mask=mask, handle=handle)
177172

178173
return data
179174

swiftsimio/_file_utils.py

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
"""
2+
Define functions for handling file and dataset objects.
3+
4+
These are used to handle a few situations where we need to check exactly
5+
what type of file or dataset object we have.
6+
"""
7+
8+
from ._handle_provider import HandleProvider
9+
10+
import h5py
11+
from pathlib import Path
12+
from swiftsimio.optional_packages import HDFSTREAM_AVAILABLE, hdfstream
13+
14+
15+
def is_soft_link(obj: h5py.Group | h5py.Dataset | h5py.SoftLink) -> bool:
16+
"""
17+
Return ``True`` if ``obj`` is a soft link.
18+
19+
Note that soft links are usually dereferenced automatically, so to check
20+
if an object is a soft link a reference to the object must be obtained
21+
with::
22+
23+
obj = group.get(key, getlink=True)
24+
25+
where ``group`` is the group containing the object and ``key`` is the name
26+
of the object.
27+
28+
Parameters
29+
----------
30+
obj : Group, Dataset or SoftLink
31+
The object to check.
32+
33+
Returns
34+
-------
35+
bool
36+
``True`` if ``obj`` is a soft link.
37+
"""
38+
if HDFSTREAM_AVAILABLE:
39+
return isinstance(obj, (h5py.SoftLink, hdfstream.SoftLink))
40+
else:
41+
return isinstance(obj, h5py.SoftLink)
42+
43+
44+
def is_dataset(obj: h5py.Group | h5py.Dataset | h5py.SoftLink) -> bool:
45+
"""
46+
Return ``True`` if ``obj`` is a dataset.
47+
48+
Parameters
49+
----------
50+
obj : Group, Dataset or SoftLink
51+
The object to check.
52+
53+
Returns
54+
-------
55+
bool
56+
``True`` if ``obj`` is a dataset.
57+
"""
58+
if HDFSTREAM_AVAILABLE:
59+
return isinstance(obj, (h5py.Dataset, hdfstream.RemoteDataset))
60+
else:
61+
return isinstance(obj, h5py.Dataset)
62+
63+
64+
def split_path_or_handle(obj: str | Path | h5py.File) -> tuple[Path, h5py.File]:
65+
"""
66+
Given a filename or handle, return a ``(filename, handle)`` tuple.
67+
68+
Parameters
69+
----------
70+
obj : str, Path or h5py.File
71+
A path to a file or a file handle object.
72+
73+
Returns
74+
-------
75+
tuple[Path, h5py.File]
76+
Tuple with the path and file handle.
77+
"""
78+
if isinstance(obj, (str, Path)):
79+
filename = Path(obj)
80+
handle = None
81+
else:
82+
filename = Path(obj.filename)
83+
handle = obj
84+
return filename, handle
85+
86+
87+
def open_path_or_handle(obj: str | Path | h5py.File) -> h5py.File:
88+
"""
89+
Context manager to open a file, given a path or handle.
90+
91+
Parameters
92+
----------
93+
obj : str, Path or h5py.File
94+
A path to a file or a file handle object.
95+
96+
Returns
97+
-------
98+
h5py.File
99+
The file handle.
100+
"""
101+
filename, handle = split_path_or_handle(obj)
102+
return HandleProvider(filename, handle).open_file()

swiftsimio/_handle_provider.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
"""Provide a mixin class for managing file handles."""
22

3+
from contextlib import contextmanager
4+
from typing import ContextManager
35
from pathlib import Path
46
import h5py
57

@@ -58,3 +60,23 @@ def _close_handle_if_manager(self) -> None:
5860
"""Close the file handle if this object is the manager of the handle."""
5961
if self.handle_manager:
6062
self._handle.close()
63+
64+
@contextmanager
65+
def open_file(self) -> ContextManager[h5py.File]:
66+
"""
67+
Return a context manager that can be used to read the file.
68+
69+
This will use the existing handle if it is open. If not, we
70+
assume that we're reading local HDF5 files using h5py and
71+
create a temporary handle.
72+
73+
Returns
74+
-------
75+
ContextManager
76+
A context manager which can be used to read the file.
77+
"""
78+
if self._handle:
79+
yield self._handle
80+
else:
81+
with h5py.File(self.filename, "r") as handle:
82+
yield handle

0 commit comments

Comments
 (0)