diff --git a/.github/dependabot.yml b/.github/dependabot.yml index ea9349e302..6c3cd5c0e0 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -11,6 +11,6 @@ updates: interval: "weekly" - package-ecosystem: docker - directory: /heat/core/tests + directory: /tests schedule: interval: "weekly" diff --git a/.github/rd-release-config.yml b/.github/rd-release-config.yml index a45fa74a14..59953506f1 100644 --- a/.github/rd-release-config.yml +++ b/.github/rd-release-config.yml @@ -116,130 +116,130 @@ autolabeler: - '/Support.+/' - label: 'classification' files: - - 'heat/classification/**/*' + - 'src/heat/classification/**/*' - label: 'cluster' files: - - 'heat/cluster/**/*' + - 'src/heat/cluster/**/*' - label: 'core' files: - - 'heat/core/**/*' + - 'src/heat/core/**/*' - label: 'datasets' files: - - 'heat/datasets/**/*' + - 'src/heat/datasets/**/*' - label: 'decomposition' files: - - 'heat/decomposition/**/*' + - 'src/heat/decomposition/**/*' - label: 'fft' files: - - 'heat/fft/**/*' + - 'src/heat/fft/**/*' - label: 'graph' files: - - 'heat/graph/**/*' + - 'src/heat/graph/**/*' - label: 'naive bayes' files: - - 'heat/naive_bayes/**/*' + - 'src/heat/naive_bayes/**/*' - label: 'nn' files: - - 'heat/nn/**/*' + - 'src/heat/nn/**/*' - label: 'optim' files: - - 'heat/optim/**/*' + - 'src/heat/optim/**/*' - label: 'preprocessing' files: - - 'heat/preprocessing/**/*' + - 'src/heat/preprocessing/**/*' - label: 'regression' files: - - 'heat/regression/**/*' + - 'src/heat/regression/**/*' - label: 'sparse' files: - - 'heat/sparse/**/*' + - 'src/heat/sparse/**/*' - label: 'spatial' files: - - 'heat/spatial/**/*' + - 'src/heat/spatial/**/*' - label: 'utils' files: - - 'heat/utils/**/*' + - 'src/heat/utils/**/*' - label: 'linalg' files: - - 'heat/core/linalg/**/*' + - 'src/heat/core/linalg/**/*' - label: 'arithmetics' files: - - 'heat/core/arithmetics.py' + - 'src/heat/core/arithmetics.py' - label: 'base' files: - - 'heat/core/base.py' + - 'src/heat/core/base.py' - label: 'communication' files: - - 'heat/core/communication.py' + - 'src/heat/core/communication.py' - label: 'complex_math' files: - - 'heat/core/complex_math.py' + - 'src/heat/core/complex_math.py' - label: 'constants' files: - - 'heat/core/constants.py' + - 'src/heat/core/constants.py' - label: 'devices' files: - - 'heat/core/devices.py' + - 'src/heat/core/devices.py' - label: 'dndarray' files: - - 'heat/core/dndarray.py' + - 'src/heat/core/dndarray.py' - label: 'exponential' files: - - 'heat/core/exponential.py' + - 'src/heat/core/exponential.py' - label: 'indexing' files: - - 'heat/core/indexing.py' + - 'src/heat/core/indexing.py' - label: 'io' files: - - 'heat/core/io.py' + - 'src/heat/core/io.py' - label: 'logical' files: - - 'heat/core/logical.py' + - 'src/heat/core/logical.py' - label: 'manipulations' files: - - 'heat/core/manipulations.py' + - 'src/heat/core/manipulations.py' - label: 'memory' files: - - 'heat/core/memory.py' + - 'src/heat/core/memory.py' - label: 'printing' files: - - 'heat/core/printing.py' + - 'src/heat/core/printing.py' - label: 'random' files: - - 'heat/core/random.py' + - 'src/heat/core/random.py' - label: 'relational' files: - - 'heat/core/relational.py' + - 'src/heat/core/relational.py' - label: 'rounding' files: - - 'heat/core/rounding.py' + - 'src/heat/core/rounding.py' - label: 'sanitation' files: - - 'heat/core/sanitation.py' + - 'src/heat/core/sanitation.py' - label: 'signal' files: - - 'heat/core/signal.py' + - 'src/heat/core/signal.py' - label: 'statistics' files: - - 'heat/core/statistics.py' + - 'src/heat/core/statistics.py' - label: 'stride_tricks' files: - - 'heat/core/stride_tricks.py' + - 'src/heat/core/stride_tricks.py' - label: 'tiling' files: - - 'heat/core/tiling.py' + - 'src/heat/core/tiling.py' - label: 'trigonometrics' files: - - 'heat/core/trigonometrics.py' + - 'src/heat/core/trigonometrics.py' - label: 'types' files: - - 'heat/core/types.py' + - 'src/heat/core/types.py' - label: 'version' files: - - 'heat/core/version.py' + - 'src/heat/core/version.py' - label: 'vmap' files: - - 'heat/core/vmap.py' + - 'src/heat/core/vmap.py' change-template: '- #$NUMBER $TITLE (by @$AUTHOR)' category-template: '### $TITLE' diff --git a/.github/workflows/ReceivePR.yml b/.github/workflows/ReceivePR.yml index 8d8839a56d..074ccb90e3 100644 --- a/.github/workflows/ReceivePR.yml +++ b/.github/workflows/ReceivePR.yml @@ -4,7 +4,7 @@ on: pull_request: types: [opened, synchronize, reopened, ready_for_review] paths: - - 'heat/**' + - 'src/heat/**' jobs: build: diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index c9f7978ceb..f716aed70f 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -68,5 +68,5 @@ jobs: pip install pytest pip install ${{ matrix.pytorch-version }} --extra-index-url https://download.pytorch.org/whl/cpu pip install ${{ matrix.install-options }} - mpirun -n 3 pytest heat/ - mpirun -n 4 pytest heat/ + mpirun -n 3 pytest + mpirun -n 4 pytest diff --git a/.github/workflows/release-prep.yml b/.github/workflows/release-prep.yml index 83304025a8..16453928d8 100644 --- a/.github/workflows/release-prep.yml +++ b/.github/workflows/release-prep.yml @@ -92,10 +92,10 @@ jobs: ## ----- END Workflow to update Dockerfile Images ------- # Write on to the version.py file - sed -i "s/major: int = \([0-9]\+\)/major: int = $MAJOR/g" heat/core/version.py - sed -i "s/minor: int = \([0-9]\+\)/minor: int = $MINOR/g" heat/core/version.py - sed -i "s/micro: int = \([0-9]\+\)/micro: int = $MICRO/g" heat/core/version.py - sed -i "s/extension: str = .*/extension: str = None/g" heat/core/version.py + sed -i "s/major: int = \([0-9]\+\)/major: int = $MAJOR/g" src/heat/core/version.py + sed -i "s/minor: int = \([0-9]\+\)/minor: int = $MINOR/g" src/heat/core/version.py + sed -i "s/micro: int = \([0-9]\+\)/micro: int = $MICRO/g" src/heat/core/version.py + sed -i "s/extension: str = .*/extension: str = None/g" src/heat/core/version.py { echo -e "# v${MAJOR}.${MINOR}.${MICRO} - ${{github.event.inputs.title}}\n${{ steps.release_drafter.outputs.body}}\n"; cat CHANGELOG.md; } > tmp.md mv tmp.md CHANGELOG.md @@ -105,7 +105,7 @@ jobs: git config --global user.name "Heat Release Bot" # Commit the changes - git add heat/core/version.py CHANGELOG.md + git add src/heat/core/version.py CHANGELOG.md git commit -m "Bump version to $VERSION" # Commit Dockerfile changes diff --git a/doc/source/conf.py b/doc/source/conf.py index c2da12b04f..81757c765d 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -23,7 +23,7 @@ import sys # sys.path.insert(0, os.path.abspath('.')) -sys.path.insert(0, os.path.abspath("../../heat")) +sys.path.insert(0, os.path.abspath("../../src/heat")) # -- General configuration ------------------------------------------------ @@ -49,7 +49,7 @@ # Document Python Code autoapi_type = "python" -autoapi_dirs = ["../../heat/"] +autoapi_dirs = ["../../src/heat/"] autoapi_ignore = ["*/operations.py", "*/tests/*"] autoapi_template_dir = "_templates/autoapi" @@ -117,7 +117,7 @@ def setup(sphinx): # built documents. # # The short X.Y version. -sys.path.insert(0, "../../heat/core") +sys.path.insert(0, "../../src/heat/core") import version as ht_version version = f"{ht_version.major}.{ht_version.minor}.{ht_version.micro}" diff --git a/heat/utils/data/tests/test_distributed_data.py b/heat/utils/data/tests/test_distributed_data.py deleted file mode 100644 index 2b59d35c36..0000000000 --- a/heat/utils/data/tests/test_distributed_data.py +++ /dev/null @@ -1,92 +0,0 @@ -from typing import Optional -import heat as ht -from heat.utils.data.datatools import DistributedDataset, DistributedSampler -import torch -import unittest - - -class SeedEnviroment: - """ - Class to be used in a `with` Enviroment. - Changes the torch seed to the given and then resets it to the previous one when exiting. - """ - - def __init__(self, seed: Optional[int] = None): - self.seed = seed - - def __enter__(self): - self.state = torch.random.get_rng_state() - - if self.seed is not None: - torch.random.manual_seed(self.seed) - - def __exit__(self, *args, **kwargs): - torch.random.set_rng_state(self.state) - - -class TestDistbributedData(unittest.TestCase): - def test_dataset_and_sampler(self) -> bool: - - reference = ht.arange(100, dtype=torch.int32).reshape(20, 5) - - heat_array = ht.copy(reference).resplit_(0) - dset = DistributedDataset(heat_array) - dsampler = DistributedSampler(dset, shuffle=True) - dsampler._shuffle() - - # To test this, the resulting array should be balanced, have the same number of elements as the original one, and the sum of all the columns should be the same - # And the elements should not be equal to each other. - self.assertTrue(dset.dndarray.size == reference.size) - self.assertTrue(dset.dndarray.shape == reference.shape) - self.assertTrue(dset.dndarray.balanced) - - ref_col_sum = reference.sum(0) - col_sum = dset.dndarray.sum(0) - - self.assertTrue(ht.equal(col_sum, ref_col_sum)) - self.assertFalse(ht.equal(reference, dset.dndarray)) - - def test_batches(self) -> bool: - reference = ht.array( - [ - [10, 11, 12, 13, 14], - [20, 21, 22, 23, 24], - [15, 16, 17, 18, 19], - [0, 1, 2, 3, 4], - [5, 6, 7, 8, 9], - ], - split=0, - dtype=ht.int32, - ) - - with SeedEnviroment(): - arr = ht.arange(25, dtype=ht.int32, split=0).reshape(5, 5) - dset = DistributedDataset(arr) - dsampler = DistributedSampler(dset, shuffle=True, seed=42) - - dataloader = torch.utils.data.DataLoader( - dset, batch_size=1, shuffle=False, sampler=dsampler - ) - - for batch in dataloader: - found = False - for larray in reference.larray: - if not torch.isclose(batch, larray).all(): - continue - found = True - break - self.assertTrue(found) - - def test_dataset_exceptions(self) -> bool: - with self.assertRaises(TypeError): - DistributedDataset("") - with self.assertRaises(ValueError): - DistributedDataset(ht.zeros(2, split=1)) - - def test_data_sampler_exceptions(self) -> bool: - with self.assertRaises(TypeError): - DistributedSampler(ht.zeros(10)) - with self.assertRaises(TypeError): - DistributedSampler(DistributedDataset(ht.zeros(2, split=0)), shuffle="") - with self.assertRaises(TypeError): - DistributedSampler(DistributedDataset(ht.zeros(2, split=0)), shuffle=True, seed="") diff --git a/heat/utils/data/tests/test_matrixgallery.py b/heat/utils/data/tests/test_matrixgallery.py deleted file mode 100644 index 17390cb013..0000000000 --- a/heat/utils/data/tests/test_matrixgallery.py +++ /dev/null @@ -1,118 +0,0 @@ -import heat as ht -import unittest -import torch -from heat.core.tests.test_suites.basic_test import TestCase - - -class TestMatrixgallery(TestCase): - def __check_parter(self, parter): - self.assertEqual(parter.shape, (20, 20)) - # TODO: check for singular values of the parter matrix - - def __check_orthogonality(self, U): - U_orth_err = ( - ht.norm(U.T @ U - ht.eye(U.shape[1], dtype=U.dtype, split=U.T.split, device=U.device)) - / U.shape[1] ** 0.5 - ) - if U.dtype == ht.float64: - dtype_tol = 1e-12 - if U.dtype == ht.float32: - dtype_tol = 1e-6 - self.assertTrue(U_orth_err <= dtype_tol) - - def test_hermitian(self): - with self.assertRaises(ValueError): - ht.utils.data.matrixgallery.hermitian(10, 20) - with self.assertRaises(ValueError): - ht.utils.data.matrixgallery.hermitian(20, split=0, dtype=ht.int32) - - # test default: complex single precision, not positive definite - A = ht.utils.data.matrixgallery.hermitian(20, split=1) - A_err = ht.norm(A - A.T.conj().resplit_(A.split)) / ht.norm(A) - self.assertTrue(A_err <= 1e-6) - - for posdef in [True, False]: - if not self.is_mps: - # test complex double precision - A = ht.utils.data.matrixgallery.hermitian( - 20, dtype=ht.complex128, split=0, positive_definite=posdef - ) - A_err = ht.norm(A - A.T.conj().resplit_(A.split)) / ht.norm(A) - self.assertTrue(A.dtype == ht.complex128) - self.assertTrue(A_err <= 1e-12) - - # test real datatype - A = ht.utils.data.matrixgallery.hermitian( - 20, dtype=ht.float32, split=0, positive_definite=posdef - ) - A_err = ht.norm(A - A.T.conj().resplit_(A.split)) / ht.norm(A) - self.assertTrue(A_err <= 1e-6) - self.assertTrue(A.dtype == ht.float32) - - def test_parter(self): - parter = ht.utils.data.matrixgallery.parter(20) - self.__check_parter(parter) - - parters0 = ht.utils.data.matrixgallery.parter(20, split=0, comm=ht.MPI_WORLD) - self.__check_parter(parters0) - - parters1 = ht.utils.data.matrixgallery.parter(20, split=1, comm=ht.MPI_WORLD) - self.__check_parter(parters1) - - with self.assertRaises(ValueError): - ht.utils.data.matrixgallery.parter(20, split=2, comm=ht.MPI_WORLD) - - def test_random_orthogonal(self): - with self.assertRaises(RuntimeError): - ht.utils.data.matrixgallery.random_orthogonal(10, 20) - - Q = ht.utils.data.matrixgallery.random_orthogonal(20, 15) - # Q_orth_err = ht.norm( - # Q.T @ Q - # - ht.eye(Q.shape[1], dtype=Q.dtype, split=Q.T.split, device=Q.device) - # ) - # self.assertTrue(Q_orth_err <= 1e-6) - self.__check_orthogonality(Q) - - def test_random_known_singularvalues(self): - with self.assertRaises(RuntimeError): - ht.utils.data.matrixgallery.random_known_singularvalues(30, 20, "abc", split=1) - with self.assertRaises(RuntimeError): - ht.utils.data.matrixgallery.random_known_singularvalues(30, 20, ht.eye(20), split=1) - with self.assertRaises(RuntimeError): - ht.utils.data.matrixgallery.random_known_singularvalues(30, 20, ht.ones(50), split=1) - - svals_input = ht.ones(15) - A, SVD = ht.utils.data.matrixgallery.random_known_singularvalues( - 30, 20, svals_input, split=1 - ) - U = SVD[0] - S = SVD[1] - V = SVD[2] - if A.dtype == ht.float64: - dtype_tol = 1e-12 - if A.dtype == ht.float32: - dtype_tol = 1e-6 - self.__check_orthogonality(U) - self.__check_orthogonality(V) - self.assertTrue(ht.allclose(S, svals_input, rtol=dtype_tol)) - A_err = ht.norm(A - U @ ht.diag(S) @ V.T) / ht.norm(A) - self.assertTrue(A_err <= dtype_tol) - - def test_random_known_rank(self): - with self.assertRaises(RuntimeError): - ht.utils.data.matrixgallery.random_known_rank(30, 20, 25, split=1) - rkinput = 15 - A, SVD = ht.utils.data.matrixgallery.random_known_rank(30, 20, rkinput, split=1) - U = SVD[0] - S = SVD[1] - V = SVD[2] - if A.dtype == ht.float64: - dtype_tol = 1e-12 - if A.dtype == ht.float32: - dtype_tol = 1e-6 - self.__check_orthogonality(U) - self.__check_orthogonality(V) - self.assertTrue(S.shape[0] == rkinput) - A_err = ht.norm(A - U @ ht.diag(S) @ V.T) / ht.norm(A) - self.assertTrue(A_err <= dtype_tol) diff --git a/heat/utils/data/tests/test_partial_dataset.py b/heat/utils/data/tests/test_partial_dataset.py deleted file mode 100644 index 49b9f4d3b3..0000000000 --- a/heat/utils/data/tests/test_partial_dataset.py +++ /dev/null @@ -1,77 +0,0 @@ -import heat as ht -import torch -import unittest - - -@unittest.skipIf(torch.cuda.is_available() and torch.version.hip, "not supported for HIP") -class TestPartialDataset(unittest.TestCase): - @unittest.skipUnless(ht.supports_hdf5(), "Requires HDF5") - def test_partial_h5_dataset(self): - # load h5 data and get the total shape - full_data = ht.load("heat/datasets/iris.h5", dataset="data", split=None) - target_shape = full_data.shape - - class TestDataset(ht.utils.data.partial_dataset.PartialH5Dataset): - def __init__(self, file, comm, load, load_len, use_gpus=False): - super(TestDataset, self).__init__( - file, comm=comm, initial_load=load, load_length=load_len, use_gpu=use_gpus - ) - - def __getitem__(self, item): - return self.data[item] - - partial_dset = TestDataset("heat/datasets/iris.h5", full_data.comm, 30, 20) - dl = ht.utils.data.DataLoader(dataset=partial_dset, batch_size=7) - first_epoch = None - second_epoch = None - for epoch in range(2): - elems = 0 - last_batch = None - for batch in dl: - elems += batch.shape[0] - if last_batch is not None: - self.assertFalse(torch.allclose(last_batch, batch)) - self.assertEqual(batch.shape, (7, 4)) - last_batch = batch - if epoch == 0: - if first_epoch is None: - first_epoch = batch - else: - first_epoch = torch.cat((first_epoch, batch), dim=0) - else: - if second_epoch is None: - second_epoch = batch - else: - second_epoch = torch.cat((second_epoch, batch), dim=0) - self.assertTrue(elems >= (target_shape[0] - 7) // full_data.comm.size) - self.assertFalse(torch.allclose(first_epoch, second_epoch)) - - partial_dset = TestDataset("heat/datasets/iris.h5", full_data.comm, 30, 20, True) - dl = ht.utils.data.DataLoader( - dataset=partial_dset, - batch_size=7, - pin_memory=True if torch.cuda.is_available() else False, - ) - first_epoch = None - second_epoch = None - for epoch in range(2): - elems = 0 - last_batch = None - for batch in dl: - elems += batch.shape[0] - if last_batch is not None: - self.assertFalse(torch.allclose(last_batch, batch)) - self.assertEqual(batch.shape, (7, 4)) - last_batch = batch - if epoch == 0: - if first_epoch is None: - first_epoch = batch - else: - first_epoch = torch.cat((first_epoch, batch), dim=0) - else: - if second_epoch is None: - second_epoch = batch - else: - second_epoch = torch.cat((second_epoch, batch), dim=0) - self.assertTrue(elems >= (target_shape[0] - 7) // full_data.comm.size) - self.assertFalse(torch.allclose(first_epoch, second_epoch)) diff --git a/heat/utils/data/tests/test_spherical.py b/heat/utils/data/tests/test_spherical.py deleted file mode 100644 index 7850065969..0000000000 --- a/heat/utils/data/tests/test_spherical.py +++ /dev/null @@ -1,84 +0,0 @@ -import heat as ht -import unittest -import torch -from heat.core.tests.test_suites.basic_test import TestCase - - -class TestCreateClusters(TestCase): - def test_create_cluster(self): - n_samples = ht.MPI_WORLD.size * 10 + 3 - n_features = 3 - n_clusters = ht.MPI_WORLD.size - cluster_mean = torch.arange(n_clusters, dtype=torch.float32).repeat(n_features, 1).T - - # test case with uneven distribution of clusters over processes and variances given as vector - cluster_weight = torch.zeros(n_clusters) - cluster_weight[ht.MPI_WORLD.rank] += 0.5 - cluster_weight[0] += 0.5 - cluster_std = 0.01 * torch.ones(n_clusters) - data = ht.utils.data.spherical.create_clusters( - n_samples, n_features, n_clusters, cluster_mean, cluster_std, cluster_weight - ) - self.assertEqual(data.shape, (n_samples, n_features)) - self.assertEqual(data.dtype, ht.float32) - - # test case with even distribution of clusters over processes and variances given as matrix - cluster_weight = None - cluster_std = 0.01 * torch.rand(n_clusters, n_features, n_features) - cluster_std = torch.transpose(cluster_std, 1, 2) @ cluster_std - data = ht.utils.data.spherical.create_clusters( - n_samples, n_features, n_clusters, cluster_mean, cluster_std, cluster_weight - ) - self.assertEqual(data.shape, (n_samples, n_features)) - self.assertEqual(data.dtype, ht.float32) - - def test_if_errors_are_catched(self): - n_samples = ht.MPI_WORLD.size * 10 + 3 - n_features = 3 - n_clusters = ht.MPI_WORLD.size - cluster_mean = torch.arange(n_clusters, dtype=torch.float32).repeat(n_features, 1).T - cluster_std = 0.01 * torch.ones(n_clusters) - - with self.assertRaises(TypeError): - ht.utils.data.spherical.create_clusters( - n_samples, n_features, n_clusters, "abc", cluster_std - ) - with self.assertRaises(ValueError): - ht.utils.data.spherical.create_clusters( - n_samples, n_features, n_clusters, torch.zeros(2, 2), cluster_std - ) - with self.assertRaises(TypeError): - ht.utils.data.spherical.create_clusters( - n_samples, n_features, n_clusters, cluster_mean, "abc" - ) - with self.assertRaises(ValueError): - ht.utils.data.spherical.create_clusters( - n_samples, n_features, n_clusters, cluster_mean, torch.zeros(2, 2) - ) - with self.assertRaises(TypeError): - ht.utils.data.spherical.create_clusters( - n_samples, n_features, n_clusters, cluster_mean, cluster_std, "abc" - ) - with self.assertRaises(ValueError): - ht.utils.data.spherical.create_clusters( - n_samples, - n_features, - n_clusters, - cluster_mean, - cluster_std, - torch.ones( - n_clusters + 1, - ), - ) - with self.assertRaises(ValueError): - ht.utils.data.spherical.create_clusters( - n_samples, - n_features, - n_clusters, - cluster_mean, - cluster_std, - 2 - * torch.ones( - n_clusters, - ), - ) diff --git a/heat/utils/tests/__init__.py b/heat/utils/tests/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/pyproject.toml b/pyproject.toml index 5168e89f48..0bf4b877f2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,3 @@ -[build-system] -requires = ["setuptools"] -build-backend = "setuptools.build_meta" - [project] name="heat" dynamic = ["version"] @@ -101,14 +97,17 @@ Repository = "https://github.com/helmholtz-analytics/heat" Issues = "https://github.com/helmholtz-analytics/heat/issues" Changelog = "https://github.com/helmholtz-analytics/heat/blob/main/CHANGELOG.md" +[build-system] +requires = ["setuptools"] +build-backend = "setuptools.build_meta" + [tool.setuptools.packages.find] -where = ["."] +where = ["src"] include = ["heat", "heat.*"] exclude = ["*tests*", "*benchmarks*"] - [tool.setuptools.package-data] -datasets = ["*.csv", "*.h5", "*.nc"] +"heat.datasets" = ["*.csv", "*.h5", "*.nc"] heat = ["py.typed"] [tool.setuptools.dynamic] diff --git a/heat/__init__.py b/src/heat/__init__.py similarity index 93% rename from heat/__init__.py rename to src/heat/__init__.py index 84c4afc11b..c99086277f 100644 --- a/heat/__init__.py +++ b/src/heat/__init__.py @@ -9,6 +9,7 @@ from . import core from . import classification from . import cluster +from . import decomposition from . import fft from . import graph from . import naive_bayes diff --git a/heat/classification/__init__.py b/src/heat/classification/__init__.py similarity index 100% rename from heat/classification/__init__.py rename to src/heat/classification/__init__.py diff --git a/heat/classification/kneighborsclassifier.py b/src/heat/classification/kneighborsclassifier.py similarity index 100% rename from heat/classification/kneighborsclassifier.py rename to src/heat/classification/kneighborsclassifier.py diff --git a/heat/cli.py b/src/heat/cli.py similarity index 100% rename from heat/cli.py rename to src/heat/cli.py diff --git a/heat/cluster/__init__.py b/src/heat/cluster/__init__.py similarity index 100% rename from heat/cluster/__init__.py rename to src/heat/cluster/__init__.py diff --git a/heat/cluster/_kcluster.py b/src/heat/cluster/_kcluster.py similarity index 100% rename from heat/cluster/_kcluster.py rename to src/heat/cluster/_kcluster.py diff --git a/heat/cluster/batchparallelclustering.py b/src/heat/cluster/batchparallelclustering.py similarity index 100% rename from heat/cluster/batchparallelclustering.py rename to src/heat/cluster/batchparallelclustering.py diff --git a/heat/cluster/kmeans.py b/src/heat/cluster/kmeans.py similarity index 100% rename from heat/cluster/kmeans.py rename to src/heat/cluster/kmeans.py diff --git a/heat/cluster/kmedians.py b/src/heat/cluster/kmedians.py similarity index 100% rename from heat/cluster/kmedians.py rename to src/heat/cluster/kmedians.py diff --git a/heat/cluster/kmedoids.py b/src/heat/cluster/kmedoids.py similarity index 100% rename from heat/cluster/kmedoids.py rename to src/heat/cluster/kmedoids.py diff --git a/heat/cluster/spectral.py b/src/heat/cluster/spectral.py similarity index 100% rename from heat/cluster/spectral.py rename to src/heat/cluster/spectral.py diff --git a/heat/core/__init__.py b/src/heat/core/__init__.py similarity index 100% rename from heat/core/__init__.py rename to src/heat/core/__init__.py diff --git a/heat/core/_config.py b/src/heat/core/_config.py similarity index 100% rename from heat/core/_config.py rename to src/heat/core/_config.py diff --git a/heat/core/_operations.py b/src/heat/core/_operations.py similarity index 100% rename from heat/core/_operations.py rename to src/heat/core/_operations.py diff --git a/heat/core/arithmetics.py b/src/heat/core/arithmetics.py similarity index 100% rename from heat/core/arithmetics.py rename to src/heat/core/arithmetics.py diff --git a/heat/core/base.py b/src/heat/core/base.py similarity index 100% rename from heat/core/base.py rename to src/heat/core/base.py diff --git a/heat/core/communication.py b/src/heat/core/communication.py similarity index 100% rename from heat/core/communication.py rename to src/heat/core/communication.py diff --git a/heat/core/complex_math.py b/src/heat/core/complex_math.py similarity index 100% rename from heat/core/complex_math.py rename to src/heat/core/complex_math.py diff --git a/heat/core/constants.py b/src/heat/core/constants.py similarity index 100% rename from heat/core/constants.py rename to src/heat/core/constants.py diff --git a/heat/core/devices.py b/src/heat/core/devices.py similarity index 100% rename from heat/core/devices.py rename to src/heat/core/devices.py diff --git a/heat/core/dndarray.py b/src/heat/core/dndarray.py similarity index 100% rename from heat/core/dndarray.py rename to src/heat/core/dndarray.py diff --git a/heat/core/exponential.py b/src/heat/core/exponential.py similarity index 100% rename from heat/core/exponential.py rename to src/heat/core/exponential.py diff --git a/heat/core/factories.py b/src/heat/core/factories.py similarity index 100% rename from heat/core/factories.py rename to src/heat/core/factories.py diff --git a/heat/core/indexing.py b/src/heat/core/indexing.py similarity index 100% rename from heat/core/indexing.py rename to src/heat/core/indexing.py diff --git a/heat/core/io.py b/src/heat/core/io.py similarity index 100% rename from heat/core/io.py rename to src/heat/core/io.py diff --git a/heat/core/linalg/__init__.py b/src/heat/core/linalg/__init__.py similarity index 100% rename from heat/core/linalg/__init__.py rename to src/heat/core/linalg/__init__.py diff --git a/heat/core/linalg/basics.py b/src/heat/core/linalg/basics.py similarity index 100% rename from heat/core/linalg/basics.py rename to src/heat/core/linalg/basics.py diff --git a/heat/core/linalg/eigh.py b/src/heat/core/linalg/eigh.py similarity index 100% rename from heat/core/linalg/eigh.py rename to src/heat/core/linalg/eigh.py diff --git a/heat/core/linalg/polar.py b/src/heat/core/linalg/polar.py similarity index 100% rename from heat/core/linalg/polar.py rename to src/heat/core/linalg/polar.py diff --git a/heat/core/linalg/qr.py b/src/heat/core/linalg/qr.py similarity index 100% rename from heat/core/linalg/qr.py rename to src/heat/core/linalg/qr.py diff --git a/heat/core/linalg/solver.py b/src/heat/core/linalg/solver.py similarity index 100% rename from heat/core/linalg/solver.py rename to src/heat/core/linalg/solver.py diff --git a/heat/core/linalg/svd.py b/src/heat/core/linalg/svd.py similarity index 100% rename from heat/core/linalg/svd.py rename to src/heat/core/linalg/svd.py diff --git a/heat/core/linalg/svdtools.py b/src/heat/core/linalg/svdtools.py similarity index 100% rename from heat/core/linalg/svdtools.py rename to src/heat/core/linalg/svdtools.py diff --git a/heat/core/logical.py b/src/heat/core/logical.py similarity index 100% rename from heat/core/logical.py rename to src/heat/core/logical.py diff --git a/heat/core/manipulations.py b/src/heat/core/manipulations.py similarity index 100% rename from heat/core/manipulations.py rename to src/heat/core/manipulations.py diff --git a/heat/core/memory.py b/src/heat/core/memory.py similarity index 100% rename from heat/core/memory.py rename to src/heat/core/memory.py diff --git a/heat/core/printing.py b/src/heat/core/printing.py similarity index 100% rename from heat/core/printing.py rename to src/heat/core/printing.py diff --git a/heat/core/random.py b/src/heat/core/random.py similarity index 100% rename from heat/core/random.py rename to src/heat/core/random.py diff --git a/heat/core/relational.py b/src/heat/core/relational.py similarity index 100% rename from heat/core/relational.py rename to src/heat/core/relational.py diff --git a/heat/core/rounding.py b/src/heat/core/rounding.py similarity index 100% rename from heat/core/rounding.py rename to src/heat/core/rounding.py diff --git a/heat/core/sanitation.py b/src/heat/core/sanitation.py similarity index 100% rename from heat/core/sanitation.py rename to src/heat/core/sanitation.py diff --git a/heat/core/signal.py b/src/heat/core/signal.py similarity index 100% rename from heat/core/signal.py rename to src/heat/core/signal.py diff --git a/heat/core/statistics.py b/src/heat/core/statistics.py similarity index 100% rename from heat/core/statistics.py rename to src/heat/core/statistics.py diff --git a/heat/core/stride_tricks.py b/src/heat/core/stride_tricks.py similarity index 100% rename from heat/core/stride_tricks.py rename to src/heat/core/stride_tricks.py diff --git a/heat/core/tiling.py b/src/heat/core/tiling.py similarity index 100% rename from heat/core/tiling.py rename to src/heat/core/tiling.py diff --git a/heat/core/trigonometrics.py b/src/heat/core/trigonometrics.py similarity index 100% rename from heat/core/trigonometrics.py rename to src/heat/core/trigonometrics.py diff --git a/heat/core/types.py b/src/heat/core/types.py similarity index 100% rename from heat/core/types.py rename to src/heat/core/types.py diff --git a/heat/core/version.py b/src/heat/core/version.py similarity index 100% rename from heat/core/version.py rename to src/heat/core/version.py diff --git a/heat/core/vmap.py b/src/heat/core/vmap.py similarity index 100% rename from heat/core/vmap.py rename to src/heat/core/vmap.py diff --git a/heat/datasets/__init__.py b/src/heat/datasets/__init__.py similarity index 100% rename from heat/datasets/__init__.py rename to src/heat/datasets/__init__.py diff --git a/heat/datasets/diabetes.h5 b/src/heat/datasets/diabetes.h5 similarity index 100% rename from heat/datasets/diabetes.h5 rename to src/heat/datasets/diabetes.h5 diff --git a/heat/datasets/iris.csv b/src/heat/datasets/iris.csv similarity index 100% rename from heat/datasets/iris.csv rename to src/heat/datasets/iris.csv diff --git a/heat/datasets/iris.h5 b/src/heat/datasets/iris.h5 similarity index 100% rename from heat/datasets/iris.h5 rename to src/heat/datasets/iris.h5 diff --git a/heat/datasets/iris.nc b/src/heat/datasets/iris.nc similarity index 100% rename from heat/datasets/iris.nc rename to src/heat/datasets/iris.nc diff --git a/heat/datasets/iris_X_test.csv b/src/heat/datasets/iris_X_test.csv similarity index 100% rename from heat/datasets/iris_X_test.csv rename to src/heat/datasets/iris_X_test.csv diff --git a/heat/datasets/iris_X_train.csv b/src/heat/datasets/iris_X_train.csv similarity index 100% rename from heat/datasets/iris_X_train.csv rename to src/heat/datasets/iris_X_train.csv diff --git a/heat/datasets/iris_labels.csv b/src/heat/datasets/iris_labels.csv similarity index 100% rename from heat/datasets/iris_labels.csv rename to src/heat/datasets/iris_labels.csv diff --git a/heat/datasets/iris_y_pred_proba.csv b/src/heat/datasets/iris_y_pred_proba.csv similarity index 100% rename from heat/datasets/iris_y_pred_proba.csv rename to src/heat/datasets/iris_y_pred_proba.csv diff --git a/heat/datasets/iris_y_test.csv b/src/heat/datasets/iris_y_test.csv similarity index 100% rename from heat/datasets/iris_y_test.csv rename to src/heat/datasets/iris_y_test.csv diff --git a/heat/datasets/iris_y_train.csv b/src/heat/datasets/iris_y_train.csv similarity index 100% rename from heat/datasets/iris_y_train.csv rename to src/heat/datasets/iris_y_train.csv diff --git a/heat/decomposition/__init__.py b/src/heat/decomposition/__init__.py similarity index 100% rename from heat/decomposition/__init__.py rename to src/heat/decomposition/__init__.py diff --git a/heat/decomposition/dmd.py b/src/heat/decomposition/dmd.py similarity index 100% rename from heat/decomposition/dmd.py rename to src/heat/decomposition/dmd.py diff --git a/heat/decomposition/pca.py b/src/heat/decomposition/pca.py similarity index 100% rename from heat/decomposition/pca.py rename to src/heat/decomposition/pca.py diff --git a/heat/fft/__init__.py b/src/heat/fft/__init__.py similarity index 100% rename from heat/fft/__init__.py rename to src/heat/fft/__init__.py diff --git a/heat/fft/fft.py b/src/heat/fft/fft.py similarity index 100% rename from heat/fft/fft.py rename to src/heat/fft/fft.py diff --git a/heat/graph/__init__.py b/src/heat/graph/__init__.py similarity index 100% rename from heat/graph/__init__.py rename to src/heat/graph/__init__.py diff --git a/heat/graph/laplacian.py b/src/heat/graph/laplacian.py similarity index 100% rename from heat/graph/laplacian.py rename to src/heat/graph/laplacian.py diff --git a/heat/naive_bayes/.DS_Store b/src/heat/naive_bayes/.DS_Store similarity index 100% rename from heat/naive_bayes/.DS_Store rename to src/heat/naive_bayes/.DS_Store diff --git a/heat/naive_bayes/__init__.py b/src/heat/naive_bayes/__init__.py similarity index 100% rename from heat/naive_bayes/__init__.py rename to src/heat/naive_bayes/__init__.py diff --git a/heat/naive_bayes/gaussianNB.py b/src/heat/naive_bayes/gaussianNB.py similarity index 100% rename from heat/naive_bayes/gaussianNB.py rename to src/heat/naive_bayes/gaussianNB.py diff --git a/heat/nn/__init__.py b/src/heat/nn/__init__.py similarity index 100% rename from heat/nn/__init__.py rename to src/heat/nn/__init__.py diff --git a/heat/nn/data_parallel.py b/src/heat/nn/data_parallel.py similarity index 100% rename from heat/nn/data_parallel.py rename to src/heat/nn/data_parallel.py diff --git a/heat/nn/functional.py b/src/heat/nn/functional.py similarity index 100% rename from heat/nn/functional.py rename to src/heat/nn/functional.py diff --git a/heat/optim/__init__.py b/src/heat/optim/__init__.py similarity index 100% rename from heat/optim/__init__.py rename to src/heat/optim/__init__.py diff --git a/heat/optim/dp_optimizer.py b/src/heat/optim/dp_optimizer.py similarity index 100% rename from heat/optim/dp_optimizer.py rename to src/heat/optim/dp_optimizer.py diff --git a/heat/optim/lr_scheduler.py b/src/heat/optim/lr_scheduler.py similarity index 100% rename from heat/optim/lr_scheduler.py rename to src/heat/optim/lr_scheduler.py diff --git a/heat/optim/utils.py b/src/heat/optim/utils.py similarity index 100% rename from heat/optim/utils.py rename to src/heat/optim/utils.py diff --git a/heat/preprocessing/__init__.py b/src/heat/preprocessing/__init__.py similarity index 100% rename from heat/preprocessing/__init__.py rename to src/heat/preprocessing/__init__.py diff --git a/heat/preprocessing/preprocessing.py b/src/heat/preprocessing/preprocessing.py similarity index 100% rename from heat/preprocessing/preprocessing.py rename to src/heat/preprocessing/preprocessing.py diff --git a/heat/py.typed b/src/heat/py.typed similarity index 100% rename from heat/py.typed rename to src/heat/py.typed diff --git a/heat/regression/__init__.py b/src/heat/regression/__init__.py similarity index 100% rename from heat/regression/__init__.py rename to src/heat/regression/__init__.py diff --git a/heat/regression/lasso.py b/src/heat/regression/lasso.py similarity index 100% rename from heat/regression/lasso.py rename to src/heat/regression/lasso.py diff --git a/heat/sparse/__init__.py b/src/heat/sparse/__init__.py similarity index 100% rename from heat/sparse/__init__.py rename to src/heat/sparse/__init__.py diff --git a/heat/sparse/_operations.py b/src/heat/sparse/_operations.py similarity index 100% rename from heat/sparse/_operations.py rename to src/heat/sparse/_operations.py diff --git a/heat/sparse/arithmetics.py b/src/heat/sparse/arithmetics.py similarity index 100% rename from heat/sparse/arithmetics.py rename to src/heat/sparse/arithmetics.py diff --git a/heat/sparse/dcsx_matrix.py b/src/heat/sparse/dcsx_matrix.py similarity index 100% rename from heat/sparse/dcsx_matrix.py rename to src/heat/sparse/dcsx_matrix.py diff --git a/heat/sparse/factories.py b/src/heat/sparse/factories.py similarity index 100% rename from heat/sparse/factories.py rename to src/heat/sparse/factories.py diff --git a/heat/sparse/manipulations.py b/src/heat/sparse/manipulations.py similarity index 100% rename from heat/sparse/manipulations.py rename to src/heat/sparse/manipulations.py diff --git a/heat/spatial/__init__.py b/src/heat/spatial/__init__.py similarity index 100% rename from heat/spatial/__init__.py rename to src/heat/spatial/__init__.py diff --git a/heat/spatial/distance.py b/src/heat/spatial/distance.py similarity index 100% rename from heat/spatial/distance.py rename to src/heat/spatial/distance.py diff --git a/heat/utils/__init__.py b/src/heat/utils/__init__.py similarity index 100% rename from heat/utils/__init__.py rename to src/heat/utils/__init__.py diff --git a/heat/utils/data/__init__.py b/src/heat/utils/data/__init__.py similarity index 100% rename from heat/utils/data/__init__.py rename to src/heat/utils/data/__init__.py diff --git a/heat/utils/data/_utils.py b/src/heat/utils/data/_utils.py similarity index 100% rename from heat/utils/data/_utils.py rename to src/heat/utils/data/_utils.py diff --git a/heat/utils/data/datatools.py b/src/heat/utils/data/datatools.py similarity index 100% rename from heat/utils/data/datatools.py rename to src/heat/utils/data/datatools.py diff --git a/heat/utils/data/matrixgallery.py b/src/heat/utils/data/matrixgallery.py similarity index 100% rename from heat/utils/data/matrixgallery.py rename to src/heat/utils/data/matrixgallery.py diff --git a/heat/utils/data/mnist.py b/src/heat/utils/data/mnist.py similarity index 100% rename from heat/utils/data/mnist.py rename to src/heat/utils/data/mnist.py diff --git a/heat/utils/data/partial_dataset.py b/src/heat/utils/data/partial_dataset.py similarity index 100% rename from heat/utils/data/partial_dataset.py rename to src/heat/utils/data/partial_dataset.py diff --git a/heat/utils/data/spherical.py b/src/heat/utils/data/spherical.py similarity index 100% rename from heat/utils/data/spherical.py rename to src/heat/utils/data/spherical.py diff --git a/heat/utils/vision_transforms.py b/src/heat/utils/vision_transforms.py similarity index 100% rename from heat/utils/vision_transforms.py rename to src/heat/utils/vision_transforms.py diff --git a/heat/classification/tests/__init__.py b/tests/classification/__init__.py similarity index 100% rename from heat/classification/tests/__init__.py rename to tests/classification/__init__.py diff --git a/heat/classification/tests/test_knn.py b/tests/classification/test_knn.py similarity index 97% rename from heat/classification/tests/test_knn.py rename to tests/classification/test_knn.py index 752778062e..41a466ffa0 100644 --- a/heat/classification/tests/test_knn.py +++ b/tests/classification/test_knn.py @@ -2,7 +2,7 @@ import heat as ht from heat.classification.kneighborsclassifier import KNeighborsClassifier -from heat.core.tests.test_suites.basic_test import TestCase +from tests.test_suites.basic_test import TestCase class TestKNN(TestCase): diff --git a/heat/cluster/tests/__init__.py b/tests/cluster/__init__.py similarity index 100% rename from heat/cluster/tests/__init__.py rename to tests/cluster/__init__.py diff --git a/heat/cluster/tests/test_batchparallelclustering.py b/tests/cluster/test_batchparallelclustering.py similarity index 98% rename from heat/cluster/tests/test_batchparallelclustering.py rename to tests/cluster/test_batchparallelclustering.py index 684d9d9247..9769ce1a99 100644 --- a/heat/cluster/tests/test_batchparallelclustering.py +++ b/tests/cluster/test_batchparallelclustering.py @@ -7,8 +7,8 @@ from heat.utils.data.spherical import create_spherical_dataset from mpi4py import MPI -from ...core.tests.test_suites.basic_test import TestCase -from ..batchparallelclustering import _kmex, _initialize_plus_plus, _BatchParallelKCluster +from tests.test_suites.basic_test import TestCase +from heat.cluster.batchparallelclustering import _kmex, _initialize_plus_plus, _BatchParallelKCluster # test BatchParallelKCluster base class and auxiliary functions diff --git a/heat/cluster/tests/test_kmeans.py b/tests/cluster/test_kmeans.py similarity index 92% rename from heat/cluster/tests/test_kmeans.py rename to tests/cluster/test_kmeans.py index 25eaf80518..afef8d11ad 100644 --- a/heat/cluster/tests/test_kmeans.py +++ b/tests/cluster/test_kmeans.py @@ -3,12 +3,17 @@ import numpy as np import torch import heat as ht -from heat.utils.data.spherical import create_spherical_dataset - -from ...core.tests.test_suites.basic_test import TestCase +from pathlib import Path +from heat.utils.data.spherical import create_spherical_dataset +from tests.test_suites.basic_test import TestCase class TestKMeans(TestCase): + @classmethod + def setUpClass(cls): + super(TestKMeans, cls).setUpClass() + cls.data_path = str(Path(ht.__file__).parent / "datasets" / "iris.csv") + def test_clusterer(self): kmeans = ht.cluster.KMeans() self.assertTrue(ht.is_estimator(kmeans)) @@ -31,7 +36,7 @@ def test_fit_iris_unsplit(self): oversampling=10 for split in [None, 0]: # get some test data - iris = ht.load("heat/datasets/iris.csv", sep=";", split=split) + iris = ht.load(self.data_path, sep=";", split=split) # fit the clusters k = 3 @@ -50,7 +55,7 @@ def test_fit_iris_unsplit(self): self.assertIsInstance(kmeans.cluster_centers_, ht.DNDarray) self.assertEqual(kmeans.cluster_centers_.shape, (k, iris.shape[1])) - iris = ht.load("heat/datasets/iris.csv", sep=";", split=0) + iris = ht.load(self.data_path, sep=";", split=0) # same test with init=batchparallel kmeans = ht.cluster.KMeans(n_clusters=k, init="batchparallel") kmeans.fit(iris, oversampling=oversampling) @@ -61,7 +66,7 @@ def test_fit_iris_unsplit(self): def test_exceptions(self): # get some test data - iris_split = ht.load("heat/datasets/iris.csv", sep=";", split=1) + iris_split = ht.load(self.data_path, sep=";", split=1) # build a clusterer k = 3 diff --git a/heat/cluster/tests/test_kmedians.py b/tests/cluster/test_kmedians.py similarity index 92% rename from heat/cluster/tests/test_kmedians.py rename to tests/cluster/test_kmedians.py index ee8b534e50..5a0f3873cb 100644 --- a/heat/cluster/tests/test_kmedians.py +++ b/tests/cluster/test_kmedians.py @@ -5,12 +5,18 @@ import numpy as np import torch +from pathlib import Path from heat.utils.data.spherical import create_spherical_dataset -from ...core.tests.test_suites.basic_test import TestCase +from tests.test_suites.basic_test import TestCase class TestKMedians(TestCase): + @classmethod + def setUpClass(cls): + super(TestKMedians, cls).setUpClass() + cls.data_path = str(Path(ht.__file__).parent / "datasets" / "iris.csv") + def test_clusterer(self): kmedian = ht.cluster.KMedians() self.assertTrue(ht.is_estimator(kmedian)) @@ -32,7 +38,7 @@ def test_get_and_set_params(self): def test_fit_iris_unsplit(self): split = 0 # get some test data - iris = ht.load("heat/datasets/iris.csv", sep=";", split=split) + iris = ht.load(self.data_path, sep=";", split=split) # fit the clusters k = 3 @@ -60,7 +66,7 @@ def test_fit_iris_unsplit(self): def test_exceptions(self): # get some test data - iris_split = ht.load("heat/datasets/iris.csv", sep=";", split=1) + iris_split = ht.load(self.data_path, sep=";", split=1) # build a clusterer k = 3 diff --git a/heat/cluster/tests/test_kmedoids.py b/tests/cluster/test_kmedoids.py similarity index 91% rename from heat/cluster/tests/test_kmedoids.py rename to tests/cluster/test_kmedoids.py index a1a261eca8..ef3bb1e21b 100644 --- a/heat/cluster/tests/test_kmedoids.py +++ b/tests/cluster/test_kmedoids.py @@ -1,11 +1,17 @@ import unittest import heat as ht +from pathlib import Path from heat.utils.data.spherical import create_spherical_dataset -from ...core.tests.test_suites.basic_test import TestCase +from tests.test_suites.basic_test import TestCase -class TestKMeans(TestCase): +class TestKMedoids(TestCase): + @classmethod + def setUpClass(cls): + super(TestKMedoids, cls).setUpClass() + cls.data_path = str(Path(ht.__file__).parent / "datasets" / "iris.csv") + def test_clusterer(self): kmedoid = ht.cluster.KMedoids() self.assertTrue(ht.is_estimator(kmedoid)) @@ -26,7 +32,7 @@ def test_get_and_set_params(self): def test_fit_iris_unsplit(self): split = 0 # get some test data - iris = ht.load("heat/datasets/iris.csv", sep=";", split=split) + iris = ht.load(self.data_path, sep=";", split=split) # fit the clusters k = 3 kmedoid = ht.cluster.KMedoids(n_clusters=k, random_state=1) @@ -51,7 +57,7 @@ def test_fit_iris_unsplit(self): def test_exceptions(self): # get some test data - iris_split = ht.load("heat/datasets/iris.csv", sep=";", split=1) + iris_split = ht.load(self.data_path, sep=";", split=1) # build a clusterer k = 3 @@ -65,7 +71,7 @@ def test_exceptions(self): kmedoid = ht.cluster.KMedoids(n_clusters=k, init="random_number") kmedoid.fit(iris_split) - iris_split = ht.load("heat/datasets/iris.csv", sep=";", split=0) + iris_split = ht.load(self.data_path, sep=";", split=0) with self.assertRaises(ValueError): kmedoid = ht.cluster.KMedoids(n_clusters=k, init="batchparallel") kmedoid.fit(iris_split) diff --git a/heat/cluster/tests/test_spectral.py b/tests/cluster/test_spectral.py similarity index 87% rename from heat/cluster/tests/test_spectral.py rename to tests/cluster/test_spectral.py index cd43433d9d..5f6c4177f0 100644 --- a/heat/cluster/tests/test_spectral.py +++ b/tests/cluster/test_spectral.py @@ -4,10 +4,16 @@ import heat as ht import torch -from ...core.tests.test_suites.basic_test import TestCase +from pathlib import Path +from tests.test_suites.basic_test import TestCase class TestSpectral(TestCase): + @classmethod + def setUpClass(cls): + super(TestSpectral, cls).setUpClass() + cls.data_path = str(Path(ht.__file__).parent / "datasets" / "iris.csv") + def test_clusterer(self): spectral = ht.cluster.Spectral() self.assertTrue(ht.is_estimator(spectral)) @@ -39,7 +45,7 @@ def test_fit_iris(self): # skip on MPS, matmul on ComplexFloat not supported as of PyTorch 2.5 if not self.is_mps: # get some test data - iris = ht.load("heat/datasets/iris.csv", sep=";", split=0) + iris = ht.load(self.data_path, sep=";", split=0) m = 10 # fit the clusters spectral = ht.cluster.Spectral( @@ -80,7 +86,7 @@ def test_fit_iris(self): with self.assertRaises(NotImplementedError): spectral = ht.cluster.Spectral(metric="ahalanobis", n_lanczos=m) - iris_split = ht.load("heat/datasets/iris.csv", sep=";", split=1) + iris_split = ht.load(self.data_path, sep=";", split=1) spectral = ht.cluster.Spectral(n_lanczos=20) with self.assertRaises(NotImplementedError): spectral.fit(iris_split) diff --git a/heat/core/tests/Dockerfile b/tests/core/Dockerfile similarity index 100% rename from heat/core/tests/Dockerfile rename to tests/core/Dockerfile diff --git a/heat/core/linalg/tests/__init__.py b/tests/core/__init__.py similarity index 100% rename from heat/core/linalg/tests/__init__.py rename to tests/core/__init__.py diff --git a/heat/core/tests/test_arithmetics.py b/tests/core/test_arithmetics.py similarity index 99% rename from heat/core/tests/test_arithmetics.py rename to tests/core/test_arithmetics.py index 8b8a8a902d..8b92dc53d9 100644 --- a/heat/core/tests/test_arithmetics.py +++ b/tests/core/test_arithmetics.py @@ -6,7 +6,7 @@ import numpy as np import torch -from .test_suites.basic_test import TestCase +from tests.test_suites.basic_test import TestCase class TestArithmetics(TestCase): diff --git a/heat/core/tests/test_communication.py b/tests/core/test_communication.py similarity index 99% rename from heat/core/tests/test_communication.py rename to tests/core/test_communication.py index 9ae4a95b70..ff37948f17 100644 --- a/heat/core/tests/test_communication.py +++ b/tests/core/test_communication.py @@ -6,7 +6,7 @@ import torch import heat as ht -from .test_suites.basic_test import TestCase +from tests.test_suites.basic_test import TestCase envar = os.getenv("HEAT_TEST_USE_DEVICE", "cpu") is_mps = envar == "gpu" and platform.machine() == "arm64" diff --git a/heat/core/tests/test_complex_math.py b/tests/core/test_complex_math.py similarity index 99% rename from heat/core/tests/test_complex_math.py rename to tests/core/test_complex_math.py index cc56088bce..4b679689ac 100644 --- a/heat/core/tests/test_complex_math.py +++ b/tests/core/test_complex_math.py @@ -3,7 +3,7 @@ import heat as ht import platform -from .test_suites.basic_test import TestCase +from tests.test_suites.basic_test import TestCase class TestComplex(TestCase): diff --git a/heat/core/tests/test_constants.py b/tests/core/test_constants.py similarity index 88% rename from heat/core/tests/test_constants.py rename to tests/core/test_constants.py index 31c725ae3f..5f4745a919 100644 --- a/heat/core/tests/test_constants.py +++ b/tests/core/test_constants.py @@ -1,7 +1,7 @@ import numpy as np import heat as ht -from .test_suites.basic_test import TestCase +from tests.test_suites.basic_test import TestCase class TestConstants(TestCase): diff --git a/heat/core/tests/test_devices.py b/tests/core/test_devices.py similarity index 98% rename from heat/core/tests/test_devices.py rename to tests/core/test_devices.py index e0ce2a758b..f9890f9182 100644 --- a/heat/core/tests/test_devices.py +++ b/tests/core/test_devices.py @@ -2,7 +2,7 @@ import unittest import heat as ht -from .test_suites.basic_test import TestCase +from tests.test_suites.basic_test import TestCase envar = os.getenv("HEAT_TEST_USE_DEVICE", "cpu") diff --git a/heat/core/tests/test_dndarray.py b/tests/core/test_dndarray.py similarity index 99% rename from heat/core/tests/test_dndarray.py rename to tests/core/test_dndarray.py index c6123c1cf2..9652872222 100644 --- a/heat/core/tests/test_dndarray.py +++ b/tests/core/test_dndarray.py @@ -2,7 +2,8 @@ import torch import heat as ht -from .test_suites.basic_test import TestCase +from tests.test_suites.basic_test import TestCase +from pathlib import Path pytorch_major_version = int(torch.__version__.split(".")[0]) @@ -354,6 +355,8 @@ def test_astype(self): self.assertIs(as_float64, data) def test_balance_and_lshape_map(self): + data_path = str(Path(ht.__file__).parent / "datasets" / "iris.csv") + data = ht.zeros((70, 20), split=0) data = data[:50] data.lshape_map @@ -382,8 +385,8 @@ def test_balance_and_lshape_map(self): data = data[:, 40:70].balance() self.assertTrue(data.is_balanced()) - data = np.loadtxt("heat/datasets/iris.csv", delimiter=";") - htdata = ht.load("heat/datasets/iris.csv", sep=";", split=0) + data = np.loadtxt(data_path, delimiter=";") + htdata = ht.load(data_path, sep=";", split=0) self.assertTrue( ht.equal(htdata, ht.array(data.astype(np.float32), split=0, dtype=ht.float)) ) diff --git a/heat/core/tests/test_exponential.py b/tests/core/test_exponential.py similarity index 99% rename from heat/core/tests/test_exponential.py rename to tests/core/test_exponential.py index b26cfe789a..5a28b6f8b6 100644 --- a/heat/core/tests/test_exponential.py +++ b/tests/core/test_exponential.py @@ -2,7 +2,7 @@ import torch import heat as ht -from .test_suites.basic_test import TestCase +from tests.test_suites.basic_test import TestCase class TestExponential(TestCase): diff --git a/heat/core/tests/test_factories.py b/tests/core/test_factories.py similarity index 99% rename from heat/core/tests/test_factories.py rename to tests/core/test_factories.py index fe17e897c4..253377cd92 100644 --- a/heat/core/tests/test_factories.py +++ b/tests/core/test_factories.py @@ -2,7 +2,7 @@ import torch import heat as ht -from .test_suites.basic_test import TestCase +from tests.test_suites.basic_test import TestCase class TestFactories(TestCase): diff --git a/heat/core/tests/test_indexing.py b/tests/core/test_indexing.py similarity index 98% rename from heat/core/tests/test_indexing.py rename to tests/core/test_indexing.py index 4707aa28ab..1f190edbee 100644 --- a/heat/core/tests/test_indexing.py +++ b/tests/core/test_indexing.py @@ -1,5 +1,5 @@ import heat as ht -from .test_suites.basic_test import TestCase +from tests.test_suites.basic_test import TestCase class TestIndexing(TestCase): diff --git a/heat/core/tests/test_io.py b/tests/core/test_io.py similarity index 96% rename from heat/core/tests/test_io.py rename to tests/core/test_io.py index 0ec1bd044a..4dccba3d27 100644 --- a/heat/core/tests/test_io.py +++ b/tests/core/test_io.py @@ -11,25 +11,26 @@ import unittest import heat as ht -from .test_suites.basic_test import TestCase +from tests.test_suites.basic_test import TestCase class TestIO(TestCase): @classmethod def setUpClass(cls): super(TestIO, cls).setUpClass() + data_path = Path(ht.__file__).parent / "datasets" pwd = os.getcwd() - cls.HDF5_PATH = os.path.join(os.getcwd(), "heat/datasets/iris.h5") + cls.HDF5_PATH = str(data_path / "iris.h5") cls.HDF5_OUT_PATH = pwd + "/test.h5" cls.HDF5_DATASET = "data" - cls.NETCDF_PATH = os.path.join(os.getcwd(), "heat/datasets/iris.nc") + cls.NETCDF_PATH = str(data_path / "iris.nc") cls.NETCDF_OUT_PATH = pwd + "/test.nc" cls.NETCDF_VARIABLE = "data" cls.NETCDF_DIMENSION = "data" # load comparison data from csv - cls.CSV_PATH = os.path.join(os.getcwd(), "heat/datasets/iris.csv") + cls.CSV_PATH = str(data_path / "iris.csv") cls.CSV_OUT_PATH = pwd + "/test.csv" cls.IRIS = ( torch.from_numpy(np.loadtxt(cls.CSV_PATH, delimiter=";")) @@ -815,12 +816,12 @@ def test_load_npy_int(self): crea_array = [] for i in range(0, ht.MPI_WORLD.size * 5): x = np.random.randint(1000, size=(random.randint(0, 30), 6, 11)) - np.save(os.path.join(os.getcwd(), "heat/datasets", "int_data") + str(i), x) + np.save(os.path.join(os.getcwd(), "src/heat/datasets", "int_data") + str(i), x) crea_array.append(x) int_array = np.concatenate(crea_array) ht.MPI_WORLD.Barrier() load_array = ht.load_npy_from_path( - os.path.join(os.getcwd(), "heat/datasets"), dtype=ht.int32, split=0 + os.path.join(os.getcwd(), "src/heat/datasets"), dtype=ht.int32, split=0 ) load_array_npy = load_array.numpy() @@ -828,9 +829,9 @@ def test_load_npy_int(self): self.assertEqual(load_array.dtype, ht.int32) if ht.MPI_WORLD.rank == 0: self.assertTrue((load_array_npy == int_array).all) - for file in os.listdir(os.path.join(os.getcwd(), "heat/datasets")): + for file in os.listdir(os.path.join(os.getcwd(), "src/heat/datasets")): if fnmatch.fnmatch(file, "*.npy"): - os.remove(os.path.join(os.getcwd(), "heat/datasets", file)) + os.remove(os.path.join(os.getcwd(), "src/heat/datasets", file)) def test_load_npy_float(self): # testing for float arrays and split dimension other than 0 @@ -838,7 +839,7 @@ def test_load_npy_float(self): crea_array = [] for i in range(0, ht.MPI_WORLD.size * 5 + 1): x = np.random.rand(2, random.randint(1, 10), 11) - np.save(os.path.join(os.getcwd(), "heat/datasets", "float_data") + str(i), x) + np.save(os.path.join(os.getcwd(), "src/heat/datasets", "float_data") + str(i), x) crea_array.append(x) float_array = np.concatenate(crea_array, 1) ht.MPI_WORLD.Barrier() @@ -846,7 +847,7 @@ def test_load_npy_float(self): if not self.is_mps: # float64 not supported in MPS load_array = ht.load_npy_from_path( - os.path.join(os.getcwd(), "heat/datasets"), dtype=ht.float64, split=1 + os.path.join(os.getcwd(), "src/heat/datasets"), dtype=ht.float64, split=1 ) load_array_npy = load_array.numpy() self.assertIsInstance(load_array, ht.DNDarray) @@ -854,27 +855,27 @@ def test_load_npy_float(self): if ht.MPI_WORLD.rank == 0: self.assertTrue((load_array_npy == float_array).all) if ht.MPI_WORLD.rank == 0: - for file in os.listdir(os.path.join(os.getcwd(), "heat/datasets")): + for file in os.listdir(os.path.join(os.getcwd(), "src/heat/datasets")): if fnmatch.fnmatch(file, "*.npy"): - os.remove(os.path.join(os.getcwd(), "heat/datasets", file)) + os.remove(os.path.join(os.getcwd(), "src/heat/datasets", file)) def test_load_npy_exception(self): with self.assertRaises(TypeError): ht.load_npy_from_path(path=1, split=0) with self.assertRaises(TypeError): - ht.load_npy_from_path("heat/datasets", split="ABC") + ht.load_npy_from_path("src/heat/datasets", split="ABC") with self.assertRaises(ValueError): - ht.load_npy_from_path(path="heat", dtype=ht.int64, split=0) + ht.load_npy_from_path(path="src/heat", dtype=ht.int64, split=0) if ht.MPI_WORLD.size > 1: if ht.MPI_WORLD.rank == 0: x = np.random.rand(2, random.randint(1, 10), 11) - np.save(os.path.join(os.getcwd(), "heat/datasets", "float_data"), x) + np.save(os.path.join(os.getcwd(), "src/heat/datasets", "float_data"), x) ht.MPI_WORLD.Barrier() with self.assertRaises(RuntimeError): - ht.load_npy_from_path("heat/datasets", dtype=ht.int64, split=0) + ht.load_npy_from_path("src/heat/datasets", dtype=ht.int64, split=0) ht.MPI_WORLD.Barrier() if ht.MPI_WORLD.rank == 0: - os.remove(os.path.join(os.getcwd(), "heat/datasets", "float_data.npy")) + os.remove(os.path.join(os.getcwd(), "src/heat/datasets", "float_data.npy")) def test_load_multiple_csv(self): if not ht.io.supports_pandas(): @@ -882,7 +883,7 @@ def test_load_multiple_csv(self): import pandas as pd - csv_path = os.path.join(os.getcwd(), "heat/datasets/csv_tests") + csv_path = os.path.join(os.getcwd(), "src/heat/datasets/csv_tests") if ht.MPI_WORLD.rank == 0: nplist = [] npdroplist = [] @@ -935,26 +936,26 @@ def test_load_multiple_csv_exception(self): with self.assertRaises(TypeError): ht.load_csv_from_folder(path=1, split=0) with self.assertRaises(TypeError): - ht.load_csv_from_folder("heat/datasets", split="ABC") + ht.load_csv_from_folder("src/heat/datasets", split="ABC") with self.assertRaises(TypeError): - ht.load_csv_from_folder(path="heat/datasets", func=1) + ht.load_csv_from_folder(path="src/heat/datasets", func=1) with self.assertRaises(ValueError): ht.load_csv_from_folder(path="heat", dtype=ht.int64, split=0) if ht.MPI_WORLD.size > 1: if ht.MPI_WORLD.rank == 0: - os.mkdir(os.path.join(os.getcwd(), "heat/datasets/csv_tests")) + os.mkdir(os.path.join(os.getcwd(), "src/heat/datasets/csv_tests")) df = pd.DataFrame({"A": [0, 0, 0]}) # noqa F821 df.to_csv( - (os.path.join(os.getcwd(), "heat/datasets/csv_tests", "fail.csv")), + (os.path.join(os.getcwd(), "src/heat/datasets/csv_tests", "fail.csv")), index=False, ) ht.MPI_WORLD.Barrier() with self.assertRaises(RuntimeError): - ht.load_csv_from_folder("heat/datasets/csv_tests", dtype=ht.int64, split=0) + ht.load_csv_from_folder("src/heat/datasets/csv_tests", dtype=ht.int64, split=0) ht.MPI_WORLD.Barrier() if ht.MPI_WORLD.rank == 0: - shutil.rmtree(os.path.join(os.getcwd(), "heat/datasets/csv_tests")) + shutil.rmtree(os.path.join(os.getcwd(), "src/heat/datasets/csv_tests")) def test_load_zarr(self): if not ht.io.supports_zarr(): diff --git a/heat/core/tests/test_logical.py b/tests/core/test_logical.py similarity index 99% rename from heat/core/tests/test_logical.py rename to tests/core/test_logical.py index c2da61d64b..9ebdbd675d 100644 --- a/heat/core/tests/test_logical.py +++ b/tests/core/test_logical.py @@ -1,7 +1,7 @@ import torch import heat as ht -from .test_suites.basic_test import TestCase +from tests.test_suites.basic_test import TestCase class TestLogical(TestCase): diff --git a/heat/core/tests/test_manipulations.py b/tests/core/test_manipulations.py similarity index 99% rename from heat/core/tests/test_manipulations.py rename to tests/core/test_manipulations.py index 30138730d1..2309620cb8 100644 --- a/heat/core/tests/test_manipulations.py +++ b/tests/core/test_manipulations.py @@ -2,7 +2,7 @@ import torch import heat as ht -from .test_suites.basic_test import TestCase +from tests.test_suites.basic_test import TestCase class TestManipulations(TestCase): diff --git a/heat/core/tests/test_memory.py b/tests/core/test_memory.py similarity index 98% rename from heat/core/tests/test_memory.py rename to tests/core/test_memory.py index bdff40ac4b..9ca81ea39b 100644 --- a/heat/core/tests/test_memory.py +++ b/tests/core/test_memory.py @@ -1,7 +1,7 @@ import torch import heat as ht -from .test_suites.basic_test import TestCase +from tests.test_suites.basic_test import TestCase class TestMemory(TestCase): diff --git a/heat/core/tests/test_operations.py b/tests/core/test_operations.py similarity index 98% rename from heat/core/tests/test_operations.py rename to tests/core/test_operations.py index 6f718a82e9..780e0a2559 100644 --- a/heat/core/tests/test_operations.py +++ b/tests/core/test_operations.py @@ -3,7 +3,7 @@ import heat as ht import numpy as np -from .test_suites.basic_test import TestCase +from tests.test_suites.basic_test import TestCase class TestOperations(TestCase): diff --git a/heat/core/tests/test_printing.py b/tests/core/test_printing.py similarity index 99% rename from heat/core/tests/test_printing.py rename to tests/core/test_printing.py index fd6e382e2a..ddf93546c2 100644 --- a/heat/core/tests/test_printing.py +++ b/tests/core/test_printing.py @@ -1,7 +1,7 @@ import math import heat as ht -from .test_suites.basic_test import TestCase +from tests.test_suites.basic_test import TestCase class TestPrinting(TestCase): diff --git a/heat/core/tests/test_random.py b/tests/core/test_random.py similarity index 99% rename from heat/core/tests/test_random.py rename to tests/core/test_random.py index f0bc9b1f92..842469dca7 100644 --- a/heat/core/tests/test_random.py +++ b/tests/core/test_random.py @@ -6,7 +6,7 @@ import torch import heat as ht -from .test_suites.basic_test import TestCase +from tests.test_suites.basic_test import TestCase envar = os.getenv("HEAT_TEST_USE_DEVICE", "cpu") is_mps = envar == "gpu" and platform.system() == "Darwin" diff --git a/heat/core/tests/test_relational.py b/tests/core/test_relational.py similarity index 99% rename from heat/core/tests/test_relational.py rename to tests/core/test_relational.py index f050c0d2d4..1f2fcc6f07 100644 --- a/heat/core/tests/test_relational.py +++ b/tests/core/test_relational.py @@ -1,5 +1,5 @@ import heat as ht -from .test_suites.basic_test import TestCase +from tests.test_suites.basic_test import TestCase class TestRelational(TestCase): diff --git a/heat/core/tests/test_rounding.py b/tests/core/test_rounding.py similarity index 99% rename from heat/core/tests/test_rounding.py rename to tests/core/test_rounding.py index 597cd044f9..dbd28522df 100644 --- a/heat/core/tests/test_rounding.py +++ b/tests/core/test_rounding.py @@ -4,7 +4,7 @@ import torch import heat as ht -from .test_suites.basic_test import TestCase +from tests.test_suites.basic_test import TestCase class TestRounding(TestCase): diff --git a/heat/core/tests/test_sanitation.py b/tests/core/test_sanitation.py similarity index 98% rename from heat/core/tests/test_sanitation.py rename to tests/core/test_sanitation.py index fd08a1401f..ddb7827bf0 100644 --- a/heat/core/tests/test_sanitation.py +++ b/tests/core/test_sanitation.py @@ -2,7 +2,7 @@ import torch import heat as ht -from .test_suites.basic_test import TestCase +from tests.test_suites.basic_test import TestCase class TestSanitation(TestCase): diff --git a/heat/core/tests/test_signal.py b/tests/core/test_signal.py similarity index 99% rename from heat/core/tests/test_signal.py rename to tests/core/test_signal.py index ad3ecea12a..31278e99ef 100644 --- a/heat/core/tests/test_signal.py +++ b/tests/core/test_signal.py @@ -4,7 +4,7 @@ import heat as ht from heat import manipulations -from .test_suites.basic_test import TestCase +from tests.test_suites.basic_test import TestCase class TestSignal(TestCase): diff --git a/heat/core/tests/test_statistics.py b/tests/core/test_statistics.py similarity index 98% rename from heat/core/tests/test_statistics.py rename to tests/core/test_statistics.py index 358c99e857..5e8930b841 100644 --- a/heat/core/tests/test_statistics.py +++ b/tests/core/test_statistics.py @@ -3,12 +3,18 @@ from itertools import combinations from scipy import stats as ss +from pathlib import Path import heat as ht -from .test_suites.basic_test import TestCase +from tests.test_suites.basic_test import TestCase class TestStatistics(TestCase): + @classmethod + def setUpClass(cls): + super(TestStatistics, cls).setUpClass() + cls.data_path = str(Path(ht.__file__).parent / "datasets" / "iris.csv") + def test_argmax(self): torch.manual_seed(1) data = ht.random.randn(3, 4, 5) @@ -417,11 +423,11 @@ def test_cov(self): actual = ht.array([[1, -1], [-1, 1]], split=0) self.assertTrue(ht.equal(cov, actual)) - data = np.loadtxt("heat/datasets/iris.csv", delimiter=";") + data = np.loadtxt(self.data_path, delimiter=";") np_cov = np.cov(data[:, 0], data[:, 1:3], rowvar=False).astype(np_dtype) # split = None tests - htdata = ht.load("heat/datasets/iris.csv", sep=";", split=None) + htdata = ht.load(self.data_path, sep=";", split=None) ht_cov = ht.cov(htdata[:, 0], htdata[:, 1:3], rowvar=False) comp = ht.array(np_cov, dtype=dtype) self.assertTrue(ht.allclose(comp - ht_cov, 0, atol=1e-4)) @@ -439,10 +445,10 @@ def test_cov(self): self.assertTrue(ht.allclose(ht.array(np_cov, dtype=dtype) - ht_cov, 0, atol=1e-4)) # split = 0 tests - data = np.loadtxt("heat/datasets/iris.csv", delimiter=";") + data = np.loadtxt(self.data_path, delimiter=";") np_cov = np.cov(data[:, 0], data[:, 1:3], rowvar=False).astype(np_dtype) - htdata = ht.load("heat/datasets/iris.csv", sep=";", split=0) + htdata = ht.load(self.data_path, sep=";", split=0) ht_cov = ht.cov(htdata[:, 0], htdata[:, 1:3], rowvar=False) comp = ht.array(np_cov, dtype=ht.float) self.assertTrue(ht.allclose(comp - ht_cov, 0, atol=1e-4)) @@ -461,18 +467,18 @@ def test_cov(self): if 1 < x.comm.size < 5: # split 1 tests - htdata = ht.load("heat/datasets/iris.csv", sep=";", split=1) + htdata = ht.load(self.data_path, sep=";", split=1) np_cov = np.cov(data, rowvar=False).astype(np_dtype) ht_cov = ht.cov(htdata, rowvar=False) self.assertTrue(ht.allclose(ht.array(np_cov, dtype=dtype), ht_cov, atol=1e-4)) np_cov = np.cov(data, data, rowvar=True).astype(np_dtype) - htdata = ht.load("heat/datasets/iris.csv", sep=";", split=0) + htdata = ht.load(self.data_path, sep=";", split=0) ht_cov = ht.cov(htdata, htdata, rowvar=True) self.assertTrue(ht.allclose(ht.array(np_cov, dtype=dtype), ht_cov, atol=1e-4)) - htdata = ht.load("heat/datasets/iris.csv", sep=";", split=0) + htdata = ht.load(self.data_path, sep=";", split=0) with self.assertRaises(RuntimeError): ht.cov(htdata[1:], rowvar=False) with self.assertRaises(RuntimeError): @@ -979,7 +985,7 @@ def test_mean(self): # values for the iris dataset mean measured by libreoffice calc ax0 = ht.array([5.84333333333333, 3.054, 3.75866666666667, 1.19866666666667]) for sp in [None, 0, 1]: - iris = ht.load("heat/datasets/iris.csv", sep=";", split=sp) + iris = ht.load(self.data_path, sep=";", split=sp) self.assertTrue(ht.allclose(ht.mean(iris), 3.46366666666667)) self.assertTrue(ht.allclose(ht.mean(iris, axis=0), ax0)) @@ -1589,5 +1595,5 @@ def test_var(self): # values for the iris dataset var measured by libreoffice calc for sp in [None, 0, 1]: - iris = ht.load("heat/datasets/iris.csv", sep=";", split=sp) + iris = ht.load(self.data_path, sep=";", split=sp) self.assertTrue(ht.allclose(ht.var(iris, bessel=True), 3.90318519755147)) diff --git a/heat/core/tests/test_stride_tricks.py b/tests/core/test_stride_tricks.py similarity index 98% rename from heat/core/tests/test_stride_tricks.py rename to tests/core/test_stride_tricks.py index 81d79eb11f..e03213bb67 100644 --- a/heat/core/tests/test_stride_tricks.py +++ b/tests/core/test_stride_tricks.py @@ -1,5 +1,5 @@ import heat as ht -from .test_suites.basic_test import TestCase +from tests.test_suites.basic_test import TestCase class TestStrideTricks(TestCase): diff --git a/heat/core/tests/test_tiling.py b/tests/core/test_tiling.py similarity index 99% rename from heat/core/tests/test_tiling.py rename to tests/core/test_tiling.py index b6e00c3161..24526551ce 100644 --- a/heat/core/tests/test_tiling.py +++ b/tests/core/test_tiling.py @@ -5,7 +5,7 @@ import torch import heat as ht -from .test_suites.basic_test import TestCase +from tests.test_suites.basic_test import TestCase envar = os.getenv("HEAT_TEST_USE_DEVICE", "cpu") is_mps = envar == "gpu" and platform.machine() == "arm64" diff --git a/heat/core/tests/test_trigonometrics.py b/tests/core/test_trigonometrics.py similarity index 99% rename from heat/core/tests/test_trigonometrics.py rename to tests/core/test_trigonometrics.py index 7e09472b86..706e74eee7 100644 --- a/heat/core/tests/test_trigonometrics.py +++ b/tests/core/test_trigonometrics.py @@ -2,7 +2,7 @@ import torch import heat as ht -from .test_suites.basic_test import TestCase +from tests.test_suites.basic_test import TestCase class TestTrigonometrics(TestCase): diff --git a/heat/core/tests/test_types.py b/tests/core/test_types.py similarity index 99% rename from heat/core/tests/test_types.py rename to tests/core/test_types.py index 42e0124ef2..b27d0c16e7 100644 --- a/heat/core/tests/test_types.py +++ b/tests/core/test_types.py @@ -2,7 +2,7 @@ import torch import heat as ht -from .test_suites.basic_test import TestCase +from tests.test_suites.basic_test import TestCase class TestTypes(TestCase): diff --git a/heat/core/tests/test_vmap.py b/tests/core/test_vmap.py similarity index 99% rename from heat/core/tests/test_vmap.py rename to tests/core/test_vmap.py index 0f7ba62d2e..07699518a9 100644 --- a/heat/core/tests/test_vmap.py +++ b/tests/core/test_vmap.py @@ -2,7 +2,7 @@ import torch import os -from .test_suites.basic_test import TestCase +from tests.test_suites.basic_test import TestCase class TestVmap(TestCase): diff --git a/heat/core/tests/__init__.py b/tests/decomposition/__init__.py similarity index 100% rename from heat/core/tests/__init__.py rename to tests/decomposition/__init__.py diff --git a/heat/decomposition/tests/test_dmd.py b/tests/decomposition/test_dmd.py similarity index 99% rename from heat/decomposition/tests/test_dmd.py rename to tests/decomposition/test_dmd.py index 38b3ec2b2b..a013906949 100644 --- a/heat/decomposition/tests/test_dmd.py +++ b/tests/decomposition/test_dmd.py @@ -5,7 +5,7 @@ import torch import heat as ht -from ...core.tests.test_suites.basic_test import TestCase +from tests.test_suites.basic_test import TestCase # MPS does not support non-float matrix multiplication envar = os.getenv("HEAT_TEST_USE_DEVICE", "cpu") diff --git a/heat/decomposition/tests/test_pca.py b/tests/decomposition/test_pca.py similarity index 99% rename from heat/decomposition/tests/test_pca.py rename to tests/decomposition/test_pca.py index 41300186e6..2a20cc4d19 100644 --- a/heat/decomposition/tests/test_pca.py +++ b/tests/decomposition/test_pca.py @@ -4,7 +4,7 @@ import torch import heat as ht -from ...core.tests.test_suites.basic_test import TestCase +from tests.test_suites.basic_test import TestCase class TestPCA(TestCase): diff --git a/heat/core/tests/test_suites/__init__.py b/tests/fft/__init__.py similarity index 100% rename from heat/core/tests/test_suites/__init__.py rename to tests/fft/__init__.py diff --git a/heat/fft/tests/test_fft.py b/tests/fft/test_fft.py similarity index 99% rename from heat/fft/tests/test_fft.py rename to tests/fft/test_fft.py index b0ecdc68b0..7d86f114b2 100644 --- a/heat/fft/tests/test_fft.py +++ b/tests/fft/test_fft.py @@ -5,7 +5,7 @@ import os import heat as ht -from heat.core.tests.test_suites.basic_test import TestCase +from tests.test_suites.basic_test import TestCase torch_ihfftn = hasattr(torch.fft, "ihfftn") diff --git a/heat/graph/tests/test_laplacian.py b/tests/graph/test_laplacian.py similarity index 97% rename from heat/graph/tests/test_laplacian.py rename to tests/graph/test_laplacian.py index 1c21764861..cf5edd6fe3 100644 --- a/heat/graph/tests/test_laplacian.py +++ b/tests/graph/test_laplacian.py @@ -3,7 +3,7 @@ import heat as ht -from heat.core.tests.test_suites.basic_test import TestCase +from tests.test_suites.basic_test import TestCase class TestLaplacian(TestCase): diff --git a/heat/decomposition/tests/__init__.py b/tests/linalg/__init__.py similarity index 100% rename from heat/decomposition/tests/__init__.py rename to tests/linalg/__init__.py diff --git a/heat/core/linalg/tests/test_basics.py b/tests/linalg/test_basics.py similarity index 99% rename from heat/core/linalg/tests/test_basics.py rename to tests/linalg/test_basics.py index 6e6ecf5374..8d913a3f32 100644 --- a/heat/core/linalg/tests/test_basics.py +++ b/tests/linalg/test_basics.py @@ -2,8 +2,8 @@ import torch import heat as ht -from ...tests.test_suites.basic_test import TestCase -from ..basics import _estimate_largest_singularvalue +from tests.test_suites.basic_test import TestCase +from heat.core.linalg.basics import _estimate_largest_singularvalue class TestLinalgBasics(TestCase): diff --git a/heat/core/linalg/tests/test_eigh.py b/tests/linalg/test_eigh.py similarity index 97% rename from heat/core/linalg/tests/test_eigh.py rename to tests/linalg/test_eigh.py index 5b6b5a0a78..18b6533790 100644 --- a/heat/core/linalg/tests/test_eigh.py +++ b/tests/linalg/test_eigh.py @@ -2,7 +2,7 @@ import unittest import numpy as np -from ...tests.test_suites.basic_test import TestCase +from tests.test_suites.basic_test import TestCase class TestEigh(TestCase): diff --git a/heat/core/linalg/tests/test_polar.py b/tests/linalg/test_polar.py similarity index 98% rename from heat/core/linalg/tests/test_polar.py rename to tests/linalg/test_polar.py index e1934d8c2b..f4fe9e8c7f 100644 --- a/heat/core/linalg/tests/test_polar.py +++ b/tests/linalg/test_polar.py @@ -3,7 +3,7 @@ import torch import numpy as np -from ...tests.test_suites.basic_test import TestCase +from tests.test_suites.basic_test import TestCase class TestZolopolar(TestCase): diff --git a/heat/core/linalg/tests/test_qr.py b/tests/linalg/test_qr.py similarity index 99% rename from heat/core/linalg/tests/test_qr.py rename to tests/linalg/test_qr.py index 0da27aea07..3c3b0a5bcc 100644 --- a/heat/core/linalg/tests/test_qr.py +++ b/tests/linalg/test_qr.py @@ -3,7 +3,7 @@ import torch import numpy as np -from ...tests.test_suites.basic_test import TestCase +from tests.test_suites.basic_test import TestCase class TestQR(TestCase): diff --git a/heat/core/linalg/tests/test_solver.py b/tests/linalg/test_solver.py similarity index 99% rename from heat/core/linalg/tests/test_solver.py rename to tests/linalg/test_solver.py index 944305b63e..ba5a64fa63 100644 --- a/heat/core/linalg/tests/test_solver.py +++ b/tests/linalg/test_solver.py @@ -4,7 +4,7 @@ import heat as ht import numpy as np -from ...tests.test_suites.basic_test import TestCase +from tests.test_suites.basic_test import TestCase class TestSolver(TestCase): diff --git a/heat/core/linalg/tests/test_svd.py b/tests/linalg/test_svd.py similarity index 99% rename from heat/core/linalg/tests/test_svd.py rename to tests/linalg/test_svd.py index 97dc5f5c77..2e79a9c37b 100644 --- a/heat/core/linalg/tests/test_svd.py +++ b/tests/linalg/test_svd.py @@ -3,7 +3,7 @@ import torch import numpy as np -from ...tests.test_suites.basic_test import TestCase +from tests.test_suites.basic_test import TestCase class TestTallSkinnySVD(TestCase): diff --git a/heat/core/linalg/tests/test_svdtools.py b/tests/linalg/test_svdtools.py similarity index 99% rename from heat/core/linalg/tests/test_svdtools.py rename to tests/linalg/test_svdtools.py index ea6cd9681e..7b2b1544e0 100644 --- a/heat/core/linalg/tests/test_svdtools.py +++ b/tests/linalg/test_svdtools.py @@ -5,7 +5,7 @@ import numpy as np from mpi4py import MPI -from ...tests.test_suites.basic_test import TestCase +from tests.test_suites.basic_test import TestCase class TestHSVD(TestCase): diff --git a/heat/fft/tests/__init__.py b/tests/naive_bayes/__init__.py similarity index 100% rename from heat/fft/tests/__init__.py rename to tests/naive_bayes/__init__.py diff --git a/heat/naive_bayes/tests/test_gaussiannb.py b/tests/naive_bayes/test_gaussiannb.py similarity index 92% rename from heat/naive_bayes/tests/test_gaussiannb.py rename to tests/naive_bayes/test_gaussiannb.py index 3918c6d4a0..28146cef71 100644 --- a/heat/naive_bayes/tests/test_gaussiannb.py +++ b/tests/naive_bayes/test_gaussiannb.py @@ -1,9 +1,10 @@ import os import numpy as np import torch +from pathlib import Path import heat as ht -from heat.core.tests.test_suites.basic_test import TestCase +from tests.test_suites.basic_test import TestCase class TestGaussianNB(TestCase): @@ -28,11 +29,13 @@ def test_fit_iris(self): else: dtype = ht.float64 # load sklearn train/test sets and resulting probabilities - X_train = ht.load("heat/datasets/iris_X_train.csv", sep=";", dtype=dtype) - X_test = ht.load("heat/datasets/iris_X_test.csv", sep=";", dtype=dtype) - y_train = ht.load("heat/datasets/iris_y_train.csv", sep=";", dtype=ht.int64).squeeze() - y_test = ht.load("heat/datasets/iris_y_test.csv", sep=";", dtype=ht.int64).squeeze() - y_pred_proba_sklearn = ht.load("heat/datasets/iris_y_pred_proba.csv", sep=";", dtype=dtype) + data_path = Path(ht.__file__).parent / "datasets" + + X_train = ht.load(str(data_path / "iris_X_train.csv"), sep=";", dtype=dtype) + X_test = ht.load(str(data_path / "iris_X_test.csv"), sep=";", dtype=dtype) + y_train = ht.load(str(data_path / "iris_y_train.csv"), sep=";", dtype=ht.int64).squeeze() + y_test = ht.load(str(data_path / "iris_y_test.csv"), sep=";", dtype=ht.int64).squeeze() + y_pred_proba_sklearn = ht.load(str(data_path / "iris_y_pred_proba.csv"), sep=";", dtype=dtype) # test ht.GaussianNB from heat.naive_bayes import GaussianNB diff --git a/heat/nn/tests/__init__.py b/tests/nn/__init__.py similarity index 100% rename from heat/nn/tests/__init__.py rename to tests/nn/__init__.py diff --git a/heat/nn/tests/test_data_parallel.py b/tests/nn/test_data_parallel.py similarity index 100% rename from heat/nn/tests/test_data_parallel.py rename to tests/nn/test_data_parallel.py diff --git a/heat/nn/tests/test_nn.py b/tests/nn/test_nn.py similarity index 100% rename from heat/nn/tests/test_nn.py rename to tests/nn/test_nn.py diff --git a/heat/optim/tests/__init__.py b/tests/optim/__init__.py similarity index 100% rename from heat/optim/tests/__init__.py rename to tests/optim/__init__.py diff --git a/heat/optim/tests/test_dp_optimizer.py b/tests/optim/test_dp_optimizer.py similarity index 99% rename from heat/optim/tests/test_dp_optimizer.py rename to tests/optim/test_dp_optimizer.py index 42464b7142..ed4f4ddfb3 100644 --- a/heat/optim/tests/test_dp_optimizer.py +++ b/tests/optim/test_dp_optimizer.py @@ -4,7 +4,7 @@ import torch import unittest -from heat.core.tests.test_suites.basic_test import TestCase +from tests.test_suites.basic_test import TestCase class TestDASO(TestCase): diff --git a/heat/optim/tests/test_optim.py b/tests/optim/test_optim.py similarity index 94% rename from heat/optim/tests/test_optim.py rename to tests/optim/test_optim.py index 0927c6a11b..c83504b8d4 100644 --- a/heat/optim/tests/test_optim.py +++ b/tests/optim/test_optim.py @@ -1,6 +1,6 @@ import heat as ht -from heat.core.tests.test_suites.basic_test import TestCase +from tests.test_suites.basic_test import TestCase class TestOptim(TestCase): diff --git a/heat/optim/tests/test_utils.py b/tests/optim/test_utils.py similarity index 97% rename from heat/optim/tests/test_utils.py rename to tests/optim/test_utils.py index 6b5f774477..cca0fc6f69 100644 --- a/heat/optim/tests/test_utils.py +++ b/tests/optim/test_utils.py @@ -3,7 +3,7 @@ import os import torch -from heat.core.tests.test_suites.basic_test import TestCase +from tests.test_suites.basic_test import TestCase class TestUtils(TestCase): diff --git a/heat/naive_bayes/tests/__init__.py b/tests/preprocessing/__init__.py similarity index 100% rename from heat/naive_bayes/tests/__init__.py rename to tests/preprocessing/__init__.py diff --git a/heat/preprocessing/tests/test_preprocessing.py b/tests/preprocessing/test_preprocessing.py similarity index 97% rename from heat/preprocessing/tests/test_preprocessing.py rename to tests/preprocessing/test_preprocessing.py index d145aa6bcb..eebe17c77e 100644 --- a/heat/preprocessing/tests/test_preprocessing.py +++ b/tests/preprocessing/test_preprocessing.py @@ -4,7 +4,7 @@ from mpi4py import MPI import os -from ...core.tests.test_suites.basic_test import TestCase +from tests.test_suites.basic_test import TestCase atol_fit = 1e-5 atol_inv = 1e-4 diff --git a/heat/preprocessing/tests/__init__.py b/tests/regression/__init__.py similarity index 100% rename from heat/preprocessing/tests/__init__.py rename to tests/regression/__init__.py diff --git a/heat/regression/tests/test_lasso.py b/tests/regression/test_lasso.py similarity index 97% rename from heat/regression/tests/test_lasso.py rename to tests/regression/test_lasso.py index 8b2ed6908f..656f6e8f43 100644 --- a/heat/regression/tests/test_lasso.py +++ b/tests/regression/test_lasso.py @@ -1,7 +1,7 @@ import os import heat as ht -from heat.core.tests.test_suites.basic_test import TestCase +from tests.test_suites.basic_test import TestCase class TestLasso(TestCase): diff --git a/heat/sparse/tests/__init__.py b/tests/sparse/__init__.py similarity index 100% rename from heat/sparse/tests/__init__.py rename to tests/sparse/__init__.py diff --git a/heat/sparse/tests/test_arithmetics_csr.py b/tests/sparse/test_arithmetics_csr.py similarity index 99% rename from heat/sparse/tests/test_arithmetics_csr.py rename to tests/sparse/test_arithmetics_csr.py index 38f23062a5..fd19e3f94c 100644 --- a/heat/sparse/tests/test_arithmetics_csr.py +++ b/tests/sparse/test_arithmetics_csr.py @@ -7,7 +7,7 @@ import platform import random -from heat.core.tests.test_suites.basic_test import TestCase +from tests.test_suites.basic_test import TestCase envar = os.getenv("HEAT_TEST_USE_DEVICE", "cpu") diff --git a/heat/sparse/tests/test_dcscmatrix.py b/tests/sparse/test_dcscmatrix.py similarity index 99% rename from heat/sparse/tests/test_dcscmatrix.py rename to tests/sparse/test_dcscmatrix.py index 22386d1444..569455fec3 100644 --- a/heat/sparse/tests/test_dcscmatrix.py +++ b/tests/sparse/test_dcscmatrix.py @@ -4,7 +4,7 @@ import heat as ht import torch -from heat.core.tests.test_suites.basic_test import TestCase +from tests.test_suites.basic_test import TestCase from typing import Tuple diff --git a/heat/sparse/tests/test_dcsrmatrix.py b/tests/sparse/test_dcsrmatrix.py similarity index 99% rename from heat/sparse/tests/test_dcsrmatrix.py rename to tests/sparse/test_dcsrmatrix.py index 4f5b99df64..ef1df82c0b 100644 --- a/heat/sparse/tests/test_dcsrmatrix.py +++ b/tests/sparse/test_dcsrmatrix.py @@ -4,7 +4,7 @@ import heat as ht import torch -from heat.core.tests.test_suites.basic_test import TestCase +from tests.test_suites.basic_test import TestCase from typing import Tuple diff --git a/heat/sparse/tests/test_factories.py b/tests/sparse/test_factories.py similarity index 99% rename from heat/sparse/tests/test_factories.py rename to tests/sparse/test_factories.py index 84dd5e2b5d..848885b8c4 100644 --- a/heat/sparse/tests/test_factories.py +++ b/tests/sparse/test_factories.py @@ -5,7 +5,7 @@ import torch import scipy -from heat.core.tests.test_suites.basic_test import TestCase +from tests.test_suites.basic_test import TestCase envar = os.getenv("HEAT_TEST_USE_DEVICE", "cpu") is_mps = envar == "gpu" and platform.system() == "Darwin" diff --git a/heat/sparse/tests/test_manipulations.py b/tests/sparse/test_manipulations.py similarity index 99% rename from heat/sparse/tests/test_manipulations.py rename to tests/sparse/test_manipulations.py index 97b5ab5ca9..2d86cf69ea 100644 --- a/heat/sparse/tests/test_manipulations.py +++ b/tests/sparse/test_manipulations.py @@ -4,7 +4,7 @@ import heat as ht import torch -from heat.core.tests.test_suites.basic_test import TestCase +from tests.test_suites.basic_test import TestCase envar = os.getenv("HEAT_TEST_USE_DEVICE", "cpu") is_mps = envar == "gpu" and platform.system() == "Darwin" diff --git a/heat/regression/tests/__init__.py b/tests/spatial/__init__.py similarity index 100% rename from heat/regression/tests/__init__.py rename to tests/spatial/__init__.py diff --git a/heat/spatial/tests/test_distances.py b/tests/spatial/test_distances.py similarity index 99% rename from heat/spatial/tests/test_distances.py rename to tests/spatial/test_distances.py index d5769c2009..9fda8f1318 100644 --- a/heat/spatial/tests/test_distances.py +++ b/tests/spatial/test_distances.py @@ -7,7 +7,7 @@ import numpy as np import math -from heat.core.tests.test_suites.basic_test import TestCase +from tests.test_suites.basic_test import TestCase class TestDistances(TestCase): diff --git a/heat/tests/test_cli.py b/tests/test_cli.py similarity index 100% rename from heat/tests/test_cli.py rename to tests/test_cli.py diff --git a/heat/spatial/tests/__init__.py b/tests/test_suites/__init__.py similarity index 100% rename from heat/spatial/tests/__init__.py rename to tests/test_suites/__init__.py diff --git a/heat/core/tests/test_suites/basic_test.py b/tests/test_suites/basic_test.py similarity index 100% rename from heat/core/tests/test_suites/basic_test.py rename to tests/test_suites/basic_test.py diff --git a/heat/core/tests/test_suites/test_basic_test.py b/tests/test_suites/test_basic_test.py similarity index 100% rename from heat/core/tests/test_suites/test_basic_test.py rename to tests/test_suites/test_basic_test.py diff --git a/heat/utils/data/tests/__init__.py b/tests/utils/__init__.py similarity index 100% rename from heat/utils/data/tests/__init__.py rename to tests/utils/__init__.py diff --git a/tests/utils/data/__init__.py b/tests/utils/data/__init__.py new file mode 100644 index 0000000000..8b1c8633c5 --- /dev/null +++ b/tests/utils/data/__init__.py @@ -0,0 +1,9 @@ +""" +add data utility functions to the ht.utils.data namespace +""" + +from .datatools import * +from . import matrixgallery +from . import mnist +from .partial_dataset import * +from . import spherical diff --git a/tests/utils/data/_utils.py b/tests/utils/data/_utils.py new file mode 100644 index 0000000000..a20cd2fb09 --- /dev/null +++ b/tests/utils/data/_utils.py @@ -0,0 +1,280 @@ +""" +Data utilities module. +This file contains functions which may be useful for certain datatypes, but are not test in the heat framework +This file contains standalone utilities for data preparation which may be useful +The functions contained within are not tested, nor actively supported +""" + +import base64 +import numpy as np +import os +import struct + + +def dali_tfrecord2idx(train_dir, train_idx_dir, val_dir, val_idx_dir): + """ + WARNING: This function likely requires adjustments and it is by no means a final product !!! + this file contains standalone utilities for data preparation which may be useful + this function contained within are not tested, nor actively supported + + prepare TFRecords indexes for use with DALI. It will produce indexes for all files in the + given ``train_dir`` and ``val_dir`` directories + """ + for tv in [train_dir, val_dir]: + dir_list = os.listdir(tv) + out = train_idx_dir if tv == train_dir else val_idx_dir + for file in dir_list: + with open(file, "rb") as f, open(out + file, "w") as idx: + while True: + current = f.tell() + try: + # length + byte_len = f.read(8) + if len(byte_len) == 0: + break + # crc + f.read(4) + proto_len = struct.unpack("q", byte_len)[0] + # proto + f.read(proto_len) + # crc + f.read(4) + idx.write(str(current) + " " + str(f.tell() - current) + "\n") + except Exception: + print("Not a valid TFRecord file") + break + + +def merge_files_imagenet_tfrecord(folder_name, output_folder=None): + """ + WARNING: This function likely requires adjustments and it is by no means a final product !!! + this file contains standalone utilities for data preparation which may be useful + this function contained within are not tested, nor actively supported + + merge multiple preprocessed imagenet TFRecord files together, + result is one HDF5 file with all of the images stacked in the 0th dimension + + Parameters + ---------- + folder_name : str, optional* + folder location of the files to join, either filenames or folder_names must not be None + output_folder : str, optional + location to create the output files. Defaults to current directory + + Notes + ----- + Metadata for both the created files (`imagenet_merged.h5` and `imagenet_merged_validation.h5`): + + The datasets are the combination of all of the images in the Image-net 2012 dataset. + The data is split into training and validation. + + imagenet_merged.h5 -> training + imagenet_merged_validation.h5 -> validation + + both files have the same internal structure: + - file + * "images" : encoded ASCII string of the decoded RGB JPEG image. + - to decode: `torch.as_tensor(bytearray(base64.binascii.a2b_base64(string_repr.encode('ascii'))), dtype=torch.uint8)` + - note: the images must be reshaped using: `.reshape(file["metadata"]["image/height"], file["metadata"]["image/height"], 3)` + (3 is the number of channels, all images are RGB) + * "metadata" : the metadata for each image quotes are the titles for each column + 0. "image/height" + 1. "image/width" + 2. "image/channels" + 3. "image/class/label" + 4. "image/object/bbox/xmin" + 5. "image/object/bbox/xmax" + 6. "image/object/bbox/ymin" + 7. "image/object/bbox/ymax" + 8. "image/object/bbox/label" + * "file_info" : string information related to each image + 0. "image/format" + 1. "image/filename" + 2. "image/class/synset" + 3. "image/class/text" + + + The dataset was created using the preprocessed data from the script: + https://github.com/tensorflow/models/blob/master/research/inception/inception/data/download_and_preprocess_imagenet.sh + + """ + import h5py + import tensorflow as tf + + """ + labels: + image/encoded: string containing JPEG encoded image in RGB colorspace + image/height: integer, image height in pixels + image/width: integer, image width in pixels + image/colorspace: string, specifying the colorspace, always 'RGB' + image/channels: integer, specifying the number of channels, always 3 + image/format: string, specifying the format, always 'JPEG' + image/filename: string containing the basename of the image file + e.g. 'n01440764_10026.JPEG' or 'ILSVRC2012_val_00000293.JPEG' + image/class/label: integer specifying the index in a classification layer. + The label ranges from [1, 1000] where 0 is not used. + image/class/synset: string specifying the unique ID of the label, e.g. 'n01440764' + image/class/text: string specifying the human-readable version of the label + e.g. 'red fox, Vulpes vulpes' + image/object/bbox/xmin: list of integers specifying the 0+ human annotated bounding boxes + image/object/bbox/xmax: list of integers specifying the 0+ human annotated bounding boxes + image/object/bbox/ymin: list of integers specifying the 0+ human annotated bounding boxes + image/object/bbox/ymax: list of integers specifying the 0+ human annotated bounding boxes + image/object/bbox/label: integer specifying the index in a classification + layer. The label ranges from [1, 1000] where 0 is not used. Note this is + always identical to the image label.""" + # get the number of files from the contents of the folder + train_names = [folder_name + f for f in os.listdir(folder_name) if f.startswith("train")].sort() + val_names = [folder_name + f for f in os.listdir(folder_name) if f.startswith("val")].sort() + num_train = len(train_names) + num_val = len(val_names) + + def _find_output_name_and_stsp(num_names): + start = 0 + stop = num_names + 1 + output_name_lcl = output_folder + output_name_lcl += "imagenet_merged.h5" + return start, stop, output_name_lcl + + train_start, train_stop, output_name_lcl_train = _find_output_name_and_stsp(num_train) + val_start, val_stop, output_name_lcl_val = _find_output_name_and_stsp(num_val) + output_name_lcl_val = f"{output_name_lcl_val[:-3]}_validation.h5" + + # create the output files + train_lcl_file = h5py.File(output_name_lcl_train, "w") + dt = h5py.string_dtype(encoding="ascii") + train_lcl_file.create_dataset("images", (2502,), chunks=(1251,), maxshape=(None,), dtype=dt) + train_lcl_file.create_dataset("metadata", (2502, 9), chunks=(1251, 9), maxshape=(None, 9)) + train_lcl_file.create_dataset( + "file_info", (2502, 4), chunks=(1251, 4), maxshape=(None, 4), dtype="S10" + ) + + val_lcl_file = h5py.File(output_name_lcl_val, "w") + val_lcl_file.create_dataset("images", (50000,), chunks=True, maxshape=(None,), dtype=dt) + val_lcl_file.create_dataset("metadata", (50000, 9), chunks=True, maxshape=(None, 9)) + val_lcl_file.create_dataset( + "file_info", (50000, 4), chunks=True, maxshape=(None, 4), dtype="S10" + ) + + def __single_file_load(src): + # load a file and read it to a numpy array + dataset = tf.data.TFRecordDataset(filenames=[src]) + imgs = [] + img_meta = [[] for _ in range(9)] + file_arr = [[] for _ in range(4)] + for raw_example in iter(dataset): + parsed = tf.train.Example.FromString(raw_example.numpy()) + img_str = parsed.features.feature["image/encoded"].bytes_list.value[0] + img = tf.image.decode_jpeg(img_str, channels=3).numpy() + string_repr = base64.binascii.b2a_base64(img).decode("ascii") + imgs.append(string_repr) + # to decode: np.frombuffer(base64.binascii.a2b_base64(string_repr.encode('ascii'))) + img_meta[0].append( + tf.cast( + parsed.features.feature["image/height"].int64_list.value[0], tf.float32 + ).numpy() + ) + img_meta[1].append( + tf.cast( + parsed.features.feature["image/width"].int64_list.value[0], tf.float32 + ).numpy() + ) + img_meta[2].append( + tf.cast( + parsed.features.feature["image/channels"].int64_list.value[0], tf.float32 + ).numpy() + ) + img_meta[3].append(parsed.features.feature["image/class/label"].int64_list.value[0] - 1) + try: + bbxmin = parsed.features.feature["image/object/bbox/xmin"].float_list.value[0] + bbxmax = parsed.features.feature["image/object/bbox/xmax"].float_list.value[0] + bbymin = parsed.features.feature["image/object/bbox/ymin"].float_list.value[0] + bbymax = parsed.features.feature["image/object/bbox/ymax"].float_list.value[0] + bblabel = parsed.features.feature["image/object/bbox/label"].int64_list.value[0] - 1 + except IndexError: + bbxmin = 0.0 + bbxmax = img_meta[1][-1] + bbymin = 0.0 + bbymax = img_meta[0][-1] + bblabel = -2 + + img_meta[4].append(np.float(bbxmin)) + img_meta[5].append(np.float(bbxmax)) + img_meta[6].append(np.float(bbymin)) + img_meta[7].append(np.float(bbymax)) + img_meta[8].append(bblabel) + + file_arr[0].append(parsed.features.feature["image/format"].bytes_list.value[0]) + file_arr[1].append(parsed.features.feature["image/filename"].bytes_list.value[0]) + file_arr[2].append(parsed.features.feature["image/class/synset"].bytes_list.value[0]) + file_arr[3].append( + np.array(parsed.features.feature["image/class/text"].bytes_list.value[0]) + ) + # need to transpose because of the way that numpy understands nested lists + img_meta = np.array(img_meta, dtype=np.float64).T + file_arr = np.array(file_arr).T + return imgs, img_meta, file_arr + + def __write_datasets(img_outl, img_metal, file_arrl, past_sizel, file): + file["images"].resize((past_sizel + len(img_outl),)) + file["images"][past_sizel : len(img_outl) + past_sizel] = img_outl + file["metadata"].resize((past_sizel + img_metal.shape[0], 9)) + file["metadata"][past_sizel : img_metal.shape[0] + past_sizel] = img_metal + file["file_info"].resize((past_sizel + img_metal.shape[0], 4)) + file["file_info"][past_sizel : img_metal.shape[0] + past_sizel] = file_arrl + + def __load_multiple_files(train_names, train_start, train_stop, file): + loc_files = train_names[train_start:train_stop] + img_out, img_meta, file_arr = None, None, None + past_size, i = 0, 0 + for f in loc_files: # train + # print(f) + # this is where the data is created for + imgs, img_metaf, file_arrf = __single_file_load(f) + # create a larger ndarray with the results + if img_out is not None: + img_out.extend(imgs) + else: + img_out = imgs + img_meta = np.vstack((img_meta, img_metaf)) if img_meta is not None else img_metaf + file_arr = np.vstack((file_arr, file_arrf)) if file_arr is not None else file_arrf + # when 2 files are read, write to the output file + if i % 2 == 1: + print(past_size) + __write_datasets(img_out, img_meta, file_arr, past_size, file) + past_size += len(img_out) + img_out, img_meta, file_arr = None, None, None + del imgs, img_metaf, file_arrf + i += 1 + + if img_out is not None: + __write_datasets(img_out, img_meta, file_arr, past_size, file) + + __load_multiple_files(train_names, train_start, train_stop, train_lcl_file) + __load_multiple_files(val_names, val_start, val_stop, val_lcl_file) + + # add the label names to the datasets + img_list = [1, 2, 4, 7, 10, 11, 12, 13, 14] + file_list = [5, 6, 8, 9] + feature_list = [ + "image/encoded", + "image/height", + "image/width", + "image/colorspace", + "image/channels", + "image/format", + "image/filename", + "image/class/label", + "image/class/synset", + "image/class/text", + "image/object/bbox/xmin", + "image/object/bbox/xmax", + "image/object/bbox/ymin", + "image/object/bbox/ymax", + "image/object/bbox/label", + ] + + train_lcl_file["metadata"].attrs["column_names"] = [feature_list[im] for im in img_list] + train_lcl_file["file_info"].attrs["column_names"] = [feature_list[im] for im in file_list] + val_lcl_file["metadata"].attrs["column_names"] = [feature_list[im] for im in img_list] + val_lcl_file["file_info"].attrs["column_names"] = [feature_list[im] for im in file_list] diff --git a/tests/utils/data/datatools.py b/tests/utils/data/datatools.py new file mode 100644 index 0000000000..ccc79de888 --- /dev/null +++ b/tests/utils/data/datatools.py @@ -0,0 +1,799 @@ +""" +Function and classes useful for loading data into neural networks +""" + +import itertools +import random +import warnings +import mpi4py +import torch +import torch.distributed +from torch.utils import data as torch_data +from typing import Callable, List, Iterator, Literal, Union, Optional, Sized +from mpi4py import MPI +from functools import reduce + +import torch.utils +import torchvision + +from heat.dndarray import DNDarray +from heat.communication import GPU_AWARE_MPI, MPI_WORLD, MPICommunication +from heat.random import permutation +from . import partial_dataset + +__all__ = [ + "DataLoader", + "Dataset", + "dataset_shuffle", + "dataset_ishuffle", + "DistributedDataset", + "DistributedSampler", + "create_train_val_split", +] + + +class DataLoader: + r""" + The combines either a :func:`DNDarray ` or a torch `Dataset `_ + with a sampler. This provides an iterable over the local dataset and it will shuffle the data at the end of the + iterator. If a :func:`DNDarray ` is given, then a :func:`Dataset` will be created + internally. + + Currently, this only supports only map-style datasets with single-process loading. It uses the random + batch sampler. The rest of the ``DataLoader`` functionality mentioned in `torch.utils.data.dataloader `_ applies. + + Arguments: + dataset : :func:`Dataset`, torch `Dataset `_, :func:`heat.utils.data.partial_dataset.PartialH5Dataset` + A torch dataset from which the data will be returned by the created iterator + batch_size : int, optional + How many samples per batch to load\n + Default: 1 + num_workers : int, optional + How many subprocesses to use for data loading. 0 means that the data will be loaded in the main process.\n + Default: 0 + collate_fn : callable, optional + Merges a list of samples to form a mini-batch of torch.Tensor(s). Used when using batched loading from a + map-style dataset.\n + Default: None + pin_memory : bool, optional + If ``True``, the data loader will copy torch.Tensors into CUDA pinned memory before returning them. + If your data elements are a custom type, or your :attr:`collate_fn` returns a batch that is a custom type, + see the example below. \n + Default: False + drop_last : bool, optional + Set to ``True`` to drop the last incomplete batch, if the dataset size is not divisible by + the batch size. If ``False`` and the size of dataset is not divisible by the batch size, then + the last batch will be smaller.\n + Default: ``False`` + timeout : int or float, optional + If positive, the timeout value for collecting a batch from workers. Should always be non-negative.\n + Default: 0 + worker_init_fn : callable, optional + If not ``None``, this will be called on each worker subprocess with the worker id + (an int in ``[0, num_workers - 1]``) as input, after seeding and before data loading.\n + default: None + + Attributes + ---------- + dataset : :func:`Dataset`, torch `Dataset `_, :func:`heat.utils.data.partial_dataset.PartialH5Dataset` + The dataset created from the local data + DataLoader : `torch.utils.data.dataloader `_ + The local DataLoader object. Used in the creation of the iterable and the length + _first_iter : bool + Flag indicating if the iterator created is the first one. If it is not, then the data will be shuffled before + the iterator is created + last_epoch : bool + Flag indicating last epoch + """ + + def __init__( + self, + dataset: Union[torch_data.Dataset, partial_dataset.PartialH5Dataset], + batch_size: int = 1, + num_workers: int = 0, + collate_fn: Callable = None, + pin_memory: bool = False, + drop_last: bool = False, + timeout: Union[int, float] = 0, + worker_init_fn: Callable = None, + ): # noqa: D107 + if not isinstance(dataset, (torch_data.Dataset, Dataset, partial_dataset.PartialH5Dataset)): + raise TypeError( + f"dataset must be a torch Dataset, heat Dataset, heat PartialH5Dataset, currently: {type(dataset)}" + ) + self.dataset = dataset + if hasattr(self.dataset, "ishuffle"): + self.ishuffle = self.dataset.ishuffle + if isinstance(self.dataset, partial_dataset.PartialH5Dataset): + drop_last = True + + self.DataLoader = torch_data.DataLoader( + dataset=self.dataset, + batch_size=batch_size, + shuffle=True, + batch_sampler=None, + num_workers=num_workers, + collate_fn=collate_fn, + drop_last=drop_last, + pin_memory=pin_memory, + timeout=timeout, + worker_init_fn=worker_init_fn, + ) + self._first_iter = True + self.last_epoch = False + + def __iter__(self) -> Iterator: + """ + Generate a new iterator of a type dependent on the type of dataset. + Returns a :class:`partial_dataset.PartialH5DataLoaderIter` if the dataset is a :class:`partial_dataset.PartialH5Dataset` + :func:`self._full_dataset_shuffle_iter` otherwise + """ + if isinstance(self.dataset, partial_dataset.PartialH5Dataset): + return partial_dataset.PartialH5DataLoaderIter(self) + if hasattr(self, "_full_dataset_shuffle_iter") and hasattr(self.dataset, "ishuffle"): + # if it is a normal heat dataset then this is defined + self._full_dataset_shuffle_iter() + return self.DataLoader.__iter__() + + def __len__(self) -> int: + """ + Get the length of the dataloader. Returns the number of batches. + """ + return self.DataLoader.__len__() + + def _full_dataset_shuffle_iter(self): + # logic for when to shuffle the data + if not self.ishuffle: + if self._first_iter: + self._first_iter = False + else: + # shuffle after the first epoch but before the iterator is generated + self.dataset.Shuffle() + else: + # start the shuffling for the next iteration + if not self.last_epoch: + self.dataset.Ishuffle() + + if self._first_iter: + self._first_iter = False + else: + dataset_irecv(self.dataset) + + +class Dataset(torch_data.Dataset): + r""" + An abstract class representing a given dataset. This inherits from torch.utils.data.Dataset. + + This class is a general example for what should be done to create a Dataset. When creating a dataset all of the + standard attributes should be set, the ``__getitem__``, ``__len__``, and ``shuffle`` functions must be defined. + + - ``__getitem__`` : how an item is given to the network + - ``__len__`` : the number of data elements to be given to the network in total + - ``Shuffle()`` : how the data should be shuffled between the processes. The function shown below is for a dataset composed of only data and without targets. The function :func:`dataset_shuffle` abstracts this. For this function only the dataset and a list of attributes to shuffle are given.\n + - ``Ishuffle()`` : A non-blocking version of ``Shuffle()``, this is handled in the abstract function :func:`dataset_ishuffle`. It works similarly to :func:`dataset_shuffle`. + + As the amount of data across processes can be non-uniform, the dataset class will slice off the remaining elements + on whichever processes have more data than the others. This should only be 1 element. + The shuffle function will shuffle all of the data on the process. + + It is recommended that for ``DNDarray`` s, the split is either 0 or None + + Parameters + ---------- + array : DNDarray + DNDarray for which to great the dataset + transform : Callable + Transformation to call before a data item is returned + ishuffle : bool, optional + flag indicating whether to use non-blocking communications for shuffling the data between epochs + Note: if ``True``, the ``Ishuffle()`` function must be defined within the class\n + Default: False + + Attributes + ---------- + These are the required attributes. + + htdata : DNDarray + Full data + _cut_slice : slice + Slice to cut off the last element to get a uniform amount of data on each process + comm : MPICommunicator + Communication object used to send the data between processes + lcl_half : int + Half of the number of data elements on the process + data : torch.Tensor + The local data to be used in training + transforms : Callable + Transform to be called during the getitem function + ishuffle : bool + Flag indicating if non-blocking communications are used for shuffling the data between epochs + """ + + def __init__( + self, + array, + transforms: Optional[Union[List, Callable]] = None, + ishuffle: Optional[bool] = False, + test_set: Optional[bool] = False, + ): # noqa: D107 + self.htdata = array + self.comm = array.comm + self.test_set = test_set + # create a slice to create a uniform amount of data on each process + min_data_split = array.gshape[array.split] // array.comm.size + self.lcl_half = min_data_split // 2 + arb_slice = [slice(None)] * array.ndim + arb_slice[array.split] = slice(min_data_split) + self._cut_slice = tuple(arb_slice) + self.data = array._DNDarray__array[self._cut_slice] + if not isinstance(transforms, (list, tuple)) and transforms is not None: + transforms = [transforms] + self.transforms = transforms + self.ishuffle = ishuffle + + def __getitem__(self, index: Union[int, slice, tuple, list, torch.Tensor]) -> torch.Tensor: + """ + Basic form of __getitem__. As the dataset is often very specific to the dataset, + this should be overwritten by the user. In this form it only gets the raw items from the data. + """ + if self.transforms: + return self.transforms[0](self.data[index]) + return self.data[index] + + def __len__(self) -> int: + """ + Get the number of items in the dataset. This should be overwritten by custom datasets + """ + return self.data.shape[0] + + def Shuffle(self): + """ + Send half of the local data to the process ``self.comm.rank + 1`` if available, else wrap around. After + receiving the new data, shuffle the local tensor. + """ + if not self.test_set: + dataset_shuffle(dataset=self, attrs=[["data", "htdata"]]) + + def Ishuffle(self): + """ + Send half of the local data to the process ``self.comm.rank + 1`` if available, else wrap around. After + receiving the new data, shuffle the local tensor. + """ + if not self.test_set: + dataset_ishuffle(dataset=self, attrs=[["data", "htdata"]]) + + +class DistributedDataset(torch_data.Dataset): + """ + A DistributedDataset for usage in PyTorch. Saves the dndarray and the larray tensor. Uses the larray tensor + for the distribution and getting the items. Intented to be used with DistributedSampler. + """ + + def __init__(self, dndarray: DNDarray, transforms: torchvision.transforms.Compose = None): + if not isinstance(dndarray, DNDarray): + raise TypeError(f"Expected DNDarray but got {type(dndarray)}") + if dndarray.split != 0: + raise ValueError("DistributedDataset only works with a DNDarray split of 0") + + self.dndarray = dndarray + self.transforms = transforms + + def __len__(self) -> int: + return len(self.dndarray.larray) + + def __getitem__(self, index): + item = self.dndarray.larray[index] + if self.transforms is not None: + return self.transforms(item) + return item + + def __getitems__(self, indices): + if self.transforms is not None: + return tuple(self.transforms(self.dndarray.larray[index]) for index in indices) + return tuple(self.dndarray.larray[index] for index in indices) + + +class DistributedSampler(torch_data.Sampler): + """ + A DistributedSampler for usage in PyTorch with Heat Arrays. Uses the nature of the Heat DNDArray + to give the locally stored data on the larray. Shuffling is done by shuffling the indices. + The given Indices corrospond to the index of the larray tensor. + Works only with DNDarray that are split on axis 0 + """ + + def __init__( + self, + dataset: DistributedDataset, + shuffle: bool = False, + seed: Optional[int] = None, + shuffle_type: Literal["global"] | Literal["local"] = "global", + correction: bool = False, + ) -> None: + """ + Parameters + ---------- + dataset : DistributedDataset + Dataset to be shuffled + shuffle : bool, optional + If the underlying DNDarray should be shuffled, by default False + seed : int, optional + seed for shuffling, by default None + shuffle_type : Literal["global"] | Literal["local"], optional + Wether to shuffle process local or get new data using by shuffling globally across all processes, by default "global" + correction : bool, optional + If index correction is wanted after an global shuffle, by default False + """ + if not isinstance(dataset, DistributedDataset): + raise TypeError(f"Expected DistributedDataset for dataset not {type(dataset)}") + if not isinstance(shuffle, bool): + raise TypeError(f"Expected bool for shuffle not {type(shuffle)}") + if not isinstance(seed, int) and seed is not None: + raise TypeError(f"Expected int or None for seed not {type(shuffle)}") + if not isinstance(shuffle_type, str): + raise TypeError("Shuffle Type needs to be an string") + if not isinstance(correction, bool): + raise TypeError("Correction Parameter needs to be an bool") + + self.dataset = dataset + self.dndarray = dataset.dndarray + self.shuffle = shuffle + self.linked_sampler = None + self.correction = correction + self.set_shuffle_type(shuffle_type) + self.set_seed(seed) + + if self.dndarray.split != 0: + raise ValueError("DistributedSampler only works with a DNDarray split of 0") + + @staticmethod + def _in_slice(idx: int, a_slice: slice) -> bool: + """Check if the given index is inside the given slice + + Parameters + ---------- + idx : int + Index to check + a_slice : slice + Slice to check + + Returns + ------- + bool + Wether index is in slice + """ + if idx < a_slice.start or idx >= a_slice.stop: + return False + step = a_slice.step if a_slice.step else 1 + if (idx - a_slice.start) % step == 0: + return True + else: + return False + + def _shuffle(self) -> None: + """Shuffles the given dndarray at creation across processes.""" + if self.shuffle_type == "local": + rand_perm = torch.randperm(self.dndarray.larray.shape[0]) + self.dndarray.larray = self.dndarray.larray[rand_perm] + return + + if self.shuffle_type != "global": + raise ValueError("Shuffle type is not 'local' nor 'global'") + + # TODO: Find out which implementation is better + # self.dndarray = permutation(self.dndarray) + # self.dataset.dndarray = self.dndarray + self._alltoall_shuffle() + + def _alltoall_shuffle(self) -> None: + # Exchanges the data using Indexed data types and i iaj + dtype = self.dndarray.dtype.torch_type() + comm: MPICommunication = self.dndarray.comm + rank: int = comm.rank + world_size: int = comm.size + N: int = self.dndarray.gshape[0] + mpi_type: mpi4py.MPI.Datatype = comm._MPICommunication__mpi_type_mappings[dtype] + + if rank == 0: + indices = torch.randperm(N, dtype=torch.int64) + else: + indices = torch.empty(N, dtype=torch.int64) + mpi4py.MPI.COMM_WORLD.Bcast(indices, root=0) + + indice_buffers: List[List[int]] = [list() for _ in range(world_size)] + rank_slices: List[slice] = [ + comm.chunk((N,), split=0, rank=i)[-1][0] for i in range(world_size) + ] + + block_length: int = reduce(lambda a, b: a * b, self.dndarray.gshape[1:], 1) + local_slice: slice = rank_slices[rank] + local_displacement: int = self.dndarray.counts_displs()[1][rank] * block_length + + # Now figure out which rank needs to send what to each rank and what this rank will receive + for i, idx in enumerate(indices): + idx = idx.item() + for data_send_rank, tslice in enumerate(rank_slices): + if not self._in_slice(idx, tslice): + continue + break + for data_recv_rank, tslice in enumerate(rank_slices): + if not self._in_slice(i, tslice): + continue + break + if data_recv_rank == rank: + indice_buffers[rank].append(idx) + elif data_send_rank == rank: + indice_buffers[data_recv_rank].append(idx) + + # print("RECV BUFFER creating...", flush=True) + send_elems_dtype: List[mpi4py.MPI.Datatype] = list() + local_recv_buffer: torch.Tensor = torch.empty(self.dndarray.larray.shape, dtype=dtype) + + for current_rank in range(world_size): + if current_rank == rank: + send_indice = [ + idx for idx in indice_buffers[current_rank] if self._in_slice(idx, local_slice) + ] + else: + send_indice = indice_buffers[current_rank] + displacements = [ + mpi_type.Get_size() * (disp * block_length - local_displacement) + for disp in send_indice + ] + block_lengths = [block_length] * len(displacements) + send_type = mpi_type.Create_struct( + blocklengths=block_lengths, + displacements=displacements, + datatypes=[mpi_type] * len(displacements), + ) + send_type.Commit() + send_elems_dtype.append(send_type) + + recv_counts = torch.zeros(world_size, dtype=torch.int64) + for idx in indice_buffers[rank]: + for i, tslice in enumerate(rank_slices): + if not self._in_slice(idx, tslice): + continue + recv_counts[i] += 1 + break + + send_elems = self.dndarray.larray + send_elems = send_elems if GPU_AWARE_MPI else send_elems.cpu() + + recv_types: List[mpi4py.MPI.Datatype] = [] + + total_displ = 0 + + for i in range(world_size): + if recv_counts[i] == 0: + recv_type = mpi_type.Create_contiguous(0) + else: + types = [mpi_type.Create_contiguous(block_length) for _ in range(recv_counts[i])] + + displ = torch.zeros(len(types), dtype=torch.int64) + displ[1:] = torch.cumsum(torch.tensor([t.Get_size() for t in types])[:-1], 0) + displ += total_displ + + recv_type = mpi_type.Create_struct( + blocklengths=[1] * len(types), displacements=displ.tolist(), datatypes=types + ) + total_displ += sum([t.Get_size() for t in types]) + + recv_type.Commit() + recv_types.append(recv_type) + + mpi4py.MPI.COMM_WORLD.Alltoallw( + (send_elems, send_elems_dtype), + (local_recv_buffer, recv_types), + ) + + for elem in itertools.chain(recv_types, send_elems_dtype): + elem.Free() + + # As MPI indirectly sorts the data according to the rank we need + # to change that to represent the permutation + if self.correction: + + def get_from_rank(idx): + for i, rslice in enumerate(rank_slices): + if self._in_slice(idx, rslice): + return i + raise RuntimeError("IDX not found in slices") + + idx_to_rank_map = [get_from_rank(idx) for idx in indices[local_slice]] + + sort_idx = torch.argsort(torch.tensor(idx_to_rank_map), stable=True) + local_slices_sorted = indices[local_slice][sort_idx] + + reverse_index = {idx.item(): i for i, idx in enumerate(indices[local_slice])} + idxmap = {i: reverse_index[idx.item()] for i, idx in enumerate(local_slices_sorted)} + + for i, dest in idxmap.items(): + self.dndarray.larray[dest] = local_recv_buffer[i].to(self.dndarray.larray.device) + else: + self.dndarray.larray = local_recv_buffer.to(self.dndarray.larray.device) + + def set_shuffle_type(self, shuffle_type: Literal["global"] | Literal["local"]) -> None: + """Sets the Shuffle type for the Sampler. + + Parameters + ---------- + shuffle_type : Literal["global"] | Literal["local"] + - Local Shuffle means the shuffle of the larray only. + - Global Shuffle means the shuffle across all processes + + Raises + ------ + TypeError + Shuffle type needs to be a string + ValueError + Only Global/Local shuffle types exist + """ + if not isinstance(shuffle_type, str): + raise TypeError("Shuffle type needs to be an string") + if not (shuffle_type == "global" or shuffle_type == "local"): + raise ValueError("only 'global' or 'local' allowed as shuffle type") + + self.shuffle_type: Literal["global"] | Literal["local"] = shuffle_type + + if self.linked_sampler is not None: + self.linked_sampler.set_shuffle_type(shuffle_type) + + def set_seed(self, value: int | None) -> None: + """Sets the seed for the torch.randperm + + Parameters + ---------- + value : int + seed to set + """ + self._seed = value + if value is not None: + torch.manual_seed(value) + if self.shuffle: + self._shuffle() + + if self.linked_sampler is not None: + self.linked_sampler.set_seed(value) + + def link(self, sampler: "DistributedSampler") -> None: + """ + Links another DistributedSampler to this one, to automatically sets the seed/shuffle_type of this and the linked one, + rather than manually setting both seperately. Usefull when one Sampler contains training data and the + linked one the label data. + """ + if not isinstance(sampler, DistributedSampler): + raise TypeError(f"Sampler of type {type(sampler)} needs to be an DistributedSampler") + self.linked_sampler = sampler + + def unlink(self) -> None: + """ + Removes an established link. For more info view :link: function + """ + self.linked_sampler = None + + def __iter__(self) -> Iterator[int]: + if self.shuffle_type == "local": + self.indices = torch.randperm(len(self.dndarray.larray)).tolist() + else: + self.indices = list(range(len(self.dndarray.larray))) + return iter(self.indices) + + def __len__(self) -> int: + return len(self.dndarray.larray) + + +def create_train_val_split( + X: DNDarray, y: DNDarray, p: float = 0.95, seed: int | None = None +) -> tuple[DNDarray, DNDarray, DNDarray, DNDarray]: + """Shuffles the data and then creates the train val split. + + Parameters + ---------- + X : DNDarray + Training Data + y : DNDarray + Training Labels + p : float, optional + How much the training should contain, by default 0.95 + seed : int | None, optional + Random Seed to be used, by default None + + Returns + ------- + tuple[DNDarray, DNDarray, DNDarray, DNDarray] + returns tuple of (train_arr, train_labels_arr, val_arr, val_labels_arr) + """ + if seed is None: + seed = random.randint(-0x8000_0000_0000_0000, 0xFFFF_FFFF_FFFF_FFFF) + + for arr in [X, y]: + dset = DistributedDataset(arr) + _ = DistributedSampler(dset, shuffle=True, seed=seed) + + train_rows = int(X.lshape[0] * p) + val_rows = X.lshape[0] - train_rows + + perm = torch.randperm(X.lshape[0]) + + train_idx = perm[:train_rows] + val_idx = perm[-val_rows:] + + assert len(train_idx) + len(val_idx) == X.lshape[0] + + comm = MPI.COMM_WORLD + + total_train_rows = comm.allreduce(train_rows, MPI.SUM) + total_val_rows = comm.allreduce(val_rows, MPI.SUM) + + train_gshape = tuple([total_train_rows, *X.gshape[1:]]) + val_gshape = tuple([total_val_rows, *X.gshape[1:]]) + + train_arr = DNDarray( + X.larray[train_idx], + train_gshape, + X.dtype, + split=0, + device=X.device, + comm=X.comm, + balanced=True, + ) + val_arr = DNDarray( + X.larray[val_idx], val_gshape, X.dtype, split=0, device=X.device, comm=X.comm, balanced=True + ) + + train_labels_gshape = tuple([total_train_rows, *y.gshape[1:]]) + val_labels_gshape = tuple([total_val_rows, *y.gshape[1:]]) + + train_labels_arr = DNDarray( + y.larray[train_idx], + train_labels_gshape, + y.dtype, + split=0, + device=y.device, + comm=y.comm, + balanced=True, + ) + val_labels_arr = DNDarray( + y.larray[val_idx], + val_labels_gshape, + y.dtype, + split=0, + device=y.device, + comm=y.comm, + balanced=True, + ) + + return train_arr, train_labels_arr, val_arr, val_labels_arr + + +def dataset_shuffle(dataset: Union[Dataset, torch_data.Dataset], attrs: List[list]): + """ + Shuffle the given attributes of a dataset across multiple processes. This will send half of the data to rank + 1. + Once the new data is received, it will be shuffled into the existing data on the process. + This function will be called by the DataLoader automatically if ``dataset.ishuffle = False``. + attrs should have the form [[torch.Tensor, DNDarray], ... i.e. [['data', 'htdata`]] assume that all of the attrs have the same dim0 shape as the local data + + Parameters + ---------- + dataset : Dataset + the dataset to shuffle + attrs : List[List[str, str], ... ] + List of lists each of which contains 2 strings. The strings are the handles corresponding to the Dataset + attributes corresponding to the global data DNDarray and the local data of that array, i.e. [["data, "htdata"],] + would shuffle the htdata around and set the correct amount of data for the ``dataset.data`` attribute. For + multiple parameters multiple lists are required. I.e. [["data", "htdata"], ["targets", "httargets"]] + + Notes + ----- + ``dataset.comm`` must be defined for this function to work. + """ + # attrs -> [[torch.Tensor, DNDarray], ...] + if attrs[0][1] is not None: + prm = torch.randperm(getattr(dataset, attrs[0][1])._DNDarray__array.shape[0]) + else: + prm = torch.randperm(getattr(dataset, attrs[0][0]).shape[0]) + comm = dataset.comm + for att in attrs: + ld = getattr(dataset, att[0]) + snd = ld[: dataset.lcl_half].clone() + snd_shape, snd_dtype, snd_dev = snd.shape, snd.dtype, snd.device + dest = comm.rank + 1 if comm.rank + 1 != comm.size else 0 + # send the top half of the data to the next process + send_wait = comm.Isend(snd, dest=dest) + del snd + new_data = torch.empty(snd_shape, dtype=snd_dtype, device=snd_dev) + src = comm.rank - 1 if comm.rank != 0 else comm.size - 1 + rcv_w = comm.Irecv(new_data, source=src) + send_wait.wait() + rcv_w.wait() + # set the DNDarray data + if att[1] is not None: + getattr(dataset, att[1])._DNDarray__array[: dataset.lcl_half] = new_data + # shuffle all of the data around + shuffled = getattr(dataset, att[1])._DNDarray__array[prm] + getattr(dataset, att[1])._DNDarray__array = shuffled + # set the torch data + setattr(dataset, att[0], shuffled[dataset._cut_slice]) + else: + getattr(dataset, att[0])[: dataset.lcl_half] = new_data + # shuffle all of the data around + shuffled = getattr(dataset, att[0])[prm] + setattr(dataset, att[0], shuffled[dataset._cut_slice]) + + +def dataset_ishuffle(dataset: Union[Dataset, torch_data.Dataset], attrs: List[list]): + """ + Shuffle the given attributes of a dataset across multiple processes, using non-blocking communications. + This will send half of the data to rank + 1. The data must be received by the :func:`dataset_irecv` function. + + This function will be called by the DataLoader automatically if ``dataset.ishuffle = True``. This is set either + during the definition of the class of its initialization by a given paramete. + + Parameters + ---------- + dataset : Dataset + the dataset to shuffle + attrs : List[List[str, str], ... ] + List of lists each of which contains 2 strings. The strings are the handles corresponding to the Dataset + attributes corresponding to the global data DNDarray and the local data of that array, i.e. [["htdata, "data"],] + would shuffle the htdata around and set the correct amount of data for the ``dataset.data`` attribute. For + multiple parameters multiple lists are required. I.e. [["htdata", "data"], ["httargets", "targets"]] + + Notes + ----- + ``dataset.comm`` must be defined for this function to work. + """ + # attrs should have the form [[heat array, sliced array], [...], ...] + # i.e. [['data', 'htdata']] + # assume that all of the attrs have the same dim0 shape as the local data + comm = dataset.comm + ret_list = [] + for att in attrs: + snd = getattr(dataset, att[0])[: dataset.lcl_half].clone() + snd_shape, snd_dtype, snd_dev = snd.shape, snd.dtype, snd.device + dest = comm.rank + 1 if comm.rank + 1 != comm.size else 0 + # send the top half of the data to the next process + send_wait = comm.Isend(snd, dest=dest, tag=99999) + new_data = torch.empty(snd_shape, dtype=snd_dtype, device=snd_dev) + src = comm.rank - 1 if comm.rank != 0 else comm.size - 1 + wait = comm.Irecv(new_data, source=src, tag=99999) + ret_list.append([att, wait, new_data]) + send_wait.wait() + del snd + setattr(dataset, "rcv_list", ret_list) + + +def dataset_irecv(dataset: Union[Dataset, torch_data.Dataset]): + """ + Receive the data sent by the :func:`dataset_ishuffle` function. This will wait for the data and then shuffle the + data into the existing data on the process + + This function will be called by the DataLoader automatically if ``dataset.ishuffle = True``. This is set either + during the definition of the class of its initialization by a given paramete. + + Parameters + ---------- + dataset : Dataset + the dataset to shuffle + + Notes + ----- + ``dataset.comm`` must be defined for this function to work. + """ + setattr(dataset, "shuffle_prm", torch.randperm(dataset.data.shape[0])) + rcv_list = getattr(dataset, "rcv_list") + prm = getattr(dataset, "shuffle_prm") + for rcv in rcv_list: + rcv[1].wait() + if rcv[0][1] is not None: + getattr(dataset, rcv[0][1])._DNDarray__array[: dataset.lcl_half] = rcv[2] + # shuffle all of the data around + shuffled = getattr(dataset, rcv[0][1])._DNDarray__array[prm] + getattr(dataset, rcv[0][1])._DNDarray__array = shuffled + # set the torch data + setattr(dataset, rcv[0][0], shuffled[dataset._cut_slice]) + else: + getattr(dataset, rcv[0][0])[: dataset.lcl_half] = rcv[2] + # shuffle all of the data around + shuffled = getattr(dataset, rcv[0][0])[prm] + setattr(dataset, rcv[0][0], shuffled[dataset._cut_slice]) diff --git a/tests/utils/data/matrixgallery.py b/tests/utils/data/matrixgallery.py new file mode 100644 index 0000000000..16c0ba191a --- /dev/null +++ b/tests/utils/data/matrixgallery.py @@ -0,0 +1,204 @@ +""" +Generate matrices for specific tests and functions +""" + +from heat import core +from core.dndarray import DNDarray +from core.communication import Communication +from core.devices import Device +from core.types import datatype, heat_type_is_complexfloating, heat_type_is_exact +from core.random import randn, rand +from core.linalg import qr, matmul +from core.manipulations import diag, sort +from core.exponential import log +from typing import Type, Union, Tuple, Callable + +__all__ = ["hermitian", "parter", "random_known_singularvalues", "random_known_rank"] + + +def hermitian( + n: int, + dtype: Type[datatype] = core.complex64, + split: Union[None, int] = None, + device: Union[None, str, Device] = None, + comm: Union[None, Communication] = None, + positive_definite: bool = False, +) -> DNDarray: + r""" + Generates a random Hermitian matrix of size `(n,n)`. A Hermitian matrix is a complex square matrix that is equal to its conjugate transpose; for real data-types this routine + returns a random symmetric matrix of size `(n,n)`. + + If `positive_definite=True`, the output is given by :math:`\frac{1}{n} R R^H` with :math:`R\in\mathbb{K}^{n\times n}` having entries distributed according to the standard normal distribution. + This corresponds to sampling a random matrix according to the so-called Wishart distribution; see, e.g., [2], and also [3] for additional information regarding the asymptotic distribution of + the singular values. The output matrix will be positive definite with probability 1. + + If `positive_definite=False`, the output is :math:`R+R^H` with :math:`R` generated as above. + + Parameters + ---------- + n : int + size of the resulting square matrix + dtype: Type[datatype], optional + The desired data-type for the array, defaults to ht.complex64; only floating-point data-types allowed. + For real data-types, i.e. float32 and float64, a matrix with real entries (i.e. a symmetric one) is returned. + split: None or int, optional + The axis along which the array content is split and distributed in memory. + device: None or str or Device, optional + Specifies the device the tensor shall be allocated on, defaults globally set default device. + comm : Communication, optional + Handle to the nodes holding distributed parts or copies of this array. + positive_definite : bool, optional + If True, the resulting matrix is positive definite, defaults to False. + + References + ---------- + [1] https://en.wikipedia.org/wiki/Hermitian_matrix + [2] https://en.wikipedia.org/wiki/Wishart_distribution + [3] https://en.wikipedia.org/wiki/Marchenko%E2%80%93Pastur_distribution + """ + if heat_type_is_complexfloating(dtype): + real_dtype = core.float32 if dtype is core.complex64 else core.float64 + matrix = randn(n, n, dtype=real_dtype, split=split, device=device, comm=comm) + 1j * randn( + n, n, dtype=real_dtype, split=split, device=device, comm=comm + ) + elif dtype in [core.float32, core.float64]: + matrix = randn(n, n, dtype=dtype, split=split, device=device, comm=comm) + else: + raise ValueError("dtype must be floating-point data-type but is ", dtype, ".") + if positive_definite: + return 1 / n * matrix @ core.conj(matrix).T + + return matrix + core.conj(matrix).T.resplit_(split) + + +def parter( + n: int, + split: Union[None, int] = None, + device: Union[None, str, Device] = None, + comm: Union[None, Communication] = None, + dtype: Type[datatype] = core.float32, +) -> DNDarray: + """ + Generates the Parter matrix, a Toeplitz matrix that has the interesting property of having its singular values cluster at + pi. The matrix has been named so by Cleve Moler in recognition of Seymour Parter's proof of this fact. + + Parameters + ---------- + n : int + size of the resulting square matrix + split: None or int, optional + The axis along which the array content is split and distributed in memory. + device: None or str or Device, optional + Specifies the device the tensor shall be allocated on, defaults globally set default device. + comm: None or Communication, optional + Handle to the nodes holding distributed tensor chunks. + dtype: Type[datatype], optional + The desired data-type for the array, defaults to ht.float64. + + References + ---------- + [1] https://blogs.mathworks.com/cleve/2019/06/24/bohemian-matrices-in-the-matlab-gallery/ + + [2] https://blogs.mathworks.com/cleve/2014/02/03/surprising-svd-square-waves-and-pi/ + + [3] Seymour V. Parter, On the distribution of the singular values of Toeplitz matrices, Linear Algebra and its + Applications 80, 1986, 115-130, http://www.sciencedirect.com/science/article/pii/0024379586902806 + """ + if split is None: + a = core.arange(n, dtype=dtype, device=device, comm=comm) + II = a.expand_dims(0) + JJ = a.expand_dims(1) + elif split == 0: + II = core.arange(n, dtype=dtype, device=device, comm=comm).expand_dims(0) + JJ = core.arange(n, dtype=dtype, split=split, device=device, comm=comm).expand_dims(1) + elif split == 1: + II = core.arange(n, dtype=dtype, split=0, device=device, comm=comm).expand_dims(0) + JJ = core.arange(n, dtype=dtype, device=device, comm=comm).expand_dims(1) + else: + raise ValueError(f"expected split value to be either {{None,0,1}}, but was {split}") + + return 1.0 / (II - JJ + 0.5) + + +def random_orthogonal( + m: int, + n: int, + split: Union[None, int] = None, + device: Union[None, str, Device] = None, + comm: Union[None, Communication] = None, + dtype: Type[datatype] = core.float32, +) -> DNDarray: + """Auxiliary routine: creates a random mxn matrix with orthonormal columns + Caveat: this is done by QR of mxn matrices with i.i.d. normal entries, so this does not produce the uniform distribution on the orthogonal matrices... + """ + if n > m: + raise RuntimeError("No orthogonal matrix of shape %d x %d possible." % (m, n)) + + # TODO: if QR does not make problems anymore, replace split=None by split=split + U = randn(m, n, split=None, dtype=dtype, comm=comm, device=device) + Q, _ = qr(U) + + return Q[:, :n].resplit_(split) + + +def random_known_singularvalues( + m: int, + n: int, + singular_values: DNDarray, + split: Union[None, int] = None, + device: Union[None, str, Device] = None, + comm: Union[None, Communication] = None, + dtype: Type[datatype] = core.float32, +) -> Tuple[DNDarray, Tuple[DNDarray]]: + """ + Creates an m x n matrix with singular values given by the entries of the input array singular_values. + Caveat: if the entries of `singular_values` are not sorted, the singular value decomposition of A (returned as second output) is so as well. + The singular vectors are chosen randomly using :func:`random_orthogonal`. + """ + if not isinstance(singular_values, DNDarray): + raise RuntimeError( + f"Argument singular_values needs to be a DNDarray but is {type(singular_values)}." + ) + if singular_values.ndim != 1: + raise RuntimeError( + f"Argument singular_values needs to be a 1D array, but dimension is {singular_values.ndim}." + ) + if singular_values.shape[0] > min(m, n): + raise RuntimeError( + f"Number of given singular values must not exceed matrix dimensions. Got {singular_values.shape[0]} singular values for matrix size ({m}, {n})." + ) + + r = singular_values.shape[0] + U = random_orthogonal(m, r, split=split, device=device, comm=comm, dtype=dtype) + V = random_orthogonal(n, r, split=split, device=device, comm=comm, dtype=dtype) + + A = matmul(U, matmul(diag(singular_values), V.T)) + + return A.resplit_(split), (U, singular_values, V) + + +def random_known_rank( + m: int, + n: int, + r: int, + quantile_function: Callable = lambda x: -log(x), + split: Union[None, int] = None, + device: Union[None, str, Device] = None, + comm: Union[None, Communication] = None, + dtype: Type[datatype] = core.float32, +) -> Tuple[DNDarray, Tuple[DNDarray]]: + """ + Creates a random m x n matrix with rank r. + This routine uses :func:`random_known_singularvalues` with r singular values randomly chosen + w.r.t. the distribution with quantile function given by the input quantile_function. Default yields exponential distibution with parameter lambda=1. + Unlike in :func:`random_known_singularvalues`, here the singular values of the output are sorted in descending order. + """ + if r > min(m, n): + raise RuntimeError("rank must not exceed matrix dimensions.") + + singular_values = rand(r, dtype=dtype, comm=comm, device=device) + singular_values = sort(quantile_function(singular_values), descending=True)[0] + + return random_known_singularvalues( + m, n, singular_values, split=split, device=device, comm=comm, dtype=dtype + ) diff --git a/tests/utils/data/mnist.py b/tests/utils/data/mnist.py new file mode 100644 index 0000000000..007e493445 --- /dev/null +++ b/tests/utils/data/mnist.py @@ -0,0 +1,129 @@ +""" +File for the MNIST dataset definition in heat +""" + +import torch + +from torchvision import datasets +from typing import Callable, Union + +from heat import factories +from heat import datatools + +__all__ = ["MNISTDataset"] + + +class MNISTDataset(datasets.MNIST): + """ + Dataset wrapper for `torchvision.datasets.MNIST `_. + This implements all of the required functions mentioned in :class:`heat.utils.data.Dataset`. The ``__getitem__`` and ``__len__`` functions are inherited from + `torchvision.datasets.MNIST `_. + + Parameters + ---------- + root : str + Directory containing the MNIST dataset + train : bool, optional + If the data is the training dataset or not, default is True + transform : Callable, optional + Transform to be applied to the data dataset in the ``__getitem__`` function, default is ``None`` + target_transform : Callable, optional + Transform to be applied to the target dataset in the ``__getitem__`` function, default is ``None`` + download : bool, optional + If the data does not exist in the directory, download it if True (default) + split : int, optional + On which access to split the data when it is loaded into a ``DNDarray`` + ishuffle : bool, optional + Flag indicating whether to use non-blocking communications for shuffling the data between epochs + Note: if True, the ``Ishuffle()`` function must be defined within the class + Default: ``False`` + test_set : bool, optional + If this dataset is the testing set then keep all of the data local + Default: ``False`` + + Attributes + ---------- + htdata : DNDarray + full data + httargets : DNDarray + full target data + comm : communication.MPICommunicator + heat communicator for sending data between processes + _cut_slice : slice + slice to remove the last element if all are not equal in length + lcl_half : int + integer value of half of the data on the process + data : torch.Tensor + the local data on a process + targets : torch.Tensor + the local targets on a process + ishuffle : bool + flag indicating if non-blocking communications are used for shuffling the data between epochs + test_set : bool + if this dataset is the testing set then keep all of the data local + + Notes + ----- + For other attributes see `torchvision.datasets.MNIST `_. + """ + + def __init__( + self, + root: str, + train: bool = True, + transform: Callable = None, + target_transform: Callable = None, + download: bool = True, + split: int = 0, + ishuffle: bool = False, + test_set: bool = False, + ): # noqa: D107 + super().__init__( + root, + train=train, + transform=transform, + target_transform=target_transform, + download=download, + ) + if split != 0 and split is not None: + raise ValueError("split must be 0 or None") + split = None if test_set else split + array = factories.array(self.data, split=split) + targets = factories.array(self.targets, split=split) + self.test_set = test_set + self.partial_dataset = False + self.comm = array.comm + self.htdata = array + self.httargets = targets + self.ishuffle = ishuffle + if split is not None: + min_data_split = array.gshape[0] // array.comm.size + arb_slice = slice(min_data_split) + self._cut_slice = arb_slice + self.lcl_half = min_data_split // 2 + self.data = array._DNDarray__array[self._cut_slice] + self.targets = targets._DNDarray__array[self._cut_slice] + else: + self._cut_slice = None + self.lcl_half = array.gshape[0] // 2 + self.data = array._DNDarray__array + self.targets = targets._DNDarray__array + # getitem and len are defined by torch's MNIST class + + def Shuffle(self): + """ + Uses the :func:`datatools.dataset_shuffle` function to shuffle the data between the processes + """ + if not self.test_set: + datatools.dataset_shuffle( + dataset=self, attrs=[["data", "htdata"], ["targets", "httargets"]] + ) + + def Ishuffle(self): + """ + Uses the :func:`datatools.dataset_ishuffle` function to shuffle the data between the processes + """ + if not self.test_set: + datatools.dataset_ishuffle( + dataset=self, attrs=[["data", "htdata"], ["targets", "httargets"]] + ) diff --git a/tests/utils/data/partial_dataset.py b/tests/utils/data/partial_dataset.py new file mode 100644 index 0000000000..9e1ba362b7 --- /dev/null +++ b/tests/utils/data/partial_dataset.py @@ -0,0 +1,360 @@ +""" +Tool for using a dataset which will not fit in memory with neural networks +""" + +import math +import queue +import threading +import torch +import time + +from torch.utils import data as torch_data +from typing import Callable, List, Iterator, Union + +from heat.communication import MPICommunication +from heat.communication import MPI_WORLD + +__all__ = ["PartialH5Dataset", "PartialH5DataLoaderIter"] + + +def queue_thread(q: queue.Queue): + while True: + items = q.get() + if isinstance(items, tuple): + func = items[0] + args = items[1:] + func(*args) + else: + items() + q.task_done() + + +class PartialH5Dataset(torch_data.Dataset): + """ + Create a Dataset object for a dataset which loads portions of data from an HDF5 file. Very similar to + :func:``. This will create 2 threads, one for loading the data from the target file, + and one for converting items before being passed to the network. The conversion is done by the iterator. + A portion of the data of length ``initial_load`` is loaded upon initialization, the rest of the data is loaded + after the loaded data is returned by :func:`PartialH5DataLoaderIter`. This iterator will be used by the HeAT + :func:`heat.utils.data.datatools.DataLoader` automatically with this type of dataset. + + Notes + ----- + H5 datasets require the GIL to load data. This can be a bottleneck if data needs to be loaded multiple times (e.g. + the case for using this dataset). It is recommended to find another way to preprocess the data and avoid using + H5 files for this reason. + + Parameters + ---------- + file: str + H5 file to use + comm: MPICommunication + Global MPI communicator generated by HeAT + dataset_names: Union[str, List[str]], optional + Name/s of dataset/s to load from ``file``. If a string is given, it will be the only dataset loaded. + Default is "data". + transforms : List[Callable], optional + Transforms to apply to the data after it is gotten from the loaded data before it is used by the network. + This should be a list of Callable torch functions for each item returned by the ``__getitem__`` function + of the individual dataset. If a list element is ``None`` then no transform will be applied to the + corresponding element returned by ``__getitem__``. I.e. if ``__getitem__`` returns an image and a label + then the list would look like this: ``transforms = [image_transforms, None]``. If this is ``None``, no + transforms will be applied to any elements. Default is ``None``. + use_gpu : bool, optional + Use GPUs if available. Defaults to True. + validate_set : bool, optional + Load the entire dataset onto each node upon initialization and skip loaded in iterator + This is typically the case needed for validation sets when the network should be tested against the whole + dataset. Default is False. + initial_load : int, optional + How many elements to load from the file in the 0th dimension. Default is 7000 elements + load_length : int, optional + How many elements to load from the file in the iterator. Default is 1000 elements + """ + + def __init__( + self, + file: str, + comm: MPICommunication = MPI_WORLD, + dataset_names: Union[str, List[str]] = "data", + transforms: List[Callable] = None, + use_gpu: bool = True, + validate_set: bool = False, + initial_load: int = 7000, + load_length: int = 1000, + ): # noqa: D107 + import h5py + + super(PartialH5Dataset, self).__init__() + self.ishuffle = False + self.file = file + self.comm = comm + self.transforms = transforms if isinstance(transforms, (list, tuple)) else [transforms] + self.gpu = True if torch.cuda.device_count() > 0 and use_gpu else False + self.torch_device = "cpu" + if torch.cuda.is_available() and use_gpu: + dev_id = MPI_WORLD.rank % torch.cuda.device_count() + self.torch_device = torch.device(f"cuda:{str(dev_id)}") + torch.cuda.set_device(dev_id) + + f = h5py.File(file, "r") + # too much data for the process + fkeys = list(f.keys()) + + sz = f[fkeys[0]].len() + for k in fkeys[1:]: + # ensure that all of the datasets are the same length + if f[k].len() != sz: + raise ValueError(f"all datasets in {file} must be the same length") + self.total_size = sz + # how many indices will go onto each process (len) + self.lcl_full_sz = sz // comm.size + # load data that is half of of the available memory + self.local_data_start = comm.rank * self.lcl_full_sz + self.local_data_end = (comm.rank + 1) * self.lcl_full_sz + + if validate_set or initial_load > self.lcl_full_sz: + # if its the validation set then load the whole dataset for each process + self.lcl_full_sz = sz + self.local_data_start = 0 + self.local_data_end = sz + self.load_initial = sz + self.partial_dataset = False + self.load_len = 0 + self.loads_needed = 0 + else: + self.local_length = self.local_data_end - self.local_data_start + self.load_initial = initial_load + self.load_len = load_length # int(local_data_end / 3) + self.loads_needed = math.ceil(self.lcl_full_sz / self.load_len) + self.partial_dataset = True + + self.loads_left = self.loads_needed + self.load_start = self.local_data_start + self.load_end = self.local_data_start + self.load_initial + + # data being loaded from dataset_names parameter + if isinstance(dataset_names, str): + dataset_names = [dataset_names] + self.dataset_names = dataset_names + self.dataset_order = [] + for d in dataset_names: + hld = f[d][self.load_start : self.load_end] + self.__setattr__(d, hld) + self.load_start = self.load_end + self.load_end += self.load_len + f.close() + self.load_thread = None + self.epoch_end = False + # need the number of loads required for an epoch + self.loading_queue = queue.Queue() + self.loading_condition = threading.Condition() + threading.Thread(target=queue_thread, args=[self.loading_queue], daemon=True).start() + self.convert_queue = queue.Queue() + threading.Thread(target=queue_thread, args=[self.convert_queue], daemon=True).start() + self.used_indices = [] + + def Shuffle(self): + """ + Send half of the local data to the process ``self.comm.rank + 1`` if available, else wrap around. After + receiving the new data, shuffle the local tensor. + + Not implemented for partial dataset + """ + return NotImplementedError + + def Ishuffle(self): + """ + Send half of the local data to the process ``self.comm.rank + 1`` if available, else wrap around. After + receiving the new data, shuffle the local tensor. + + Not implemented for partial dataset + """ + return NotImplementedError + + def __getitem__(self, index: Union[int, slice, List[int], torch.Tensor]) -> torch.Tensor: + """ + Abstract __getitem__ method. + This should be defined by the user at runtime. This function needs to be designed such + that the data is in the 0th dimension and the indexes called are only in the 0th dim! + """ + raise NotImplementedError("__getitem__ must be overwritten") + + def __len__(self) -> int: + """ + Get the total length of the dataset + """ + return self.total_size + + def thread_replace_converted_batches(self): + """ + Replace the elements of the dataset with newly loaded elements. :func:'PartialH5DataLoaderIter' will + put the used indices in the ``used_indices`` parameter. This object is reset to an empty list after + these elements are overwritten with new data. + """ + import h5py + + self.loads_left = self.loads_needed + ll = self.loads_left + for _ in range(ll): + with h5py.File(self.file, "r") as f: + for d in self.dataset_names: + hld = f[d][self.load_start : self.load_end] + self.__setattr__("hold" + d, hld) + if self.load_end + self.comm.size > self.total_size: + self.load_end = 0 + self.load_start = self.load_end + self.load_end += self.load_len + + # wait for lock1 *from* convert thread + with self.loading_condition: + self.loading_condition.wait() + for d in self.dataset_names: + new = self.__getattribute__("hold" + d) + dset = self.__getattribute__(d) + new_top = new[: len(self.used_indices)] + lnew = len(new_top) + dset[self.used_indices[:lnew]] = new_top + self.__setattr__(d, dset) + self.__setattr__("hold" + d, new[lnew:]) + # give up lock / notify convert thread + self.used_indices = [] + self.loads_left -= 1 + + +class PartialH5DataLoaderIter(object): + """ + Iterator to be used with :func:'PartialH5Dataset'. It closely mirrors the standard torch iterator while loading + new data to replace the loaded batches automatically. It also pre-fetches the batches and begins their + preparation, collation, and device setting in the background. + """ + + def __init__(self, loader): # noqa: D107 + # todo: make note that h5py is required for this...move load to dataset? + self.dataset = loader.dataset + self._dataset_kind = loader.DataLoader._dataset_kind + self._IterableDataset_len_called = loader.DataLoader._IterableDataset_len_called + self._auto_collation = loader.DataLoader._auto_collation + self._drop_last = loader.DataLoader.drop_last + self._index_sampler = loader.DataLoader._index_sampler + self._num_workers = loader.DataLoader.num_workers + self._pin_memory = loader.DataLoader.pin_memory and torch.cuda.is_available() + self._timeout = loader.DataLoader.timeout + self._collate_fn = loader.DataLoader.collate_fn + self._sampler_iter = iter(self._index_sampler) + self._base_seed = torch.empty((), dtype=torch.int64).random_().item() + self._num_yielded = 0 + self.batch_size = loader.DataLoader.batch_size + self.comm = self.dataset.comm + rand_samp_list = torch.randperm(self.dataset.load_initial).tolist() + + # todo: support other samplers: for now its only random + if self.dataset.partial_dataset: + self.ready_batches = [] + mod_batch = self.dataset.load_len % self.batch_size + if mod_batch != 0: + self.dataset.load_len += self.batch_size - mod_batch + self.dataset.load_end = self.dataset.load_start + self.dataset.load_len + # generate all indices + index_list = [] + idx_repeats = math.ceil(self.dataset.lcl_full_sz / self.dataset.load_initial) + for _ in range(idx_repeats): + index_list.extend(torch.randperm(self.dataset.load_initial).tolist()) + # start the conversion + self.dataset.convert_queue.put((self.__thread_convert_all, index_list)) + + self.length = len(index_list) // self.batch_size + self.dataset.loading_queue.put(self.dataset.thread_replace_converted_batches) + else: + self.rand_samp_list = rand_samp_list + self.length = len(self._index_sampler) + + self._dataset_fetcher = torch_data.dataloader._DatasetKind.create_fetcher( + self._dataset_kind, + loader.DataLoader.dataset, + self._auto_collation, + self._collate_fn, + self._drop_last, + ) + + def __len__(self): + """ + Get the length of the iterator + """ + return self.length + + def _next_data(self): + # get the next batch + if not self.dataset.partial_dataset: + index = next(self._sampler_iter) # may raise StopIteration + data = self._dataset_fetcher.fetch(index) # may raise StopIteration + if self._pin_memory: + data = torch_data._utils.pin_memory.pin_memory(data) + return data + if self._num_yielded == self.__len__(): + raise StopIteration + while len(self.ready_batches) < 1: + time.sleep(0.1) + batch = self.ready_batches.pop(0) + for b in range(len(batch)): + if batch[b].device != self.dataset.torch_device: + batch[b] = batch[b].to(self.dataset.torch_device) + return batch + + def __next__(self): + """ + Get the next batch of data. Shamelessly taken from torch. + """ + # shamelessly taken from torch + data = self._next_data() + self._num_yielded += 1 + # note: the warnings raised by torch for iterable datasets were removed here, look for these in + # the base class of the single process iterator + return data + + def __iter__(self): + """ + Get a new iterator of this class + + Returns + ------- + PartialH5DataLoaderIter + """ + return self + + def __thread_convert_all(self, index_list): + # convert all of the elements, collate them into batches, and send the batches to the correct device + # this function als communicates with the data loading thread from the PartialH5Dataset to notify it + # when it has the correct amount of data to write. + converted_items = [] + for ind in index_list: + # get the desired image/target/... to begin composing a batch + single_item = self.dataset[ind] + if not isinstance(single_item, tuple) and self.dataset.transforms[0] is not None: + single_item = self.dataset.transforms[0](single_item) + if isinstance(single_item, tuple): + single_item = list(single_item) + for ii in range(len(single_item)): + # do transforms (have all torch stuff here) + if self.dataset.transforms[ii] is not None: + single_item[ii] = self.dataset.transforms[ii](single_item[ii]) + converted_items.append(single_item) + self.dataset.used_indices.append(ind) + if len(converted_items) == self.batch_size: + if ( + len(self.dataset.used_indices) == self.dataset.load_len + and self.dataset.loads_left > 0 + ): + with self.dataset.loading_condition: + self.dataset.loading_condition.notify() + batch = self._collate_fn(converted_items) + try: + for bb in range(2): + bb_batch = self.ready_batches[bb] + for b in range(len(batch)): + bb_batch[b] = bb_batch[b].to(self.dataset.torch_device) + self.ready_batches[bb] = bb_batch + except IndexError: + pass + self.ready_batches.append(batch) + converted_items = [] diff --git a/tests/utils/data/spherical.py b/tests/utils/data/spherical.py new file mode 100644 index 0000000000..133f25c89a --- /dev/null +++ b/tests/utils/data/spherical.py @@ -0,0 +1,160 @@ +"""Create a sperical dataset.""" + +import heat as ht +import torch + + +def create_spherical_dataset( + num_samples_cluster, radius=1.0, offset=4.0, dtype=ht.float32, random_state=1 +): + """ + Creates k=4 sperical clusters in 3D space along the space-diagonal + + Parameters + ---------- + num_samples_cluster: int + Number of samples per cluster. Each process will create n // MPI_WORLD.size elements for each cluster + radius: float + Radius of the sphere + offset: float + Shift of the clusters along the axes. The 4 clusters will be positioned centered around c1=(offset, offset,offset), + c2=(2*offset,2*offset,2*offset), c3=(-offset, -offset, -offset) and c4=(2*offset, -2*offset, -2*offset) + dtype: ht.datatype + Dataset dtype + random_state: int + seed of the torch random number generator + """ + # contains num_samples + + p = ht.MPI_WORLD.size + # create k sperical clusters with each n elements per cluster. Each process creates k * n/p elements + num_ele = num_samples_cluster // p + ht.random.seed(random_state) + # radius between 0 and 1 + r = ht.random.rand(num_ele, split=0) * radius + # theta between 0 and pi + theta = ht.random.rand(num_ele, split=0) * 3.1415 + # phi between 0 and 2pi + phi = ht.random.rand(num_ele, split=0) * 2 * 3.1415 + # Cartesian coordinates + x = r * ht.sin(theta) * ht.cos(phi) + x.astype(dtype, copy=False) + y = r * ht.sin(theta) * ht.sin(phi) + y.astype(dtype, copy=False) + z = r * ht.cos(theta) + z.astype(dtype, copy=False) + + cluster1 = ht.stack((x + offset, y + offset, z + offset), axis=1) + cluster2 = ht.stack((x + 2 * offset, y + 2 * offset, z + 2 * offset), axis=1) + cluster3 = ht.stack((x - offset, y - offset, z - offset), axis=1) + cluster4 = ht.stack((x - 2 * offset, y - 2 * offset, z - 2 * offset), axis=1) + + data = ht.concatenate((cluster1, cluster2, cluster3, cluster4), axis=0) + # Note: enhance when shuffel is available + return data + + +def create_clusters( + n_samples, n_features, n_clusters, cluster_mean, cluster_std, cluster_weight=None, device=None +): + """ + Creates a DNDarray of shape (n_samples, n_features), split=0, and dtype=ht.float32, that is balanced (i.e. roughly same size of samples on each process). + The data set consists of n_clusters clusters, each of which is sampled from a multivariate normal distribution with mean cluster_mean[k,:] and covariance matrix cluster_std[k,:,:]. + The clusters are of the same size (quantitatively) and distributed evenly over the processes, unless cluster_weight is specified. + + Parameters + ---------- + n_samples: int + Number of overall samples + n_features: int + Number of features + n_clusters: int + Number of clusters + cluster_mean: torch.Tensor of shape (n_clusters, n_features) + featurewise mean (center) of each cluster; of course not the true mean, but rather the mean according to which the elements of the cluster are sampled. + cluster_std: torch.Tensor of shape (n_clusters, n_features, n_features), or (n_clusters,) + featurewise standard deviation of each cluster from the mean value; of course not the true std, but rather the std according to which the elements of the cluster are sampled. + If shape is (n_clusters,), std is assumed to be the same in each direction for each cluster + cluster_weight: torch.Tensor of shape (n_clusters,), optional + On each process, cluster_weight is assumed to be a torch.Tensor whose entries add up to 1. The i-th entry of cluster_weight on process p specified which amount of the samples on process p + is sampled according to the distribution of cluster i. Thus, this parameter allows to distribute the n_cluster clusters unevenly over the processes. + If None, each cluster is distributed evenly over all processes. + device: Optional[str] = None, + The device on which the data is stored. If None, the default device is used. + """ + device = ht.devices.sanitize_device(device) + + if cluster_weight is None: + cluster_weight = torch.ones(n_clusters) / n_clusters + else: + if not isinstance(cluster_weight, torch.Tensor): + raise TypeError( + "cluster_weight must be None or a torch.Tensor, but is {}".format( + type(cluster_weight) + ) + ) + elif not cluster_weight.shape == (n_clusters,): + raise ValueError( + "If a torch.Tensor, cluster_weight must be of shape (n_clusters,), but is {}".format( + cluster_weight.shape + ) + ) + elif not torch.allclose(torch.sum(cluster_weight), torch.tensor(1.0)): + raise ValueError( + "If a torch.Tensor, cluster_weight must add up to 1, but adds up to {}".format( + torch.sum(cluster_weight) + ) + ) + if not isinstance(cluster_mean, torch.Tensor): + raise TypeError("cluster_mean must be a torch.Tensor, but is {}".format(type(cluster_mean))) + elif not cluster_mean.shape == (n_clusters, n_features): + raise ValueError( + "cluster_mean must be of shape (n_clusters, n_features), but is {}".format( + cluster_mean.shape + ) + ) + if not isinstance(cluster_std, torch.Tensor): + raise TypeError("cluster_std must be a torch.Tensor, but is {}".format(type(cluster_std))) + elif not cluster_std.shape == ( + n_clusters, + n_features, + n_features, + ) and not cluster_std.shape == (n_clusters,): + raise ValueError( + "cluster_std must be of shape (n_clusters, n_features, n_features) or (n_clusters,), but is {}".format( + cluster_std.shape + ) + ) + if cluster_std.shape == (n_clusters,): + cluster_std = torch.stack( + [torch.eye(n_features) * cluster_std[k] for k in range(n_clusters)], dim=0 + ) + + global_shape = (n_samples, n_features) + local_shape = ht.MPI_WORLD.chunk(global_shape, 0)[1] + local_size_of_clusters = [int(local_shape[0] * cluster_weight[k]) for k in range(n_clusters)] + if sum(local_size_of_clusters) != local_shape[0]: + local_size_of_clusters[0] += local_shape[0] - sum(local_size_of_clusters) + distributions = [ + torch.distributions.multivariate_normal.MultivariateNormal( + cluster_mean[k, :], cluster_std[k] + ) + for k in range(n_clusters) + ] + local_data = [ + distributions[k].sample((local_size_of_clusters[k],)).to(device.torch_device) + for k in range(n_clusters) + ] + local_data = torch.cat(local_data, dim=0) + rand_perm = torch.randperm(local_shape[0], device=device.torch_device) + local_data = local_data[rand_perm, :] + data = ht.DNDarray( + local_data, + global_shape, + dtype=ht.float32, + split=0, + device=device, + comm=ht.MPI_WORLD, + balanced=True, + ) + return data diff --git a/heat/utils/tests/test_vision_transforms.py b/tests/utils/test_vision_transforms.py similarity index 100% rename from heat/utils/tests/test_vision_transforms.py rename to tests/utils/test_vision_transforms.py