diff --git a/.github/workflows/pytests.yml b/.github/workflows/pytests.yml index 0fba0b5..c420878 100644 --- a/.github/workflows/pytests.yml +++ b/.github/workflows/pytests.yml @@ -14,10 +14,10 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: [ 3.8 ] + python-version: [ 3.9 ] max-parallel: 5 env: - coverage-on-version: 3.8 + coverage-on-version: 3.9 use-mpi: True steps: @@ -39,7 +39,7 @@ jobs: run: conda install pip - name: Install pytest requirements from pip (specific pymatnext dependencies will be automatically installed when it is) - run: pip install wheel setuptools ruff pytest pytest-cov + run: pip install wheel setuptools ruff pytest pytest-cov pytest-timeout - name: Install latest ASE from gitlab run: pip install git+https://gitlab.com/ase/ase.git @@ -72,7 +72,7 @@ jobs: - name: Test with pytest - coverage if: env.coverage-on-version == matrix.python-version run: | - pytest -v --cov=pymatnext --cov-report term --cov-report html --cov-config=tests/.coveragerc --cov-report term-missing --cov-report term:skip-covered -rxXs + pytest -v --cov=pymatnext --cov-report term --cov-report html --cov-config=tests/.coveragerc --cov-report term-missing --cov-report term:skip-covered -s -rxXs # # DEBUGGING # - name: Setup tmate session @@ -84,13 +84,13 @@ jobs: if: ${{ env.use-mpi && env.coverage-on-version != matrix.python-version}} run: | # envvar and test run - No coverage - mpirun -n 2 pytest --with-mpi -k mpi + mpirun -n 2 pytest --with-mpi -k mpi -rxXs - name: MPI tests -- coverage if: ${{ env.use-mpi && env.coverage-on-version == matrix.python-version}} run: | # envvar and coverage Appended to the previous - mpirun -n 2 pytest --cov=pymatnext --cov-report term --cov-config=tests/.coveragerc --cov-report term-missing --cov-report term:skip-covered --with-mpi -k mpi --cov-append + mpirun -n 2 pytest --cov=pymatnext --cov-report term --cov-config=tests/.coveragerc --cov-report term-missing --cov-report term:skip-covered --cov-append --with-mpi -k mpi -s -rxXs - name: 'Upload Coverage Data' uses: actions/upload-artifact@v4 diff --git a/pymatnext/ns.py b/pymatnext/ns.py index 0a0bba9..6c55ccf 100644 --- a/pymatnext/ns.py +++ b/pymatnext/ns.py @@ -73,6 +73,7 @@ def __init__(self, params_ns, comm, MPI, random_seed, params_configs, output_fil self.max_val = None self.local_configs = [] + self.extra_config = False old_state_files = NS._old_state_files(output_filename_prefix) if len(old_state_files) > 0: @@ -202,9 +203,14 @@ def init_configs(self, params_configs, configs_file=None, extra=False): self.local_configs = [] if self.comm.rank == 0: for config_i, new_config in enumerate(new_configs_generator()): + if config_i == 0: + first_config = new_config if config_i >= self.n_configs_global: raise RuntimeError(f"Got too many configs (expected {self.n_configs_global}) from new config generator {new_configs_generator}") + # Check that all step sizes are the same. Maybe instead we should just copy from first? + assert new_config.step_size == first_config.step_size, f"Mismatched step size for config {config_i} {new_config.step_size} != 0 {first_config.step_size}" + target_rank = config_i // self.max_n_configs_local if target_rank == self.comm.rank: self.local_configs.append(new_config) @@ -352,6 +358,13 @@ def step_size_tune(self, n_configs=1, min_accept_rate=0.25, max_accept_rate=0.5, print("step_size_tune initial", name, "size", size, "max", max_size, "freq", freq) first_iter = False + # It looks like the following should always give the same values, hence exit + # condition, on all MPI tasks, but this is not guaranteed and can lead to deadlocks + # in the allreduce. The reason is that the value of done_i in the loop depends + # on the value returned from _tune_from_accept_rate, which depends on the previous + # step size, and if those are inconsistent between MPI tasks (as in + # https://github.com/libAtoms/pymatnext/issues/20), a deadlock may occur. + # Only fix is to make sure this doesn't happen (https://github.com/libAtoms/pymatnext/pull/23) done = [] for param_i in range(n_params): if accept_freq[param_i][0] > 0: @@ -369,6 +382,9 @@ def step_size_tune(self, n_configs=1, min_accept_rate=0.25, max_accept_rate=0.5, new_step_size = {k: v * m for k, v, m in zip(step_size_names, step_size, max_step_size)} for ns_config in self.local_configs: ns_config.step_size = new_step_size + # make sure that config used as buffer also has correct step_size + if self.extra_config: + self.extra_config.step_size = new_step_size # if self.comm.rank == 0: # print("step_size_tune done", list(zip(done, accept_freq, step_size))) diff --git a/tests/assets/do_Morse_ASE/params.toml b/tests/assets/do_Morse_ASE/params.toml index a3eb281..d4e9af9 100644 --- a/tests/assets/do_Morse_ASE/params.toml +++ b/tests/assets/do_Morse_ASE/params.toml @@ -3,7 +3,7 @@ output_filename_prefix = "Morse_ASE" random_seed = 5 - max_iter = 100 + max_iter = 110 stdout_report_interval_s = 10 @@ -11,7 +11,7 @@ [global.step_size_tune] - interval = 1000 + interval = 50 [ns] diff --git a/tests/test_sample.py b/tests/test_sample.py index f76518f..c603e9f 100644 --- a/tests/test_sample.py +++ b/tests/test_sample.py @@ -46,8 +46,13 @@ def test_Morse_ASE_mpi(mpi_tmp_path, monkeypatch): do_Morse_ASE(mpi_tmp_path, monkeypatch, using_mpi=True) @pytest.mark.mpi +# github CI working test takes ~90 s +@pytest.mark.timeout(120, method="thread") def test_Morse_ASE_restart_mpi(mpi_tmp_path, monkeypatch): + import time + t0 = time.time() do_Morse_ASE_restart(mpi_tmp_path, monkeypatch, using_mpi=True) + print("BOB time", time.time() - t0) @pytest.mark.mpi def test_EAM_LAMMPS_mpi(mpi_tmp_path, monkeypatch): @@ -151,13 +156,13 @@ def do_Morse_ASE(tmp_path, monkeypatch, using_mpi, max_iter=None): # files exist assert (tmp_path / 'Morse_ASE.test.NS_samples').is_file() assert len(list(tmp_path.glob('Morse_ASE.test.traj.*xyz'))) == 1 - assert len(list(ase.io.read(tmp_path / 'Morse_ASE.test.traj.extxyz', ':'))) == max_iter_use // traj_interval + assert len(list(ase.io.read(tmp_path / 'Morse_ASE.test.traj.extxyz', ':'))) == int(np.ceil(max_iter_use / traj_interval)) - # from test run 12/8/2022 + # from test run 6/13/2025, when max iter was extended to 110 to catch deadlock in old buggy snapshot step_size writing if using_mpi: - samples_fields_ref = np.asarray([9.90000000e+01, 8.04691943e+00, 1.28925862e+04, 1.60000000e+01]) + samples_fields_ref = np.asarray([1.09000000e+02, 8.02719236e+00, 1.28609799e+04, 1.60000000e+01]) else: - samples_fields_ref = np.asarray([9.90000000e+01, 8.08011910e+00, 1.29457779e+04, 1.60000000e+01]) + samples_fields_ref = np.asarray([1.09000000e+02, 8.06422068e+00, 1.29203058e+04, 1.60000000e+01]) with open(tmp_path / 'Morse_ASE.test.NS_samples') as fin: for l in fin: