Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions .github/workflows/pytests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,10 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: [ 3.8 ]
python-version: [ 3.9 ]
max-parallel: 5
env:
coverage-on-version: 3.8
coverage-on-version: 3.9
use-mpi: True

steps:
Expand All @@ -39,7 +39,7 @@ jobs:
run: conda install pip

- name: Install pytest requirements from pip (specific pymatnext dependencies will be automatically installed when it is)
run: pip install wheel setuptools ruff pytest pytest-cov
run: pip install wheel setuptools ruff pytest pytest-cov pytest-timeout

- name: Install latest ASE from gitlab
run: pip install git+https://gitlab.com/ase/ase.git
Expand Down Expand Up @@ -72,7 +72,7 @@ jobs:
- name: Test with pytest - coverage
if: env.coverage-on-version == matrix.python-version
run: |
pytest -v --cov=pymatnext --cov-report term --cov-report html --cov-config=tests/.coveragerc --cov-report term-missing --cov-report term:skip-covered -rxXs
pytest -v --cov=pymatnext --cov-report term --cov-report html --cov-config=tests/.coveragerc --cov-report term-missing --cov-report term:skip-covered -s -rxXs

# # DEBUGGING
# - name: Setup tmate session
Expand All @@ -84,13 +84,13 @@ jobs:
if: ${{ env.use-mpi && env.coverage-on-version != matrix.python-version}}
run: |
# envvar and test run - No coverage
mpirun -n 2 pytest --with-mpi -k mpi
mpirun -n 2 pytest --with-mpi -k mpi -rxXs

- name: MPI tests -- coverage
if: ${{ env.use-mpi && env.coverage-on-version == matrix.python-version}}
run: |
# envvar and coverage Appended to the previous
mpirun -n 2 pytest --cov=pymatnext --cov-report term --cov-config=tests/.coveragerc --cov-report term-missing --cov-report term:skip-covered --with-mpi -k mpi --cov-append
mpirun -n 2 pytest --cov=pymatnext --cov-report term --cov-config=tests/.coveragerc --cov-report term-missing --cov-report term:skip-covered --cov-append --with-mpi -k mpi -s -rxXs

- name: 'Upload Coverage Data'
uses: actions/upload-artifact@v4
Expand Down
16 changes: 16 additions & 0 deletions pymatnext/ns.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ def __init__(self, params_ns, comm, MPI, random_seed, params_configs, output_fil
self.max_val = None

self.local_configs = []
self.extra_config = False

old_state_files = NS._old_state_files(output_filename_prefix)
if len(old_state_files) > 0:
Expand Down Expand Up @@ -202,9 +203,14 @@ def init_configs(self, params_configs, configs_file=None, extra=False):
self.local_configs = []
if self.comm.rank == 0:
for config_i, new_config in enumerate(new_configs_generator()):
if config_i == 0:
first_config = new_config
if config_i >= self.n_configs_global:
raise RuntimeError(f"Got too many configs (expected {self.n_configs_global}) from new config generator {new_configs_generator}")

# Check that all step sizes are the same. Maybe instead we should just copy from first?
assert new_config.step_size == first_config.step_size, f"Mismatched step size for config {config_i} {new_config.step_size} != 0 {first_config.step_size}"

target_rank = config_i // self.max_n_configs_local
if target_rank == self.comm.rank:
self.local_configs.append(new_config)
Expand Down Expand Up @@ -352,6 +358,13 @@ def step_size_tune(self, n_configs=1, min_accept_rate=0.25, max_accept_rate=0.5,
print("step_size_tune initial", name, "size", size, "max", max_size, "freq", freq)
first_iter = False

# It looks like the following should always give the same values, hence exit
# condition, on all MPI tasks, but this is not guaranteed and can lead to deadlocks
# in the allreduce. The reason is that the value of done_i in the loop depends
# on the value returned from _tune_from_accept_rate, which depends on the previous
# step size, and if those are inconsistent between MPI tasks (as in
# https://github.com/libAtoms/pymatnext/issues/20), a deadlock may occur.
# Only fix is to make sure this doesn't happen (https://github.com/libAtoms/pymatnext/pull/23)
done = []
for param_i in range(n_params):
if accept_freq[param_i][0] > 0:
Expand All @@ -369,6 +382,9 @@ def step_size_tune(self, n_configs=1, min_accept_rate=0.25, max_accept_rate=0.5,
new_step_size = {k: v * m for k, v, m in zip(step_size_names, step_size, max_step_size)}
for ns_config in self.local_configs:
ns_config.step_size = new_step_size
# make sure that config used as buffer also has correct step_size
if self.extra_config:
self.extra_config.step_size = new_step_size

# if self.comm.rank == 0:
# print("step_size_tune done", list(zip(done, accept_freq, step_size)))
Expand Down
4 changes: 2 additions & 2 deletions tests/assets/do_Morse_ASE/params.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,15 @@
output_filename_prefix = "Morse_ASE"
random_seed = 5

max_iter = 100
max_iter = 110

stdout_report_interval_s = 10

snapshot_interval = 50

[global.step_size_tune]

interval = 1000
interval = 50

[ns]

Expand Down
13 changes: 9 additions & 4 deletions tests/test_sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,13 @@ def test_Morse_ASE_mpi(mpi_tmp_path, monkeypatch):
do_Morse_ASE(mpi_tmp_path, monkeypatch, using_mpi=True)

@pytest.mark.mpi
# github CI working test takes ~90 s
@pytest.mark.timeout(120, method="thread")
def test_Morse_ASE_restart_mpi(mpi_tmp_path, monkeypatch):
import time
t0 = time.time()
do_Morse_ASE_restart(mpi_tmp_path, monkeypatch, using_mpi=True)
print("BOB time", time.time() - t0)

@pytest.mark.mpi
def test_EAM_LAMMPS_mpi(mpi_tmp_path, monkeypatch):
Expand Down Expand Up @@ -151,13 +156,13 @@ def do_Morse_ASE(tmp_path, monkeypatch, using_mpi, max_iter=None):
# files exist
assert (tmp_path / 'Morse_ASE.test.NS_samples').is_file()
assert len(list(tmp_path.glob('Morse_ASE.test.traj.*xyz'))) == 1
assert len(list(ase.io.read(tmp_path / 'Morse_ASE.test.traj.extxyz', ':'))) == max_iter_use // traj_interval
assert len(list(ase.io.read(tmp_path / 'Morse_ASE.test.traj.extxyz', ':'))) == int(np.ceil(max_iter_use / traj_interval))

# from test run 12/8/2022
# from test run 6/13/2025, when max iter was extended to 110 to catch deadlock in old buggy snapshot step_size writing
if using_mpi:
samples_fields_ref = np.asarray([9.90000000e+01, 8.04691943e+00, 1.28925862e+04, 1.60000000e+01])
samples_fields_ref = np.asarray([1.09000000e+02, 8.02719236e+00, 1.28609799e+04, 1.60000000e+01])
else:
samples_fields_ref = np.asarray([9.90000000e+01, 8.08011910e+00, 1.29457779e+04, 1.60000000e+01])
samples_fields_ref = np.asarray([1.09000000e+02, 8.06422068e+00, 1.29203058e+04, 1.60000000e+01])

with open(tmp_path / 'Morse_ASE.test.NS_samples') as fin:
for l in fin:
Expand Down