Skip to content

Simulation Tests (Nightly) #231

Simulation Tests (Nightly)

Simulation Tests (Nightly) #231

name: Simulation Tests (Nightly)
# Long-running and large-scale simulation tests too slow for PR CI.
# Medium-scale fdev tests (50 nodes, fault tolerance, high latency, churn
# resilience) now run in the main CI workflow. This workflow covers:
# - Nightly-gated nextest tests (1h virtual time, 250-contract scale)
# - Large scale (500 nodes, 10000 events)
on:
schedule:
# Run at 3 AM UTC every day
- cron: "0 3 * * *"
workflow_dispatch:
# Allow manual triggering from any branch
inputs:
ref:
description: "Git ref to checkout (branch, tag, or SHA). Leave empty to use the branch selected in the UI."
required: false
default: ""
seed:
description: "Simulation seed for reproducibility (hex, e.g., 0xDEADBEEF)"
required: false
default: ""
# Cancel in-progress runs when a new run is triggered
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
large-scale-simulation:
name: Large Scale Simulation
runs-on: ubicloud-standard-16
timeout-minutes: 150
env:
RUST_LOG: info,turmoil=warn
CARGO_TARGET_DIR: ${{ github.workspace }}/target
RUST_MIN_STACK: 16777216
# Use mold linker to avoid rust-lld crashes
CARGO_TARGET_X86_64_UNKNOWN_LINUX_GNU_LINKER: clang
CARGO_TARGET_X86_64_UNKNOWN_LINUX_GNU_RUSTFLAGS: -C link-arg=-fuse-ld=mold
# Simulation seed - use input if provided, otherwise use fixed seed for reproducibility
SIMULATION_SEED: ${{ github.event.inputs.seed || '0xDEADBEEF' }}
steps:
- uses: actions/checkout@v7
with:
ref: ${{ github.event.inputs.ref || github.ref }}
- name: Show checked out ref
run: |
echo "Checked out ref: $(git rev-parse --abbrev-ref HEAD || git rev-parse HEAD)"
echo "Commit: $(git rev-parse HEAD)"
- uses: dtolnay/rust-toolchain@stable
with:
toolchain: 1.93.0
- uses: Swatinem/rust-cache@v2
with:
prefix-key: simulation-large
save-if: ${{ github.ref == 'refs/heads/main' }}
- name: Install mold linker
run: |
sudo apt-get update
sudo apt-get install -y mold
- name: Install nextest
run: curl -LsSf https://get.nexte.st/latest/linux | tar zxf - -C ${CARGO_HOME:-~/.cargo}/bin
- name: Clean test directories
run: |
rm -rf /tmp/freenet /tmp/freenet-* 2>/dev/null || true
# Build fdev for simulation tests
- name: Build fdev
run: cargo build -p fdev --release
# Medium scale with alternate seed (explores different code paths).
# The primary seed (0xDEADBEEF) runs in PR CI; this covers a second path.
- name: Medium scale test (50 nodes, seed 0xCAFEBABE)
run: |
target/release/fdev test \
--name "nightly-medium-50-alt" \
--seed 0xCAFEBABE \
--gateways 4 --nodes 46 --events 2000 \
--ring-max-htl 12 --max-connections 20 --min-connections 6 \
--latency-min 10 --latency-max 50 \
--min-success-rate 1.0 \
--print-summary --print-network-stats \
single-process
# Nightly-gated simulation tests (long-running, high-scale regression tests)
# Uses Turmoil deterministic scheduler for reproducible results
# Includes:
# - test_long_running_deterministic: 1 hour virtual time
# - test_subscription_renewal_at_scale: 250 contracts regression test
# NOTE: Gated by nightly_tests feature — does NOT run in regular CI
- name: Nightly simulation tests (long-running + high-scale)
run: |
echo "Running all nightly simulation tests (gated by nightly_tests feature)"
cargo nextest run -p freenet \
--features "simulation_tests,testing,nightly_tests" \
--test simulation_integration \
--test-threads 1 \
--no-capture \
--profile nightly
# Large scale test (500 nodes) - uses direct runner (no turmoil overhead)
# With simulation-mode timer optimizations (~5x fewer timer firings),
# completes in ~4 min locally for 10000 events.
- name: Large scale test (500 nodes)
timeout-minutes: 60
env:
RUST_LOG: warn
run: |
echo "Running large scale simulation (500 nodes, 10000 events)"
target/release/fdev test \
--name "nightly-large-500" \
--seed 0x500BEEF \
--gateways 10 \
--nodes 490 \
--events 10000 \
--ring-max-htl 15 \
--max-connections 30 \
--min-connections 10 \
--latency-min 10 \
--latency-max 50 \
--min-success-rate 1.0 \
--print-summary \
--print-network-stats \
single-process
# Real-process soak test - disabled until the test runs properly
# TODO: Re-enable once large_network test is stable
# - name: Run large network soak test (50+ real nodes)
# run: |
# echo "Running large network soak test with seed: $SIMULATION_SEED"
# if command -v riverctl &> /dev/null; then
# cargo nextest run -p freenet --test large_network \
# -E 'test(large_network_soak)' \
# --test-threads 1 \
# --no-capture \
# --profile nightly
# else
# echo "Skipping large_network test - riverctl not installed"
# fi
# Notify the Freenet dev room on failure.
#
# This used to post to Matrix via a direct curl against the matrix.org REST
# API using a MATRIX_ACCESS_TOKEN secret. That token expired at some point
# and every nightly failure since 2026-04-07 was also failing to deliver
# the notification (HTTP 401 M_UNKNOWN_TOKEN). Rather than rotate another
# standalone token, consolidate on the same transport
# `river_pr_merge_notify.yml` already uses: `riverctl` against the Freenet
# gateway, authenticated with the shared `RIVER_SIGNING_KEY` secret. One
# set of credentials to maintain, and the message ends up in the same
# Freenet room subscribers are already watching.
notify-failure:
name: Notify River on Failure
runs-on: ubuntu-latest
timeout-minutes: 20
needs: large-scale-simulation
if: failure() && needs.large-scale-simulation.result == 'failure'
continue-on-error: true
steps:
- name: Install Rust
uses: dtolnay/rust-toolchain@stable
with:
toolchain: 1.93.0
- name: Install riverctl
run: cargo install riverctl
- name: Send failure message to River (with exponential backoff)
env:
RIVER_SIGNING_KEY: ${{ secrets.RIVER_SIGNING_KEY }}
run: |
MESSAGE="🚨 Nightly Simulation Tests Failed - ${{ github.repository }} - branch: ${{ github.ref_name }} - commit: ${{ github.sha }} - seed: ${{ github.event.inputs.seed || '0xDEADBEEF' }} - ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
NODE_URL="${{ secrets.RIVER_GATEWAY_URL }}"
ROOM_ID="${{ secrets.RIVER_ROOM_ID }}"
max_attempts=7
delay=10
for attempt in $(seq 1 $max_attempts); do
echo "Attempt $attempt/$max_attempts (delay: ${delay}s)..."
if riverctl --node-url "$NODE_URL" message send "$ROOM_ID" "$MESSAGE" 2>&1; then
echo "Message sent successfully on attempt $attempt"
exit 0
fi
if [ "$attempt" -lt "$max_attempts" ]; then
echo "Failed, retrying in ${delay}s..."
sleep "$delay"
delay=$((delay * 2))
fi
done
echo "::warning::Failed to send River notification after $max_attempts attempts (gateway may be restarting)"
exit 0
# Mirror the nightly-failure notification into the private freenet-dev room
# (the existing notify-failure job above posts to the public official room).
notify-dev-room-failure:
name: Notify dev room on Failure
runs-on: ubuntu-latest
needs: large-scale-simulation
if: failure() && needs.large-scale-simulation.result == 'failure'
continue-on-error: true
steps:
- uses: actions/checkout@v7
- uses: ./.github/actions/river-dev-notify
with:
message: "🚨 Nightly simulation tests failed — ${{ github.repository }} — branch ${{ github.ref_name }} — commit ${{ github.sha }} — seed ${{ github.event.inputs.seed || '0xDEADBEEF' }} — ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
bot-config: ${{ secrets.RIVER_DEV_BOT_CONFIG }}
room-id: ${{ secrets.RIVER_DEV_ROOM_ID }}
gateway-url: ${{ secrets.RIVER_GATEWAY_URL }}