perf(broadphase): integrate commits 1+2+3 into LDS path (no cooperative phase)

gpinkert · yaoliu13 · commit 8ebf5c6360e1 · 2026-05-05T17:37:59.000-07:00
Adapts perf/broadphase-stack commits 1+2+3 to live on top of PR #56's LDS infrastructure. Cooperative parallel re-fill (the original commit 4) is intentionally excluded for this experiment because adding qd.simt.block.sync() to a loop that doesn't fill block_dim=64 (e.g. n_envs=1 -> 4 active lanes) triggers a workgroup-barrier UB and a GPU memory-access fault. Changes inside func_broad_phase_lds: * Replace the now-removed func_collision_clear() kernel call with the per-env helper func_collision_clear_per_env(i_b, ...). * Hoist n_eq_static / n_eq_dyn at the top of the per-env block and pass them to all 3 func_check_collision_valid call sites in this path. * Switch lds_sort_packed dtype from gs.qd_int to qd.u32 to match the StructSortBuffer.i_g_packed encoding contract. * Replace open-coded (i_g << 1) / >> 1 / & 1 patterns with func_pack_event / func_unpack_i_g / func_unpack_is_max throughout. * Fix the post-cherry-pick broken sort_buffer.{is_max, i_g} refs (8 sites) to use the renamed sort_buffer.i_g_packed field via helpers. * Collapse the write-back loop from 2 stores per event (separate i_g + is_max) to 1 store (i_g_packed).
diff --git a/genesis/engine/solvers/rigid/collider/broadphase.py b/genesis/engine/solvers/rigid/collider/broadphase.py
@@ -190,30 +190,29 @@ def func_broad_phase_lds(
     collider_info: array_class.ColliderInfo,
     errno: array_class.V_ANNOTATION,
 ):
-    """
-    Sweep and Prune (SAP) for broad-phase collision detection.
-
-    This function sorts the geometry axis-aligned bounding boxes (AABBs) along a specified axis and checks for
-    potential collision pairs based on the AABB overlap.
-
-    The optimized LDS path primarily targets use_hibernation=False.
-    The hibernation path keeps the original active_buffer_awake/hib logic.
+    """Sweep and Prune (SAP) for broad-phase collision detection -- LDS path.
+
+    Integrates broadphase-stack commits 1+2+3 on top of PR #56's LDS scaffolding:
+    * Commit 1: replace func_collision_clear() kernel-call with the per-env
+      helper func_collision_clear_per_env(i_b, ...) inlined here.
+    * Commit 2: hoist n_eq_static / n_eq_dyn out of the per-pair check.
+    * Commit 3: pack i_g + is_max into a u32 single-word; use func_pack_event /
+      func_unpack_i_g / func_unpack_is_max helpers; collapse 2-store sort_buffer
+      write-back to a single i_g_packed store.
+
+    Cooperative parallel warm-start (the original commit 4) is intentionally
+    omitted from this experiment because adding qd.simt.block.sync() to a loop
+    that doesn't fill block_dim=64 (e.g. n_envs=1 -> 4 active lanes) triggers
+    a workgroup-barrier UB / GPU memory access fault. Lane-0-only execution
+    is preserved here.
     """
     n_geoms, _B = collider_state.active_buffer.shape
     n_links = links_info.geom_start.shape[0]
 
-    # Clear collider state
-    func_collision_clear(links_state, links_info, collider_state, static_rigid_sim_config)
-
     MAX_GEOMS_NUM = qd.static(MAX_GEOMS_IN_LDS)
     MAX_SORT_ELEM_NUM = qd.static(MAX_GEOMS_NUM * 2)
-
     BLOCK_DIM = qd.static(64)
     ENVS_PER_BLOCK = qd.static(16)
-
-    # Only one lane out of THREADS_PER_ENV currently processes one env.
-    # THREADS_PER_ENV is used to map 16 envs to one 64-thread workgroup and
-    # reserve one LDS slot per env.
     THREADS_PER_ENV = qd.static(BLOCK_DIM // ENVS_PER_BLOCK)
 
     qd.loop_config(serialize=static_rigid_sim_config.para_level < gs.PARA_LEVEL.ALL, block_dim=BLOCK_DIM)
@@ -223,107 +222,97 @@ def func_broad_phase_lds(
             continue
 
         lds_sort_value = qd.simt.block.SharedArray((ENVS_PER_BLOCK, MAX_SORT_ELEM_NUM), gs.qd_float)
-
-        # Packed format: lds_sort_i_g_packed = (i_g << 1) | is_max_bit
-        lds_sort_packed = qd.simt.block.SharedArray((ENVS_PER_BLOCK, MAX_SORT_ELEM_NUM), gs.qd_int)
-
-        # Don't need to copy `collider_state.active_buffer` into `lds_active` before using it.
-        # Because the sweep below starts with `n_active = 0` and rebuilds the set from scratch.
+        # Packed format: bit 0 = is_max, bits 1..31 = i_g (matches StructSortBuffer.i_g_packed).
+        lds_sort_packed = qd.simt.block.SharedArray((ENVS_PER_BLOCK, MAX_SORT_ELEM_NUM), qd.u32)
+        # No need to copy collider_state.active_buffer; sweep starts with n_active=0.
         lds_active = qd.simt.block.SharedArray((ENVS_PER_BLOCK, MAX_GEOMS_NUM), gs.qd_int)
-        
-        i_b_lds = i_b % ENVS_PER_BLOCK
 
+        i_b_lds = i_b % ENVS_PER_BLOCK
         axis = 0
 
-        # Calculate the number of active geoms for this environment
-        # (for heterogeneous entities, different envs may have different geoms)
+        # Commit 1: per-env contact-clear helper, inlined into this kernel.
+        func_collision_clear_per_env(i_b, links_state, links_info, collider_state, static_rigid_sim_config)
+
+        # Commit 2: hoist equality bounds out of the per-pair check.
+        n_eq_static = rigid_global_info.n_equalities[None]
+        n_eq_dyn = constraint_state.qd_n_equalities[i_b]
+
+        # Calculate the number of active geoms for this environment.
         env_n_geoms = 0
         for i_l in range(n_links):
             I_l = [i_l, i_b] if qd.static(static_rigid_sim_config.batch_links_info) else i_l
             env_n_geoms = env_n_geoms + links_info.geom_end[I_l] - links_info.geom_start[I_l]
 
-        # copy updated geom aabbs to buffer for sorting
+        # copy updated geom aabbs to LDS for sorting
         if collider_state.first_time[i_b]:
             i_buffer = 0
             for i_l in range(n_links):
                 I_l = [i_l, i_b] if qd.static(static_rigid_sim_config.batch_links_info) else i_l
                 for i_g in range(links_info.geom_start[I_l], links_info.geom_end[I_l]):
                     lds_sort_value[i_b_lds, 2 * i_buffer] = geoms_state.aabb_min[i_g, i_b][axis]
-                    lds_sort_packed[i_b_lds, 2 * i_buffer] = i_g << 1 # is_max = 0
-
+                    lds_sort_packed[i_b_lds, 2 * i_buffer] = func_pack_event(i_g, False)
                     lds_sort_value[i_b_lds, 2 * i_buffer + 1] = geoms_state.aabb_max[i_g, i_b][axis]
-                    lds_sort_packed[i_b_lds, 2 * i_buffer + 1] = (i_g << 1) | 1 # is_max = 1
-
+                    lds_sort_packed[i_b_lds, 2 * i_buffer + 1] = func_pack_event(i_g, True)
                     geoms_state.min_buffer_idx[i_buffer, i_b] = 2 * i_g
                     geoms_state.max_buffer_idx[i_buffer, i_b] = 2 * i_g + 1
                     i_buffer = i_buffer + 1
-
             collider_state.first_time[i_b] = False
-
         else:
-            if qd.static(not static_rigid_sim_config.use_hibernation):
-                for i in range(env_n_geoms * 2):
-                    is_max = collider_state.sort_buffer.is_max[i, i_b]
-                    i_g = collider_state.sort_buffer.i_g[i, i_b]
+            # Warm-start re-fill: read packed event from global sort_buffer, decode via
+            # helpers, look up new aabb extent in LDS slot.
+            for i in range(env_n_geoms * 2):
+                packed = collider_state.sort_buffer.i_g_packed[i, i_b]
+                is_max = func_unpack_is_max(packed)
+                i_g = func_unpack_i_g(packed)
+                lds_sort_packed[i_b_lds, i] = packed
+                if qd.static(not static_rigid_sim_config.use_hibernation):
                     if is_max:
                         lds_sort_value[i_b_lds, i] = geoms_state.aabb_max[i_g, i_b][axis]
                     else:
                         lds_sort_value[i_b_lds, i] = geoms_state.aabb_min[i_g, i_b][axis]
+                else:
+                    lds_sort_value[i_b_lds, i] = collider_state.sort_buffer.value[i, i_b]
 
-                    lds_sort_packed[i_b_lds, i] = (i_g << 1) | qd.cast(is_max, gs.qd_int)
-            else:
-                for i in range(env_n_geoms * 2):
-                    is_max = collider_state.sort_buffer.is_max[i, i_b]
-                    i_g = collider_state.sort_buffer.i_g[i, i_b]
-                    value = collider_state.sort_buffer.value[i, i_b]
-                    lds_sort_packed[i_b_lds, i] = (i_g << 1) | qd.cast(is_max, gs.qd_int)
-                    lds_sort_value[i_b_lds, i] = value
-
-
-        # insertion sort, which has complexity near O(n) for nearly sorted array
+        # insertion sort, near O(n) for nearly sorted input
         for i in range(1, 2 * env_n_geoms):
             key_value = lds_sort_value[i_b_lds, i]
-            key_packed_ig_ismax = lds_sort_packed[i_b_lds, i]
+            key_packed = lds_sort_packed[i_b_lds, i]
 
             j = i - 1
             while j >= 0 and key_value < lds_sort_value[i_b_lds, j]:
-                packed_ig_ismax = lds_sort_packed[i_b_lds, j]
+                slid_packed = lds_sort_packed[i_b_lds, j]
                 lds_sort_value[i_b_lds, j + 1] = lds_sort_value[i_b_lds, j]
-                lds_sort_packed[i_b_lds, j + 1] = packed_ig_ismax
+                lds_sort_packed[i_b_lds, j + 1] = slid_packed
 
                 if qd.static(static_rigid_sim_config.use_hibernation):
-                    shifted_i_g = packed_ig_ismax >> 1
-                    if packed_ig_ismax & 1:
-                        geoms_state.max_buffer_idx[shifted_i_g, i_b] = j + 1
+                    slid_i_g = func_unpack_i_g(slid_packed)
+                    if func_unpack_is_max(slid_packed):
+                        geoms_state.max_buffer_idx[slid_i_g, i_b] = j + 1
                     else:
-                        geoms_state.min_buffer_idx[shifted_i_g, i_b] = j + 1
+                        geoms_state.min_buffer_idx[slid_i_g, i_b] = j + 1
 
                 j -= 1
             lds_sort_value[i_b_lds, j + 1] = key_value
-            lds_sort_packed[i_b_lds, j + 1] = key_packed_ig_ismax
+            lds_sort_packed[i_b_lds, j + 1] = key_packed
 
             if qd.static(static_rigid_sim_config.use_hibernation):
-                key_i_g = key_packed_ig_ismax >> 1
-                if key_packed_ig_ismax & 1:
+                key_i_g = func_unpack_i_g(key_packed)
+                if func_unpack_is_max(key_packed):
                     geoms_state.max_buffer_idx[key_i_g, i_b] = j + 1
                 else:
                     geoms_state.min_buffer_idx[key_i_g, i_b] = j + 1
 
-        
         n_broad = 0
         if qd.static(not static_rigid_sim_config.use_hibernation):
             n_active = 0
-
             for i in range(2 * env_n_geoms):
-                packed_ig_ismax = lds_sort_packed[i_b_lds, i]
-                is_max = packed_ig_ismax & 1
-                i_g = packed_ig_ismax >> 1
-                
+                packed = lds_sort_packed[i_b_lds, i]
+                is_max = func_unpack_is_max(packed)
+                i_g = func_unpack_i_g(packed)
 
                 if not is_max:
                     min_b0, min_b1, min_b2 = geoms_state.aabb_min[i_g, i_b]
                     max_b0, max_b1, max_b2 = geoms_state.aabb_max[i_g, i_b]
-
                     for j in range(n_active):
                         i_ga = lds_active[i_b_lds, j]
 
@@ -338,7 +327,6 @@ def func_broad_phase_lds(
 
                         min_a0, min_a1, min_a2 = geoms_state.aabb_min[i_ga, i_b]
                         max_a0, max_a1, max_a2 = geoms_state.aabb_max[i_ga, i_b]
-                        
 
                         if (min_a0 > max_b0 or min_a1 > max_b1 or min_a2 > max_b2 or
                             max_a0 < min_b0 or max_a1 < min_b1 or max_a2 < min_b2):
@@ -348,6 +336,8 @@ def func_broad_phase_lds(
                             i_ga_c,
                             i_gb_c,
                             i_b,
+                            n_eq_static,
+                            n_eq_dyn,
                             links_state,
                             links_info,
                             geoms_info,
@@ -369,29 +359,24 @@ def func_broad_phase_lds(
                     lds_active[i_b_lds, n_active] = i_g
                     geoms_state.active_buffer_idx[i_g, i_b] = n_active
                     n_active += 1
-
                 else:
                     j_remove = geoms_state.active_buffer_idx[i_g, i_b]
                     if j_remove < n_active - 1:
-                        # Swap with last element
                         i_g_last = lds_active[i_b_lds, n_active - 1]
                         lds_active[i_b_lds, j_remove] = i_g_last
                         geoms_state.active_buffer_idx[i_g_last, i_b] = j_remove
                     n_active -= 1
-
-            collider_state.n_broad_pairs[i_b] = n_broad
         else:
             if rigid_global_info.n_awake_dofs[i_b] > 0:
                 n_active_awake = 0
                 n_active_hib = 0
                 for i in range(2 * env_n_geoms):
-                    packed_ig_ismax = lds_sort_packed[i_b_lds, i]
-                    i_gb_origin = packed_ig_ismax >> 1
-                    is_max = packed_ig_ismax & 1
+                    packed = lds_sort_packed[i_b_lds, i]
+                    i_gb_origin = func_unpack_i_g(packed)
+                    is_max = func_unpack_is_max(packed)
                     is_incoming_geom_hibernated = geoms_state.hibernated[i_gb_origin, i_b]
 
                     if not is_max:
-                        # both awake and hibernated geom check with active awake geoms
                         for j in range(n_active_awake):
                             i_ga = collider_state.active_buffer_awake[j, i_b]
                             i_gb = i_gb_origin
@@ -402,6 +387,8 @@ def func_broad_phase_lds(
                                 i_ga,
                                 i_gb,
                                 i_b,
+                                n_eq_static,
+                                n_eq_dyn,
                                 links_state,
                                 links_info,
                                 geoms_info,
@@ -414,7 +401,6 @@ def func_broad_phase_lds(
                                 continue
 
                             if not func_is_geom_aabbs_overlap(geoms_state, i_ga, i_gb, i_b):
-                                # Clear collision normal cache if not in contact
                                 if qd.static(not static_rigid_sim_config.enable_mujoco_compatibility):
                                     i_pair = collider_info.collision_pair_idx[i_ga, i_gb]
                                     collider_state.contact_cache.normal[i_pair, i_b] = qd.Vector.zero(gs.qd_float, 3)
@@ -424,7 +410,6 @@ def func_broad_phase_lds(
                             collider_state.broad_collision_pairs[n_broad, i_b][1] = i_gb
                             n_broad = n_broad + 1
 
-                        # if incoming geom is awake, also need to check with hibernated geoms
                         if not is_incoming_geom_hibernated:
                             for j in range(n_active_hib):
                                 i_ga = collider_state.active_buffer_hib[j, i_b]
@@ -436,6 +421,8 @@ def func_broad_phase_lds(
                                     i_ga,
                                     i_gb,
                                     i_b,
+                                    n_eq_static,
+                                    n_eq_dyn,
                                     links_state,
                                     links_info,
                                     geoms_info,
@@ -448,7 +435,6 @@ def func_broad_phase_lds(
                                     continue
 
                                 if not func_is_geom_aabbs_overlap(geoms_state, i_ga, i_gb, i_b):
-                                    # Clear collision normal cache if not in contact
                                     i_pair = collider_info.collision_pair_idx[i_ga, i_gb]
                                     collider_state.contact_cache.normal[i_pair, i_b] = qd.Vector.zero(gs.qd_float, 3)
                                     continue
@@ -470,42 +456,34 @@ def func_broad_phase_lds(
                                 if collider_state.active_buffer_hib[j, i_b] == i_g_to_remove:
                                     if j < n_active_hib - 1:
                                         for k in range(j, n_active_hib - 1):
-                                            collider_state.active_buffer_hib[k, i_b] = collider_state.active_buffer_hib[
-                                                k + 1, i_b
-                                            ]
+                                            collider_state.active_buffer_hib[k, i_b] = collider_state.active_buffer_hib[k + 1, i_b]
                                     n_active_hib = n_active_hib - 1
                                     break
                         else:
                             for j in range(n_active_awake):
                                 if collider_state.active_buffer_awake[j, i_b] == i_g_to_remove:
                                     if j < n_active_awake - 1:
                                         for k in range(j, n_active_awake - 1):
-                                            collider_state.active_buffer_awake[k, i_b] = (
-                                                collider_state.active_buffer_awake[k + 1, i_b]
-                                            )
+                                            collider_state.active_buffer_awake[k, i_b] = collider_state.active_buffer_awake[k + 1, i_b]
                                     n_active_awake = n_active_awake - 1
                                     break
 
+        # Write-back to global sort_buffer for next step's warm-start.
+        # Single i_g_packed store per event (commit 3 dtype change collapsed
+        # what used to be two stores into one).
         for i in range(env_n_geoms):
-
             if qd.static(static_rigid_sim_config.use_hibernation):
                 collider_state.sort_buffer.value[2 * i, i_b] = lds_sort_value[i_b_lds, 2 * i]
                 collider_state.sort_buffer.value[2 * i + 1, i_b] = lds_sort_value[i_b_lds, 2 * i + 1]
-
-            packed_ig_ismax = lds_sort_packed[i_b_lds, 2 * i]
-            collider_state.sort_buffer.i_g[2 * i, i_b] = packed_ig_ismax >> 1
-            collider_state.sort_buffer.is_max[2 * i, i_b] = qd.cast(packed_ig_ismax & 1, gs.qd_bool)
-
-            packed_ig_ismax = lds_sort_packed[i_b_lds, 2 * i + 1]
-            collider_state.sort_buffer.i_g[2 * i + 1, i_b] = packed_ig_ismax >> 1
-            collider_state.sort_buffer.is_max[2 * i + 1, i_b] = qd.cast(packed_ig_ismax & 1, gs.qd_bool)
-
+            collider_state.sort_buffer.i_g_packed[2 * i, i_b] = lds_sort_packed[i_b_lds, 2 * i]
+            collider_state.sort_buffer.i_g_packed[2 * i + 1, i_b] = lds_sort_packed[i_b_lds, 2 * i + 1]
             if qd.static(not static_rigid_sim_config.use_hibernation):
                 collider_state.active_buffer[i, i_b] = lds_active[i_b_lds, i]
 
         collider_state.n_broad_pairs[i_b] = n_broad
 
 
+
 @qd.func
 def func_broad_phase_global_mem(
     links_state: array_class.LinksState,