Skip to content

Commit 9824cee

Browse files
authored
Fix races related to OS scheduling and spurious wakeup (#759)
This PR fixes two race conditions. 1. If a GC worker spuriously wakes up after the `Finish` coordinator message is sent, the coordinator may deactivate work packets while the GC worker is trying to schedule sentinels or open new buckets. 2. If `Plan::schedule_collection` runs too slowly due to OS scheduling, workers may open subsequent work buckets while `Plan::schedule_collection` still has pending packets to be added in prior buckets. Fixes: #758
1 parent 675fece commit 9824cee

File tree

1 file changed

+58
-18
lines changed

1 file changed

+58
-18
lines changed

src/scheduler/controller.rs

Lines changed: 58 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ use crate::vm::VMBinding;
1414
use crate::MMTK;
1515
use atomic::Ordering;
1616

17-
use super::{GCWork, GCWorkScheduler, GCWorker};
17+
use super::{CoordinatorWork, GCWorkScheduler, GCWorker};
1818

1919
/// The thread local struct for the GC controller, the counterpart of `GCWorker`.
2020
pub struct GCController<VM: VMBinding> {
@@ -69,22 +69,9 @@ impl<VM: VMBinding> GCController<VM> {
6969

7070
/// Process a message. Return true if the GC is finished.
7171
fn process_message(&mut self, message: CoordinatorMessage<VM>) -> bool {
72-
let worker = &mut self.coordinator_worker;
73-
let mmtk = self.mmtk;
7472
match message {
7573
CoordinatorMessage::Work(mut work) => {
76-
work.do_work_with_stat(worker, mmtk);
77-
let old_count = self
78-
.scheduler
79-
.pending_coordinator_packets
80-
.fetch_sub(1, Ordering::SeqCst);
81-
if old_count == 1 {
82-
// When the coordinator finishes executing all coordinator work packets,
83-
// it is a chance to open more work buckets.
84-
// Notify one worker so it can open buckets.
85-
let _guard = self.scheduler.worker_monitor.0.lock().unwrap();
86-
self.scheduler.worker_monitor.1.notify_one();
87-
}
74+
self.execute_coordinator_work(work.as_mut(), true);
8875
false
8976
}
9077
CoordinatorMessage::Finish => {
@@ -101,7 +88,7 @@ impl<VM: VMBinding> GCController<VM> {
10188
/// Coordinate workers to perform GC in response to a GC request.
10289
pub fn do_gc_until_completion(&mut self) {
10390
// Schedule collection.
104-
ScheduleCollection.do_work_with_stat(&mut self.coordinator_worker, self.mmtk);
91+
self.initiate_coordinator_work(&mut ScheduleCollection, true);
10592

10693
// Tell GC trigger that GC started - this happens after ScheduleCollection so we
10794
// will know what kind of GC this is (e.g. nursery vs mature in gen copy, defrag vs fast in Immix)
@@ -128,7 +115,14 @@ impl<VM: VMBinding> GCController<VM> {
128115
CoordinatorMessage::Finish => {}
129116
}
130117
}
131-
self.scheduler.deactivate_all();
118+
119+
{
120+
// Note: GC workers may spuriously wake up, examining the states of work buckets and
121+
// trying to open them. Use lock to ensure workers do not wake up when we deactivate
122+
// buckets.
123+
let _guard = self.scheduler.worker_monitor.0.lock().unwrap();
124+
self.scheduler.deactivate_all();
125+
}
132126

133127
// Tell GC trigger that GC ended - this happens before EndOfGC where we resume mutators.
134128
self.mmtk.plan.base().gc_trigger.policy.on_gc_end(self.mmtk);
@@ -138,8 +132,54 @@ impl<VM: VMBinding> GCController<VM> {
138132
// Otherwise, for generational GCs, workers will receive and process
139133
// newly generated remembered-sets from those open buckets.
140134
// But these remsets should be preserved until next GC.
141-
EndOfGC.do_work_with_stat(&mut self.coordinator_worker, self.mmtk);
135+
self.initiate_coordinator_work(&mut EndOfGC, false);
142136

143137
self.scheduler.debug_assert_all_buckets_deactivated();
144138
}
139+
140+
/// The controller uses this method to start executing a coordinator work immediately.
141+
///
142+
/// Note: GC workers will start executing work packets as soon as individual work packets
143+
/// are added. If the coordinator work (such as `ScheduleCollection`) adds multiple work
144+
/// packets into different buckets, workers may open subsequent buckets while the coordinator
145+
/// work still has packets to be added to prior buckets. For this reason, we use the
146+
/// `pending_coordinator_packets` to prevent the workers from opening any work buckets while
147+
/// this coordinator work is being executed.
148+
///
149+
/// # Arguments
150+
///
151+
/// - `work`: The work to execute.
152+
/// - `notify_workers`: Notify one worker after the work is finished. Useful for proceeding
153+
/// to the next work bucket stage.
154+
fn initiate_coordinator_work(
155+
&mut self,
156+
work: &mut dyn CoordinatorWork<VM>,
157+
notify_workers: bool,
158+
) {
159+
self.scheduler
160+
.pending_coordinator_packets
161+
.fetch_add(1, Ordering::SeqCst);
162+
163+
self.execute_coordinator_work(work, notify_workers)
164+
}
165+
166+
fn execute_coordinator_work(
167+
&mut self,
168+
work: &mut dyn CoordinatorWork<VM>,
169+
notify_workers: bool,
170+
) {
171+
work.do_work_with_stat(&mut self.coordinator_worker, self.mmtk);
172+
173+
self.scheduler
174+
.pending_coordinator_packets
175+
.fetch_sub(1, Ordering::SeqCst);
176+
177+
if notify_workers {
178+
// When a coordinator work finishes, there is a chance that all GC workers parked, and
179+
// no work packets are added to any open buckets. We need to wake up one GC worker so
180+
// that it can open more work buckets.
181+
let _guard = self.scheduler.worker_monitor.0.lock().unwrap();
182+
self.scheduler.worker_monitor.1.notify_one();
183+
};
184+
}
145185
}

0 commit comments

Comments
 (0)