Skip to content
39 changes: 26 additions & 13 deletions src/policy/compressor/compressorspace.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,12 @@ use std::sync::Arc;
pub(crate) const TRACE_KIND_MARK: TraceKind = 0;
pub(crate) const TRACE_KIND_FORWARD_ROOT: TraceKind = 1;

/// The number of bytes of the heap that each CalculateOffsetVector
/// work packet should process. Calculating the offset vector is very fast,
/// and we are often swamped by scheduling overhead when we
/// only process one region per work packet.
const OFFSET_VECTOR_PACKET_BYTES: usize = 1 << 21;

/// [`CompressorSpace`] is a stop-the-world implementation of
/// the Compressor, as described in Kermany and Petrank,
/// [The Compressor: concurrent, incremental, and parallel compaction](https://dl.acm.org/doi/10.1145/1133255.1134023).
Expand Down Expand Up @@ -224,7 +230,7 @@ impl<VM: VMBinding> CompressorSpace<VM> {
} else {
RegionPageResource::new_contiguous(common.start, common.extent, vm_map)
},
forwarding: forwarding::ForwardingMetadata::new(),
forwarding: forwarding::ForwardingMetadata::new(&common.options),
common,
scheduler,
}
Expand Down Expand Up @@ -305,9 +311,17 @@ impl<VM: VMBinding> CompressorSpace<VM> {
}

pub fn add_offset_vector_tasks(&'static self) {
let offset_vector_packets: Vec<Box<dyn GCWork<VM>>> = self.generate_tasks(&mut |r, _| {
Box::new(CalculateOffsetVector::<VM>::new(self, r.region, r.cursor()))
});
let offset_vector_packets: Vec<Box<dyn GCWork<VM>>> =
self.pr.with_regions(&mut |regions| {
regions
.chunks(OFFSET_VECTOR_PACKET_BYTES / forwarding::CompressorRegion::BYTES)
.map(|c| {
let chunk = c.iter().map(|r| (r.region, r.cursor())).collect();
Box::new(CalculateOffsetVector::<VM>::new(self, chunk))
as Box<dyn GCWork<VM>>
})
.collect()
});
self.scheduler.work_buckets[WorkBucketStage::CalculateForwarding]
.bulk_add(offset_vector_packets);
}
Expand Down Expand Up @@ -418,30 +432,29 @@ impl<VM: VMBinding> CompressorSpace<VM> {
}
}

/// Calculate the offset vector for a region.
/// Calculate the offset vector for some regions.
pub struct CalculateOffsetVector<VM: VMBinding> {
compressor_space: &'static CompressorSpace<VM>,
region: forwarding::CompressorRegion,
cursor: Address,
regions: Vec<(forwarding::CompressorRegion, Address)>,
}

impl<VM: VMBinding> GCWork<VM> for CalculateOffsetVector<VM> {
fn do_work(&mut self, _worker: &mut GCWorker<VM>, _mmtk: &'static MMTK<VM>) {
self.compressor_space
.calculate_offset_vector_for_region(self.region, self.cursor);
for (region, cursor) in self.regions.iter() {
self.compressor_space
.calculate_offset_vector_for_region(*region, *cursor);
}
}
}

impl<VM: VMBinding> CalculateOffsetVector<VM> {
pub fn new(
compressor_space: &'static CompressorSpace<VM>,
region: forwarding::CompressorRegion,
cursor: Address,
regions: Vec<(forwarding::CompressorRegion, Address)>,
) -> Self {
Self {
compressor_space,
region,
cursor,
regions,
}
}
}
Expand Down
108 changes: 105 additions & 3 deletions src/policy/compressor/forwarding.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ use crate::util::constants::BYTES_IN_WORD;
use crate::util::linear_scan::{Region, RegionIterator};
use crate::util::metadata::side_metadata::spec_defs::{COMPRESSOR_MARK, COMPRESSOR_OFFSET_VECTOR};
use crate::util::metadata::side_metadata::SideMetadataSpec;
use crate::util::options::Options;
use crate::util::{Address, ObjectReference};
use crate::vm::object_model::ObjectModel;
use crate::vm::VMBinding;
Expand All @@ -15,7 +16,7 @@ use std::sync::atomic::AtomicBool;
#[derive(Copy, Clone, PartialEq, PartialOrd)]
pub(crate) struct CompressorRegion(Address);
impl Region for CompressorRegion {
const LOG_BYTES: usize = 20; // 1 MiB
const LOG_BYTES: usize = 18; // 256 kiB
fn from_aligned_address(address: Address) -> Self {
assert!(
address.is_aligned_to(Self::BYTES),
Expand Down Expand Up @@ -94,6 +95,8 @@ impl Transducer {
pub struct ForwardingMetadata<VM: VMBinding> {
calculated: AtomicBool,
vm: PhantomData<VM>,
// This field is only used on x86_64.
_use_clmul: bool,
}

// A block in the Compressor is the granularity at which we cache
Expand All @@ -116,10 +119,11 @@ pub(crate) const MARK_SPEC: SideMetadataSpec = COMPRESSOR_MARK;
pub(crate) const OFFSET_VECTOR_SPEC: SideMetadataSpec = COMPRESSOR_OFFSET_VECTOR;

impl<VM: VMBinding> ForwardingMetadata<VM> {
pub fn new() -> ForwardingMetadata<VM> {
pub fn new(options: &Options) -> ForwardingMetadata<VM> {
ForwardingMetadata {
calculated: AtomicBool::new(false),
vm: PhantomData,
_use_clmul: *options.compressor_use_clmul,
}
}

Expand Down Expand Up @@ -148,7 +152,65 @@ impl<VM: VMBinding> ForwardingMetadata<VM> {
MARK_SPEC.fetch_or_atomic::<u8>(last_word_of_object, 1, Ordering::Relaxed);
}

pub fn calculate_offset_vector(&self, region: CompressorRegion, cursor: Address) {
// TODO: We could compute a prefix-sum by Hillis-Steele too, for which
// the same offset-vector algorithm works. Would it be faster than the
// branchy version?

// SAFETY: Only call this function when the processor supports
// pclmulqdq and popcnt, i.e. when processor_can_clmul().
#[cfg(target_arch = "x86_64")]
unsafe fn calculate_offset_vector_clmul(&self, region: CompressorRegion, cursor: Address) {
// This function implements Geoff Langdale's
// algorithm to find quote pairs using prefix sums:
// https://branchfree.org/2019/03/06/code-fragment-finding-quote-pairs-with-carry-less-multiply-pclmulqdq/
debug_assert!(processor_can_clmul());
// We need a local function to use #[target_feature], which in turn
// allows rustc to generate the POPCNT and PCLMULQDQ instructions.
#[target_feature(enable = "pclmulqdq,popcnt")]
unsafe fn inner(to: &mut Address, carry: &mut i64, word: usize, addr: Address) {
use std::arch::x86_64;
if addr.is_aligned_to(Block::BYTES) {
// Write the state at the start of the block.
// The carry has all bits set the same way,
// so extract the least significant bit.
let in_object = (*carry as usize) & 1;
let encoded = (*to).as_usize() + in_object;
OFFSET_VECTOR_SPEC.store_atomic::<usize>(addr, encoded, Ordering::Relaxed);
}
// Compute the prefix sum of this word of mark bitmap.
let ones = x86_64::_mm_set1_epi8(0xFFu8 as i8);
let vector = x86_64::_mm_set_epi64x(0, word as i64);
let sum: i64 = x86_64::_mm_cvtsi128_si64(x86_64::_mm_clmulepi64_si128(vector, ones, 0));
debug_assert_eq!(sum, prefix_sum(word) as i64);
// Carry-in from the last word. If the last word ended in the
// middle of an object, we need to invert the in/out-of-object
// states in this word.
let flipped = sum ^ *carry;
// Produce a carry-out for the next word. This shift will
// replicate the most significant bit to all bit positions.
*carry = flipped >> 63;
// Now count the in-object bits. The marked bits on either
// end of an object are both in an object, despite that the
// prefix sum for the bit at the end of an object will be zero,
// so we bitwise-or the original word with the prefix sum to
// find all in-object bits.
*to += (((flipped as usize | word).count_ones()) * 8) as usize;
}

let mut to = region.start();
let mut carry: i64 = 0;
MARK_SPEC.scan_words::<u8>(
region.start(),
cursor.align_up(Block::BYTES),
&mut |_, _| panic!("should be word aligned, got a bit instead"),
&mut |_, _| panic!("should be word aligned, got a byte instead"),
&mut |word: usize, addr: Address| {
inner(&mut to, &mut carry, word, addr);
},
);
}

fn calculate_offset_vector_base(&self, region: CompressorRegion, cursor: Address) {
let mut state = Transducer::new(region.start());
let first_block = Block::from_aligned_address(region.start());
let last_block = Block::from_aligned_address(cursor);
Expand All @@ -166,6 +228,26 @@ impl<VM: VMBinding> ForwardingMetadata<VM> {
},
);
}
}

pub fn calculate_offset_vector(&self, region: CompressorRegion, cursor: Address) {
#[cfg(target_arch = "x86_64")]
{
// XXX: how to derive this? It's s.t. the mark bits
// for one block occupy at least one word.
let blocks_large_enough = Block::LOG_BYTES >= 9;
if self._use_clmul && blocks_large_enough && processor_can_clmul() {
unsafe {
// SAFETY: We checked the processor supports the
// necessary instructions.
self.calculate_offset_vector_clmul(region, cursor)
}
} else {
self.calculate_offset_vector_base(region, cursor)
}
}
#[cfg(not(target_arch = "x86_64"))]
self.calculate_offset_vector_base(region, cursor);
self.calculated.store(true, Ordering::Relaxed);
}

Expand Down Expand Up @@ -212,3 +294,23 @@ impl<VM: VMBinding> ForwardingMetadata<VM> {
self.calculated.load(Ordering::Relaxed)
}
}

#[cfg(target_arch = "x86_64")]
fn processor_can_clmul() -> bool {
is_x86_feature_detected!("pclmulqdq") && is_x86_feature_detected!("popcnt")
}

// This function is only used in a debug assertion for the x86_64-only
// calculate_offset_vector_clmul.
#[cfg(target_arch = "x86_64")]
fn prefix_sum(x: usize) -> usize {
// This function implements a bit-parallel version of the Hillis-Steele prefix sum algorithm:
// https://en.wikipedia.org/wiki/Prefix_sum#Algorithm_1:_Shorter_span,_more_parallel
let mut result = x;
let mut n = 1;
while n < usize::BITS {
result ^= result << n;
n <<= 1;
}
result
}
32 changes: 32 additions & 0 deletions src/util/address.rs
Original file line number Diff line number Diff line change
Expand Up @@ -347,6 +347,16 @@ impl Address {
pub fn range_intersection(r1: &Range<Address>, r2: &Range<Address>) -> Range<Address> {
r1.start.max(r2.start)..r1.end.min(r2.end)
}

/// Returns an iterator which steps from this address to below the
/// `end` address, in steps of `step` bytes.
pub fn iter_to(&self, end: Address, step: usize) -> AddressIterator {
AddressIterator {
start: *self,
end,
step,
}
}
}

/// allows print Address as upper-case hex value
Expand Down Expand Up @@ -386,6 +396,28 @@ impl std::str::FromStr for Address {
}
}

/// Iterate addresses from a start address to below an end address,
/// with a given step size.
pub struct AddressIterator {
start: Address,
end: Address,
step: usize,
}

impl Iterator for AddressIterator {
type Item = Address;

fn next(&mut self) -> Option<Self::Item> {
if self.start >= self.end {
None
} else {
let current = self.start;
self.start += self.step;
Some(current)
}
}
}

#[cfg(test)]
mod tests {
use crate::util::Address;
Expand Down
79 changes: 78 additions & 1 deletion src/util/metadata/side_metadata/global.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ use crate::util::metadata::metadata_val_traits::*;
use crate::util::metadata::vo_bit::VO_BIT_SIDE_METADATA_SPEC;
use crate::util::Address;
use num_traits::FromPrimitive;
use ranges::BitByteRange;
use ranges::{BitByteRange, ByteWordRange};
use std::fmt;
use std::io::Result;
use std::sync::atomic::{AtomicU8, Ordering};
Expand Down Expand Up @@ -1247,6 +1247,83 @@ impl SideMetadataSpec {
&mut visitor,
);
}

/// Walk the metadata between two addresses, calling visitor functions with
/// varying sizes of aligned metadata.
/// - `scan_words` calls its `visit_word` argument with each `usize` of metadata
/// which is word-aligned, and the data address that the metadata word starts at.
/// - `scan_words` calls its `visit_byte` argument with each `u8` of metadata
/// which is byte-aligned but word-unaligned, and the data address that the
/// metadata byte starts at.
/// - `scan_words` calls its `visit_value` argument with each metadata value
/// which is byte-unaligned, and the data address of the metadata valu.
///
/// `scan_words` calls each function with arguments in order of lowest to
/// highest addresses.
pub fn scan_words<T: MetadataValue>(
&self,
data_start_addr: Address,
data_end_addr: Address,
visit_value: &mut impl FnMut(T, Address),
visit_byte: &mut impl FnMut(u8, Address),
visit_word: &mut impl FnMut(usize, Address),
) {
assert!(self.uses_contiguous_side_metadata());
let start_meta_addr = address_to_contiguous_meta_address(self, data_start_addr);
let start_meta_shift = meta_byte_lshift(self, data_start_addr);
let end_meta_addr = address_to_contiguous_meta_address(self, data_end_addr);
let end_meta_shift = meta_byte_lshift(self, data_end_addr);

let mut visit_bytes = |start: Address, end: Address| {
ranges::break_byte_range(start, end, &mut |range| match range {
ByteWordRange::Bytes { start, end } => {
for meta in start.iter_to(end, 1) {
let addr = contiguous_meta_address_to_address(self, meta, 0);
let byte = unsafe { meta.load::<u8>() };
visit_byte(byte, addr);
}
}
ByteWordRange::Words { start, end } => {
for meta in start.iter_to(end, crate::util::constants::BYTES_IN_ADDRESS) {
let addr = contiguous_meta_address_to_address(self, meta, 0);
let word = unsafe { meta.load::<usize>() };
visit_word(word, addr);
}
}
});
};

ranges::break_bit_range(
start_meta_addr,
start_meta_shift,
end_meta_addr,
end_meta_shift,
true,
&mut |range| {
match range {
BitByteRange::Bytes { start, end } => {
visit_bytes(start, end);
}
BitByteRange::BitsInByte {
addr,
bit_start,
bit_end,
} => {
let start = contiguous_meta_address_to_address(self, addr, bit_start);
let end = contiguous_meta_address_to_address(self, addr, bit_end);
let region_bytes = 1usize << self.log_bytes_in_region;
let mut cursor = start;
while cursor < end {
let value = unsafe { self.load::<T>(cursor) };
visit_value(value, cursor);
cursor += region_bytes;
}
}
}
false
},
);
}
}

impl fmt::Debug for SideMetadataSpec {
Expand Down
Loading
Loading