diff --git a/src/policy/compressor/compressorspace.rs b/src/policy/compressor/compressorspace.rs index bb22ecbeec..c0352fdd6b 100644 --- a/src/policy/compressor/compressorspace.rs +++ b/src/policy/compressor/compressorspace.rs @@ -24,6 +24,12 @@ use std::sync::Arc; pub(crate) const TRACE_KIND_MARK: TraceKind = 0; pub(crate) const TRACE_KIND_FORWARD_ROOT: TraceKind = 1; +/// The number of bytes of the heap that each CalculateOffsetVector +/// work packet should process. Calculating the offset vector is very fast, +/// and we are often swamped by scheduling overhead when we +/// only process one region per work packet. +const OFFSET_VECTOR_PACKET_BYTES: usize = 1 << 21; + /// [`CompressorSpace`] is a stop-the-world implementation of /// the Compressor, as described in Kermany and Petrank, /// [The Compressor: concurrent, incremental, and parallel compaction](https://dl.acm.org/doi/10.1145/1133255.1134023). @@ -224,7 +230,7 @@ impl CompressorSpace { } else { RegionPageResource::new_contiguous(common.start, common.extent, vm_map) }, - forwarding: forwarding::ForwardingMetadata::new(), + forwarding: forwarding::ForwardingMetadata::new(&common.options), common, scheduler, } @@ -305,9 +311,17 @@ impl CompressorSpace { } pub fn add_offset_vector_tasks(&'static self) { - let offset_vector_packets: Vec>> = self.generate_tasks(&mut |r, _| { - Box::new(CalculateOffsetVector::::new(self, r.region, r.cursor())) - }); + let offset_vector_packets: Vec>> = + self.pr.with_regions(&mut |regions| { + regions + .chunks(OFFSET_VECTOR_PACKET_BYTES / forwarding::CompressorRegion::BYTES) + .map(|c| { + let chunk = c.iter().map(|r| (r.region, r.cursor())).collect(); + Box::new(CalculateOffsetVector::::new(self, chunk)) + as Box> + }) + .collect() + }); self.scheduler.work_buckets[WorkBucketStage::CalculateForwarding] .bulk_add(offset_vector_packets); } @@ -418,30 +432,29 @@ impl CompressorSpace { } } -/// Calculate the offset vector for a region. +/// Calculate the offset vector for some regions. pub struct CalculateOffsetVector { compressor_space: &'static CompressorSpace, - region: forwarding::CompressorRegion, - cursor: Address, + regions: Vec<(forwarding::CompressorRegion, Address)>, } impl GCWork for CalculateOffsetVector { fn do_work(&mut self, _worker: &mut GCWorker, _mmtk: &'static MMTK) { - self.compressor_space - .calculate_offset_vector_for_region(self.region, self.cursor); + for (region, cursor) in self.regions.iter() { + self.compressor_space + .calculate_offset_vector_for_region(*region, *cursor); + } } } impl CalculateOffsetVector { pub fn new( compressor_space: &'static CompressorSpace, - region: forwarding::CompressorRegion, - cursor: Address, + regions: Vec<(forwarding::CompressorRegion, Address)>, ) -> Self { Self { compressor_space, - region, - cursor, + regions, } } } diff --git a/src/policy/compressor/forwarding.rs b/src/policy/compressor/forwarding.rs index dc0922f39e..999cf4f5f6 100644 --- a/src/policy/compressor/forwarding.rs +++ b/src/policy/compressor/forwarding.rs @@ -2,6 +2,7 @@ use crate::util::constants::BYTES_IN_WORD; use crate::util::linear_scan::{Region, RegionIterator}; use crate::util::metadata::side_metadata::spec_defs::{COMPRESSOR_MARK, COMPRESSOR_OFFSET_VECTOR}; use crate::util::metadata::side_metadata::SideMetadataSpec; +use crate::util::options::Options; use crate::util::{Address, ObjectReference}; use crate::vm::object_model::ObjectModel; use crate::vm::VMBinding; @@ -15,7 +16,7 @@ use std::sync::atomic::AtomicBool; #[derive(Copy, Clone, PartialEq, PartialOrd)] pub(crate) struct CompressorRegion(Address); impl Region for CompressorRegion { - const LOG_BYTES: usize = 20; // 1 MiB + const LOG_BYTES: usize = 18; // 256 kiB fn from_aligned_address(address: Address) -> Self { assert!( address.is_aligned_to(Self::BYTES), @@ -94,6 +95,8 @@ impl Transducer { pub struct ForwardingMetadata { calculated: AtomicBool, vm: PhantomData, + // This field is only used on x86_64. + _use_clmul: bool, } // A block in the Compressor is the granularity at which we cache @@ -116,10 +119,11 @@ pub(crate) const MARK_SPEC: SideMetadataSpec = COMPRESSOR_MARK; pub(crate) const OFFSET_VECTOR_SPEC: SideMetadataSpec = COMPRESSOR_OFFSET_VECTOR; impl ForwardingMetadata { - pub fn new() -> ForwardingMetadata { + pub fn new(options: &Options) -> ForwardingMetadata { ForwardingMetadata { calculated: AtomicBool::new(false), vm: PhantomData, + _use_clmul: *options.compressor_use_clmul, } } @@ -148,7 +152,65 @@ impl ForwardingMetadata { MARK_SPEC.fetch_or_atomic::(last_word_of_object, 1, Ordering::Relaxed); } - pub fn calculate_offset_vector(&self, region: CompressorRegion, cursor: Address) { + // TODO: We could compute a prefix-sum by Hillis-Steele too, for which + // the same offset-vector algorithm works. Would it be faster than the + // branchy version? + + // SAFETY: Only call this function when the processor supports + // pclmulqdq and popcnt, i.e. when processor_can_clmul(). + #[cfg(target_arch = "x86_64")] + unsafe fn calculate_offset_vector_clmul(&self, region: CompressorRegion, cursor: Address) { + // This function implements Geoff Langdale's + // algorithm to find quote pairs using prefix sums: + // https://branchfree.org/2019/03/06/code-fragment-finding-quote-pairs-with-carry-less-multiply-pclmulqdq/ + debug_assert!(processor_can_clmul()); + // We need a local function to use #[target_feature], which in turn + // allows rustc to generate the POPCNT and PCLMULQDQ instructions. + #[target_feature(enable = "pclmulqdq,popcnt")] + unsafe fn inner(to: &mut Address, carry: &mut i64, word: usize, addr: Address) { + use std::arch::x86_64; + if addr.is_aligned_to(Block::BYTES) { + // Write the state at the start of the block. + // The carry has all bits set the same way, + // so extract the least significant bit. + let in_object = (*carry as usize) & 1; + let encoded = (*to).as_usize() + in_object; + OFFSET_VECTOR_SPEC.store_atomic::(addr, encoded, Ordering::Relaxed); + } + // Compute the prefix sum of this word of mark bitmap. + let ones = x86_64::_mm_set1_epi8(0xFFu8 as i8); + let vector = x86_64::_mm_set_epi64x(0, word as i64); + let sum: i64 = x86_64::_mm_cvtsi128_si64(x86_64::_mm_clmulepi64_si128(vector, ones, 0)); + debug_assert_eq!(sum, prefix_sum(word) as i64); + // Carry-in from the last word. If the last word ended in the + // middle of an object, we need to invert the in/out-of-object + // states in this word. + let flipped = sum ^ *carry; + // Produce a carry-out for the next word. This shift will + // replicate the most significant bit to all bit positions. + *carry = flipped >> 63; + // Now count the in-object bits. The marked bits on either + // end of an object are both in an object, despite that the + // prefix sum for the bit at the end of an object will be zero, + // so we bitwise-or the original word with the prefix sum to + // find all in-object bits. + *to += (((flipped as usize | word).count_ones()) * 8) as usize; + } + + let mut to = region.start(); + let mut carry: i64 = 0; + MARK_SPEC.scan_words::( + region.start(), + cursor.align_up(Block::BYTES), + &mut |_, _| panic!("should be word aligned, got a bit instead"), + &mut |_, _| panic!("should be word aligned, got a byte instead"), + &mut |word: usize, addr: Address| { + inner(&mut to, &mut carry, word, addr); + }, + ); + } + + fn calculate_offset_vector_base(&self, region: CompressorRegion, cursor: Address) { let mut state = Transducer::new(region.start()); let first_block = Block::from_aligned_address(region.start()); let last_block = Block::from_aligned_address(cursor); @@ -166,6 +228,26 @@ impl ForwardingMetadata { }, ); } + } + + pub fn calculate_offset_vector(&self, region: CompressorRegion, cursor: Address) { + #[cfg(target_arch = "x86_64")] + { + // XXX: how to derive this? It's s.t. the mark bits + // for one block occupy at least one word. + let blocks_large_enough = Block::LOG_BYTES >= 9; + if self._use_clmul && blocks_large_enough && processor_can_clmul() { + unsafe { + // SAFETY: We checked the processor supports the + // necessary instructions. + self.calculate_offset_vector_clmul(region, cursor) + } + } else { + self.calculate_offset_vector_base(region, cursor) + } + } + #[cfg(not(target_arch = "x86_64"))] + self.calculate_offset_vector_base(region, cursor); self.calculated.store(true, Ordering::Relaxed); } @@ -212,3 +294,23 @@ impl ForwardingMetadata { self.calculated.load(Ordering::Relaxed) } } + +#[cfg(target_arch = "x86_64")] +fn processor_can_clmul() -> bool { + is_x86_feature_detected!("pclmulqdq") && is_x86_feature_detected!("popcnt") +} + +// This function is only used in a debug assertion for the x86_64-only +// calculate_offset_vector_clmul. +#[cfg(target_arch = "x86_64")] +fn prefix_sum(x: usize) -> usize { + // This function implements a bit-parallel version of the Hillis-Steele prefix sum algorithm: + // https://en.wikipedia.org/wiki/Prefix_sum#Algorithm_1:_Shorter_span,_more_parallel + let mut result = x; + let mut n = 1; + while n < usize::BITS { + result ^= result << n; + n <<= 1; + } + result +} diff --git a/src/util/address.rs b/src/util/address.rs index c87a5d3abb..01e2b3a635 100644 --- a/src/util/address.rs +++ b/src/util/address.rs @@ -347,6 +347,16 @@ impl Address { pub fn range_intersection(r1: &Range
, r2: &Range
) -> Range
{ r1.start.max(r2.start)..r1.end.min(r2.end) } + + /// Returns an iterator which steps from this address to below the + /// `end` address, in steps of `step` bytes. + pub fn iter_to(&self, end: Address, step: usize) -> AddressIterator { + AddressIterator { + start: *self, + end, + step, + } + } } /// allows print Address as upper-case hex value @@ -386,6 +396,28 @@ impl std::str::FromStr for Address { } } +/// Iterate addresses from a start address to below an end address, +/// with a given step size. +pub struct AddressIterator { + start: Address, + end: Address, + step: usize, +} + +impl Iterator for AddressIterator { + type Item = Address; + + fn next(&mut self) -> Option { + if self.start >= self.end { + None + } else { + let current = self.start; + self.start += self.step; + Some(current) + } + } +} + #[cfg(test)] mod tests { use crate::util::Address; diff --git a/src/util/metadata/side_metadata/global.rs b/src/util/metadata/side_metadata/global.rs index 65ce7443e1..768df0e023 100644 --- a/src/util/metadata/side_metadata/global.rs +++ b/src/util/metadata/side_metadata/global.rs @@ -8,7 +8,7 @@ use crate::util::metadata::metadata_val_traits::*; use crate::util::metadata::vo_bit::VO_BIT_SIDE_METADATA_SPEC; use crate::util::Address; use num_traits::FromPrimitive; -use ranges::BitByteRange; +use ranges::{BitByteRange, ByteWordRange}; use std::fmt; use std::io::Result; use std::sync::atomic::{AtomicU8, Ordering}; @@ -1247,6 +1247,83 @@ impl SideMetadataSpec { &mut visitor, ); } + + /// Walk the metadata between two addresses, calling visitor functions with + /// varying sizes of aligned metadata. + /// - `scan_words` calls its `visit_word` argument with each `usize` of metadata + /// which is word-aligned, and the data address that the metadata word starts at. + /// - `scan_words` calls its `visit_byte` argument with each `u8` of metadata + /// which is byte-aligned but word-unaligned, and the data address that the + /// metadata byte starts at. + /// - `scan_words` calls its `visit_value` argument with each metadata value + /// which is byte-unaligned, and the data address of the metadata valu. + /// + /// `scan_words` calls each function with arguments in order of lowest to + /// highest addresses. + pub fn scan_words( + &self, + data_start_addr: Address, + data_end_addr: Address, + visit_value: &mut impl FnMut(T, Address), + visit_byte: &mut impl FnMut(u8, Address), + visit_word: &mut impl FnMut(usize, Address), + ) { + assert!(self.uses_contiguous_side_metadata()); + let start_meta_addr = address_to_contiguous_meta_address(self, data_start_addr); + let start_meta_shift = meta_byte_lshift(self, data_start_addr); + let end_meta_addr = address_to_contiguous_meta_address(self, data_end_addr); + let end_meta_shift = meta_byte_lshift(self, data_end_addr); + + let mut visit_bytes = |start: Address, end: Address| { + ranges::break_byte_range(start, end, &mut |range| match range { + ByteWordRange::Bytes { start, end } => { + for meta in start.iter_to(end, 1) { + let addr = contiguous_meta_address_to_address(self, meta, 0); + let byte = unsafe { meta.load::() }; + visit_byte(byte, addr); + } + } + ByteWordRange::Words { start, end } => { + for meta in start.iter_to(end, crate::util::constants::BYTES_IN_ADDRESS) { + let addr = contiguous_meta_address_to_address(self, meta, 0); + let word = unsafe { meta.load::() }; + visit_word(word, addr); + } + } + }); + }; + + ranges::break_bit_range( + start_meta_addr, + start_meta_shift, + end_meta_addr, + end_meta_shift, + true, + &mut |range| { + match range { + BitByteRange::Bytes { start, end } => { + visit_bytes(start, end); + } + BitByteRange::BitsInByte { + addr, + bit_start, + bit_end, + } => { + let start = contiguous_meta_address_to_address(self, addr, bit_start); + let end = contiguous_meta_address_to_address(self, addr, bit_end); + let region_bytes = 1usize << self.log_bytes_in_region; + let mut cursor = start; + while cursor < end { + let value = unsafe { self.load::(cursor) }; + visit_value(value, cursor); + cursor += region_bytes; + } + } + } + false + }, + ); + } } impl fmt::Debug for SideMetadataSpec { diff --git a/src/util/metadata/side_metadata/ranges.rs b/src/util/metadata/side_metadata/ranges.rs index 48d12a7068..8916bc599e 100644 --- a/src/util/metadata/side_metadata/ranges.rs +++ b/src/util/metadata/side_metadata/ranges.rs @@ -167,6 +167,39 @@ where } } +pub enum ByteWordRange { + Bytes { start: Address, end: Address }, + Words { start: Address, end: Address }, +} + +pub fn break_byte_range( + start_addr: Address, + end_addr: Address, + visitor: &mut impl FnMut(ByteWordRange), +) { + use crate::util::constants::BYTES_IN_ADDRESS; + let start_word = start_addr.align_up(BYTES_IN_ADDRESS); + let end_word = end_addr.align_down(BYTES_IN_ADDRESS); + if start_word != start_addr { + visitor(ByteWordRange::Bytes { + start: start_addr, + end: start_word, + }); + } + if start_word != end_word { + visitor(ByteWordRange::Words { + start: start_word, + end: end_word, + }); + } + if end_word != end_addr { + visitor(ByteWordRange::Bytes { + start: end_word, + end: end_addr, + }); + } +} + #[cfg(test)] mod tests { use crate::util::constants::BITS_IN_BYTE; diff --git a/src/util/options.rs b/src/util/options.rs index fd8952d1da..80a79a3982 100644 --- a/src/util/options.rs +++ b/src/util/options.rs @@ -964,7 +964,10 @@ options! { /// Percentage of heap size reserved for defragmentation. /// According to [this paper](https://doi.org/10.1145/1375581.1375586), Immix works well with /// headroom between 1% to 3% of the heap size. - immix_defrag_headroom_percent: usize [|v: &usize| *v <= 50] = 2 + immix_defrag_headroom_percent: usize [|v: &usize| *v <= 50] = 2, + /// Enable the use of an algorithm based on carryless multiplication to + /// compute the offset vector in the Compressor. + compressor_use_clmul: bool [always_valid] = true } #[cfg(test)] diff --git a/src/vm/slot.rs b/src/vm/slot.rs index d57e3379a9..ccd641007e 100644 --- a/src/vm/slot.rs +++ b/src/vm/slot.rs @@ -8,6 +8,7 @@ use std::{fmt::Debug, ops::Range}; use atomic::Atomic; +use crate::util::address::AddressIterator; use crate::util::constants::{BYTES_IN_ADDRESS, LOG_BYTES_IN_ADDRESS}; use crate::util::{Address, ObjectReference}; @@ -237,22 +238,13 @@ pub trait MemorySlice: Send + Debug + PartialEq + Eq + Clone + Hash { } /// Iterate slots within `Range
`. -pub struct AddressRangeIterator { - cursor: Address, - limit: Address, -} +pub struct AddressRangeIterator(AddressIterator); impl Iterator for AddressRangeIterator { type Item = Address; fn next(&mut self) -> Option { - if self.cursor >= self.limit { - None - } else { - let slot = self.cursor; - self.cursor += BYTES_IN_ADDRESS; - Some(slot) - } + self.0.next() } } @@ -261,10 +253,7 @@ impl MemorySlice for Range
{ type SlotIterator = AddressRangeIterator; fn iter_slots(&self) -> Self::SlotIterator { - AddressRangeIterator { - cursor: self.start, - limit: self.end, - } + AddressRangeIterator(self.start.iter_to(self.end, BYTES_IN_ADDRESS)) } fn object(&self) -> Option {