diff --git a/Cargo.toml b/Cargo.toml
index 86ec4afe1..45e433fff 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -47,6 +47,9 @@ serde_test = "1.0"
 doc-comment = "0.3.1"
 bumpalo = { version = "3.13.0", features = ["allocator-api2"] }
 
+[target.'cfg(unix)'.dev-dependencies]
+libc = "0.2.155"
+
 [features]
 default = ["default-hasher", "inline-more", "allocator-api2", "equivalent", "raw-entry"]
 
diff --git a/benches/with_capacity.rs b/benches/with_capacity.rs
new file mode 100644
index 000000000..eeb85b59a
--- /dev/null
+++ b/benches/with_capacity.rs
@@ -0,0 +1,38 @@
+#![feature(test)]
+
+extern crate test;
+
+use hashbrown::HashMap;
+use test::{black_box, Bencher};
+
+type Map<K, V> = HashMap<K, V>;
+
+macro_rules! bench_with_capacity {
+    ($name:ident, $cap:expr) => {
+        #[bench]
+        fn $name(b: &mut Bencher) {
+            b.iter(|| {
+                // Construct a new empty map with a given capacity and return it to avoid
+                // being optimized away. Dropping it measures allocation + minimal setup.
+                let m: Map<usize, usize> = Map::with_capacity($cap);
+                black_box(m)
+            });
+        }
+    };
+}
+
+bench_with_capacity!(with_capacity_000000, 0);
+bench_with_capacity!(with_capacity_000001, 1);
+bench_with_capacity!(with_capacity_000003, 3);
+bench_with_capacity!(with_capacity_000007, 7);
+bench_with_capacity!(with_capacity_000008, 8);
+bench_with_capacity!(with_capacity_000016, 16);
+bench_with_capacity!(with_capacity_000032, 32);
+bench_with_capacity!(with_capacity_000064, 64);
+bench_with_capacity!(with_capacity_000128, 128);
+bench_with_capacity!(with_capacity_000256, 256);
+bench_with_capacity!(with_capacity_000512, 512);
+bench_with_capacity!(with_capacity_001024, 1024);
+bench_with_capacity!(with_capacity_004096, 4096);
+bench_with_capacity!(with_capacity_016384, 16384);
+bench_with_capacity!(with_capacity_065536, 65536);
diff --git a/src/map.rs b/src/map.rs
index 86f0ce09a..9890dc3d6 100644
--- a/src/map.rs
+++ b/src/map.rs
@@ -6631,3 +6631,136 @@ mod test_map {
         );
     }
 }
+
+#[cfg(all(test, unix, any(feature = "nightly", feature = "allocator-api2")))]
+mod test_map_with_mmap_allocations {
+    use super::HashMap;
+    use crate::raw::prev_pow2;
+    use core::alloc::Layout;
+    use core::ptr::{null_mut, NonNull};
+
+    #[cfg(feature = "nightly")]
+    use core::alloc::{AllocError, Allocator};
+
+    #[cfg(all(feature = "allocator-api2", not(feature = "nightly")))]
+    use allocator_api2::alloc::{AllocError, Allocator};
+
+    /// This is not a production quality allocator, just good enough for
+    /// some basic tests.
+    #[derive(Clone, Copy, Debug)]
+    struct MmapAllocator {
+        /// Guarantee this is a power of 2.
+        page_size: usize,
+    }
+
+    impl MmapAllocator {
+        fn new() -> Result<Self, AllocError> {
+            let result = unsafe { libc::sysconf(libc::_SC_PAGESIZE) };
+            if result < 1 {
+                return Err(AllocError);
+            }
+
+            let page_size = result as usize;
+            if !page_size.is_power_of_two() {
+                Err(AllocError)
+            } else {
+                Ok(Self { page_size })
+            }
+        }
+
+        fn fit_to_page_size(&self, n: usize) -> Result<usize, AllocError> {
+            // If n=0, give a single page (wasteful, I know).
+            let n = if n == 0 { self.page_size } else { n };
+
+            match n & (self.page_size - 1) {
+                0 => Ok(n),
+                rem => n.checked_add(self.page_size - rem).ok_or(AllocError),
+            }
+        }
+    }
+
+    unsafe impl Allocator for MmapAllocator {
+        fn allocate(&self, layout: Layout) -> Result<NonNull<[u8]>, AllocError> {
+            if layout.align() > self.page_size {
+                return Err(AllocError);
+            }
+
+            let null = null_mut();
+            let len = self.fit_to_page_size(layout.size())? as libc::size_t;
+            let prot = libc::PROT_READ | libc::PROT_WRITE;
+            let flags = libc::MAP_PRIVATE | libc::MAP_ANON;
+            let addr = unsafe { libc::mmap(null, len, prot, flags, -1, 0) };
+
+            // mmap returns MAP_FAILED on failure, not Null.
+            if addr == libc::MAP_FAILED {
+                return Err(AllocError);
+            }
+
+            match NonNull::new(addr.cast()) {
+                Some(data) => {
+                    // SAFETY: this is NonNull::slice_from_raw_parts.
+                    Ok(unsafe {
+                        NonNull::new_unchecked(core::ptr::slice_from_raw_parts_mut(
+                            data.as_ptr(),
+                            len,
+                        ))
+                    })
+                }
+
+                // This branch shouldn't be taken in practice, but since we
+                // cannot return null as a valid pointer in our type system,
+                // we attempt to handle it.
+                None => {
+                    _ = unsafe { libc::munmap(addr, len) };
+                    Err(AllocError)
+                }
+            }
+        }
+
+        unsafe fn deallocate(&self, ptr: NonNull<u8>, layout: Layout) {
+            // If they allocated it with this layout, it must round correctly.
+            let size = self.fit_to_page_size(layout.size()).unwrap();
+            let _result = libc::munmap(ptr.as_ptr().cast(), size);
+            debug_assert_eq!(0, _result)
+        }
+    }
+
+    #[test]
+    fn test_tiny_allocation_gets_rounded_to_page_size() {
+        let alloc = MmapAllocator::new().unwrap();
+        let mut map: HashMap<usize, (), _, _> = HashMap::with_capacity_in(1, alloc);
+
+        // Size of an element plus its control byte.
+        let rough_bucket_size = core::mem::size_of::<(usize, ())>() + 1;
+
+        // Accounting for some misc. padding that's likely in the allocation
+        // due to rounding to group width, etc.
+        let overhead = 3 * core::mem::size_of::<usize>();
+        let num_buckets = (alloc.page_size - overhead) / rough_bucket_size;
+        // Buckets are always powers of 2.
+        let min_elems = prev_pow2(num_buckets);
+        // Real load-factor is 7/8, but this is a lower estimation, so 1/2.
+        let min_capacity = min_elems >> 1;
+        let capacity = map.capacity();
+        assert!(
+            capacity >= min_capacity,
+            "failed: {capacity} >= {min_capacity}"
+        );
+
+        // Fill it up.
+        for i in 0..capacity {
+            map.insert(i, ());
+        }
+        // Capacity should not have changed and it should be full.
+        assert_eq!(capacity, map.len());
+        assert_eq!(capacity, map.capacity());
+
+        // Alright, make it grow.
+        map.insert(capacity, ());
+        assert!(
+            capacity < map.capacity(),
+            "failed: {capacity} < {}",
+            map.capacity()
+        );
+    }
+}
diff --git a/src/raw/alloc.rs b/src/raw/alloc.rs
index c01e2a45c..bacb4a149 100644
--- a/src/raw/alloc.rs
+++ b/src/raw/alloc.rs
@@ -15,9 +15,9 @@ mod inner {
     use core::ptr::NonNull;
 
     #[allow(clippy::map_err_ignore)]
-    pub(crate) fn do_alloc<A: Allocator>(alloc: &A, layout: Layout) -> Result<NonNull<u8>, ()> {
+    pub(crate) fn do_alloc<A: Allocator>(alloc: &A, layout: Layout) -> Result<NonNull<[u8]>, ()> {
         match alloc.allocate(layout) {
-            Ok(ptr) => Ok(ptr.as_non_null_ptr()),
+            Ok(ptr) => Ok(ptr),
             Err(_) => Err(()),
         }
     }
@@ -38,9 +38,9 @@ mod inner {
     use core::ptr::NonNull;
 
     #[allow(clippy::map_err_ignore)]
-    pub(crate) fn do_alloc<A: Allocator>(alloc: &A, layout: Layout) -> Result<NonNull<u8>, ()> {
+    pub(crate) fn do_alloc<A: Allocator>(alloc: &A, layout: Layout) -> Result<NonNull<[u8]>, ()> {
         match alloc.allocate(layout) {
-            Ok(ptr) => Ok(ptr.cast()),
+            Ok(ptr) => Ok(ptr),
             Err(_) => Err(()),
         }
     }
@@ -61,7 +61,7 @@ mod inner {
 
     #[allow(clippy::missing_safety_doc)] // not exposed outside of this crate
     pub unsafe trait Allocator {
-        fn allocate(&self, layout: Layout) -> Result<NonNull<u8>, ()>;
+        fn allocate(&self, layout: Layout) -> Result<NonNull<[u8]>, ()>;
         unsafe fn deallocate(&self, ptr: NonNull<u8>, layout: Layout);
     }
 
@@ -70,8 +70,19 @@ mod inner {
 
     unsafe impl Allocator for Global {
         #[inline]
-        fn allocate(&self, layout: Layout) -> Result<NonNull<u8>, ()> {
-            unsafe { NonNull::new(alloc(layout)).ok_or(()) }
+        fn allocate(&self, layout: Layout) -> Result<NonNull<[u8]>, ()> {
+            match unsafe { NonNull::new(alloc(layout)) } {
+                Some(data) => {
+                    // SAFETY: this is NonNull::slice_from_raw_parts.
+                    Ok(unsafe {
+                        NonNull::new_unchecked(core::ptr::slice_from_raw_parts_mut(
+                            data.as_ptr(),
+                            layout.size(),
+                        ))
+                    })
+                }
+                None => Err(()),
+            }
         }
         #[inline]
         unsafe fn deallocate(&self, ptr: NonNull<u8>, layout: Layout) {
@@ -86,7 +97,7 @@ mod inner {
         }
     }
 
-    pub(crate) fn do_alloc<A: Allocator>(alloc: &A, layout: Layout) -> Result<NonNull<u8>, ()> {
+    pub(crate) fn do_alloc<A: Allocator>(alloc: &A, layout: Layout) -> Result<NonNull<[u8]>, ()> {
         alloc.allocate(layout)
     }
 }
diff --git a/src/raw/mod.rs b/src/raw/mod.rs
index 6a8d37d82..a96282b70 100644
--- a/src/raw/mod.rs
+++ b/src/raw/mod.rs
@@ -97,6 +97,8 @@ impl ProbeSeq {
 /// taking the maximum load factor into account.
 ///
 /// Returns `None` if an overflow occurs.
+///
+/// This ensures that `buckets * table_layout.size >= table_layout.ctrl_align`.
 // Workaround for emscripten bug emscripten-core/emscripten-fastcomp#258
 #[cfg_attr(target_os = "emscripten", inline(never))]
 #[cfg_attr(not(target_os = "emscripten"), inline)]
@@ -138,13 +140,15 @@ fn capacity_to_buckets(cap: usize, table_layout: TableLayout) -> Option<usize> {
         // We don't bother with a table size of 2 buckets since that can only
         // hold a single element. Instead, we skip directly to a 4 bucket table
         // which can hold 3 elements.
-        return Some(if cap < 4 {
+        let buckets = if cap < 4 {
             4
         } else if cap < 8 {
             8
         } else {
             16
-        });
+        };
+        ensure_bucket_bytes_at_least_ctrl_align(table_layout, buckets);
+        return Some(buckets);
     }
 
     // Otherwise require 1/8 buckets to be empty (87.5% load)
@@ -156,7 +160,22 @@ fn capacity_to_buckets(cap: usize, table_layout: TableLayout) -> Option<usize> {
     // Any overflows will have been caught by the checked_mul. Also, any
     // rounding errors from the division above will be cleaned up by
     // next_power_of_two (which can't overflow because of the previous division).
-    Some(adjusted_cap.next_power_of_two())
+    let buckets = adjusted_cap.next_power_of_two();
+    ensure_bucket_bytes_at_least_ctrl_align(table_layout, buckets);
+    Some(buckets)
+}
+
+// `maximum_buckets_in` relies on the property that for non-ZST `T`, any
+// chosen `buckets` will satisfy `buckets * table_layout.size >=
+// table_layout.ctrl_align`, so `calculate_layout_for` does not need to add
+// extra padding beyond `table_layout.size * buckets`. If small-table bucket
+// selection or growth policy changes, revisit `maximum_buckets_in`.
+#[inline]
+fn ensure_bucket_bytes_at_least_ctrl_align(table_layout: TableLayout, buckets: usize) {
+    if table_layout.size != 0 {
+        let prod = table_layout.size.saturating_mul(buckets);
+        debug_assert!(prod >= table_layout.ctrl_align);
+    }
 }
 
 /// Returns the maximum effective capacity for the given bucket mask, taking
@@ -1442,6 +1461,45 @@ impl RawTableInner {
     }
 }
 
+/// Find the previous power of 2. If it's already a power of 2, it's unchanged.
+/// Passing zero is undefined behavior.
+pub(crate) fn prev_pow2(z: usize) -> usize {
+    let shift = mem::size_of::<usize>() * 8 - 1;
+    1 << (shift - (z.leading_zeros() as usize))
+}
+
+/// Finds the largest number of buckets that can fit in `allocation_size`
+/// provided the given TableLayout.
+///
+/// This relies on some invariants of `capacity_to_buckets`, so only feed in
+/// an `allocation_size` calculated from `capacity_to_buckets`.
+fn maximum_buckets_in(
+    allocation_size: usize,
+    table_layout: TableLayout,
+    group_width: usize,
+) -> usize {
+    // Given an equation like:
+    //   z >= x * y + x + g
+    // x can be maximized by doing:
+    //   x = (z - g) / (y + 1)
+    // If you squint:
+    //   x is the number of buckets
+    //   y is the table_layout.size
+    //   z is the size of the allocation
+    //   g is the group width
+    // But this is ignoring the padding needed for ctrl_align.
+    // If we remember these restrictions:
+    //   x is always a power of 2
+    //   Layout size for T must always be a multiple of T
+    // Then the alignment can be ignored if we add the constraint:
+    //   x * y >= table_layout.ctrl_align
+    // This is taken care of by `capacity_to_buckets`.
+    // It may be helpful to understand this if you remember that:
+    //   ctrl_offset = align(x * y, ctrl_align)
+    let x = (allocation_size - group_width) / (table_layout.size + 1);
+    prev_pow2(x)
+}
+
 impl RawTableInner {
     /// Allocates a new [`RawTableInner`] with the given number of buckets.
     /// The control bytes and buckets are left uninitialized.
@@ -1459,7 +1517,7 @@ impl RawTableInner {
     unsafe fn new_uninitialized<A>(
         alloc: &A,
         table_layout: TableLayout,
-        buckets: usize,
+        mut buckets: usize,
         fallibility: Fallibility,
     ) -> Result<Self, TryReserveError>
     where
@@ -1468,13 +1526,33 @@ impl RawTableInner {
         debug_assert!(buckets.is_power_of_two());
 
         // Avoid `Option::ok_or_else` because it bloats LLVM IR.
-        let (layout, ctrl_offset) = match table_layout.calculate_layout_for(buckets) {
+        let (layout, mut ctrl_offset) = match table_layout.calculate_layout_for(buckets) {
             Some(lco) => lco,
             None => return Err(fallibility.capacity_overflow()),
         };
 
         let ptr: NonNull<u8> = match do_alloc(alloc, layout) {
-            Ok(block) => block.cast(),
+            Ok(block) => {
+                // The allocator can't return a value smaller than was
+                // requested, so this can be != instead of >=.
+                if block.len() != layout.size() {
+                    // Utilize over-sized allocations.
+                    let x = maximum_buckets_in(block.len(), table_layout, Group::WIDTH);
+                    debug_assert!(x >= buckets);
+                    // Calculate the new ctrl_offset.
+                    let (oversized_layout, oversized_ctrl_offset) =
+                        match table_layout.calculate_layout_for(x) {
+                            Some(lco) => lco,
+                            None => unsafe { hint::unreachable_unchecked() },
+                        };
+                    debug_assert!(oversized_layout.size() <= block.len());
+                    debug_assert!(oversized_ctrl_offset >= ctrl_offset);
+                    ctrl_offset = oversized_ctrl_offset;
+                    buckets = x;
+                }
+
+                block.cast()
+            }
             Err(_) => return Err(fallibility.alloc_err(layout)),
         };
 
@@ -4168,6 +4246,23 @@ impl<T, A: Allocator> RawExtractIf<'_, T, A> {
 mod test_map {
     use super::*;
 
+    #[test]
+    fn test_prev_pow2() {
+        // Skip 0, not defined for that input.
+        let mut pow2: usize = 1;
+        while (pow2 << 1) > 0 {
+            let next_pow2 = pow2 << 1;
+            assert_eq!(pow2, prev_pow2(pow2));
+            // Need to skip 2, because it's also a power of 2, so it doesn't
+            // return the previous power of 2.
+            if next_pow2 > 2 {
+                assert_eq!(pow2, prev_pow2(pow2 + 1));
+                assert_eq!(pow2, prev_pow2(next_pow2 - 1));
+            }
+            pow2 = next_pow2;
+        }
+    }
+
     #[test]
     fn test_minimum_capacity_for_small_types() {
         #[track_caller]