From c6fa5c8ce06aa5a39c8c45c5cd0ad298c5eb5ab4 Mon Sep 17 00:00:00 2001 From: Jonas Malaco Date: Sun, 12 Jan 2025 03:08:22 -0300 Subject: [PATCH 1/8] argon2: add parallelism with cooperative views into memory Coordinated shared access in the memory blocks is implemented with `SegmentViewIter` and associated types, which provide views into Argon2 memory that can be processed in parallel. These views alias in the regions that are read-only, but are disjoint in the regions where mutation happens. Effectively, they implement, with a combination of mutable borrowing and runtime checking, the cooperative contract outlined in RFC 9106. To avoid aliasing mutable references into the entire buffer of blocks (which would be UB), pointers are used up to the moment where a reference (shared or mutable) into a specific block is returned. At that point, aliasing is no longer possible, as argued in SAFETY comments and/or checked at runtime. Finally, add a `parallel` feature and parallelize filling the blocks using the memory views mentioned above and rayon. --- Cargo.lock | 1 + argon2/Cargo.toml | 2 + argon2/src/lib.rs | 210 ++++++++++++++++++++++-------------------- argon2/src/memory.rs | 211 +++++++++++++++++++++++++++++++++++++++++++ benches/Cargo.toml | 4 + 5 files changed, 329 insertions(+), 99 deletions(-) create mode 100644 argon2/src/memory.rs diff --git a/Cargo.lock b/Cargo.lock index 910b9fa4..2b686b56 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -11,6 +11,7 @@ dependencies = [ "cpufeatures", "hex-literal", "password-hash", + "rayon", "zeroize", ] diff --git a/argon2/Cargo.toml b/argon2/Cargo.toml index f7c14f7e..4348bcc4 100644 --- a/argon2/Cargo.toml +++ b/argon2/Cargo.toml @@ -21,6 +21,7 @@ base64ct = "1.7" blake2 = { version = "0.11.0-rc.0", default-features = false } # optional dependencies +rayon = { version = "1.7", optional = true } password-hash = { version = "0.6.0-rc.1", optional = true } zeroize = { version = "1", default-features = false, optional = true } @@ -36,6 +37,7 @@ default = ["alloc", "password-hash", "rand"] alloc = ["password-hash?/alloc"] std = ["alloc", "password-hash?/os_rng", "base64ct/std"] +parallel = ["dep:rayon", "std"] rand = ["password-hash?/rand_core"] simple = ["password-hash"] zeroize = ["dep:zeroize"] diff --git a/argon2/src/lib.rs b/argon2/src/lib.rs index c8469c96..1919d6da 100644 --- a/argon2/src/lib.rs +++ b/argon2/src/lib.rs @@ -153,6 +153,7 @@ mod algorithm; mod blake2b_long; mod block; mod error; +mod memory; mod params; mod version; @@ -173,10 +174,14 @@ pub use { use crate::blake2b_long::blake2b_long; use blake2::{Blake2b512, Digest, digest}; use core::fmt; +use memory::SegmentViews; #[cfg(all(feature = "alloc", feature = "password-hash"))] use password_hash::{Decimal, Ident, ParamsString, Salt}; +#[cfg(feature = "parallel")] +use rayon::prelude::*; + #[cfg(feature = "zeroize")] use zeroize::Zeroize; @@ -347,7 +352,7 @@ impl<'key> Argon2<'key> { mut initial_hash: digest::Output, ) -> Result<()> { let block_count = self.params.block_count(); - let memory_blocks = memory_blocks + let mut memory_blocks = memory_blocks .get_mut(..block_count) .ok_or(Error::MemoryTooLittle)?; @@ -387,54 +392,29 @@ impl<'key> Argon2<'key> { && pass == 0 && slice < SYNC_POINTS / 2); - for lane in 0..lanes { - let mut address_block = Block::default(); - let mut input_block = Block::default(); - let zero_block = Block::default(); - - if data_independent_addressing { - input_block.as_mut()[..6].copy_from_slice(&[ - pass as u64, - lane as u64, - slice as u64, - memory_blocks.len() as u64, - iterations as u64, - self.algorithm as u64, - ]); - } - - let first_block = if pass == 0 && slice == 0 { + memory_blocks + .segment_views(slice, lanes) + .for_each(|mut memory_view| { + let lane = memory_view.lane(); + //for (lane, mut memory_view) in memory.as_parallel_views(slice, lanes).enumerate() { + let mut address_block = Block::default(); + let mut input_block = Block::default(); + let zero_block = Block::default(); + if data_independent_addressing { - // Generate first set of addresses - self.update_address_block( - &mut address_block, - &mut input_block, - &zero_block, - ); + input_block.as_mut()[..6].copy_from_slice(&[ + pass as u64, + lane as u64, + slice as u64, + memory_view.block_count() as u64, + iterations as u64, + self.algorithm as u64, + ]); } - // The first two blocks of each lane are already initialized - 2 - } else { - 0 - }; - - let mut cur_index = lane * lane_length + slice * segment_length + first_block; - let mut prev_index = if slice == 0 && first_block == 0 { - // Last block in current lane - cur_index + lane_length - 1 - } else { - // Previous block - cur_index - 1 - }; - - // Fill blocks in the segment - for block in first_block..segment_length { - // Extract entropy - let rand = if data_independent_addressing { - let address_index = block % ADDRESSES_IN_BLOCK; - - if address_index == 0 { + let first_block = if pass == 0 && slice == 0 { + if data_independent_addressing { + // Generate first set of addresses self.update_address_block( &mut address_block, &mut input_block, @@ -442,71 +422,103 @@ impl<'key> Argon2<'key> { ); } - address_block.as_ref()[address_index] + // The first two blocks of each lane are already initialized + 2 } else { - memory_blocks[prev_index].as_ref()[0] + 0 }; - // Calculate source block index for compress function - let ref_lane = if pass == 0 && slice == 0 { - // Cannot reference other lanes yet - lane + let mut cur_index = + lane * lane_length + slice * segment_length + first_block; + let mut prev_index = if slice == 0 && first_block == 0 { + // Last block in current lane + cur_index + lane_length - 1 } else { - (rand >> 32) as usize % lanes + // Previous block + cur_index - 1 }; - let reference_area_size = if pass == 0 { - // First pass - if slice == 0 { - // First slice - block - 1 // all but the previous - } else if ref_lane == lane { - // The same lane => add current segment - slice * segment_length + block - 1 - } else { - slice * segment_length - if block == 0 { 1 } else { 0 } - } - } else { - // Second pass - if ref_lane == lane { - lane_length - segment_length + block - 1 + // Fill blocks in the segment + for block in first_block..segment_length { + // Extract entropy + let rand = if data_independent_addressing { + let address_index = block % ADDRESSES_IN_BLOCK; + + if address_index == 0 { + self.update_address_block( + &mut address_block, + &mut input_block, + &zero_block, + ); + } + + address_block.as_ref()[address_index] } else { - lane_length - segment_length - if block == 0 { 1 } else { 0 } - } - }; + memory_view.get_block(prev_index).as_ref()[0] + }; - // 1.2.4. Mapping rand to 0.. and produce - // relative position - let mut map = rand & 0xFFFFFFFF; - map = (map * map) >> 32; - let relative_position = reference_area_size - - 1 - - ((reference_area_size as u64 * map) >> 32) as usize; - - // 1.2.5 Computing starting position - let start_position = if pass != 0 && slice != SYNC_POINTS - 1 { - (slice + 1) * segment_length - } else { - 0 - }; + // Calculate source block index for compress function + let ref_lane = if pass == 0 && slice == 0 { + // Cannot reference other lanes yet + lane + } else { + (rand >> 32) as usize % lanes + }; + + let reference_area_size = if pass == 0 { + // First pass + if slice == 0 { + // First slice + block - 1 // all but the previous + } else if ref_lane == lane { + // The same lane => add current segment + slice * segment_length + block - 1 + } else { + slice * segment_length - if block == 0 { 1 } else { 0 } + } + } else { + // Second pass + if ref_lane == lane { + lane_length - segment_length + block - 1 + } else { + lane_length - segment_length - if block == 0 { 1 } else { 0 } + } + }; + + // 1.2.4. Mapping rand to 0.. and produce + // relative position + let mut map = rand & 0xFFFFFFFF; + map = (map * map) >> 32; + let relative_position = reference_area_size + - 1 + - ((reference_area_size as u64 * map) >> 32) as usize; + + // 1.2.5 Computing starting position + let start_position = if pass != 0 && slice != SYNC_POINTS - 1 { + (slice + 1) * segment_length + } else { + 0 + }; - let lane_index = (start_position + relative_position) % lane_length; - let ref_index = ref_lane * lane_length + lane_index; + let lane_index = (start_position + relative_position) % lane_length; + let ref_index = ref_lane * lane_length + lane_index; - // Calculate new block - let result = - self.compress(&memory_blocks[prev_index], &memory_blocks[ref_index]); + // Calculate new block + let result = self.compress( + memory_view.get_block(prev_index), + memory_view.get_block(ref_index), + ); - if self.version == Version::V0x10 || pass == 0 { - memory_blocks[cur_index] = result; - } else { - memory_blocks[cur_index] ^= &result; - }; + if self.version == Version::V0x10 || pass == 0 { + *memory_view.get_block_mut(cur_index) = result; + } else { + *memory_view.get_block_mut(cur_index) ^= &result; + }; - prev_index = cur_index; - cur_index += 1; - } - } + prev_index = cur_index; + cur_index += 1; + } + }); } } diff --git a/argon2/src/memory.rs b/argon2/src/memory.rs new file mode 100644 index 00000000..28fe6efc --- /dev/null +++ b/argon2/src/memory.rs @@ -0,0 +1,211 @@ +//! Views into Argon2 memory that can be processed in parallel. +//! +//! This module implements, with a combination of compile-time borrowing and runtime checking, the +//! cooperative contract described in section 3.4 (Indexing) of RFC 9106: +//! +//! > To enable parallel block computation, we further partition the memory matrix into SL = 4 +//! > vertical slices. The intersection of a slice and a lane is called a segment, which has a +//! > length of q/SL. Segments of the same slice can be computed in parallel and do not reference +//! > blocks from each other. All other blocks can be referenced. + +#![warn( + clippy::undocumented_unsafe_blocks, + clippy::missing_safety_doc, + unsafe_op_in_unsafe_fn +)] + +use core::marker::PhantomData; +use core::ptr::NonNull; + +#[cfg(feature = "parallel")] +use rayon::iter::{IntoParallelIterator, ParallelIterator}; + +use crate::{Block, SYNC_POINTS}; + +pub trait SegmentViews<'a> { + /// Construct an iterator of parallelizable views into a set of Argon2 memory blocks. + /// + /// If the `parallel` feature is enabled, the returned type implements + /// [`rayon::iter::ParallelIterator`]; otherwise it implements [`core::iter::Iterator`]. + fn segment_views(&mut self, slice: usize, lanes: usize) -> SegmentViewIter<'_>; +} + +impl<'a> SegmentViews<'a> for &'a mut [Block] { + fn segment_views(&mut self, slice: usize, lanes: usize) -> SegmentViewIter<'_> { + // The pointer needs to be derived from a mutable reference because (later) mutating the + // blocks through a pointer derived from a shared reference would be UB. + let blocks = NonNull::from(&mut **self); + // SAFETY: we take `&mut self` and any views derived from the returned iterator carry this + // mutable borrow. Therefore, it's impossible to create a `MemoryViewIter` while another + // one, or any views derived from it, still exist. Additionally, the pointer and number of + // blocks are created from `self`. + unsafe { SegmentViewIter::new(blocks.cast(), self.len(), slice, lanes) } + } +} + +/// Iterator of parallelizable views into a set of Argon2 memory blocks. +pub struct SegmentViewIter<'a> { + inner: SegmentViewInner<'a>, + #[cfg(not(feature = "parallel"))] + minted: usize, +} + +impl SegmentViewIter<'_> { + /// Construct an Iterator of parallelizable views into a set of Argon2 memory blocks. + /// + /// # Safety + /// + /// `blocks` must point to the start of a Rust slice buffer with `block_count` blocks, and + /// there currently are no views or view iterators into that memory region. + unsafe fn new(blocks: NonNull, block_count: usize, slice: usize, lanes: usize) -> Self { + // SAFETY: the pointer is valid and there currently are no views into the memory region. + let inner = unsafe { SegmentViewInner::new(blocks, block_count, slice, lanes) }; + SegmentViewIter { + inner, + #[cfg(not(feature = "parallel"))] + minted: 0, + } + } +} + +#[cfg(not(feature = "parallel"))] +impl<'a> Iterator for SegmentViewIter<'a> { + type Item = SegmentView<'a>; + + fn next(&mut self) -> Option { + if self.minted < self.inner.lanes { + // SAFETY: `self` mutably borrows the underlying memory region for a single Argon2 + // slice, and we create exactly one memory view per lane. + let view = unsafe { SegmentView::new(self.inner.unsafe_copy(), self.minted) }; + self.minted += 1; + Some(view) + } else { + None + } + } +} + +#[cfg(feature = "parallel")] +impl<'a> ParallelIterator for SegmentViewIter<'a> { + type Item = SegmentView<'a>; + + fn drive_unindexed(self, consumer: C) -> C::Result + where + C: rayon::iter::plumbing::UnindexedConsumer, + { + (0..self.inner.lanes) + .into_par_iter() + .map(|lane| { + // SAFETY: `self` mutably borrows the underlying memory region for a single Argon2 + // slice, and we create exactly one memory view per lane. + unsafe { SegmentView::new(self.inner.unsafe_copy(), lane) } + }) + .drive_unindexed(consumer) + } +} + +/// A view into an Argon2 memory region for a particular Argon2 slice and lane. +pub struct SegmentView<'a> { + inner: SegmentViewInner<'a>, + lane: usize, +} + +impl<'a> SegmentView<'a> { + /// Create a new segment view into Argon2 memory. + /// + /// # Safety + /// + /// There can simultaneously exist at most one view per lane into the same memory region, and + /// all of them must refer to the same Argon2 slice. + unsafe fn new(inner: SegmentViewInner<'a>, lane: usize) -> Self { + Self { inner, lane } + } +} + +impl SegmentView<'_> { + pub fn get_block(&self, index: usize) -> &Block { + assert!(index < self.inner.block_count); + assert!( + index / self.lane_length() == self.lane + || index % self.lane_length() / self.segment_length() != self.inner.slice + ); + + // SAFETY: constructing `self` required the pointer to be valid, and `index` is in bounds. + let ptr = unsafe { self.inner.blocks.add(index) }; + // SAFETY: constructing `self` required that this be the only segment view for this lane, + // and that no segment views exist for other Argon2 slices. We check that `index` is is + // either on this lane -- in which case there is mutable aliasing because `get_block_mut` + // takes `&mut self` -- or on a different Argon2 slice -- in which case there are no + // mutable references to it at all. + unsafe { ptr.as_ref() } + } + + pub fn get_block_mut(&mut self, index: usize) -> &mut Block { + assert!(index < self.inner.block_count); + assert!(index / self.lane_length() == self.lane); + + // SAFETY: constructing `self` required the pointer to be valid, and `index` is in bounds. + let mut ptr = unsafe { self.inner.blocks.add(index) }; + // SAFETY: constructing `self` required this be the only segment view for this lane, and + // that no segment views exist for other Argon2 slices. We check that `index` is on this + // lane, and there is no aliasing because we take `&mut self`. + unsafe { ptr.as_mut() } + } + + pub fn block_count(&self) -> usize { + self.inner.block_count + } + + pub fn lane(&self) -> usize { + self.lane + } + + fn lane_length(&self) -> usize { + self.inner.block_count / self.inner.lanes + } + + fn segment_length(&self) -> usize { + self.inner.block_count / self.inner.lanes / SYNC_POINTS + } +} + +/// Underlying pointer and associated data for segment views (and view iterators). +struct SegmentViewInner<'a> { + blocks: NonNull, + block_count: usize, + slice: usize, + lanes: usize, + phantom: PhantomData<&'a mut Block>, +} + +// SAFETY: this is a private type, and `SegmentView` enforces the aliasing rules at runtime. +unsafe impl Send for SegmentViewInner<'_> {} +// SAFETY: this is a private type, and `SegmentView` enforces the aliasing rules at runtime. +unsafe impl Sync for SegmentViewInner<'_> {} + +impl SegmentViewInner<'_> { + /// Wrap the underlying pointer and associated data for a segment view. + /// + /// # Safety + /// + /// This method must not be called in a way that causes memory views to mutably alias. + /// Additionally, `blocks` must point to the start of a Rust slice buffer with `block_count` blocks. + unsafe fn new(blocks: NonNull, block_count: usize, slice: usize, lanes: usize) -> Self { + Self { + blocks, + block_count, + slice, + lanes, + phantom: PhantomData, + } + } + + /// Copy the underlying pointer and associated data. + /// + /// # Safety + /// + /// This method must not be called in a way that causes memory views to mutably alias. + unsafe fn unsafe_copy(&self) -> Self { + Self { ..*self } + } +} diff --git a/benches/Cargo.toml b/benches/Cargo.toml index 6109e6af..502a894f 100644 --- a/benches/Cargo.toml +++ b/benches/Cargo.toml @@ -11,6 +11,10 @@ argon2 = { path = "../argon2" } criterion = { version = "0.4", features = ["html_reports"] } pprof = { version = "0.14", features = ["flamegraph", "criterion"] } +[features] +default = [] +parallel = ["argon2/parallel"] + [[bench]] name = "argon2" path = "src/argon2.rs" From 0e2d2e89c3d9f8152f7ba35b6add2b4e950db553 Mon Sep 17 00:00:00 2001 From: Jonas Malaco Date: Sat, 8 Mar 2025 13:37:31 -0300 Subject: [PATCH 2/8] benches: bump criterion to prevent Profiler not implemented error This was cause by having multiple different versions of criterion, and therefore the train, in use: we specified ^0.4, but pprof 0.14.0 already required ^0.5. --- benches/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benches/Cargo.toml b/benches/Cargo.toml index 502a894f..978dd901 100644 --- a/benches/Cargo.toml +++ b/benches/Cargo.toml @@ -8,7 +8,7 @@ publish = false [dev-dependencies] argon2 = { path = "../argon2" } -criterion = { version = "0.4", features = ["html_reports"] } +criterion = { version = "0.5", features = ["html_reports"] } pprof = { version = "0.14", features = ["flamegraph", "criterion"] } [features] From cbc5b4b820a1abb17718cade7859851f1dc75680 Mon Sep 17 00:00:00 2001 From: Jonas Malaco Date: Sat, 8 Mar 2025 13:44:09 -0300 Subject: [PATCH 3/8] benches: expand argon2 benchmarks varying p_cost Additionally, use a set instead of trying to avoid repeating a particular set of params by hand. --- benches/src/argon2.rs | 62 +++++++++++++++---------------------------- 1 file changed, 21 insertions(+), 41 deletions(-) diff --git a/benches/src/argon2.rs b/benches/src/argon2.rs index b26b6371..d2690d04 100644 --- a/benches/src/argon2.rs +++ b/benches/src/argon2.rs @@ -1,3 +1,5 @@ +use std::collections::HashSet; + use argon2::*; use criterion::{black_box, criterion_group, criterion_main, Criterion}; use pprof::criterion::{Output, PProfProfiler}; @@ -26,46 +28,26 @@ fn bench_default_params(c: &mut Criterion) { } } -fn bench_vary_m(c: &mut Criterion) { - let t_cost = 4; - let p_cost = 4; - for m_cost in [2 * 1024, 16 * 1024, 64 * 1024, 256 * 1024] { - let test_name = format!("argon2id V0x13 m={m_cost} t={t_cost} p={p_cost}"); - c.bench_function(&test_name, |b| { - let mut out = [0u8; 32]; - let params = Params::new(m_cost, t_cost, p_cost, Some(32)).unwrap(); - let argon2 = Argon2::new(Algorithm::Argon2id, Version::V0x13, params); - b.iter(|| { - argon2 - .hash_password_into(black_box(BENCH_PASSWORD), black_box(BENCH_SALT), &mut out) - .unwrap() - }) - }); +fn bench_vary_params(c: &mut Criterion) { + let mut tests = HashSet::new(); + // Vary `m_cost`. + for m_cost in [2 * 1024, 16 * 1024, 32 * 1024, 64 * 1024, 256 * 1024] { + tests.insert((m_cost, 4, 4)); } -} - -fn bench_vary_t(c: &mut Criterion) { - let m_cost = 32 * 1024; - let p_cost = 4; - for t_cost in [2, 8, 16, 24] { - let test_name = format!("argon2id V0x13 m={m_cost} t={t_cost} p={p_cost}"); - c.bench_function(&test_name, |b| { - let mut out = [0u8; 32]; - let params = Params::new(m_cost, t_cost, p_cost, Some(32)).unwrap(); - let argon2 = Argon2::new(Algorithm::Argon2id, Version::V0x13, params); - b.iter(|| { - argon2 - .hash_password_into(black_box(BENCH_PASSWORD), black_box(BENCH_SALT), &mut out) - .unwrap() - }) - }); + // Vary `t_cost`. + for t_cost in [1, 2, 4, 8, 16] { + tests.insert((32 * 1024, t_cost, 4)); } -} - -fn bench_vary_p(c: &mut Criterion) { - let m_cost = 32 * 1024; - let t_cost = 4; - for p_cost in [2, 8, 16, 64] { + // Vary `p_cost`. + for p_cost in [1, 2, 4, 8, 16] { + for m_mib in [256 * 1024, 1024 * 1024] { + tests.insert((m_mib, 1, p_cost)); + } + for t_cost in [1, 2, 4] { + tests.insert((32 * 1024, t_cost, p_cost)); + } + } + for (m_cost, t_cost, p_cost) in tests { let test_name = format!("argon2id V0x13 m={m_cost} t={t_cost} p={p_cost}"); c.bench_function(&test_name, |b| { let mut out = [0u8; 32]; @@ -85,8 +67,6 @@ criterion_group!( config = Criterion::default().with_profiler(PProfProfiler::new(300, Output::Flamegraph(None))); targets = bench_default_params, - bench_vary_m, - bench_vary_t, - bench_vary_p, + bench_vary_params, ); criterion_main!(benches); From 28583a9587ba8bdb9fc89f5145f2a368da22230c Mon Sep 17 00:00:00 2001 From: Jonas Malaco Date: Sat, 8 Mar 2025 15:07:39 -0300 Subject: [PATCH 4/8] benches: patch password-hash due to new os-rng feature --- benches/Cargo.toml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/benches/Cargo.toml b/benches/Cargo.toml index 978dd901..491ce0ea 100644 --- a/benches/Cargo.toml +++ b/benches/Cargo.toml @@ -19,3 +19,6 @@ parallel = ["argon2/parallel"] name = "argon2" path = "src/argon2.rs" harness = false + +[patch.crates-io] +password-hash = { git = "https://github.com/RustCrypto/traits.git" } From 32db77c5c539d9545dca09edb23b7966631a461b Mon Sep 17 00:00:00 2001 From: Jonas Malaco Date: Sat, 15 Mar 2025 05:17:06 -0300 Subject: [PATCH 5/8] argon2: simplify memory view implementation and internal API --- argon2/src/lib.rs | 246 ++++++++++++++++++++--------------------- argon2/src/memory.rs | 253 +++++++++++++++++-------------------------- 2 files changed, 220 insertions(+), 279 deletions(-) diff --git a/argon2/src/lib.rs b/argon2/src/lib.rs index 1919d6da..0d55b245 100644 --- a/argon2/src/lib.rs +++ b/argon2/src/lib.rs @@ -174,14 +174,11 @@ pub use { use crate::blake2b_long::blake2b_long; use blake2::{Blake2b512, Digest, digest}; use core::fmt; -use memory::SegmentViews; +use memory::Memory; #[cfg(all(feature = "alloc", feature = "password-hash"))] use password_hash::{Decimal, Ident, ParamsString, Salt}; -#[cfg(feature = "parallel")] -use rayon::prelude::*; - #[cfg(feature = "zeroize")] use zeroize::Zeroize; @@ -386,140 +383,133 @@ impl<'key> Argon2<'key> { // Run passes on blocks for pass in 0..iterations { - for slice in 0..SYNC_POINTS { + memory_blocks.for_each_segment(lanes, |mut memory_view, slice, lane| { let data_independent_addressing = self.algorithm == Algorithm::Argon2i || (self.algorithm == Algorithm::Argon2id && pass == 0 && slice < SYNC_POINTS / 2); - memory_blocks - .segment_views(slice, lanes) - .for_each(|mut memory_view| { - let lane = memory_view.lane(); - //for (lane, mut memory_view) in memory.as_parallel_views(slice, lanes).enumerate() { - let mut address_block = Block::default(); - let mut input_block = Block::default(); - let zero_block = Block::default(); - - if data_independent_addressing { - input_block.as_mut()[..6].copy_from_slice(&[ - pass as u64, - lane as u64, - slice as u64, - memory_view.block_count() as u64, - iterations as u64, - self.algorithm as u64, - ]); + let mut address_block = Block::default(); + let mut input_block = Block::default(); + let zero_block = Block::default(); + + if data_independent_addressing { + input_block.as_mut()[..6].copy_from_slice(&[ + pass as u64, + lane as u64, + slice as u64, + block_count as u64, + iterations as u64, + self.algorithm as u64, + ]); + } + + let first_block = if pass == 0 && slice == 0 { + if data_independent_addressing { + // Generate first set of addresses + self.update_address_block( + &mut address_block, + &mut input_block, + &zero_block, + ); + } + + // The first two blocks of each lane are already initialized + 2 + } else { + 0 + }; + + let mut cur_index = lane * lane_length + slice * segment_length + first_block; + let mut prev_index = if slice == 0 && first_block == 0 { + // Last block in current lane + cur_index + lane_length - 1 + } else { + // Previous block + cur_index - 1 + }; + + // Fill blocks in the segment + for block in first_block..segment_length { + // Extract entropy + let rand = if data_independent_addressing { + let address_index = block % ADDRESSES_IN_BLOCK; + + if address_index == 0 { + self.update_address_block( + &mut address_block, + &mut input_block, + &zero_block, + ); } - let first_block = if pass == 0 && slice == 0 { - if data_independent_addressing { - // Generate first set of addresses - self.update_address_block( - &mut address_block, - &mut input_block, - &zero_block, - ); - } - - // The first two blocks of each lane are already initialized - 2 + address_block.as_ref()[address_index] + } else { + memory_view.get_block(prev_index).as_ref()[0] + }; + + // Calculate source block index for compress function + let ref_lane = if pass == 0 && slice == 0 { + // Cannot reference other lanes yet + lane + } else { + (rand >> 32) as usize % lanes + }; + + let reference_area_size = if pass == 0 { + // First pass + if slice == 0 { + // First slice + block - 1 // all but the previous + } else if ref_lane == lane { + // The same lane => add current segment + slice * segment_length + block - 1 } else { - 0 - }; - - let mut cur_index = - lane * lane_length + slice * segment_length + first_block; - let mut prev_index = if slice == 0 && first_block == 0 { - // Last block in current lane - cur_index + lane_length - 1 + slice * segment_length - if block == 0 { 1 } else { 0 } + } + } else { + // Second pass + if ref_lane == lane { + lane_length - segment_length + block - 1 } else { - // Previous block - cur_index - 1 - }; - - // Fill blocks in the segment - for block in first_block..segment_length { - // Extract entropy - let rand = if data_independent_addressing { - let address_index = block % ADDRESSES_IN_BLOCK; - - if address_index == 0 { - self.update_address_block( - &mut address_block, - &mut input_block, - &zero_block, - ); - } - - address_block.as_ref()[address_index] - } else { - memory_view.get_block(prev_index).as_ref()[0] - }; - - // Calculate source block index for compress function - let ref_lane = if pass == 0 && slice == 0 { - // Cannot reference other lanes yet - lane - } else { - (rand >> 32) as usize % lanes - }; - - let reference_area_size = if pass == 0 { - // First pass - if slice == 0 { - // First slice - block - 1 // all but the previous - } else if ref_lane == lane { - // The same lane => add current segment - slice * segment_length + block - 1 - } else { - slice * segment_length - if block == 0 { 1 } else { 0 } - } - } else { - // Second pass - if ref_lane == lane { - lane_length - segment_length + block - 1 - } else { - lane_length - segment_length - if block == 0 { 1 } else { 0 } - } - }; - - // 1.2.4. Mapping rand to 0.. and produce - // relative position - let mut map = rand & 0xFFFFFFFF; - map = (map * map) >> 32; - let relative_position = reference_area_size - - 1 - - ((reference_area_size as u64 * map) >> 32) as usize; - - // 1.2.5 Computing starting position - let start_position = if pass != 0 && slice != SYNC_POINTS - 1 { - (slice + 1) * segment_length - } else { - 0 - }; - - let lane_index = (start_position + relative_position) % lane_length; - let ref_index = ref_lane * lane_length + lane_index; - - // Calculate new block - let result = self.compress( - memory_view.get_block(prev_index), - memory_view.get_block(ref_index), - ); - - if self.version == Version::V0x10 || pass == 0 { - *memory_view.get_block_mut(cur_index) = result; - } else { - *memory_view.get_block_mut(cur_index) ^= &result; - }; - - prev_index = cur_index; - cur_index += 1; + lane_length - segment_length - if block == 0 { 1 } else { 0 } } - }); - } + }; + + // 1.2.4. Mapping rand to 0.. and produce + // relative position + let mut map = rand & 0xFFFFFFFF; + map = (map * map) >> 32; + let relative_position = reference_area_size + - 1 + - ((reference_area_size as u64 * map) >> 32) as usize; + + // 1.2.5 Computing starting position + let start_position = if pass != 0 && slice != SYNC_POINTS - 1 { + (slice + 1) * segment_length + } else { + 0 + }; + + let lane_index = (start_position + relative_position) % lane_length; + let ref_index = ref_lane * lane_length + lane_index; + + // Calculate new block + let result = self.compress( + memory_view.get_block(prev_index), + memory_view.get_block(ref_index), + ); + + if self.version == Version::V0x10 || pass == 0 { + *memory_view.get_block_mut(cur_index) = result; + } else { + *memory_view.get_block_mut(cur_index) ^= &result; + }; + + prev_index = cur_index; + cur_index += 1; + } + }); } Ok(()) diff --git a/argon2/src/memory.rs b/argon2/src/memory.rs index 28fe6efc..3802d9c0 100644 --- a/argon2/src/memory.rs +++ b/argon2/src/memory.rs @@ -22,190 +22,141 @@ use rayon::iter::{IntoParallelIterator, ParallelIterator}; use crate::{Block, SYNC_POINTS}; -pub trait SegmentViews<'a> { - /// Construct an iterator of parallelizable views into a set of Argon2 memory blocks. +/// Extension trait for Argon2 memory blocks. +pub(crate) trait Memory<'a> { + /// Compute each Argon2 segment. /// - /// If the `parallel` feature is enabled, the returned type implements - /// [`rayon::iter::ParallelIterator`]; otherwise it implements [`core::iter::Iterator`]. - fn segment_views(&mut self, slice: usize, lanes: usize) -> SegmentViewIter<'_>; -} - -impl<'a> SegmentViews<'a> for &'a mut [Block] { - fn segment_views(&mut self, slice: usize, lanes: usize) -> SegmentViewIter<'_> { - // The pointer needs to be derived from a mutable reference because (later) mutating the - // blocks through a pointer derived from a shared reference would be UB. - let blocks = NonNull::from(&mut **self); - // SAFETY: we take `&mut self` and any views derived from the returned iterator carry this - // mutable borrow. Therefore, it's impossible to create a `MemoryViewIter` while another - // one, or any views derived from it, still exist. Additionally, the pointer and number of - // blocks are created from `self`. - unsafe { SegmentViewIter::new(blocks.cast(), self.len(), slice, lanes) } - } + /// By default computation is single threaded. Parallel computation can be enabled with the + /// `parallel` feature, in which case [rayon] is used to compute as many lanes in parallel as + /// possible. + fn for_each_segment(&mut self, lanes: usize, f: F) + where + F: Fn(SegmentView<'_>, usize, usize) + Sync + Send; } -/// Iterator of parallelizable views into a set of Argon2 memory blocks. -pub struct SegmentViewIter<'a> { - inner: SegmentViewInner<'a>, +impl Memory<'_> for &mut [Block] { #[cfg(not(feature = "parallel"))] - minted: usize, -} - -impl SegmentViewIter<'_> { - /// Construct an Iterator of parallelizable views into a set of Argon2 memory blocks. - /// - /// # Safety - /// - /// `blocks` must point to the start of a Rust slice buffer with `block_count` blocks, and - /// there currently are no views or view iterators into that memory region. - unsafe fn new(blocks: NonNull, block_count: usize, slice: usize, lanes: usize) -> Self { - // SAFETY: the pointer is valid and there currently are no views into the memory region. - let inner = unsafe { SegmentViewInner::new(blocks, block_count, slice, lanes) }; - SegmentViewIter { - inner, - #[cfg(not(feature = "parallel"))] - minted: 0, - } - } -} - -#[cfg(not(feature = "parallel"))] -impl<'a> Iterator for SegmentViewIter<'a> { - type Item = SegmentView<'a>; - - fn next(&mut self) -> Option { - if self.minted < self.inner.lanes { - // SAFETY: `self` mutably borrows the underlying memory region for a single Argon2 - // slice, and we create exactly one memory view per lane. - let view = unsafe { SegmentView::new(self.inner.unsafe_copy(), self.minted) }; - self.minted += 1; - Some(view) - } else { - None + fn for_each_segment(&mut self, lanes: usize, f: F) + where + F: Fn(SegmentView<'_>, usize, usize) + Sync + Send, + { + let inner = MemoryInner::new(self, lanes); + for slice in 0..SYNC_POINTS { + for lane in 0..lanes { + // SAFETY: `self` exclusively borrows the blocks, and we sequentially process + // slices and segments. + let segment = unsafe { SegmentView::new(inner, slice, lane) }; + f(segment, slice, lane); + } } } -} -#[cfg(feature = "parallel")] -impl<'a> ParallelIterator for SegmentViewIter<'a> { - type Item = SegmentView<'a>; - - fn drive_unindexed(self, consumer: C) -> C::Result + #[cfg(feature = "parallel")] + fn for_each_segment(&mut self, lanes: usize, f: F) where - C: rayon::iter::plumbing::UnindexedConsumer, + F: Fn(SegmentView<'_>, usize, usize) + Sync + Send, { - (0..self.inner.lanes) - .into_par_iter() - .map(|lane| { - // SAFETY: `self` mutably borrows the underlying memory region for a single Argon2 - // slice, and we create exactly one memory view per lane. - unsafe { SegmentView::new(self.inner.unsafe_copy(), lane) } - }) - .drive_unindexed(consumer) + let inner = MemoryInner::new(self, lanes); + for slice in 0..SYNC_POINTS { + (0..lanes).into_par_iter().for_each(|lane| { + // SAFETY: `self` exclusively borrows the blocks, we sequentially process slices, + // and we create exactly one segment view per lane in a slice. + let segment = unsafe { SegmentView::new(inner, slice, lane) }; + f(segment, slice, lane); + }); + } } } -/// A view into an Argon2 memory region for a particular Argon2 slice and lane. -pub struct SegmentView<'a> { - inner: SegmentViewInner<'a>, - lane: usize, +/// Low-level pointer and metadata for an Argon2 memory region. +#[derive(Clone, Copy)] +struct MemoryInner<'a> { + blocks: NonNull, + block_count: usize, + lane_length: usize, + phantom: PhantomData<&'a mut Block>, } -impl<'a> SegmentView<'a> { - /// Create a new segment view into Argon2 memory. - /// - /// # Safety - /// - /// There can simultaneously exist at most one view per lane into the same memory region, and - /// all of them must refer to the same Argon2 slice. - unsafe fn new(inner: SegmentViewInner<'a>, lane: usize) -> Self { - Self { inner, lane } - } -} +impl MemoryInner<'_> { + fn new(memory_blocks: &mut [Block], lanes: usize) -> Self { + let block_count = memory_blocks.len(); + let lane_length = block_count / lanes; -impl SegmentView<'_> { - pub fn get_block(&self, index: usize) -> &Block { - assert!(index < self.inner.block_count); - assert!( - index / self.lane_length() == self.lane - || index % self.lane_length() / self.segment_length() != self.inner.slice - ); - - // SAFETY: constructing `self` required the pointer to be valid, and `index` is in bounds. - let ptr = unsafe { self.inner.blocks.add(index) }; - // SAFETY: constructing `self` required that this be the only segment view for this lane, - // and that no segment views exist for other Argon2 slices. We check that `index` is is - // either on this lane -- in which case there is mutable aliasing because `get_block_mut` - // takes `&mut self` -- or on a different Argon2 slice -- in which case there are no - // mutable references to it at all. - unsafe { ptr.as_ref() } - } + // SAFETY: the pointer needs to be derived from a mutable reference because (later) + // mutating the blocks through a pointer derived from a shared reference would be UB. + let blocks = NonNull::from(memory_blocks); - pub fn get_block_mut(&mut self, index: usize) -> &mut Block { - assert!(index < self.inner.block_count); - assert!(index / self.lane_length() == self.lane); - - // SAFETY: constructing `self` required the pointer to be valid, and `index` is in bounds. - let mut ptr = unsafe { self.inner.blocks.add(index) }; - // SAFETY: constructing `self` required this be the only segment view for this lane, and - // that no segment views exist for other Argon2 slices. We check that `index` is on this - // lane, and there is no aliasing because we take `&mut self`. - unsafe { ptr.as_mut() } + MemoryInner { + blocks: blocks.cast(), + block_count, + lane_length, + phantom: PhantomData, + } } - pub fn block_count(&self) -> usize { - self.inner.block_count + fn lane_of(&self, index: usize) -> usize { + index / self.lane_length } - pub fn lane(&self) -> usize { - self.lane + fn slice_of(&self, index: usize) -> usize { + index / (self.lane_length / SYNC_POINTS) % SYNC_POINTS } +} - fn lane_length(&self) -> usize { - self.inner.block_count / self.inner.lanes - } +// SAFETY: private type, and just a pointer with some metadata. +unsafe impl Send for MemoryInner<'_> {} - fn segment_length(&self) -> usize { - self.inner.block_count / self.inner.lanes / SYNC_POINTS - } -} +// SAFETY: private type, and just a pointer with some metadata. +unsafe impl Sync for MemoryInner<'_> {} -/// Underlying pointer and associated data for segment views (and view iterators). -struct SegmentViewInner<'a> { - blocks: NonNull, - block_count: usize, +/// A view into Argon2 memory for a particular segment (i.e. slice × lane). +pub(crate) struct SegmentView<'a> { + inner: MemoryInner<'a>, slice: usize, - lanes: usize, - phantom: PhantomData<&'a mut Block>, + lane: usize, } -// SAFETY: this is a private type, and `SegmentView` enforces the aliasing rules at runtime. -unsafe impl Send for SegmentViewInner<'_> {} -// SAFETY: this is a private type, and `SegmentView` enforces the aliasing rules at runtime. -unsafe impl Sync for SegmentViewInner<'_> {} - -impl SegmentViewInner<'_> { - /// Wrap the underlying pointer and associated data for a segment view. +impl<'a> SegmentView<'a> { + /// Create a view into Argon2 memory for a particular segment (i.e. slice × lane). /// /// # Safety /// - /// This method must not be called in a way that causes memory views to mutably alias. - /// Additionally, `blocks` must point to the start of a Rust slice buffer with `block_count` blocks. - unsafe fn new(blocks: NonNull, block_count: usize, slice: usize, lanes: usize) -> Self { - Self { - blocks, - block_count, - slice, - lanes, - phantom: PhantomData, - } + /// At any time, there can be at most one view for a given Argon2 segment. Additionally, all + /// concurrent segment views must be for the same slice. + unsafe fn new(inner: MemoryInner<'a>, slice: usize, lane: usize) -> Self { + SegmentView { inner, slice, lane } } - /// Copy the underlying pointer and associated data. + /// Get a shared reference to a block. /// - /// # Safety + /// # Panics /// - /// This method must not be called in a way that causes memory views to mutably alias. - unsafe fn unsafe_copy(&self) -> Self { - Self { ..*self } + /// Panics if the index is out of bounds or if the desired block *could* be mutably aliased (if + /// it is on the current slice but on a different lane/segment). + pub fn get_block(&self, index: usize) -> &Block { + assert!(index < self.inner.block_count); + assert!(self.inner.lane_of(index) == self.lane || self.inner.slice_of(index) != self.slice); + + // SAFETY: by construction, the base pointer is valid for reads, and we assert that the + // index is in bounds. We also assert that the index either lies on this lane, or is on + // another slice. Finally, we're the only view into this segment, and mutating through it + // requires `&mut self` and is restricted to blocks within the segment. + unsafe { self.inner.blocks.add(index).as_ref() } + } + + /// Get a mutable reference to a block. + /// + /// # Panics + /// + /// Panics if the index is out of bounds or if the desired block lies outside this segment. + pub fn get_block_mut(&mut self, index: usize) -> &mut Block { + assert!(index < self.inner.block_count); + assert_eq!(self.inner.lane_of(index), self.lane); + assert_eq!(self.inner.slice_of(index), self.slice); + + // SAFETY: by construction, the base pointer is valid for reads and writes, and we assert + // that the index is in bounds. We also assert that the index lies on this segment, and + // we're the only view for it, taking `&mut self`. + unsafe { self.inner.blocks.add(index).as_mut() } } } From 3a380c3465b8ddd37b65cd84a223683f595410e7 Mon Sep 17 00:00:00 2001 From: Jonas Malaco Date: Sat, 15 Mar 2025 05:17:36 -0300 Subject: [PATCH 6/8] benches: run benches in a predictable order --- benches/src/argon2.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/benches/src/argon2.rs b/benches/src/argon2.rs index d2690d04..b939c5d1 100644 --- a/benches/src/argon2.rs +++ b/benches/src/argon2.rs @@ -1,4 +1,4 @@ -use std::collections::HashSet; +use std::collections::BTreeSet; use argon2::*; use criterion::{black_box, criterion_group, criterion_main, Criterion}; @@ -29,7 +29,7 @@ fn bench_default_params(c: &mut Criterion) { } fn bench_vary_params(c: &mut Criterion) { - let mut tests = HashSet::new(); + let mut tests = BTreeSet::new(); // Vary `m_cost`. for m_cost in [2 * 1024, 16 * 1024, 32 * 1024, 64 * 1024, 256 * 1024] { tests.insert((m_cost, 4, 4)); From 267c7530ffc7d3ead72e8744e509b5618159b122 Mon Sep 17 00:00:00 2001 From: Jonas Malaco Date: Fri, 18 Jul 2025 19:41:24 -0300 Subject: [PATCH 7/8] argon2: don't enable std by enabling parallel --- argon2/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/argon2/Cargo.toml b/argon2/Cargo.toml index 4348bcc4..7339b4b4 100644 --- a/argon2/Cargo.toml +++ b/argon2/Cargo.toml @@ -37,7 +37,7 @@ default = ["alloc", "password-hash", "rand"] alloc = ["password-hash?/alloc"] std = ["alloc", "password-hash?/os_rng", "base64ct/std"] -parallel = ["dep:rayon", "std"] +parallel = ["dep:rayon"] rand = ["password-hash?/rand_core"] simple = ["password-hash"] zeroize = ["dep:zeroize"] From fb4b0f49b92725d5af02a2ad3d7a6d9346b631b0 Mon Sep 17 00:00:00 2001 From: Jonas Malaco Date: Fri, 18 Jul 2025 19:36:27 -0300 Subject: [PATCH 8/8] argon2: configure lints at the toplevel To pass the newly enabled clippy lints, add a missing safety comment to the compress_avx2 call. --- argon2/src/lib.rs | 3 +++ argon2/src/memory.rs | 6 ------ 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/argon2/src/lib.rs b/argon2/src/lib.rs index 0d55b245..5ab7f3ee 100644 --- a/argon2/src/lib.rs +++ b/argon2/src/lib.rs @@ -13,8 +13,10 @@ clippy::cast_sign_loss, clippy::checked_conversions, clippy::implicit_saturating_sub, + clippy::missing_safety_doc, clippy::panic, clippy::panic_in_result_fn, + clippy::undocumented_unsafe_blocks, clippy::unwrap_used, missing_docs, rust_2018_idioms, @@ -525,6 +527,7 @@ impl<'key> Argon2<'key> { } if self.cpu_feat_avx2.get() { + // SAFETY: checked that AVX2 was detected. return unsafe { compress_avx2(rhs, lhs) }; } } diff --git a/argon2/src/memory.rs b/argon2/src/memory.rs index 3802d9c0..0565857f 100644 --- a/argon2/src/memory.rs +++ b/argon2/src/memory.rs @@ -8,12 +8,6 @@ //! > length of q/SL. Segments of the same slice can be computed in parallel and do not reference //! > blocks from each other. All other blocks can be referenced. -#![warn( - clippy::undocumented_unsafe_blocks, - clippy::missing_safety_doc, - unsafe_op_in_unsafe_fn -)] - use core::marker::PhantomData; use core::ptr::NonNull;