From 3477b1be4ff619745aab300a79199e727f7145e5 Mon Sep 17 00:00:00 2001 From: NexusXe Date: Sat, 23 May 2026 18:34:14 -0500 Subject: [PATCH 01/10] CPU feature detection Adds AVX-512F feature detection and uses VAES presence alongside to detect "good" AVX-512 support, present on Ice Lake/Zen 4 and later. This is to prevent "bad" implementations (specifically early Intel implementations) from automatically being used. --- src/cpu.cpp | 39 +++++++++++++++++++++++++++++++++++++++ src/cpu.hpp | 3 +++ 2 files changed, 42 insertions(+) diff --git a/src/cpu.cpp b/src/cpu.cpp index 3178d037..05cf4671 100644 --- a/src/cpu.cpp +++ b/src/cpu.cpp @@ -67,17 +67,56 @@ namespace randomx { { #ifdef HAVE_CPUID int info[4]; + bool os_avx512_support = false; + cpuid(info, 0); int nIds = info[0]; if (nIds >= 0x00000001) { cpuid(info, 0x00000001); ssse3_ = (info[2] & (1 << 9)) != 0; aes_ = (info[2] & (1 << 25)) != 0; + + // AVX-512 support + // Must have OSXSAVE enabled + bool osxsave = (info[2] & (1 << 27)) != 0; + if (osxsave) { + // Check if OS saves AVX-512 state + unsigned long long xcrFeatureMask = _xgetbv(_XCR_XFEATURE_ENABLED_MASK); + os_avx512_support = (xcrFeatureMask & 0xE6) == 0xE6; + } } + if (nIds >= 0x00000007) { cpuid(info, 0x00000007); avx2_ = (info[1] & (1 << 5)) != 0; + + if (os_avx512_support) { + /* + AVX-512 is primarily used to decrease instruction cache and + decoder pressure. + + On early Intel implementations of AVX-512, the excessive state + transition penalty makes this implementation detrimental to + performance. + + However, on Ice Lake/Zen 4 and above, there is little to no + state transition penalty for using AVX-512, so using it is + overall beneficial. + + VAES support alongside AVX-512F is used to differentiate "poor" + implementations from "good" ones. + */ + + bool has_vaes = (info[2] & (1 << 9)) != 0; + + if (has_vaes) { + // Only auto-enable AVX-512 if "good" implementation detected + avx512_ = (info[1] & (1 << 16)) != 0; // AVX-512F + } + } } + + #elif defined(__aarch64__) #if defined(HWCAP_AES) long hwcaps = getauxval(AT_HWCAP); diff --git a/src/cpu.hpp b/src/cpu.hpp index 7db03311..21585046 100644 --- a/src/cpu.hpp +++ b/src/cpu.hpp @@ -37,6 +37,8 @@ namespace randomx { inline bool hasAes() const { return aes_; } inline bool hasSsse3() const { return ssse3_; } inline bool hasAvx2() const { return avx2_; } + /// Specifically AVX-512F + inline bool hasAvx512() const { return avx512_; } #ifdef __riscv inline bool hasRVV() const { return rvv_; } inline int getRVV_Length() const { return rvv_length; } @@ -46,6 +48,7 @@ namespace randomx { bool aes_ = false; bool ssse3_ = false; bool avx2_ = false; + bool avx512_ = false; #ifdef __riscv bool rvv_ = false; int rvv_length = 0; From 4d68a076504d11317291b97c19b33beafc948c7b Mon Sep 17 00:00:00 2001 From: NexusXe Date: Sat, 23 May 2026 18:38:11 -0500 Subject: [PATCH 02/10] AVX-512F blamka round implementation Based on src/blake2/blamka-round-avx2.h --- src/blake2/blamka-round-avx512.h | 147 +++++++++++++++++++++++++++++++ 1 file changed, 147 insertions(+) create mode 100644 src/blake2/blamka-round-avx512.h diff --git a/src/blake2/blamka-round-avx512.h b/src/blake2/blamka-round-avx512.h new file mode 100644 index 00000000..80acaa76 --- /dev/null +++ b/src/blake2/blamka-round-avx512.h @@ -0,0 +1,147 @@ +/* +Copyright (c) 2026, NexusXe +Copyright (c) 2018-2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/* Original code from Argon2 reference source code package used under CC0 Licence + * https://github.com/P-H-C/phc-winner-argon2 + * Copyright 2015 + * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves +*/ + +// Requires avx512f, + +#ifndef BLAKE_ROUND_MKA_OPT_H +#define BLAKE_ROUND_MKA_OPT_H + +#include "blake2-impl.h" + +#ifdef __GNUC__ +#include +#else +#include +#endif + +#define rotr32(x) _mm512_ror_epi64((x), 32) +#define rotr24(x) _mm512_ror_epi64((x), 24) +#define rotr16(x) _mm512_ror_epi64((x), 16) +#define rotr63(x) _mm512_ror_epi64((x), 63) + +#define G1_AVX512(A, B, C, D) \ + do { \ + __m512i ml = _mm512_mul_epu32(A, B); \ + ml = _mm512_add_epi64(ml, ml); \ + A = _mm512_add_epi64(A, _mm512_add_epi64(B, ml)); \ + D = _mm512_xor_si512(D, A); \ + D = rotr32(D); \ + \ + ml = _mm512_mul_epu32(C, D); \ + ml = _mm512_add_epi64(ml, ml); \ + C = _mm512_add_epi64(C, _mm512_add_epi64(D, ml)); \ + \ + B = _mm512_xor_si512(B, C); \ + B = rotr24(B); \ + } while((void)0, 0); + +#define G2_AVX512(A, B, C, D) \ +do { \ + __m512i ml = _mm512_mul_epu32(A, B); \ + ml = _mm512_add_epi64(ml, ml); \ + A = _mm512_add_epi64(A, _mm512_add_epi64(B, ml)); \ + D = _mm512_xor_si512(D, A); \ + D = rotr16(D); \ + \ + ml = _mm512_mul_epu32(C, D); \ + ml = _mm512_add_epi64(ml, ml); \ + C = _mm512_add_epi64(C, _mm512_add_epi64(D, ml)); \ + B = _mm512_xor_si512(B, C); \ + B = rotr63(B); \ +} while((void)0, 0); + +#define DIAGONALIZE_1(A, B, C, D) \ + do { \ + B = _mm512_permutex_epi64(B, _MM_SHUFFLE(0, 3, 2, 1)); \ + C = _mm512_permutex_epi64(C, _MM_SHUFFLE(1, 0, 3, 2)); \ + D = _mm512_permutex_epi64(D, _MM_SHUFFLE(2, 1, 0, 3)); \ + } while((void)0, 0); + +#define DIAGONALIZE_2(A, B, C, D) \ + do { \ + __m512i idx_B = _mm512_setr_epi64(1, 4, 3, 6, 5, 0, 7, 2); \ + __m512i idx_C = _mm512_setr_epi64(4, 5, 6, 7, 0, 1, 2, 3); \ + __m512i idx_D = _mm512_setr_epi64(5, 0, 7, 2, 1, 4, 3, 6); \ + \ + B = _mm512_permutexvar_epi64(idx_B, B); \ + C = _mm512_permutexvar_epi64(idx_C, C); \ + D = _mm512_permutexvar_epi64(idx_D, D); \ + } while((void)0, 0); + +#define UNDIAGONALIZE_1(A, B, C, D) \ + do { \ + B = _mm512_permutex_epi64(B, _MM_SHUFFLE(2, 1, 0, 3)); \ + C = _mm512_permutex_epi64(C, _MM_SHUFFLE(1, 0, 3, 2)); \ + D = _mm512_permutex_epi64(D, _MM_SHUFFLE(0, 3, 2, 1)); \ + } while((void)0, 0); + +#define UNDIAGONALIZE_2(A, B, C, D) \ + do { \ + __m512i idx_B = _mm512_setr_epi64(5, 0, 7, 2, 1, 4, 3, 6); \ + __m512i idx_C = _mm512_setr_epi64(4, 5, 6, 7, 0, 1, 2, 3); \ + __m512i idx_D = _mm512_setr_epi64(1, 4, 3, 6, 5, 0, 7, 2); \ + \ + B = _mm512_permutexvar_epi64(idx_B, B); \ + C = _mm512_permutexvar_epi64(idx_C, C); \ + D = _mm512_permutexvar_epi64(idx_D, D); \ + } while((void)0, 0); + +#define BLAKE2_ROUND_1(A, B, C, D) \ + do { \ + G1_AVX512(A, B, C, D) \ + G2_AVX512(A, B, C, D) \ + \ + DIAGONALIZE_1(A, B, C, D) \ + \ + G1_AVX512(A, B, C, D) \ + G2_AVX512(A, B, C, D) \ + \ + UNDIAGONALIZE_1(A, B, C, D) \ + } while((void)0, 0); + +#define BLAKE2_ROUND_2(A, B, C, D) \ + do { \ + G1_AVX512(A, B, C, D) \ + G2_AVX512(A, B, C, D) \ + \ + DIAGONALIZE_2(A, B, C, D) \ + \ + G1_AVX512(A, B, C, D) \ + G2_AVX512(A, B, C, D) \ + \ + UNDIAGONALIZE_2(A, B, C, D) \ + } while((void)0, 0); + +#endif /* BLAKE_ROUND_MKA_OPT_H */ From 11663446aaae3a17629d11e6293830c18e55dca4 Mon Sep 17 00:00:00 2001 From: NexusXe Date: Sat, 23 May 2026 18:39:02 -0500 Subject: [PATCH 03/10] AVX-512F Argon2 implementation Based on src/argon2_avx2.c --- src/argon2.h | 1 + src/argon2_avx512.c | 240 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 241 insertions(+) create mode 100644 src/argon2_avx512.c diff --git a/src/argon2.h b/src/argon2.h index 9052f42a..ed103dfe 100644 --- a/src/argon2.h +++ b/src/argon2.h @@ -255,6 +255,7 @@ void randomx_argon2_fill_segment_ref(const argon2_instance_t* instance, randomx_argon2_impl *randomx_argon2_impl_ssse3(); randomx_argon2_impl *randomx_argon2_impl_avx2(); +randomx_argon2_impl *randomx_argon2_impl_avx512(); #if defined(__cplusplus) } diff --git a/src/argon2_avx512.c b/src/argon2_avx512.c new file mode 100644 index 00000000..78179920 --- /dev/null +++ b/src/argon2_avx512.c @@ -0,0 +1,240 @@ +/* +Copyright (c) 2026, NexusXe +Copyright (c) 2018-2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/* Original code from Argon2 reference source code package used under CC0 Licence + * https://github.com/P-H-C/phc-winner-argon2 + * Copyright 2015 + * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves +*/ + +#include +#include +#include + +#include "argon2.h" + +void randomx_argon2_fill_segment_avx512(const argon2_instance_t* instance, + argon2_position_t position); + +randomx_argon2_impl* randomx_argon2_impl_avx512() { +#if defined(__AVX512F__) + return &randomx_argon2_fill_segment_avx512; +#endif + return NULL; +} + +#if defined(__AVX512F__) + +#include "argon2_core.h" + +#include "blake2/blamka-round-avx512.h" +#include "blake2/blake2-impl.h" +#include "blake2/blake2.h" + +/* + Since the AVX-512 implementation processes v0 and v1 together, these helpers + are used to help un-interleave these values from the state block. +*/ + +/// Packs the lower 256-bit halves of v0 and v1 into a single 512-bit register +static inline __m512i avx512_pack_lower(__m512i v0, __m512i v1) { + return _mm512_inserti64x4( + _mm512_castsi256_si512(_mm512_castsi512_si256(v0)), + _mm512_castsi512_si256(v1), + 1 + ); +} + +/// Packs the upper 256-bit halves of v0 and v1 into a single 512-bit register +static inline __m512i avx512_pack_upper(__m512i v0, __m512i v1) { + return _mm512_inserti64x4( + _mm512_castsi256_si512(_mm512_extracti64x4_epi64(v0, 1)), + _mm512_extracti64x4_epi64(v1, 1), + 1 + ); +} + +static void fill_block(__m512i* state, const block* ref_block, + block* next_block, int with_xor) { + + __m512i block_XY[ARGON2_512BIT_WORDS_IN_BLOCK]; + unsigned int i; + + if (with_xor) { + for (i = 0; i < ARGON2_512BIT_WORDS_IN_BLOCK; i++) { + // 1. state[i] = state[i] ^ ref_block[i] + state[i] = _mm512_xor_si512( + state[i], + _mm512_loadu_si512((const __m512i*)ref_block->v + i) + ); + + // 2. block_XY[i] = state[i] ^ next_block[i] + block_XY[i] = _mm512_xor_si512( + state[i], + _mm512_loadu_si512((const __m512i*)next_block->v + i) + ); + } + } + else { + for (i = 0; i < ARGON2_512BIT_WORDS_IN_BLOCK; i++) { + // block_XY[i] = state[i] = state[i] ^ ref_block[i] + block_XY[i] = state[i] = _mm512_xor_si512( + state[i], + _mm512_loadu_si512((const __m512i*)ref_block->v + i) + ); + } + } + + for (i = 0; i < 4; ++i) { + __m512i A = avx512_pack_lower(state[4 * i + 0], state[4 * i + 2]); + __m512i B = avx512_pack_upper(state[4 * i + 0], state[4 * i + 2]); + __m512i C = avx512_pack_lower(state[4 * i + 1], state[4 * i + 3]); + __m512i D = avx512_pack_upper(state[4 * i + 1], state[4 * i + 3]); + + BLAKE2_ROUND_1(A, B, C, D); + + state[4 * i + 0] = avx512_pack_lower(A, B); + state[4 * i + 2] = avx512_pack_upper(A, B); + state[4 * i + 1] = avx512_pack_lower(C, D); + state[4 * i + 3] = avx512_pack_upper(C, D); + } + + for (int j = 0; j < 2; ++j) { + // j=0 handles AVX2 i=0 and i=1 (evens) + // j=1 handles AVX2 i=2 and i=3 (odds) + + __m512i A_0 = avx512_pack_lower(state[j], state[j + 2]); + __m512i B_0 = avx512_pack_lower(state[j + 4], state[j + 6]); + __m512i C_0 = avx512_pack_lower(state[j + 8], state[j + 10]); + __m512i D_0 = avx512_pack_lower(state[j + 12], state[j + 14]); + + __m512i A_1 = avx512_pack_upper(state[j], state[j + 2]); + __m512i B_1 = avx512_pack_upper(state[j + 4], state[j + 6]); + __m512i C_1 = avx512_pack_upper(state[j + 8], state[j + 10]); + __m512i D_1 = avx512_pack_upper(state[j + 12], state[j + 14]); + + BLAKE2_ROUND_2(A_0, B_0, C_0, D_0); + BLAKE2_ROUND_2(A_1, B_1, C_1, D_1); + + state[j] = avx512_pack_lower(A_0, A_1); + state[j + 2] = avx512_pack_upper(A_0, A_1); + state[j + 4] = avx512_pack_lower(B_0, B_1); + state[j + 6] = avx512_pack_upper(B_0, B_1); + state[j + 8] = avx512_pack_lower(C_0, C_1); + state[j + 10] = avx512_pack_upper(C_0, C_1); + state[j + 12] = avx512_pack_lower(D_0, D_1); + state[j + 14] = avx512_pack_upper(D_0, D_1); + } + + for (i = 0; i < ARGON2_512BIT_WORDS_IN_BLOCK; i++) { + state[i] = _mm512_xor_si512(state[i], block_XY[i]); + _mm512_storeu_si512((__m512i*)next_block->v + i, state[i]); + } +} + +void randomx_argon2_fill_segment_avx512(const argon2_instance_t* instance, + argon2_position_t position) { + block* ref_block = NULL, * curr_block = NULL; + block address_block, input_block; + uint64_t pseudo_rand, ref_index, ref_lane; + uint32_t prev_offset, curr_offset; + uint32_t starting_index, i; + __m512i state[ARGON2_512BIT_WORDS_IN_BLOCK]; + + if (instance == NULL) { + return; + } + + starting_index = 0; + + if ((0 == position.pass) && (0 == position.slice)) { + starting_index = 2; /* We have already generated the first two blocks */ + } + + /* Offset of the current block */ + curr_offset = position.lane * instance->lane_length + + position.slice * instance->segment_length + starting_index; + + if (0 == curr_offset % instance->lane_length) { + /* Last block in this lane */ + prev_offset = curr_offset + instance->lane_length - 1; + } + else { + /* Previous block */ + prev_offset = curr_offset - 1; + } + + memcpy(state, ((instance->memory + prev_offset)->v), ARGON2_BLOCK_SIZE); + + for (i = starting_index; i < instance->segment_length; + ++i, ++curr_offset, ++prev_offset) { + /* 1.1 Rotating prev_offset if needed */ + if (curr_offset % instance->lane_length == 1) { + prev_offset = curr_offset - 1; + } + + /* 1.2 Computing the index of the reference block */ + /* 1.2.1 Taking pseudo-random value from the previous block */ + pseudo_rand = instance->memory[prev_offset].v[0]; + + /* 1.2.2 Computing the lane of the reference block */ + ref_lane = ((pseudo_rand >> 32)) % instance->lanes; + + if ((position.pass == 0) && (position.slice == 0)) { + /* Can not reference other lanes yet */ + ref_lane = position.lane; + } + + /* 1.2.3 Computing the number of possible reference block within the + * lane. + */ + position.index = i; + ref_index = randomx_argon2_index_alpha(instance, &position, pseudo_rand & 0xFFFFFFFF, + ref_lane == position.lane); + + /* 2 Creating a new block */ + ref_block = + instance->memory + instance->lane_length * ref_lane + ref_index; + curr_block = instance->memory + curr_offset; + if (ARGON2_VERSION_10 == instance->version) { + /* version 1.2.1 and earlier: overwrite, not XOR */ + fill_block(state, ref_block, curr_block, 0); + } + else { + if (0 == position.pass) { + fill_block(state, ref_block, curr_block, 0); + } + else { + fill_block(state, ref_block, curr_block, 1); + } + } + } +} + +#endif From 33fe8360c1887cd0dd48768b8e57ac8cc8880e68 Mon Sep 17 00:00:00 2001 From: NexusXe Date: Sat, 23 May 2026 18:39:54 -0500 Subject: [PATCH 04/10] Use AVX-512F Argon2 --- src/dataset.hpp | 3 +++ src/randomx.cpp | 3 +++ src/randomx.h | 1 + 3 files changed, 7 insertions(+) diff --git a/src/dataset.hpp b/src/dataset.hpp index 26bc0b28..c84c6e25 100644 --- a/src/dataset.hpp +++ b/src/dataset.hpp @@ -92,6 +92,9 @@ namespace randomx { void initDataset(randomx_cache* cache, uint8_t* dataset, uint32_t startBlock, uint32_t endBlock); inline randomx_argon2_impl* selectArgonImpl(randomx_flags flags) { + if (flags & RANDOMX_FLAG_ARGON2_AVX512) { + return randomx_argon2_impl_avx512(); + } if (flags & RANDOMX_FLAG_ARGON2_AVX2) { return randomx_argon2_impl_avx2(); } diff --git a/src/randomx.cpp b/src/randomx.cpp index 32790289..c09f251e 100644 --- a/src/randomx.cpp +++ b/src/randomx.cpp @@ -57,6 +57,9 @@ extern "C" { if (HAVE_AES && cpu.hasAes()) { flags |= RANDOMX_FLAG_HARD_AES; } + if (randomx_argon2_impl_avx512() != nullptr && cpu.hasAvx512()) { + flags |= RANDOMX_FLAG_ARGON2_AVX512; + } if (randomx_argon2_impl_avx2() != nullptr && cpu.hasAvx2()) { flags |= RANDOMX_FLAG_ARGON2_AVX2; } diff --git a/src/randomx.h b/src/randomx.h index 63076983..368d91ea 100644 --- a/src/randomx.h +++ b/src/randomx.h @@ -50,6 +50,7 @@ typedef enum { RANDOMX_FLAG_ARGON2_AVX2 = 64, RANDOMX_FLAG_ARGON2 = 96, RANDOMX_FLAG_V2 = 128, + RANDOMX_FLAG_ARGON2_AVX512 = 256, } randomx_flags; typedef struct randomx_dataset randomx_dataset; From 1bf1e087b98b02b1c1e7c062231c00c7f53b0ca6 Mon Sep 17 00:00:00 2001 From: NexusXe Date: Sat, 23 May 2026 18:43:27 -0500 Subject: [PATCH 05/10] Add AVX-512F to benchmarks & tests --- src/tests/benchmark.cpp | 12 ++++++++++-- src/tests/tests.cpp | 12 ++++++++++++ 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/src/tests/benchmark.cpp b/src/tests/benchmark.cpp index d4e6e8d2..6a9618ff 100644 --- a/src/tests/benchmark.cpp +++ b/src/tests/benchmark.cpp @@ -94,6 +94,7 @@ void printUsage(const char* executable) { std::cout << " --seed S seed for cache initialization (default: 0)" << std::endl; std::cout << " --ssse3 use optimized Argon2 for SSSE3 CPUs" << std::endl; std::cout << " --avx2 use optimized Argon2 for AVX2 CPUs" << std::endl; + std::cout << " --avx512 use optimized Argon2 for AVX-512F CPUs" << std::endl; std::cout << " --auto select the best options for the current CPU" << std::endl; std::cout << " --noBatch calculate hashes one by one (default: batch)" << std::endl; std::cout << " --commit calculate commitments instead of hashes (default: hashes)" << std::endl; @@ -152,7 +153,7 @@ void mine(randomx_vm* vm, std::atomic& atomicNonce, AtomicHash& result int main(int argc, char** argv) { bool softAes, miningMode, verificationMode, help, largePages, jit, secure, commit, v2; - bool ssse3, avx2, autoFlags, noBatch; + bool ssse3, avx2, avx512, autoFlags, noBatch; int noncesCount, threadCount, initThreadCount; uint64_t threadAffinity; int32_t seedValue; @@ -175,6 +176,7 @@ int main(int argc, char** argv) { readOption("--secure", argc, argv, secure); readOption("--ssse3", argc, argv, ssse3); readOption("--avx2", argc, argv, avx2); + readOption("--avx512", argc, argv, avx512); readOption("--auto", argc, argv, autoFlags); readOption("--noBatch", argc, argv, noBatch); readOption("--commit", argc, argv, commit); @@ -215,6 +217,9 @@ int main(int argc, char** argv) { if (avx2) { flags |= RANDOMX_FLAG_ARGON2_AVX2; } + if (avx512) { + flags |= RANDOMX_FLAG_ARGON2_AVX512; + } if (!softAes) { flags |= RANDOMX_FLAG_HARD_AES; } @@ -242,7 +247,10 @@ int main(int argc, char** argv) { flags |= RANDOMX_FLAG_V2; } - if (flags & RANDOMX_FLAG_ARGON2_AVX2) { + if (flags & RANDOMX_FLAG_ARGON2_AVX512) { + std::cout << " - Argon2 implementation: AVX-512F" << std::endl; + } + else if (flags & RANDOMX_FLAG_ARGON2_AVX2) { std::cout << " - Argon2 implementation: AVX2" << std::endl; } else if (flags & RANDOMX_FLAG_ARGON2_SSSE3) { diff --git a/src/tests/tests.cpp b/src/tests/tests.cpp index dee37d24..b0078630 100644 --- a/src/tests/tests.cpp +++ b/src/tests/tests.cpp @@ -1099,6 +1099,18 @@ int main() { assert(cacheMemory[33554431] == 0x1f47f056d05cd99b); }); + if (cache != nullptr) + randomx_release_cache(cache); + cache = randomx_alloc_cache(RANDOMX_FLAG_ARGON2_AVX512); + + runTest("Cache initialization: AVX-512F", (flags & RANDOMX_FLAG_ARGON2_AVX512) && RANDOMX_ARGON_ITERATIONS == 3 && RANDOMX_ARGON_LANES == 1 && RANDOMX_ARGON_MEMORY == 262144 && stringsEqual(RANDOMX_ARGON_SALT, "RandomX\x03"), []() { + initCache("test key 000"); + uint64_t* cacheMemory = (uint64_t*)cache->memory; + assert(cacheMemory[0] == 0x191e0e1d23c02186); + assert(cacheMemory[1568413] == 0xf1b62fe6210bf8b1); + assert(cacheMemory[33554431] == 0x1f47f056d05cd99b); + }); + if (cache != nullptr) randomx_release_cache(cache); cache = randomx_alloc_cache(RANDOMX_FLAG_DEFAULT); From 51f8369d7ad4ab85dc4818c2916460a3e1bac954 Mon Sep 17 00:00:00 2001 From: NexusXe Date: Sat, 23 May 2026 18:44:01 -0500 Subject: [PATCH 06/10] Add AVX-512 Argon2 files to MSVC and Clang config files --- CMakeLists.txt | 6 ++++++ vcxproj/randomx-dll.vcxproj | 3 +++ vcxproj/randomx-dll.vcxproj.filters | 3 +++ vcxproj/randomx.vcxproj | 4 ++++ vcxproj/randomx.vcxproj.filters | 5 +++++ 5 files changed, 21 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 3d34e2fe..0547a266 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -35,6 +35,7 @@ src/aes_hash.cpp src/argon2_ref.c src/argon2_ssse3.c src/argon2_avx2.c +src/argon2_avx512.c src/bytecode_machine.cpp src/cpu.cpp src/dataset.cpp @@ -107,6 +108,7 @@ if ((CMAKE_SIZEOF_VOID_P EQUAL 8) AND (ARCH_ID STREQUAL "x86_64" OR ARCH_ID STRE set_property(SOURCE src/jit_compiler_x86_static.asm PROPERTY LANGUAGE ASM_MASM) set_source_files_properties(src/argon2_avx2.c COMPILE_FLAGS /arch:AVX2) + set_source_files_properties(src/argon2_avx512.c COMPILE_FLAGS /arch:AVX512) set(CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_C_FLAGS_RELWITHDEBINFO} /DRELWITHDEBINFO") set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} /DRELWITHDEBINFO") @@ -137,6 +139,10 @@ if ((CMAKE_SIZEOF_VOID_P EQUAL 8) AND (ARCH_ID STREQUAL "x86_64" OR ARCH_ID STRE if(HAVE_AVX2) set_source_files_properties(src/argon2_avx2.c COMPILE_FLAGS -mavx2) endif() + check_c_compiler_flag(-mavx512f HAVE_AVX512F) + if(HAVE_AVX512F) + set_source_files_properties(src/argon2_avx512.c COMPILE_FLAGS -mavx512f) + endif() endif() endif() endif() diff --git a/vcxproj/randomx-dll.vcxproj b/vcxproj/randomx-dll.vcxproj index 4eaae9be..2976d1d7 100644 --- a/vcxproj/randomx-dll.vcxproj +++ b/vcxproj/randomx-dll.vcxproj @@ -57,6 +57,9 @@ AdvancedVectorExtensions2 + + AdvancedVectorExtensions512 + diff --git a/vcxproj/randomx-dll.vcxproj.filters b/vcxproj/randomx-dll.vcxproj.filters index 5b51f9f7..9ce0786a 100644 --- a/vcxproj/randomx-dll.vcxproj.filters +++ b/vcxproj/randomx-dll.vcxproj.filters @@ -175,6 +175,9 @@ Source Files + + Source Files + Source Files diff --git a/vcxproj/randomx.vcxproj b/vcxproj/randomx.vcxproj index cefdc8fb..40714e7e 100644 --- a/vcxproj/randomx.vcxproj +++ b/vcxproj/randomx.vcxproj @@ -134,6 +134,9 @@ SET ERRORLEVEL = 0 AdvancedVectorExtensions2 + + AdvancedVectorExtensions512 + @@ -169,6 +172,7 @@ SET ERRORLEVEL = 0 + diff --git a/vcxproj/randomx.vcxproj.filters b/vcxproj/randomx.vcxproj.filters index 7f055b5b..7e09b801 100644 --- a/vcxproj/randomx.vcxproj.filters +++ b/vcxproj/randomx.vcxproj.filters @@ -84,6 +84,9 @@ Source Files + + Source Files + Source Files @@ -196,6 +199,8 @@ Header Files + + Header Files Header Files From e940c9b2cad394540261eb92d3d0d7d59e35ebba Mon Sep 17 00:00:00 2001 From: NexusXe Date: Sat, 23 May 2026 18:53:07 -0500 Subject: [PATCH 07/10] Remove old comment I was unsure if extensions past AVX-512F would be needed, but it turned out that since the primary data element for this code is a 64-bit integer, only AVX-512F is needed. --- src/blake2/blamka-round-avx512.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/blake2/blamka-round-avx512.h b/src/blake2/blamka-round-avx512.h index 80acaa76..0a540965 100644 --- a/src/blake2/blamka-round-avx512.h +++ b/src/blake2/blamka-round-avx512.h @@ -33,8 +33,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves */ -// Requires avx512f, - #ifndef BLAKE_ROUND_MKA_OPT_H #define BLAKE_ROUND_MKA_OPT_H From 679db5d22623349bf91ec0ebae0a11b6360c41ac Mon Sep 17 00:00:00 2001 From: NexusXe Date: Mon, 25 May 2026 11:02:06 -0500 Subject: [PATCH 08/10] ensure `_XCR_XFEATURE_ENABLED_MASK` gets defined --- src/cpu.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/cpu.cpp b/src/cpu.cpp index 05cf4671..932c2973 100644 --- a/src/cpu.cpp +++ b/src/cpu.cpp @@ -40,6 +40,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. __cpuid_count(InfoType, 0, info[0], info[1], info[2], info[3]); } #endif + #ifndef _XCR_XFEATURE_ENABLED_MASK + #define _XCR_XFEATURE_ENABLED_MASK 0 + #endif #endif #if defined(HAVE_HWCAP) From 9f583baee55a8da0be72065e24013f3b45d30a7f Mon Sep 17 00:00:00 2001 From: NexusXe Date: Mon, 25 May 2026 11:36:26 -0500 Subject: [PATCH 09/10] specify macro section as for both GCC and Clang --- src/cpu.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cpu.cpp b/src/cpu.cpp index 932c2973..e7961fda 100644 --- a/src/cpu.cpp +++ b/src/cpu.cpp @@ -34,7 +34,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(_MSC_VER) #include #define cpuid(info, x) __cpuidex(info, x, 0) - #else //GCC + #else // GCC/Clang #include void cpuid(int info[4], int InfoType) { __cpuid_count(InfoType, 0, info[0], info[1], info[2], info[3]); From e6726b928083ba681f3a5a2ae72e831725f14395 Mon Sep 17 00:00:00 2001 From: NexusXe Date: Mon, 25 May 2026 11:36:52 -0500 Subject: [PATCH 10/10] Use assembly for `_xgetbv` on non-MSVC GCC/Clang more strictly ensures that the `_xgetbv` macro is only used when the `XSAVE` target feature is enabled. This project is (intentionally) built without strict target features, so instead use an assembly shim that manually uses the intrinsic. Since this is only run when `OSXSAVE` is enabled (and thus the `XSAVE` feature *must* be enabled on the host), this is safe. --- src/cpu.cpp | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/src/cpu.cpp b/src/cpu.cpp index e7961fda..3b2440c1 100644 --- a/src/cpu.cpp +++ b/src/cpu.cpp @@ -39,6 +39,17 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. void cpuid(int info[4], int InfoType) { __cpuid_count(InfoType, 0, info[0], info[1], info[2], info[3]); } + + __attribute__((target("xsave"))) + static inline unsigned long long _xgetbv(unsigned int index) { + unsigned int eax, edx; + __asm__ __volatile__( + "xgetbv" + : "=a"(eax), "=d"(edx) + : "c"(index) + ); + return ((unsigned long long)edx << 32) | eax; + } #endif #ifndef _XCR_XFEATURE_ENABLED_MASK #define _XCR_XFEATURE_ENABLED_MASK 0 @@ -83,7 +94,11 @@ namespace randomx { // Must have OSXSAVE enabled bool osxsave = (info[2] & (1 << 27)) != 0; if (osxsave) { - // Check if OS saves AVX-512 state + /* + Check if OS saves AVX-512 state + Requires XSAVE, which is guaranteed present due to the presence + of the OSXSAVE enabled bit + */ unsigned long long xcrFeatureMask = _xgetbv(_XCR_XFEATURE_ENABLED_MASK); os_avx512_support = (xcrFeatureMask & 0xE6) == 0xE6; }