diff --git a/CMakeLists.txt b/CMakeLists.txt index 3d34e2fe..0547a266 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -35,6 +35,7 @@ src/aes_hash.cpp src/argon2_ref.c src/argon2_ssse3.c src/argon2_avx2.c +src/argon2_avx512.c src/bytecode_machine.cpp src/cpu.cpp src/dataset.cpp @@ -107,6 +108,7 @@ if ((CMAKE_SIZEOF_VOID_P EQUAL 8) AND (ARCH_ID STREQUAL "x86_64" OR ARCH_ID STRE set_property(SOURCE src/jit_compiler_x86_static.asm PROPERTY LANGUAGE ASM_MASM) set_source_files_properties(src/argon2_avx2.c COMPILE_FLAGS /arch:AVX2) + set_source_files_properties(src/argon2_avx512.c COMPILE_FLAGS /arch:AVX512) set(CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_C_FLAGS_RELWITHDEBINFO} /DRELWITHDEBINFO") set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} /DRELWITHDEBINFO") @@ -137,6 +139,10 @@ if ((CMAKE_SIZEOF_VOID_P EQUAL 8) AND (ARCH_ID STREQUAL "x86_64" OR ARCH_ID STRE if(HAVE_AVX2) set_source_files_properties(src/argon2_avx2.c COMPILE_FLAGS -mavx2) endif() + check_c_compiler_flag(-mavx512f HAVE_AVX512F) + if(HAVE_AVX512F) + set_source_files_properties(src/argon2_avx512.c COMPILE_FLAGS -mavx512f) + endif() endif() endif() endif() diff --git a/src/argon2.h b/src/argon2.h index 9052f42a..ed103dfe 100644 --- a/src/argon2.h +++ b/src/argon2.h @@ -255,6 +255,7 @@ void randomx_argon2_fill_segment_ref(const argon2_instance_t* instance, randomx_argon2_impl *randomx_argon2_impl_ssse3(); randomx_argon2_impl *randomx_argon2_impl_avx2(); +randomx_argon2_impl *randomx_argon2_impl_avx512(); #if defined(__cplusplus) } diff --git a/src/argon2_avx512.c b/src/argon2_avx512.c new file mode 100644 index 00000000..78179920 --- /dev/null +++ b/src/argon2_avx512.c @@ -0,0 +1,240 @@ +/* +Copyright (c) 2026, NexusXe +Copyright (c) 2018-2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/* Original code from Argon2 reference source code package used under CC0 Licence + * https://github.com/P-H-C/phc-winner-argon2 + * Copyright 2015 + * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves +*/ + +#include +#include +#include + +#include "argon2.h" + +void randomx_argon2_fill_segment_avx512(const argon2_instance_t* instance, + argon2_position_t position); + +randomx_argon2_impl* randomx_argon2_impl_avx512() { +#if defined(__AVX512F__) + return &randomx_argon2_fill_segment_avx512; +#endif + return NULL; +} + +#if defined(__AVX512F__) + +#include "argon2_core.h" + +#include "blake2/blamka-round-avx512.h" +#include "blake2/blake2-impl.h" +#include "blake2/blake2.h" + +/* + Since the AVX-512 implementation processes v0 and v1 together, these helpers + are used to help un-interleave these values from the state block. +*/ + +/// Packs the lower 256-bit halves of v0 and v1 into a single 512-bit register +static inline __m512i avx512_pack_lower(__m512i v0, __m512i v1) { + return _mm512_inserti64x4( + _mm512_castsi256_si512(_mm512_castsi512_si256(v0)), + _mm512_castsi512_si256(v1), + 1 + ); +} + +/// Packs the upper 256-bit halves of v0 and v1 into a single 512-bit register +static inline __m512i avx512_pack_upper(__m512i v0, __m512i v1) { + return _mm512_inserti64x4( + _mm512_castsi256_si512(_mm512_extracti64x4_epi64(v0, 1)), + _mm512_extracti64x4_epi64(v1, 1), + 1 + ); +} + +static void fill_block(__m512i* state, const block* ref_block, + block* next_block, int with_xor) { + + __m512i block_XY[ARGON2_512BIT_WORDS_IN_BLOCK]; + unsigned int i; + + if (with_xor) { + for (i = 0; i < ARGON2_512BIT_WORDS_IN_BLOCK; i++) { + // 1. state[i] = state[i] ^ ref_block[i] + state[i] = _mm512_xor_si512( + state[i], + _mm512_loadu_si512((const __m512i*)ref_block->v + i) + ); + + // 2. block_XY[i] = state[i] ^ next_block[i] + block_XY[i] = _mm512_xor_si512( + state[i], + _mm512_loadu_si512((const __m512i*)next_block->v + i) + ); + } + } + else { + for (i = 0; i < ARGON2_512BIT_WORDS_IN_BLOCK; i++) { + // block_XY[i] = state[i] = state[i] ^ ref_block[i] + block_XY[i] = state[i] = _mm512_xor_si512( + state[i], + _mm512_loadu_si512((const __m512i*)ref_block->v + i) + ); + } + } + + for (i = 0; i < 4; ++i) { + __m512i A = avx512_pack_lower(state[4 * i + 0], state[4 * i + 2]); + __m512i B = avx512_pack_upper(state[4 * i + 0], state[4 * i + 2]); + __m512i C = avx512_pack_lower(state[4 * i + 1], state[4 * i + 3]); + __m512i D = avx512_pack_upper(state[4 * i + 1], state[4 * i + 3]); + + BLAKE2_ROUND_1(A, B, C, D); + + state[4 * i + 0] = avx512_pack_lower(A, B); + state[4 * i + 2] = avx512_pack_upper(A, B); + state[4 * i + 1] = avx512_pack_lower(C, D); + state[4 * i + 3] = avx512_pack_upper(C, D); + } + + for (int j = 0; j < 2; ++j) { + // j=0 handles AVX2 i=0 and i=1 (evens) + // j=1 handles AVX2 i=2 and i=3 (odds) + + __m512i A_0 = avx512_pack_lower(state[j], state[j + 2]); + __m512i B_0 = avx512_pack_lower(state[j + 4], state[j + 6]); + __m512i C_0 = avx512_pack_lower(state[j + 8], state[j + 10]); + __m512i D_0 = avx512_pack_lower(state[j + 12], state[j + 14]); + + __m512i A_1 = avx512_pack_upper(state[j], state[j + 2]); + __m512i B_1 = avx512_pack_upper(state[j + 4], state[j + 6]); + __m512i C_1 = avx512_pack_upper(state[j + 8], state[j + 10]); + __m512i D_1 = avx512_pack_upper(state[j + 12], state[j + 14]); + + BLAKE2_ROUND_2(A_0, B_0, C_0, D_0); + BLAKE2_ROUND_2(A_1, B_1, C_1, D_1); + + state[j] = avx512_pack_lower(A_0, A_1); + state[j + 2] = avx512_pack_upper(A_0, A_1); + state[j + 4] = avx512_pack_lower(B_0, B_1); + state[j + 6] = avx512_pack_upper(B_0, B_1); + state[j + 8] = avx512_pack_lower(C_0, C_1); + state[j + 10] = avx512_pack_upper(C_0, C_1); + state[j + 12] = avx512_pack_lower(D_0, D_1); + state[j + 14] = avx512_pack_upper(D_0, D_1); + } + + for (i = 0; i < ARGON2_512BIT_WORDS_IN_BLOCK; i++) { + state[i] = _mm512_xor_si512(state[i], block_XY[i]); + _mm512_storeu_si512((__m512i*)next_block->v + i, state[i]); + } +} + +void randomx_argon2_fill_segment_avx512(const argon2_instance_t* instance, + argon2_position_t position) { + block* ref_block = NULL, * curr_block = NULL; + block address_block, input_block; + uint64_t pseudo_rand, ref_index, ref_lane; + uint32_t prev_offset, curr_offset; + uint32_t starting_index, i; + __m512i state[ARGON2_512BIT_WORDS_IN_BLOCK]; + + if (instance == NULL) { + return; + } + + starting_index = 0; + + if ((0 == position.pass) && (0 == position.slice)) { + starting_index = 2; /* We have already generated the first two blocks */ + } + + /* Offset of the current block */ + curr_offset = position.lane * instance->lane_length + + position.slice * instance->segment_length + starting_index; + + if (0 == curr_offset % instance->lane_length) { + /* Last block in this lane */ + prev_offset = curr_offset + instance->lane_length - 1; + } + else { + /* Previous block */ + prev_offset = curr_offset - 1; + } + + memcpy(state, ((instance->memory + prev_offset)->v), ARGON2_BLOCK_SIZE); + + for (i = starting_index; i < instance->segment_length; + ++i, ++curr_offset, ++prev_offset) { + /* 1.1 Rotating prev_offset if needed */ + if (curr_offset % instance->lane_length == 1) { + prev_offset = curr_offset - 1; + } + + /* 1.2 Computing the index of the reference block */ + /* 1.2.1 Taking pseudo-random value from the previous block */ + pseudo_rand = instance->memory[prev_offset].v[0]; + + /* 1.2.2 Computing the lane of the reference block */ + ref_lane = ((pseudo_rand >> 32)) % instance->lanes; + + if ((position.pass == 0) && (position.slice == 0)) { + /* Can not reference other lanes yet */ + ref_lane = position.lane; + } + + /* 1.2.3 Computing the number of possible reference block within the + * lane. + */ + position.index = i; + ref_index = randomx_argon2_index_alpha(instance, &position, pseudo_rand & 0xFFFFFFFF, + ref_lane == position.lane); + + /* 2 Creating a new block */ + ref_block = + instance->memory + instance->lane_length * ref_lane + ref_index; + curr_block = instance->memory + curr_offset; + if (ARGON2_VERSION_10 == instance->version) { + /* version 1.2.1 and earlier: overwrite, not XOR */ + fill_block(state, ref_block, curr_block, 0); + } + else { + if (0 == position.pass) { + fill_block(state, ref_block, curr_block, 0); + } + else { + fill_block(state, ref_block, curr_block, 1); + } + } + } +} + +#endif diff --git a/src/blake2/blamka-round-avx512.h b/src/blake2/blamka-round-avx512.h new file mode 100644 index 00000000..0a540965 --- /dev/null +++ b/src/blake2/blamka-round-avx512.h @@ -0,0 +1,145 @@ +/* +Copyright (c) 2026, NexusXe +Copyright (c) 2018-2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/* Original code from Argon2 reference source code package used under CC0 Licence + * https://github.com/P-H-C/phc-winner-argon2 + * Copyright 2015 + * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves +*/ + +#ifndef BLAKE_ROUND_MKA_OPT_H +#define BLAKE_ROUND_MKA_OPT_H + +#include "blake2-impl.h" + +#ifdef __GNUC__ +#include +#else +#include +#endif + +#define rotr32(x) _mm512_ror_epi64((x), 32) +#define rotr24(x) _mm512_ror_epi64((x), 24) +#define rotr16(x) _mm512_ror_epi64((x), 16) +#define rotr63(x) _mm512_ror_epi64((x), 63) + +#define G1_AVX512(A, B, C, D) \ + do { \ + __m512i ml = _mm512_mul_epu32(A, B); \ + ml = _mm512_add_epi64(ml, ml); \ + A = _mm512_add_epi64(A, _mm512_add_epi64(B, ml)); \ + D = _mm512_xor_si512(D, A); \ + D = rotr32(D); \ + \ + ml = _mm512_mul_epu32(C, D); \ + ml = _mm512_add_epi64(ml, ml); \ + C = _mm512_add_epi64(C, _mm512_add_epi64(D, ml)); \ + \ + B = _mm512_xor_si512(B, C); \ + B = rotr24(B); \ + } while((void)0, 0); + +#define G2_AVX512(A, B, C, D) \ +do { \ + __m512i ml = _mm512_mul_epu32(A, B); \ + ml = _mm512_add_epi64(ml, ml); \ + A = _mm512_add_epi64(A, _mm512_add_epi64(B, ml)); \ + D = _mm512_xor_si512(D, A); \ + D = rotr16(D); \ + \ + ml = _mm512_mul_epu32(C, D); \ + ml = _mm512_add_epi64(ml, ml); \ + C = _mm512_add_epi64(C, _mm512_add_epi64(D, ml)); \ + B = _mm512_xor_si512(B, C); \ + B = rotr63(B); \ +} while((void)0, 0); + +#define DIAGONALIZE_1(A, B, C, D) \ + do { \ + B = _mm512_permutex_epi64(B, _MM_SHUFFLE(0, 3, 2, 1)); \ + C = _mm512_permutex_epi64(C, _MM_SHUFFLE(1, 0, 3, 2)); \ + D = _mm512_permutex_epi64(D, _MM_SHUFFLE(2, 1, 0, 3)); \ + } while((void)0, 0); + +#define DIAGONALIZE_2(A, B, C, D) \ + do { \ + __m512i idx_B = _mm512_setr_epi64(1, 4, 3, 6, 5, 0, 7, 2); \ + __m512i idx_C = _mm512_setr_epi64(4, 5, 6, 7, 0, 1, 2, 3); \ + __m512i idx_D = _mm512_setr_epi64(5, 0, 7, 2, 1, 4, 3, 6); \ + \ + B = _mm512_permutexvar_epi64(idx_B, B); \ + C = _mm512_permutexvar_epi64(idx_C, C); \ + D = _mm512_permutexvar_epi64(idx_D, D); \ + } while((void)0, 0); + +#define UNDIAGONALIZE_1(A, B, C, D) \ + do { \ + B = _mm512_permutex_epi64(B, _MM_SHUFFLE(2, 1, 0, 3)); \ + C = _mm512_permutex_epi64(C, _MM_SHUFFLE(1, 0, 3, 2)); \ + D = _mm512_permutex_epi64(D, _MM_SHUFFLE(0, 3, 2, 1)); \ + } while((void)0, 0); + +#define UNDIAGONALIZE_2(A, B, C, D) \ + do { \ + __m512i idx_B = _mm512_setr_epi64(5, 0, 7, 2, 1, 4, 3, 6); \ + __m512i idx_C = _mm512_setr_epi64(4, 5, 6, 7, 0, 1, 2, 3); \ + __m512i idx_D = _mm512_setr_epi64(1, 4, 3, 6, 5, 0, 7, 2); \ + \ + B = _mm512_permutexvar_epi64(idx_B, B); \ + C = _mm512_permutexvar_epi64(idx_C, C); \ + D = _mm512_permutexvar_epi64(idx_D, D); \ + } while((void)0, 0); + +#define BLAKE2_ROUND_1(A, B, C, D) \ + do { \ + G1_AVX512(A, B, C, D) \ + G2_AVX512(A, B, C, D) \ + \ + DIAGONALIZE_1(A, B, C, D) \ + \ + G1_AVX512(A, B, C, D) \ + G2_AVX512(A, B, C, D) \ + \ + UNDIAGONALIZE_1(A, B, C, D) \ + } while((void)0, 0); + +#define BLAKE2_ROUND_2(A, B, C, D) \ + do { \ + G1_AVX512(A, B, C, D) \ + G2_AVX512(A, B, C, D) \ + \ + DIAGONALIZE_2(A, B, C, D) \ + \ + G1_AVX512(A, B, C, D) \ + G2_AVX512(A, B, C, D) \ + \ + UNDIAGONALIZE_2(A, B, C, D) \ + } while((void)0, 0); + +#endif /* BLAKE_ROUND_MKA_OPT_H */ diff --git a/src/cpu.cpp b/src/cpu.cpp index 3178d037..3b2440c1 100644 --- a/src/cpu.cpp +++ b/src/cpu.cpp @@ -34,11 +34,25 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(_MSC_VER) #include #define cpuid(info, x) __cpuidex(info, x, 0) - #else //GCC + #else // GCC/Clang #include void cpuid(int info[4], int InfoType) { __cpuid_count(InfoType, 0, info[0], info[1], info[2], info[3]); } + + __attribute__((target("xsave"))) + static inline unsigned long long _xgetbv(unsigned int index) { + unsigned int eax, edx; + __asm__ __volatile__( + "xgetbv" + : "=a"(eax), "=d"(edx) + : "c"(index) + ); + return ((unsigned long long)edx << 32) | eax; + } + #endif + #ifndef _XCR_XFEATURE_ENABLED_MASK + #define _XCR_XFEATURE_ENABLED_MASK 0 #endif #endif @@ -67,17 +81,60 @@ namespace randomx { { #ifdef HAVE_CPUID int info[4]; + bool os_avx512_support = false; + cpuid(info, 0); int nIds = info[0]; if (nIds >= 0x00000001) { cpuid(info, 0x00000001); ssse3_ = (info[2] & (1 << 9)) != 0; aes_ = (info[2] & (1 << 25)) != 0; + + // AVX-512 support + // Must have OSXSAVE enabled + bool osxsave = (info[2] & (1 << 27)) != 0; + if (osxsave) { + /* + Check if OS saves AVX-512 state + Requires XSAVE, which is guaranteed present due to the presence + of the OSXSAVE enabled bit + */ + unsigned long long xcrFeatureMask = _xgetbv(_XCR_XFEATURE_ENABLED_MASK); + os_avx512_support = (xcrFeatureMask & 0xE6) == 0xE6; + } } + if (nIds >= 0x00000007) { cpuid(info, 0x00000007); avx2_ = (info[1] & (1 << 5)) != 0; + + if (os_avx512_support) { + /* + AVX-512 is primarily used to decrease instruction cache and + decoder pressure. + + On early Intel implementations of AVX-512, the excessive state + transition penalty makes this implementation detrimental to + performance. + + However, on Ice Lake/Zen 4 and above, there is little to no + state transition penalty for using AVX-512, so using it is + overall beneficial. + + VAES support alongside AVX-512F is used to differentiate "poor" + implementations from "good" ones. + */ + + bool has_vaes = (info[2] & (1 << 9)) != 0; + + if (has_vaes) { + // Only auto-enable AVX-512 if "good" implementation detected + avx512_ = (info[1] & (1 << 16)) != 0; // AVX-512F + } + } } + + #elif defined(__aarch64__) #if defined(HWCAP_AES) long hwcaps = getauxval(AT_HWCAP); diff --git a/src/cpu.hpp b/src/cpu.hpp index 7db03311..21585046 100644 --- a/src/cpu.hpp +++ b/src/cpu.hpp @@ -37,6 +37,8 @@ namespace randomx { inline bool hasAes() const { return aes_; } inline bool hasSsse3() const { return ssse3_; } inline bool hasAvx2() const { return avx2_; } + /// Specifically AVX-512F + inline bool hasAvx512() const { return avx512_; } #ifdef __riscv inline bool hasRVV() const { return rvv_; } inline int getRVV_Length() const { return rvv_length; } @@ -46,6 +48,7 @@ namespace randomx { bool aes_ = false; bool ssse3_ = false; bool avx2_ = false; + bool avx512_ = false; #ifdef __riscv bool rvv_ = false; int rvv_length = 0; diff --git a/src/dataset.hpp b/src/dataset.hpp index 26bc0b28..c84c6e25 100644 --- a/src/dataset.hpp +++ b/src/dataset.hpp @@ -92,6 +92,9 @@ namespace randomx { void initDataset(randomx_cache* cache, uint8_t* dataset, uint32_t startBlock, uint32_t endBlock); inline randomx_argon2_impl* selectArgonImpl(randomx_flags flags) { + if (flags & RANDOMX_FLAG_ARGON2_AVX512) { + return randomx_argon2_impl_avx512(); + } if (flags & RANDOMX_FLAG_ARGON2_AVX2) { return randomx_argon2_impl_avx2(); } diff --git a/src/randomx.cpp b/src/randomx.cpp index 32790289..c09f251e 100644 --- a/src/randomx.cpp +++ b/src/randomx.cpp @@ -57,6 +57,9 @@ extern "C" { if (HAVE_AES && cpu.hasAes()) { flags |= RANDOMX_FLAG_HARD_AES; } + if (randomx_argon2_impl_avx512() != nullptr && cpu.hasAvx512()) { + flags |= RANDOMX_FLAG_ARGON2_AVX512; + } if (randomx_argon2_impl_avx2() != nullptr && cpu.hasAvx2()) { flags |= RANDOMX_FLAG_ARGON2_AVX2; } diff --git a/src/randomx.h b/src/randomx.h index 63076983..368d91ea 100644 --- a/src/randomx.h +++ b/src/randomx.h @@ -50,6 +50,7 @@ typedef enum { RANDOMX_FLAG_ARGON2_AVX2 = 64, RANDOMX_FLAG_ARGON2 = 96, RANDOMX_FLAG_V2 = 128, + RANDOMX_FLAG_ARGON2_AVX512 = 256, } randomx_flags; typedef struct randomx_dataset randomx_dataset; diff --git a/src/tests/benchmark.cpp b/src/tests/benchmark.cpp index d4e6e8d2..6a9618ff 100644 --- a/src/tests/benchmark.cpp +++ b/src/tests/benchmark.cpp @@ -94,6 +94,7 @@ void printUsage(const char* executable) { std::cout << " --seed S seed for cache initialization (default: 0)" << std::endl; std::cout << " --ssse3 use optimized Argon2 for SSSE3 CPUs" << std::endl; std::cout << " --avx2 use optimized Argon2 for AVX2 CPUs" << std::endl; + std::cout << " --avx512 use optimized Argon2 for AVX-512F CPUs" << std::endl; std::cout << " --auto select the best options for the current CPU" << std::endl; std::cout << " --noBatch calculate hashes one by one (default: batch)" << std::endl; std::cout << " --commit calculate commitments instead of hashes (default: hashes)" << std::endl; @@ -152,7 +153,7 @@ void mine(randomx_vm* vm, std::atomic& atomicNonce, AtomicHash& result int main(int argc, char** argv) { bool softAes, miningMode, verificationMode, help, largePages, jit, secure, commit, v2; - bool ssse3, avx2, autoFlags, noBatch; + bool ssse3, avx2, avx512, autoFlags, noBatch; int noncesCount, threadCount, initThreadCount; uint64_t threadAffinity; int32_t seedValue; @@ -175,6 +176,7 @@ int main(int argc, char** argv) { readOption("--secure", argc, argv, secure); readOption("--ssse3", argc, argv, ssse3); readOption("--avx2", argc, argv, avx2); + readOption("--avx512", argc, argv, avx512); readOption("--auto", argc, argv, autoFlags); readOption("--noBatch", argc, argv, noBatch); readOption("--commit", argc, argv, commit); @@ -215,6 +217,9 @@ int main(int argc, char** argv) { if (avx2) { flags |= RANDOMX_FLAG_ARGON2_AVX2; } + if (avx512) { + flags |= RANDOMX_FLAG_ARGON2_AVX512; + } if (!softAes) { flags |= RANDOMX_FLAG_HARD_AES; } @@ -242,7 +247,10 @@ int main(int argc, char** argv) { flags |= RANDOMX_FLAG_V2; } - if (flags & RANDOMX_FLAG_ARGON2_AVX2) { + if (flags & RANDOMX_FLAG_ARGON2_AVX512) { + std::cout << " - Argon2 implementation: AVX-512F" << std::endl; + } + else if (flags & RANDOMX_FLAG_ARGON2_AVX2) { std::cout << " - Argon2 implementation: AVX2" << std::endl; } else if (flags & RANDOMX_FLAG_ARGON2_SSSE3) { diff --git a/src/tests/tests.cpp b/src/tests/tests.cpp index dee37d24..b0078630 100644 --- a/src/tests/tests.cpp +++ b/src/tests/tests.cpp @@ -1099,6 +1099,18 @@ int main() { assert(cacheMemory[33554431] == 0x1f47f056d05cd99b); }); + if (cache != nullptr) + randomx_release_cache(cache); + cache = randomx_alloc_cache(RANDOMX_FLAG_ARGON2_AVX512); + + runTest("Cache initialization: AVX-512F", (flags & RANDOMX_FLAG_ARGON2_AVX512) && RANDOMX_ARGON_ITERATIONS == 3 && RANDOMX_ARGON_LANES == 1 && RANDOMX_ARGON_MEMORY == 262144 && stringsEqual(RANDOMX_ARGON_SALT, "RandomX\x03"), []() { + initCache("test key 000"); + uint64_t* cacheMemory = (uint64_t*)cache->memory; + assert(cacheMemory[0] == 0x191e0e1d23c02186); + assert(cacheMemory[1568413] == 0xf1b62fe6210bf8b1); + assert(cacheMemory[33554431] == 0x1f47f056d05cd99b); + }); + if (cache != nullptr) randomx_release_cache(cache); cache = randomx_alloc_cache(RANDOMX_FLAG_DEFAULT); diff --git a/vcxproj/randomx-dll.vcxproj b/vcxproj/randomx-dll.vcxproj index 4eaae9be..2976d1d7 100644 --- a/vcxproj/randomx-dll.vcxproj +++ b/vcxproj/randomx-dll.vcxproj @@ -57,6 +57,9 @@ AdvancedVectorExtensions2 + + AdvancedVectorExtensions512 + diff --git a/vcxproj/randomx-dll.vcxproj.filters b/vcxproj/randomx-dll.vcxproj.filters index 5b51f9f7..9ce0786a 100644 --- a/vcxproj/randomx-dll.vcxproj.filters +++ b/vcxproj/randomx-dll.vcxproj.filters @@ -175,6 +175,9 @@ Source Files + + Source Files + Source Files diff --git a/vcxproj/randomx.vcxproj b/vcxproj/randomx.vcxproj index cefdc8fb..40714e7e 100644 --- a/vcxproj/randomx.vcxproj +++ b/vcxproj/randomx.vcxproj @@ -134,6 +134,9 @@ SET ERRORLEVEL = 0 AdvancedVectorExtensions2 + + AdvancedVectorExtensions512 + @@ -169,6 +172,7 @@ SET ERRORLEVEL = 0 + diff --git a/vcxproj/randomx.vcxproj.filters b/vcxproj/randomx.vcxproj.filters index 7f055b5b..7e09b801 100644 --- a/vcxproj/randomx.vcxproj.filters +++ b/vcxproj/randomx.vcxproj.filters @@ -84,6 +84,9 @@ Source Files + + Source Files + Source Files @@ -196,6 +199,8 @@ Header Files + + Header Files Header Files