From 3477b1be4ff619745aab300a79199e727f7145e5 Mon Sep 17 00:00:00 2001
From: NexusXe <andastrike@gmail.com>
Date: Sat, 23 May 2026 18:34:14 -0500
Subject: [PATCH 01/10] CPU feature detection

Adds AVX-512F feature detection and uses VAES presence alongside to
detect "good" AVX-512 support, present on Ice Lake/Zen 4 and later.

This is to prevent "bad" implementations (specifically early Intel
implementations) from automatically being used.
---
 src/cpu.cpp | 39 +++++++++++++++++++++++++++++++++++++++
 src/cpu.hpp |  3 +++
 2 files changed, 42 insertions(+)

diff --git a/src/cpu.cpp b/src/cpu.cpp
index 3178d037..05cf4671 100644
--- a/src/cpu.cpp
+++ b/src/cpu.cpp
@@ -67,17 +67,56 @@ namespace randomx {
 	{
 #ifdef HAVE_CPUID
 		int info[4];
+		bool os_avx512_support = false;
+
 		cpuid(info, 0);
 		int nIds = info[0];
 		if (nIds >= 0x00000001) {
 			cpuid(info, 0x00000001);
 			ssse3_ = (info[2] & (1 << 9)) != 0;
 			aes_ = (info[2] & (1 << 25)) != 0;
+
+			// AVX-512 support
+			// Must have OSXSAVE enabled
+			bool osxsave = (info[2] & (1 << 27)) != 0;
+			if (osxsave) {
+				// Check if OS saves AVX-512 state
+				unsigned long long xcrFeatureMask = _xgetbv(_XCR_XFEATURE_ENABLED_MASK);
+                os_avx512_support = (xcrFeatureMask & 0xE6) == 0xE6;
+			}
 		}
+
 		if (nIds >= 0x00000007) {
 			cpuid(info, 0x00000007);
 			avx2_ = (info[1] & (1 << 5)) != 0;
+
+			if (os_avx512_support) {
+				/*
+				AVX-512 is primarily used to decrease instruction cache and
+				decoder pressure.
+
+				On early Intel implementations of AVX-512, the excessive state
+				transition penalty makes this implementation detrimental to
+				performance.
+
+				However, on Ice Lake/Zen 4 and above, there is little to no
+				state transition penalty for using AVX-512, so using it is
+				overall beneficial.
+
+				VAES support alongside AVX-512F is used to differentiate "poor"
+				implementations from "good" ones.
+				*/
+
+				bool has_vaes = (info[2] & (1 << 9)) != 0;
+
+				if (has_vaes) {
+					// Only auto-enable AVX-512 if "good" implementation detected
+					avx512_ = (info[1] & (1 << 16)) != 0; // AVX-512F
+				}
+			}
 		}
+
+		
 #elif defined(__aarch64__)
 	#if defined(HWCAP_AES)
 		long hwcaps = getauxval(AT_HWCAP);
diff --git a/src/cpu.hpp b/src/cpu.hpp
index 7db03311..21585046 100644
--- a/src/cpu.hpp
+++ b/src/cpu.hpp
@@ -37,6 +37,8 @@ namespace randomx {
 		inline bool hasAes() const { return aes_; }
 		inline bool hasSsse3() const { return ssse3_; }
 		inline bool hasAvx2() const { return avx2_; }
+		/// Specifically AVX-512F
+		inline bool hasAvx512() const { return avx512_; }
 #ifdef __riscv
 		inline bool hasRVV() const { return rvv_; }
 		inline int getRVV_Length() const { return rvv_length; }
@@ -46,6 +48,7 @@ namespace randomx {
 		bool aes_ = false;
 		bool ssse3_ = false;
 		bool avx2_ = false;
+		bool avx512_ = false;
 #ifdef __riscv
 		bool rvv_ = false;
 		int rvv_length = 0;

From 4d68a076504d11317291b97c19b33beafc948c7b Mon Sep 17 00:00:00 2001
From: NexusXe <andastrike@gmail.com>
Date: Sat, 23 May 2026 18:38:11 -0500
Subject: [PATCH 02/10] AVX-512F blamka round implementation

Based on src/blake2/blamka-round-avx2.h
---
 src/blake2/blamka-round-avx512.h | 147 +++++++++++++++++++++++++++++++
 1 file changed, 147 insertions(+)
 create mode 100644 src/blake2/blamka-round-avx512.h

diff --git a/src/blake2/blamka-round-avx512.h b/src/blake2/blamka-round-avx512.h
new file mode 100644
index 00000000..80acaa76
--- /dev/null
+++ b/src/blake2/blamka-round-avx512.h
@@ -0,0 +1,147 @@
+/*
+Copyright (c) 2026, NexusXe <nex@nexusxe.com>
+Copyright (c) 2018-2019, tevador <tevador@gmail.com>
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+	* Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	* Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	* Neither the name of the copyright holder nor the
+	  names of its contributors may be used to endorse or promote products
+	  derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/* Original code from Argon2 reference source code package used under CC0 Licence
+ * https://github.com/P-H-C/phc-winner-argon2
+ * Copyright 2015
+ * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
+*/
+
+// Requires avx512f, 
+
+#ifndef BLAKE_ROUND_MKA_OPT_H
+#define BLAKE_ROUND_MKA_OPT_H
+
+#include "blake2-impl.h"
+
+#ifdef __GNUC__
+#include <x86intrin.h>
+#else
+#include <intrin.h>
+#endif
+
+#define rotr32(x)   _mm512_ror_epi64((x), 32)
+#define rotr24(x)   _mm512_ror_epi64((x), 24)
+#define rotr16(x)   _mm512_ror_epi64((x), 16)
+#define rotr63(x)   _mm512_ror_epi64((x), 63)
+
+#define G1_AVX512(A, B, C, D) \
+    do { \
+        __m512i ml = _mm512_mul_epu32(A, B); \
+        ml = _mm512_add_epi64(ml, ml); \
+        A = _mm512_add_epi64(A, _mm512_add_epi64(B, ml)); \
+        D = _mm512_xor_si512(D, A); \
+        D = rotr32(D); \
+        \
+        ml = _mm512_mul_epu32(C, D); \
+        ml = _mm512_add_epi64(ml, ml); \
+        C = _mm512_add_epi64(C, _mm512_add_epi64(D, ml)); \
+        \
+        B = _mm512_xor_si512(B, C); \
+        B = rotr24(B); \
+    } while((void)0, 0);
+
+#define G2_AVX512(A, B, C, D) \
+do { \
+    __m512i ml = _mm512_mul_epu32(A, B); \
+    ml = _mm512_add_epi64(ml, ml); \
+    A = _mm512_add_epi64(A, _mm512_add_epi64(B, ml)); \
+    D = _mm512_xor_si512(D, A); \
+    D = rotr16(D); \
+    \
+    ml = _mm512_mul_epu32(C, D); \
+    ml = _mm512_add_epi64(ml, ml); \
+    C = _mm512_add_epi64(C, _mm512_add_epi64(D, ml)); \
+    B = _mm512_xor_si512(B, C); \
+    B = rotr63(B); \
+} while((void)0, 0);
+
+#define DIAGONALIZE_1(A, B, C, D) \
+    do { \
+        B = _mm512_permutex_epi64(B, _MM_SHUFFLE(0, 3, 2, 1)); \
+        C = _mm512_permutex_epi64(C, _MM_SHUFFLE(1, 0, 3, 2)); \
+        D = _mm512_permutex_epi64(D, _MM_SHUFFLE(2, 1, 0, 3)); \
+    } while((void)0, 0);
+
+#define DIAGONALIZE_2(A, B, C, D) \
+    do { \
+        __m512i idx_B = _mm512_setr_epi64(1, 4, 3, 6, 5, 0, 7, 2); \
+        __m512i idx_C = _mm512_setr_epi64(4, 5, 6, 7, 0, 1, 2, 3); \
+        __m512i idx_D = _mm512_setr_epi64(5, 0, 7, 2, 1, 4, 3, 6); \
+        \
+        B = _mm512_permutexvar_epi64(idx_B, B); \
+        C = _mm512_permutexvar_epi64(idx_C, C); \
+        D = _mm512_permutexvar_epi64(idx_D, D); \
+    } while((void)0, 0);
+
+#define UNDIAGONALIZE_1(A, B, C, D) \
+    do { \
+        B = _mm512_permutex_epi64(B, _MM_SHUFFLE(2, 1, 0, 3)); \
+        C = _mm512_permutex_epi64(C, _MM_SHUFFLE(1, 0, 3, 2)); \
+        D = _mm512_permutex_epi64(D, _MM_SHUFFLE(0, 3, 2, 1)); \
+    } while((void)0, 0);
+
+#define UNDIAGONALIZE_2(A, B, C, D) \
+    do { \
+        __m512i idx_B = _mm512_setr_epi64(5, 0, 7, 2, 1, 4, 3, 6); \
+        __m512i idx_C = _mm512_setr_epi64(4, 5, 6, 7, 0, 1, 2, 3); \
+        __m512i idx_D = _mm512_setr_epi64(1, 4, 3, 6, 5, 0, 7, 2); \
+        \
+        B = _mm512_permutexvar_epi64(idx_B, B); \
+        C = _mm512_permutexvar_epi64(idx_C, C); \
+        D = _mm512_permutexvar_epi64(idx_D, D); \
+    } while((void)0, 0);
+
+#define BLAKE2_ROUND_1(A, B, C, D) \
+    do { \
+        G1_AVX512(A, B, C, D) \
+        G2_AVX512(A, B, C, D) \
+        \
+        DIAGONALIZE_1(A, B, C, D) \
+        \
+        G1_AVX512(A, B, C, D) \
+        G2_AVX512(A, B, C, D) \
+        \
+        UNDIAGONALIZE_1(A, B, C, D) \
+    } while((void)0, 0);
+
+#define BLAKE2_ROUND_2(A, B, C, D) \
+    do { \
+        G1_AVX512(A, B, C, D) \
+        G2_AVX512(A, B, C, D) \
+        \
+        DIAGONALIZE_2(A, B, C, D) \
+        \
+        G1_AVX512(A, B, C, D) \
+        G2_AVX512(A, B, C, D) \
+        \
+        UNDIAGONALIZE_2(A, B, C, D) \
+    } while((void)0, 0);
+
+#endif /* BLAKE_ROUND_MKA_OPT_H */

From 11663446aaae3a17629d11e6293830c18e55dca4 Mon Sep 17 00:00:00 2001
From: NexusXe <andastrike@gmail.com>
Date: Sat, 23 May 2026 18:39:02 -0500
Subject: [PATCH 03/10] AVX-512F Argon2 implementation

Based on src/argon2_avx2.c
---
 src/argon2.h        |   1 +
 src/argon2_avx512.c | 240 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 241 insertions(+)
 create mode 100644 src/argon2_avx512.c

diff --git a/src/argon2.h b/src/argon2.h
index 9052f42a..ed103dfe 100644
--- a/src/argon2.h
+++ b/src/argon2.h
@@ -255,6 +255,7 @@ void randomx_argon2_fill_segment_ref(const argon2_instance_t* instance,
 
 randomx_argon2_impl *randomx_argon2_impl_ssse3();
 randomx_argon2_impl *randomx_argon2_impl_avx2();
+randomx_argon2_impl *randomx_argon2_impl_avx512();
 
 #if defined(__cplusplus)
 }
diff --git a/src/argon2_avx512.c b/src/argon2_avx512.c
new file mode 100644
index 00000000..78179920
--- /dev/null
+++ b/src/argon2_avx512.c
@@ -0,0 +1,240 @@
+/*
+Copyright (c) 2026, NexusXe <nex@nexusxe.com>
+Copyright (c) 2018-2019, tevador <tevador@gmail.com>
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+	* Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	* Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	* Neither the name of the copyright holder nor the
+	  names of its contributors may be used to endorse or promote products
+	  derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/* Original code from Argon2 reference source code package used under CC0 Licence
+ * https://github.com/P-H-C/phc-winner-argon2
+ * Copyright 2015
+ * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
+*/
+
+#include <stdint.h>
+#include <string.h>
+#include <stdlib.h>
+
+#include "argon2.h"
+
+void randomx_argon2_fill_segment_avx512(const argon2_instance_t* instance,
+	argon2_position_t position);
+
+randomx_argon2_impl* randomx_argon2_impl_avx512() {
+#if defined(__AVX512F__)
+	return &randomx_argon2_fill_segment_avx512;
+#endif
+	return NULL;
+}
+
+#if defined(__AVX512F__)
+
+#include "argon2_core.h"
+
+#include "blake2/blamka-round-avx512.h"
+#include "blake2/blake2-impl.h"
+#include "blake2/blake2.h"
+
+/*
+	Since the AVX-512 implementation processes v0 and v1 together, these helpers
+	are used to help un-interleave these values from the state block.
+*/
+
+/// Packs the lower 256-bit halves of v0 and v1 into a single 512-bit register
+static inline __m512i avx512_pack_lower(__m512i v0, __m512i v1) {
+    return _mm512_inserti64x4(
+        _mm512_castsi256_si512(_mm512_castsi512_si256(v0)), 
+        _mm512_castsi512_si256(v1), 
+        1
+    );
+}
+
+/// Packs the upper 256-bit halves of v0 and v1 into a single 512-bit register
+static inline __m512i avx512_pack_upper(__m512i v0, __m512i v1) {
+    return _mm512_inserti64x4(
+        _mm512_castsi256_si512(_mm512_extracti64x4_epi64(v0, 1)), 
+        _mm512_extracti64x4_epi64(v1, 1), 
+        1
+    );
+}
+
+static void fill_block(__m512i* state, const block* ref_block,
+    block* next_block, int with_xor) {
+    
+    __m512i block_XY[ARGON2_512BIT_WORDS_IN_BLOCK];
+    unsigned int i;
+
+    if (with_xor) {
+        for (i = 0; i < ARGON2_512BIT_WORDS_IN_BLOCK; i++) {
+            // 1. state[i] = state[i] ^ ref_block[i]
+            state[i] = _mm512_xor_si512(
+                state[i], 
+                _mm512_loadu_si512((const __m512i*)ref_block->v + i)
+            );
+            
+            // 2. block_XY[i] = state[i] ^ next_block[i]
+            block_XY[i] = _mm512_xor_si512(
+                state[i], 
+                _mm512_loadu_si512((const __m512i*)next_block->v + i)
+            );
+        }
+    }
+    else {
+        for (i = 0; i < ARGON2_512BIT_WORDS_IN_BLOCK; i++) {
+            // block_XY[i] = state[i] = state[i] ^ ref_block[i]
+            block_XY[i] = state[i] = _mm512_xor_si512(
+                state[i], 
+                _mm512_loadu_si512((const __m512i*)ref_block->v + i)
+            );
+        }
+    }
+
+	for (i = 0; i < 4; ++i) {
+        __m512i A = avx512_pack_lower(state[4 * i + 0], state[4 * i + 2]);
+        __m512i B = avx512_pack_upper(state[4 * i + 0], state[4 * i + 2]);
+        __m512i C = avx512_pack_lower(state[4 * i + 1], state[4 * i + 3]);
+        __m512i D = avx512_pack_upper(state[4 * i + 1], state[4 * i + 3]);
+
+        BLAKE2_ROUND_1(A, B, C, D);
+
+        state[4 * i + 0] = avx512_pack_lower(A, B);
+        state[4 * i + 2] = avx512_pack_upper(A, B);
+        state[4 * i + 1] = avx512_pack_lower(C, D);
+        state[4 * i + 3] = avx512_pack_upper(C, D);
+    }
+
+	for (int j = 0; j < 2; ++j) {
+        // j=0 handles AVX2 i=0 and i=1 (evens)
+        // j=1 handles AVX2 i=2 and i=3 (odds)
+        
+        __m512i A_0 = avx512_pack_lower(state[j], state[j + 2]);
+        __m512i B_0 = avx512_pack_lower(state[j + 4], state[j + 6]);
+        __m512i C_0 = avx512_pack_lower(state[j + 8], state[j + 10]);
+        __m512i D_0 = avx512_pack_lower(state[j + 12], state[j + 14]);
+
+        __m512i A_1 = avx512_pack_upper(state[j], state[j + 2]);
+        __m512i B_1 = avx512_pack_upper(state[j + 4], state[j + 6]);
+        __m512i C_1 = avx512_pack_upper(state[j + 8], state[j + 10]);
+        __m512i D_1 = avx512_pack_upper(state[j + 12], state[j + 14]);
+
+        BLAKE2_ROUND_2(A_0, B_0, C_0, D_0);
+        BLAKE2_ROUND_2(A_1, B_1, C_1, D_1);
+
+        state[j]      = avx512_pack_lower(A_0, A_1); 
+        state[j + 2]  = avx512_pack_upper(A_0, A_1);
+        state[j + 4]  = avx512_pack_lower(B_0, B_1);
+        state[j + 6]  = avx512_pack_upper(B_0, B_1);
+        state[j + 8]  = avx512_pack_lower(C_0, C_1);
+        state[j + 10] = avx512_pack_upper(C_0, C_1);
+        state[j + 12] = avx512_pack_lower(D_0, D_1);
+        state[j + 14] = avx512_pack_upper(D_0, D_1);
+    }
+
+	for (i = 0; i < ARGON2_512BIT_WORDS_IN_BLOCK; i++) {
+		state[i] = _mm512_xor_si512(state[i], block_XY[i]);
+		_mm512_storeu_si512((__m512i*)next_block->v + i, state[i]);
+	}
+}
+
+void randomx_argon2_fill_segment_avx512(const argon2_instance_t* instance,
+	argon2_position_t position) {
+	block* ref_block = NULL, * curr_block = NULL;
+	block address_block, input_block;
+	uint64_t pseudo_rand, ref_index, ref_lane;
+	uint32_t prev_offset, curr_offset;
+	uint32_t starting_index, i;
+	__m512i state[ARGON2_512BIT_WORDS_IN_BLOCK];
+	
+	if (instance == NULL) {
+		return;
+	}
+
+	starting_index = 0;
+
+	if ((0 == position.pass) && (0 == position.slice)) {
+		starting_index = 2; /* We have already generated the first two blocks */
+	}
+
+	/* Offset of the current block */
+	curr_offset = position.lane * instance->lane_length +
+		position.slice * instance->segment_length + starting_index;
+
+	if (0 == curr_offset % instance->lane_length) {
+		/* Last block in this lane */
+		prev_offset = curr_offset + instance->lane_length - 1;
+	}
+	else {
+		/* Previous block */
+		prev_offset = curr_offset - 1;
+	}
+
+	memcpy(state, ((instance->memory + prev_offset)->v), ARGON2_BLOCK_SIZE);
+
+	for (i = starting_index; i < instance->segment_length;
+		++i, ++curr_offset, ++prev_offset) {
+		/* 1.1 Rotating prev_offset if needed */
+		if (curr_offset % instance->lane_length == 1) {
+			prev_offset = curr_offset - 1;
+		}
+
+		/* 1.2 Computing the index of the reference block */
+		/* 1.2.1 Taking pseudo-random value from the previous block */
+		pseudo_rand = instance->memory[prev_offset].v[0];
+
+		/* 1.2.2 Computing the lane of the reference block */
+		ref_lane = ((pseudo_rand >> 32)) % instance->lanes;
+
+		if ((position.pass == 0) && (position.slice == 0)) {
+			/* Can not reference other lanes yet */
+			ref_lane = position.lane;
+		}
+
+		/* 1.2.3 Computing the number of possible reference block within the
+		* lane.
+		*/
+		position.index = i;
+		ref_index = randomx_argon2_index_alpha(instance, &position, pseudo_rand & 0xFFFFFFFF,
+			ref_lane == position.lane);
+
+		/* 2 Creating a new block */
+		ref_block =
+			instance->memory + instance->lane_length * ref_lane + ref_index;
+	curr_block = instance->memory + curr_offset;
+		if (ARGON2_VERSION_10 == instance->version) {
+			/* version 1.2.1 and earlier: overwrite, not XOR */
+			fill_block(state, ref_block, curr_block, 0);
+		}
+		else {
+			if (0 == position.pass) {
+				fill_block(state, ref_block, curr_block, 0);
+			}
+			else {
+				fill_block(state, ref_block, curr_block, 1);
+			}
+		}
+	}
+}
+
+#endif

From 33fe8360c1887cd0dd48768b8e57ac8cc8880e68 Mon Sep 17 00:00:00 2001
From: NexusXe <andastrike@gmail.com>
Date: Sat, 23 May 2026 18:39:54 -0500
Subject: [PATCH 04/10] Use AVX-512F Argon2

---
 src/dataset.hpp | 3 +++
 src/randomx.cpp | 3 +++
 src/randomx.h   | 1 +
 3 files changed, 7 insertions(+)

diff --git a/src/dataset.hpp b/src/dataset.hpp
index 26bc0b28..c84c6e25 100644
--- a/src/dataset.hpp
+++ b/src/dataset.hpp
@@ -92,6 +92,9 @@ namespace randomx {
 	void initDataset(randomx_cache* cache, uint8_t* dataset, uint32_t startBlock, uint32_t endBlock);
 
 	inline randomx_argon2_impl* selectArgonImpl(randomx_flags flags) {
+		if (flags & RANDOMX_FLAG_ARGON2_AVX512) {
+			return randomx_argon2_impl_avx512();
+		}
 		if (flags & RANDOMX_FLAG_ARGON2_AVX2) {
 			return randomx_argon2_impl_avx2();
 		}
diff --git a/src/randomx.cpp b/src/randomx.cpp
index 32790289..c09f251e 100644
--- a/src/randomx.cpp
+++ b/src/randomx.cpp
@@ -57,6 +57,9 @@ extern "C" {
 		if (HAVE_AES && cpu.hasAes()) {
 			flags |= RANDOMX_FLAG_HARD_AES;
 		}
+		if (randomx_argon2_impl_avx512() != nullptr && cpu.hasAvx512()) {
+    		flags |= RANDOMX_FLAG_ARGON2_AVX512;
+		}
 		if (randomx_argon2_impl_avx2() != nullptr && cpu.hasAvx2()) {
 			flags |= RANDOMX_FLAG_ARGON2_AVX2;
 		}
diff --git a/src/randomx.h b/src/randomx.h
index 63076983..368d91ea 100644
--- a/src/randomx.h
+++ b/src/randomx.h
@@ -50,6 +50,7 @@ typedef enum {
   RANDOMX_FLAG_ARGON2_AVX2 = 64,
   RANDOMX_FLAG_ARGON2 = 96,
   RANDOMX_FLAG_V2 = 128,
+  RANDOMX_FLAG_ARGON2_AVX512 = 256,
 } randomx_flags;
 
 typedef struct randomx_dataset randomx_dataset;

From 1bf1e087b98b02b1c1e7c062231c00c7f53b0ca6 Mon Sep 17 00:00:00 2001
From: NexusXe <andastrike@gmail.com>
Date: Sat, 23 May 2026 18:43:27 -0500
Subject: [PATCH 05/10] Add AVX-512F to benchmarks & tests

---
 src/tests/benchmark.cpp | 12 ++++++++++--
 src/tests/tests.cpp     | 12 ++++++++++++
 2 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/src/tests/benchmark.cpp b/src/tests/benchmark.cpp
index d4e6e8d2..6a9618ff 100644
--- a/src/tests/benchmark.cpp
+++ b/src/tests/benchmark.cpp
@@ -94,6 +94,7 @@ void printUsage(const char* executable) {
 	std::cout << "  --seed S      seed for cache initialization (default: 0)" << std::endl;
 	std::cout << "  --ssse3       use optimized Argon2 for SSSE3 CPUs" << std::endl;
 	std::cout << "  --avx2        use optimized Argon2 for AVX2 CPUs" << std::endl;
+	std::cout << "  --avx512      use optimized Argon2 for AVX-512F CPUs" << std::endl;
 	std::cout << "  --auto        select the best options for the current CPU" << std::endl;
 	std::cout << "  --noBatch     calculate hashes one by one (default: batch)" << std::endl;
 	std::cout << "  --commit      calculate commitments instead of hashes (default: hashes)" << std::endl;
@@ -152,7 +153,7 @@ void mine(randomx_vm* vm, std::atomic<uint32_t>& atomicNonce, AtomicHash& result
 
 int main(int argc, char** argv) {
 	bool softAes, miningMode, verificationMode, help, largePages, jit, secure, commit, v2;
-	bool ssse3, avx2, autoFlags, noBatch;
+	bool ssse3, avx2, avx512, autoFlags, noBatch;
 	int noncesCount, threadCount, initThreadCount;
 	uint64_t threadAffinity;
 	int32_t seedValue;
@@ -175,6 +176,7 @@ int main(int argc, char** argv) {
 	readOption("--secure", argc, argv, secure);
 	readOption("--ssse3", argc, argv, ssse3);
 	readOption("--avx2", argc, argv, avx2);
+	readOption("--avx512", argc, argv, avx512);
 	readOption("--auto", argc, argv, autoFlags);
 	readOption("--noBatch", argc, argv, noBatch);
 	readOption("--commit", argc, argv, commit);
@@ -215,6 +217,9 @@ int main(int argc, char** argv) {
 		if (avx2) {
 			flags |= RANDOMX_FLAG_ARGON2_AVX2;
 		}
+		if (avx512) {
+			flags |= RANDOMX_FLAG_ARGON2_AVX512;
+		}
 		if (!softAes) {
 			flags |= RANDOMX_FLAG_HARD_AES;
 		}
@@ -242,7 +247,10 @@ int main(int argc, char** argv) {
 		flags |= RANDOMX_FLAG_V2;
 	}
 
-	if (flags & RANDOMX_FLAG_ARGON2_AVX2) {
+	if (flags & RANDOMX_FLAG_ARGON2_AVX512) {
+		std::cout << " - Argon2 implementation: AVX-512F" << std::endl;
+	}
+	else if (flags & RANDOMX_FLAG_ARGON2_AVX2) {
 		std::cout << " - Argon2 implementation: AVX2" << std::endl;
 	}
 	else if (flags & RANDOMX_FLAG_ARGON2_SSSE3) {
diff --git a/src/tests/tests.cpp b/src/tests/tests.cpp
index dee37d24..b0078630 100644
--- a/src/tests/tests.cpp
+++ b/src/tests/tests.cpp
@@ -1099,6 +1099,18 @@ int main() {
 		assert(cacheMemory[33554431] == 0x1f47f056d05cd99b);
 	});
 
+	if (cache != nullptr)
+		randomx_release_cache(cache);
+	cache = randomx_alloc_cache(RANDOMX_FLAG_ARGON2_AVX512);
+
+	runTest("Cache initialization: AVX-512F", (flags & RANDOMX_FLAG_ARGON2_AVX512) && RANDOMX_ARGON_ITERATIONS == 3 && RANDOMX_ARGON_LANES == 1 && RANDOMX_ARGON_MEMORY == 262144 && stringsEqual(RANDOMX_ARGON_SALT, "RandomX\x03"), []() {
+		initCache("test key 000");
+		uint64_t* cacheMemory = (uint64_t*)cache->memory;
+		assert(cacheMemory[0] == 0x191e0e1d23c02186);
+		assert(cacheMemory[1568413] == 0xf1b62fe6210bf8b1);
+		assert(cacheMemory[33554431] == 0x1f47f056d05cd99b);
+	});
+
 	if (cache != nullptr)
 		randomx_release_cache(cache);
 	cache = randomx_alloc_cache(RANDOMX_FLAG_DEFAULT);

From 51f8369d7ad4ab85dc4818c2916460a3e1bac954 Mon Sep 17 00:00:00 2001
From: NexusXe <andastrike@gmail.com>
Date: Sat, 23 May 2026 18:44:01 -0500
Subject: [PATCH 06/10] Add AVX-512 Argon2 files to MSVC and Clang config files

---
 CMakeLists.txt                      | 6 ++++++
 vcxproj/randomx-dll.vcxproj         | 3 +++
 vcxproj/randomx-dll.vcxproj.filters | 3 +++
 vcxproj/randomx.vcxproj             | 4 ++++
 vcxproj/randomx.vcxproj.filters     | 5 +++++
 5 files changed, 21 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3d34e2fe..0547a266 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -35,6 +35,7 @@ src/aes_hash.cpp
 src/argon2_ref.c
 src/argon2_ssse3.c
 src/argon2_avx2.c
+src/argon2_avx512.c
 src/bytecode_machine.cpp
 src/cpu.cpp
 src/dataset.cpp
@@ -107,6 +108,7 @@ if ((CMAKE_SIZEOF_VOID_P EQUAL 8) AND (ARCH_ID STREQUAL "x86_64" OR ARCH_ID STRE
     set_property(SOURCE src/jit_compiler_x86_static.asm PROPERTY LANGUAGE ASM_MASM)
 
     set_source_files_properties(src/argon2_avx2.c COMPILE_FLAGS /arch:AVX2)
+    set_source_files_properties(src/argon2_avx512.c COMPILE_FLAGS /arch:AVX512)
 
     set(CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_C_FLAGS_RELWITHDEBINFO} /DRELWITHDEBINFO")
     set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} /DRELWITHDEBINFO")
@@ -137,6 +139,10 @@ if ((CMAKE_SIZEOF_VOID_P EQUAL 8) AND (ARCH_ID STREQUAL "x86_64" OR ARCH_ID STRE
       if(HAVE_AVX2)
         set_source_files_properties(src/argon2_avx2.c COMPILE_FLAGS -mavx2)
       endif()
+      check_c_compiler_flag(-mavx512f HAVE_AVX512F)
+      if(HAVE_AVX512F)
+        set_source_files_properties(src/argon2_avx512.c COMPILE_FLAGS -mavx512f)
+      endif()
     endif()
   endif()
 endif()
diff --git a/vcxproj/randomx-dll.vcxproj b/vcxproj/randomx-dll.vcxproj
index 4eaae9be..2976d1d7 100644
--- a/vcxproj/randomx-dll.vcxproj
+++ b/vcxproj/randomx-dll.vcxproj
@@ -57,6 +57,9 @@
     <ClCompile Include="..\src\argon2_avx2.c">
       <EnableEnhancedInstructionSet Condition="'$(Configuration)|$(Platform)'=='Release|x64'">AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
     </ClCompile>
+    <ClCompile Include="..\src\argon2_avx512.c">
+      <EnableEnhancedInstructionSet Condition="'$(Configuration)|$(Platform)'=='Release|x64'">AdvancedVectorExtensions512</EnableEnhancedInstructionSet>
+    </ClCompile>
     <ClCompile Include="..\src\argon2_core.c" />
     <ClCompile Include="..\src\argon2_ref.c" />
     <ClCompile Include="..\src\argon2_ssse3.c" />
diff --git a/vcxproj/randomx-dll.vcxproj.filters b/vcxproj/randomx-dll.vcxproj.filters
index 5b51f9f7..9ce0786a 100644
--- a/vcxproj/randomx-dll.vcxproj.filters
+++ b/vcxproj/randomx-dll.vcxproj.filters
@@ -175,6 +175,9 @@
     <ClCompile Include="..\src\argon2_avx2.c">
       <Filter>Source Files</Filter>
     </ClCompile>
+    <ClCompile Include="..\src\argon2_avx512.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
     <ClCompile Include="..\src\argon2_ssse3.c">
       <Filter>Source Files</Filter>
     </ClCompile>
diff --git a/vcxproj/randomx.vcxproj b/vcxproj/randomx.vcxproj
index cefdc8fb..40714e7e 100644
--- a/vcxproj/randomx.vcxproj
+++ b/vcxproj/randomx.vcxproj
@@ -134,6 +134,9 @@ SET ERRORLEVEL = 0</Command>
     <ClCompile Include="..\src\argon2_avx2.c">
       <EnableEnhancedInstructionSet Condition="'$(Configuration)|$(Platform)'=='Release|x64'">AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
     </ClCompile>
+    <ClCompile Include="..\src\argon2_avx512.c">
+      <EnableEnhancedInstructionSet Condition="'$(Configuration)|$(Platform)'=='Release|x64'">AdvancedVectorExtensions512</EnableEnhancedInstructionSet>
+    </ClCompile>
     <ClCompile Include="..\src\argon2_core.c" />
     <ClCompile Include="..\src\argon2_ref.c" />
     <ClCompile Include="..\src\argon2_ssse3.c" />
@@ -169,6 +172,7 @@ SET ERRORLEVEL = 0</Command>
     <ClInclude Include="..\src\blake2\blake2-impl.h" />
     <ClInclude Include="..\src\blake2\blake2.h" />
     <ClInclude Include="..\src\blake2\blamka-round-avx2.h" />
+    <ClInclude Include="..\src\blake2\blamka-round-avx512.h" />
     <ClInclude Include="..\src\blake2\blamka-round-ref.h" />
     <ClInclude Include="..\src\blake2\blamka-round-ssse3.h" />
     <ClInclude Include="..\src\blake2\endian.h" />
diff --git a/vcxproj/randomx.vcxproj.filters b/vcxproj/randomx.vcxproj.filters
index 7f055b5b..7e09b801 100644
--- a/vcxproj/randomx.vcxproj.filters
+++ b/vcxproj/randomx.vcxproj.filters
@@ -84,6 +84,9 @@
     <ClCompile Include="..\src\argon2_avx2.c">
       <Filter>Source Files</Filter>
     </ClCompile>
+    <ClCompile Include="..\src\argon2_avx512.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
     <ClCompile Include="..\src\argon2_ssse3.c">
       <Filter>Source Files</Filter>
     </ClCompile>
@@ -196,6 +199,8 @@
     </ClInclude>
     <ClInclude Include="..\src\blake2\blamka-round-avx2.h">
       <Filter>Header Files</Filter>
+    <ClInclude Include="..\src\blake2\blamka-round-avx512.h">
+      <Filter>Header Files</Filter>
     </ClInclude>
     <ClInclude Include="..\src\blake2\blamka-round-ssse3.h">
       <Filter>Header Files</Filter>

From e940c9b2cad394540261eb92d3d0d7d59e35ebba Mon Sep 17 00:00:00 2001
From: NexusXe <andastrike@gmail.com>
Date: Sat, 23 May 2026 18:53:07 -0500
Subject: [PATCH 07/10] Remove old comment

I was unsure if extensions past AVX-512F would be needed, but it turned
out that since the primary data element for this code is a 64-bit
integer, only AVX-512F is needed.
---
 src/blake2/blamka-round-avx512.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/blake2/blamka-round-avx512.h b/src/blake2/blamka-round-avx512.h
index 80acaa76..0a540965 100644
--- a/src/blake2/blamka-round-avx512.h
+++ b/src/blake2/blamka-round-avx512.h
@@ -33,8 +33,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
 */
 
-// Requires avx512f, 
-
 #ifndef BLAKE_ROUND_MKA_OPT_H
 #define BLAKE_ROUND_MKA_OPT_H
 

From 679db5d22623349bf91ec0ebae0a11b6360c41ac Mon Sep 17 00:00:00 2001
From: NexusXe <andastrike@gmail.com>
Date: Mon, 25 May 2026 11:02:06 -0500
Subject: [PATCH 08/10] ensure `_XCR_XFEATURE_ENABLED_MASK` gets defined

---
 src/cpu.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/cpu.cpp b/src/cpu.cpp
index 05cf4671..932c2973 100644
--- a/src/cpu.cpp
+++ b/src/cpu.cpp
@@ -40,6 +40,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 			__cpuid_count(InfoType, 0, info[0], info[1], info[2], info[3]);
 		}
 	#endif
+	#ifndef _XCR_XFEATURE_ENABLED_MASK
+		#define _XCR_XFEATURE_ENABLED_MASK 0
+	#endif
 #endif
 
 #if defined(HAVE_HWCAP)

From 9f583baee55a8da0be72065e24013f3b45d30a7f Mon Sep 17 00:00:00 2001
From: NexusXe <andastrike@gmail.com>
Date: Mon, 25 May 2026 11:36:26 -0500
Subject: [PATCH 09/10] specify macro section as for both GCC and Clang

---
 src/cpu.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/cpu.cpp b/src/cpu.cpp
index 932c2973..e7961fda 100644
--- a/src/cpu.cpp
+++ b/src/cpu.cpp
@@ -34,7 +34,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	#if defined(_MSC_VER)
 		#include <intrin.h>
 		#define cpuid(info, x) __cpuidex(info, x, 0)
-	#else //GCC
+	#else // GCC/Clang
 		#include <cpuid.h>
 		void cpuid(int info[4], int InfoType) {
 			__cpuid_count(InfoType, 0, info[0], info[1], info[2], info[3]);

From e6726b928083ba681f3a5a2ae72e831725f14395 Mon Sep 17 00:00:00 2001
From: NexusXe <andastrike@gmail.com>
Date: Mon, 25 May 2026 11:36:52 -0500
Subject: [PATCH 10/10] Use assembly for `_xgetbv` on non-MSVC

GCC/Clang more strictly ensures that the `_xgetbv` macro is only used
when the `XSAVE` target feature is enabled. This project is
(intentionally) built without strict target features, so instead use an
assembly shim that manually uses the intrinsic. Since this is only run
when `OSXSAVE` is enabled (and thus the `XSAVE` feature *must* be
enabled on the host), this is safe.
---
 src/cpu.cpp | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/src/cpu.cpp b/src/cpu.cpp
index e7961fda..3b2440c1 100644
--- a/src/cpu.cpp
+++ b/src/cpu.cpp
@@ -39,6 +39,17 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 		void cpuid(int info[4], int InfoType) {
 			__cpuid_count(InfoType, 0, info[0], info[1], info[2], info[3]);
 		}
+
+		__attribute__((target("xsave")))
+		static inline unsigned long long _xgetbv(unsigned int index) {
+			unsigned int eax, edx;
+			__asm__ __volatile__(
+				"xgetbv"
+				: "=a"(eax), "=d"(edx)
+				: "c"(index)
+			);
+			return ((unsigned long long)edx << 32) | eax;
+		}
 	#endif
 	#ifndef _XCR_XFEATURE_ENABLED_MASK
 		#define _XCR_XFEATURE_ENABLED_MASK 0
@@ -83,7 +94,11 @@ namespace randomx {
 			// Must have OSXSAVE enabled
 			bool osxsave = (info[2] & (1 << 27)) != 0;
 			if (osxsave) {
-				// Check if OS saves AVX-512 state
+				/*
+				Check if OS saves AVX-512 state
+				Requires XSAVE, which is guaranteed present due to the presence
+				of the OSXSAVE enabled bit
+				*/
 				unsigned long long xcrFeatureMask = _xgetbv(_XCR_XFEATURE_ENABLED_MASK);
                 os_avx512_support = (xcrFeatureMask & 0xE6) == 0xE6;
 			}