Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
9d3b988
chore: add patches_ptr to BitunpackParams and AlpParams
0ax1 Apr 17, 2026
d6486c0
wip
0ax1 Apr 20, 2026
339e5d0
feat(cuda): wire patches through fused dynamic dispatch
0ax1 Apr 21, 2026
2e4bb3f
perf(cuda): remove shared-memory Stage — inline PatchesCursor avoids …
0ax1 Apr 21, 2026
dd82236
refactor(cuda): extract scatter_patches_to_output helper for ALP outp…
0ax1 Apr 21, 2026
895a198
refactor(cuda): apply patches inline in their respective ops
0ax1 Apr 21, 2026
4bfe76c
refactor(cuda): move ALP patches_ptr to AlpParams — remove alp_patche…
0ax1 Apr 21, 2026
8ac5ddd
test(cuda): add fused dispatch tests for BitPacked and ALP patches
0ax1 Apr 21, 2026
a481cf2
feat(cuda): support sliced arrays with patches in fused dispatch
0ax1 Apr 21, 2026
1bcf3a4
fix(cuda): slice ALP patches with Patches::slice for correct offset m…
0ax1 Apr 21, 2026
90ed644
fix(cuda): use linear patch scan for ALP — fixes sliced ALP patches
0ax1 Apr 21, 2026
069b050
fix(cuda): correct ALP patches for sliced arrays — compare within-chu…
0ax1 Apr 21, 2026
469dad7
fix(cuda): revert to per-value PatchesCursor for ALP — tiles span chu…
0ax1 Apr 21, 2026
26053e5
chore(cuda): clippy fixes, rustfmt (nightly), clang-format
0ax1 Apr 21, 2026
707f688
test(cuda): comprehensive fused dispatch patch tests
0ax1 Apr 21, 2026
19d1e1b
fix(cuda): ALP patches offset>=1024, PatchesCursor overflow, padding UB
0ax1 Apr 21, 2026
163154c
chore: fix clippy truncation warnings in upload_gpu_patches
0ax1 Apr 21, 2026
296b541
style(cuda): replace bare 1024 with FL_CHUNK in patches.cuh, dynamic_…
0ax1 Apr 21, 2026
bd547a0
Merge remote-tracking branch 'origin/develop' into ad/cuda-patches-v2
0ax1 Apr 21, 2026
b9ac775
fix: update BitPacked::encode calls for new ExecutionCtx parameter
0ax1 Apr 21, 2026
19e1256
style: rustfmt (nightly)
0ax1 Apr 21, 2026
dc42095
style(cuda): rename loop variable i to elem_idx in execute_input_stage
0ax1 Apr 21, 2026
c6700ae
docs(cuda): explain patches_ptr placement on union variants vs per-st…
0ax1 Apr 21, 2026
40df862
refactor: tie patches to owning ops, consolidate tests into rstest
0ax1 Apr 21, 2026
8c7987f
refactor: merge upload_patches, uniform match pattern for source/scal…
0ax1 Apr 21, 2026
0f32673
docs: compact patches_ptr placement comment
0ax1 Apr 21, 2026
640d017
docs: compact ALP slice comment
0ax1 Apr 21, 2026
308c4fa
clippy
0ax1 Apr 21, 2026
e3f0aee
fix: remove broken match patterns, clippy fixes, unused import
0ax1 Apr 21, 2026
30d299f
ref: reuse load patches
0ax1 Apr 21, 2026
a187e0c
fix: remove unnecessary mut from cuda_ctx declarations
0ax1 Apr 21, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions vortex-cuda/benches/dynamic_dispatch_cuda.rs
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ struct BenchRunner {
}

impl BenchRunner {
fn new(array: &vortex::array::ArrayRef, len: usize, cuda_ctx: &CudaExecutionCtx) -> Self {
fn new(array: &vortex::array::ArrayRef, len: usize, cuda_ctx: &mut CudaExecutionCtx) -> Self {
let plan = match DispatchPlan::new(array).vortex_expect("build_dyn_dispatch_plan") {
DispatchPlan::Fused(plan) => plan,
_ => unreachable!("encoding not fusable"),
Expand Down Expand Up @@ -201,7 +201,7 @@ fn bench_for_bitpacked(c: &mut Criterion) {
let mut cuda_ctx =
CudaSession::create_execution_ctx(&VortexSession::empty()).vortex_expect("ctx");

let bench_runner = BenchRunner::new(&array, n, &cuda_ctx);
let bench_runner = BenchRunner::new(&array, n, &mut cuda_ctx);

b.iter_custom(|iters| {
let mut total_time = Duration::ZERO;
Expand Down Expand Up @@ -246,7 +246,7 @@ fn bench_dict_bp_codes(c: &mut Criterion) {
let mut cuda_ctx =
CudaSession::create_execution_ctx(&VortexSession::empty()).vortex_expect("ctx");

let bench_runner = BenchRunner::new(&array, n, &cuda_ctx);
let bench_runner = BenchRunner::new(&array, n, &mut cuda_ctx);

b.iter_custom(|iters| {
let mut total_time = Duration::ZERO;
Expand Down Expand Up @@ -290,7 +290,7 @@ fn bench_runend(c: &mut Criterion) {
let mut cuda_ctx =
CudaSession::create_execution_ctx(&VortexSession::empty()).vortex_expect("ctx");

let bench_runner = BenchRunner::new(&array, n, &cuda_ctx);
let bench_runner = BenchRunner::new(&array, n, &mut cuda_ctx);

b.iter_custom(|iters| {
let mut total_time = Duration::ZERO;
Expand Down Expand Up @@ -344,7 +344,7 @@ fn bench_dict_bp_codes_bp_for_values(c: &mut Criterion) {
let mut cuda_ctx =
CudaSession::create_execution_ctx(&VortexSession::empty()).vortex_expect("ctx");

let bench_runner = BenchRunner::new(&array, n, &cuda_ctx);
let bench_runner = BenchRunner::new(&array, n, &mut cuda_ctx);

b.iter_custom(|iters| {
let mut total_time = Duration::ZERO;
Expand Down Expand Up @@ -409,7 +409,7 @@ fn bench_alp_for_bitpacked(c: &mut Criterion) {
let mut cuda_ctx =
CudaSession::create_execution_ctx(&VortexSession::empty()).vortex_expect("ctx");

let bench_runner = BenchRunner::new(&array, n, &cuda_ctx);
let bench_runner = BenchRunner::new(&array, n, &mut cuda_ctx);

b.iter_custom(|iters| {
let mut total_time = Duration::ZERO;
Expand Down
78 changes: 39 additions & 39 deletions vortex-cuda/kernels/src/bit_unpack_16.cu
Original file line number Diff line number Diff line change
Expand Up @@ -4,148 +4,148 @@

template <int BW>
__device__ void _bit_unpack_16_device(const uint16_t *__restrict in, uint16_t *__restrict out, uint16_t reference, int thread_idx, GPUPatches& patches) {
__shared__ uint16_t shared_out[1024];
__shared__ uint16_t shared_out[FL_CHUNK];

// Step 1: Unpack into shared memory
#pragma unroll
for (int i = 0; i < 2; i++) {
_bit_unpack_16_lane<BW>(in, shared_out, reference, thread_idx * 2 + i);
for (int i = 0; i < FL_LANES<uint16_t> / 32; i++) {
_bit_unpack_16_lane<BW>(in, shared_out, reference, thread_idx * (FL_LANES<uint16_t> / 32) + i);
}
__syncwarp();

// Step 2: Apply patches to shared memory in parallel
PatchesCursor<uint16_t> cursor(patches, blockIdx.x, thread_idx, 32);
auto patch = cursor.next();
while (patch.index != 1024) {
while (patch.index != FL_CHUNK) {
shared_out[patch.index] = patch.value;
patch = cursor.next();
}
__syncwarp();

// Step 3: Copy to global memory
#pragma unroll
for (int i = 0; i < 32; i++) {
for (int i = 0; i < FL_CHUNK / 32; i++) {
auto idx = i * 32 + thread_idx;
out[idx] = shared_out[idx];
}
}

extern "C" __global__ void bit_unpack_16_0bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) {
int thread_idx = threadIdx.x;
auto in = full_in + (blockIdx.x * (128 * 0 / sizeof(uint16_t)));
auto out = full_out + (blockIdx.x * 1024);
auto in = full_in + (blockIdx.x * (FL_LANES<uint16_t> * 0));
auto out = full_out + (blockIdx.x * FL_CHUNK);
_bit_unpack_16_device<0>(in, out, reference, thread_idx, patches);
}

extern "C" __global__ void bit_unpack_16_1bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) {
int thread_idx = threadIdx.x;
auto in = full_in + (blockIdx.x * (128 * 1 / sizeof(uint16_t)));
auto out = full_out + (blockIdx.x * 1024);
auto in = full_in + (blockIdx.x * (FL_LANES<uint16_t> * 1));
auto out = full_out + (blockIdx.x * FL_CHUNK);
_bit_unpack_16_device<1>(in, out, reference, thread_idx, patches);
}

extern "C" __global__ void bit_unpack_16_2bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) {
int thread_idx = threadIdx.x;
auto in = full_in + (blockIdx.x * (128 * 2 / sizeof(uint16_t)));
auto out = full_out + (blockIdx.x * 1024);
auto in = full_in + (blockIdx.x * (FL_LANES<uint16_t> * 2));
auto out = full_out + (blockIdx.x * FL_CHUNK);
_bit_unpack_16_device<2>(in, out, reference, thread_idx, patches);
}

extern "C" __global__ void bit_unpack_16_3bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) {
int thread_idx = threadIdx.x;
auto in = full_in + (blockIdx.x * (128 * 3 / sizeof(uint16_t)));
auto out = full_out + (blockIdx.x * 1024);
auto in = full_in + (blockIdx.x * (FL_LANES<uint16_t> * 3));
auto out = full_out + (blockIdx.x * FL_CHUNK);
_bit_unpack_16_device<3>(in, out, reference, thread_idx, patches);
}

extern "C" __global__ void bit_unpack_16_4bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) {
int thread_idx = threadIdx.x;
auto in = full_in + (blockIdx.x * (128 * 4 / sizeof(uint16_t)));
auto out = full_out + (blockIdx.x * 1024);
auto in = full_in + (blockIdx.x * (FL_LANES<uint16_t> * 4));
auto out = full_out + (blockIdx.x * FL_CHUNK);
_bit_unpack_16_device<4>(in, out, reference, thread_idx, patches);
}

extern "C" __global__ void bit_unpack_16_5bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) {
int thread_idx = threadIdx.x;
auto in = full_in + (blockIdx.x * (128 * 5 / sizeof(uint16_t)));
auto out = full_out + (blockIdx.x * 1024);
auto in = full_in + (blockIdx.x * (FL_LANES<uint16_t> * 5));
auto out = full_out + (blockIdx.x * FL_CHUNK);
_bit_unpack_16_device<5>(in, out, reference, thread_idx, patches);
}

extern "C" __global__ void bit_unpack_16_6bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) {
int thread_idx = threadIdx.x;
auto in = full_in + (blockIdx.x * (128 * 6 / sizeof(uint16_t)));
auto out = full_out + (blockIdx.x * 1024);
auto in = full_in + (blockIdx.x * (FL_LANES<uint16_t> * 6));
auto out = full_out + (blockIdx.x * FL_CHUNK);
_bit_unpack_16_device<6>(in, out, reference, thread_idx, patches);
}

extern "C" __global__ void bit_unpack_16_7bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) {
int thread_idx = threadIdx.x;
auto in = full_in + (blockIdx.x * (128 * 7 / sizeof(uint16_t)));
auto out = full_out + (blockIdx.x * 1024);
auto in = full_in + (blockIdx.x * (FL_LANES<uint16_t> * 7));
auto out = full_out + (blockIdx.x * FL_CHUNK);
_bit_unpack_16_device<7>(in, out, reference, thread_idx, patches);
}

extern "C" __global__ void bit_unpack_16_8bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) {
int thread_idx = threadIdx.x;
auto in = full_in + (blockIdx.x * (128 * 8 / sizeof(uint16_t)));
auto out = full_out + (blockIdx.x * 1024);
auto in = full_in + (blockIdx.x * (FL_LANES<uint16_t> * 8));
auto out = full_out + (blockIdx.x * FL_CHUNK);
_bit_unpack_16_device<8>(in, out, reference, thread_idx, patches);
}

extern "C" __global__ void bit_unpack_16_9bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) {
int thread_idx = threadIdx.x;
auto in = full_in + (blockIdx.x * (128 * 9 / sizeof(uint16_t)));
auto out = full_out + (blockIdx.x * 1024);
auto in = full_in + (blockIdx.x * (FL_LANES<uint16_t> * 9));
auto out = full_out + (blockIdx.x * FL_CHUNK);
_bit_unpack_16_device<9>(in, out, reference, thread_idx, patches);
}

extern "C" __global__ void bit_unpack_16_10bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) {
int thread_idx = threadIdx.x;
auto in = full_in + (blockIdx.x * (128 * 10 / sizeof(uint16_t)));
auto out = full_out + (blockIdx.x * 1024);
auto in = full_in + (blockIdx.x * (FL_LANES<uint16_t> * 10));
auto out = full_out + (blockIdx.x * FL_CHUNK);
_bit_unpack_16_device<10>(in, out, reference, thread_idx, patches);
}

extern "C" __global__ void bit_unpack_16_11bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) {
int thread_idx = threadIdx.x;
auto in = full_in + (blockIdx.x * (128 * 11 / sizeof(uint16_t)));
auto out = full_out + (blockIdx.x * 1024);
auto in = full_in + (blockIdx.x * (FL_LANES<uint16_t> * 11));
auto out = full_out + (blockIdx.x * FL_CHUNK);
_bit_unpack_16_device<11>(in, out, reference, thread_idx, patches);
}

extern "C" __global__ void bit_unpack_16_12bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) {
int thread_idx = threadIdx.x;
auto in = full_in + (blockIdx.x * (128 * 12 / sizeof(uint16_t)));
auto out = full_out + (blockIdx.x * 1024);
auto in = full_in + (blockIdx.x * (FL_LANES<uint16_t> * 12));
auto out = full_out + (blockIdx.x * FL_CHUNK);
_bit_unpack_16_device<12>(in, out, reference, thread_idx, patches);
}

extern "C" __global__ void bit_unpack_16_13bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) {
int thread_idx = threadIdx.x;
auto in = full_in + (blockIdx.x * (128 * 13 / sizeof(uint16_t)));
auto out = full_out + (blockIdx.x * 1024);
auto in = full_in + (blockIdx.x * (FL_LANES<uint16_t> * 13));
auto out = full_out + (blockIdx.x * FL_CHUNK);
_bit_unpack_16_device<13>(in, out, reference, thread_idx, patches);
}

extern "C" __global__ void bit_unpack_16_14bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) {
int thread_idx = threadIdx.x;
auto in = full_in + (blockIdx.x * (128 * 14 / sizeof(uint16_t)));
auto out = full_out + (blockIdx.x * 1024);
auto in = full_in + (blockIdx.x * (FL_LANES<uint16_t> * 14));
auto out = full_out + (blockIdx.x * FL_CHUNK);
_bit_unpack_16_device<14>(in, out, reference, thread_idx, patches);
}

extern "C" __global__ void bit_unpack_16_15bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) {
int thread_idx = threadIdx.x;
auto in = full_in + (blockIdx.x * (128 * 15 / sizeof(uint16_t)));
auto out = full_out + (blockIdx.x * 1024);
auto in = full_in + (blockIdx.x * (FL_LANES<uint16_t> * 15));
auto out = full_out + (blockIdx.x * FL_CHUNK);
_bit_unpack_16_device<15>(in, out, reference, thread_idx, patches);
}

extern "C" __global__ void bit_unpack_16_16bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) {
int thread_idx = threadIdx.x;
auto in = full_in + (blockIdx.x * (128 * 16 / sizeof(uint16_t)));
auto out = full_out + (blockIdx.x * 1024);
auto in = full_in + (blockIdx.x * (FL_LANES<uint16_t> * 16));
auto out = full_out + (blockIdx.x * FL_CHUNK);
_bit_unpack_16_device<16>(in, out, reference, thread_idx, patches);
}

32 changes: 16 additions & 16 deletions vortex-cuda/kernels/src/bit_unpack_16_lanes.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ __device__ void _bit_unpack_16_lane<0>(const uint16_t *__restrict in, uint16_t *

template <>
__device__ void _bit_unpack_16_lane<1>(const uint16_t *__restrict in, uint16_t *__restrict out, uint16_t reference, unsigned int lane) {
unsigned int LANE_COUNT = 64;
constexpr unsigned int LANE_COUNT = FL_LANES<uint16_t>;
uint16_t src;
uint16_t tmp;
src = in[lane];
Expand Down Expand Up @@ -59,7 +59,7 @@ __device__ void _bit_unpack_16_lane<1>(const uint16_t *__restrict in, uint16_t *

template <>
__device__ void _bit_unpack_16_lane<2>(const uint16_t *__restrict in, uint16_t *__restrict out, uint16_t reference, unsigned int lane) {
unsigned int LANE_COUNT = 64;
constexpr unsigned int LANE_COUNT = FL_LANES<uint16_t>;
uint16_t src;
uint16_t tmp;
src = in[lane];
Expand Down Expand Up @@ -101,7 +101,7 @@ __device__ void _bit_unpack_16_lane<2>(const uint16_t *__restrict in, uint16_t *

template <>
__device__ void _bit_unpack_16_lane<3>(const uint16_t *__restrict in, uint16_t *__restrict out, uint16_t reference, unsigned int lane) {
unsigned int LANE_COUNT = 64;
constexpr unsigned int LANE_COUNT = FL_LANES<uint16_t>;
uint16_t src;
uint16_t tmp;
src = in[lane];
Expand Down Expand Up @@ -145,7 +145,7 @@ __device__ void _bit_unpack_16_lane<3>(const uint16_t *__restrict in, uint16_t *

template <>
__device__ void _bit_unpack_16_lane<4>(const uint16_t *__restrict in, uint16_t *__restrict out, uint16_t reference, unsigned int lane) {
unsigned int LANE_COUNT = 64;
constexpr unsigned int LANE_COUNT = FL_LANES<uint16_t>;
uint16_t src;
uint16_t tmp;
src = in[lane];
Expand Down Expand Up @@ -191,7 +191,7 @@ __device__ void _bit_unpack_16_lane<4>(const uint16_t *__restrict in, uint16_t *

template <>
__device__ void _bit_unpack_16_lane<5>(const uint16_t *__restrict in, uint16_t *__restrict out, uint16_t reference, unsigned int lane) {
unsigned int LANE_COUNT = 64;
constexpr unsigned int LANE_COUNT = FL_LANES<uint16_t>;
uint16_t src;
uint16_t tmp;
src = in[lane];
Expand Down Expand Up @@ -239,7 +239,7 @@ __device__ void _bit_unpack_16_lane<5>(const uint16_t *__restrict in, uint16_t *

template <>
__device__ void _bit_unpack_16_lane<6>(const uint16_t *__restrict in, uint16_t *__restrict out, uint16_t reference, unsigned int lane) {
unsigned int LANE_COUNT = 64;
constexpr unsigned int LANE_COUNT = FL_LANES<uint16_t>;
uint16_t src;
uint16_t tmp;
src = in[lane];
Expand Down Expand Up @@ -289,7 +289,7 @@ __device__ void _bit_unpack_16_lane<6>(const uint16_t *__restrict in, uint16_t *

template <>
__device__ void _bit_unpack_16_lane<7>(const uint16_t *__restrict in, uint16_t *__restrict out, uint16_t reference, unsigned int lane) {
unsigned int LANE_COUNT = 64;
constexpr unsigned int LANE_COUNT = FL_LANES<uint16_t>;
uint16_t src;
uint16_t tmp;
src = in[lane];
Expand Down Expand Up @@ -341,7 +341,7 @@ __device__ void _bit_unpack_16_lane<7>(const uint16_t *__restrict in, uint16_t *

template <>
__device__ void _bit_unpack_16_lane<8>(const uint16_t *__restrict in, uint16_t *__restrict out, uint16_t reference, unsigned int lane) {
unsigned int LANE_COUNT = 64;
constexpr unsigned int LANE_COUNT = FL_LANES<uint16_t>;
uint16_t src;
uint16_t tmp;
src = in[lane];
Expand Down Expand Up @@ -395,7 +395,7 @@ __device__ void _bit_unpack_16_lane<8>(const uint16_t *__restrict in, uint16_t *

template <>
__device__ void _bit_unpack_16_lane<9>(const uint16_t *__restrict in, uint16_t *__restrict out, uint16_t reference, unsigned int lane) {
unsigned int LANE_COUNT = 64;
constexpr unsigned int LANE_COUNT = FL_LANES<uint16_t>;
uint16_t src;
uint16_t tmp;
src = in[lane];
Expand Down Expand Up @@ -451,7 +451,7 @@ __device__ void _bit_unpack_16_lane<9>(const uint16_t *__restrict in, uint16_t *

template <>
__device__ void _bit_unpack_16_lane<10>(const uint16_t *__restrict in, uint16_t *__restrict out, uint16_t reference, unsigned int lane) {
unsigned int LANE_COUNT = 64;
constexpr unsigned int LANE_COUNT = FL_LANES<uint16_t>;
uint16_t src;
uint16_t tmp;
src = in[lane];
Expand Down Expand Up @@ -509,7 +509,7 @@ __device__ void _bit_unpack_16_lane<10>(const uint16_t *__restrict in, uint16_t

template <>
__device__ void _bit_unpack_16_lane<11>(const uint16_t *__restrict in, uint16_t *__restrict out, uint16_t reference, unsigned int lane) {
unsigned int LANE_COUNT = 64;
constexpr unsigned int LANE_COUNT = FL_LANES<uint16_t>;
uint16_t src;
uint16_t tmp;
src = in[lane];
Expand Down Expand Up @@ -569,7 +569,7 @@ __device__ void _bit_unpack_16_lane<11>(const uint16_t *__restrict in, uint16_t

template <>
__device__ void _bit_unpack_16_lane<12>(const uint16_t *__restrict in, uint16_t *__restrict out, uint16_t reference, unsigned int lane) {
unsigned int LANE_COUNT = 64;
constexpr unsigned int LANE_COUNT = FL_LANES<uint16_t>;
uint16_t src;
uint16_t tmp;
src = in[lane];
Expand Down Expand Up @@ -631,7 +631,7 @@ __device__ void _bit_unpack_16_lane<12>(const uint16_t *__restrict in, uint16_t

template <>
__device__ void _bit_unpack_16_lane<13>(const uint16_t *__restrict in, uint16_t *__restrict out, uint16_t reference, unsigned int lane) {
unsigned int LANE_COUNT = 64;
constexpr unsigned int LANE_COUNT = FL_LANES<uint16_t>;
uint16_t src;
uint16_t tmp;
src = in[lane];
Expand Down Expand Up @@ -695,7 +695,7 @@ __device__ void _bit_unpack_16_lane<13>(const uint16_t *__restrict in, uint16_t

template <>
__device__ void _bit_unpack_16_lane<14>(const uint16_t *__restrict in, uint16_t *__restrict out, uint16_t reference, unsigned int lane) {
unsigned int LANE_COUNT = 64;
constexpr unsigned int LANE_COUNT = FL_LANES<uint16_t>;
uint16_t src;
uint16_t tmp;
src = in[lane];
Expand Down Expand Up @@ -761,7 +761,7 @@ __device__ void _bit_unpack_16_lane<14>(const uint16_t *__restrict in, uint16_t

template <>
__device__ void _bit_unpack_16_lane<15>(const uint16_t *__restrict in, uint16_t *__restrict out, uint16_t reference, unsigned int lane) {
unsigned int LANE_COUNT = 64;
constexpr unsigned int LANE_COUNT = FL_LANES<uint16_t>;
uint16_t src;
uint16_t tmp;
src = in[lane];
Expand Down Expand Up @@ -829,7 +829,7 @@ __device__ void _bit_unpack_16_lane<15>(const uint16_t *__restrict in, uint16_t

template <>
__device__ void _bit_unpack_16_lane<16>(const uint16_t *__restrict in, uint16_t *__restrict out, uint16_t reference, unsigned int lane) {
unsigned int LANE_COUNT = 64;
constexpr unsigned int LANE_COUNT = FL_LANES<uint16_t>;
#pragma unroll
for (int row = 0; row < 16; row++) {
out[INDEX(row, lane)] = in[LANE_COUNT * row + lane] + reference;
Expand Down
Loading
Loading