vortex-data · 0ax1 · Apr 17, 2026 · Apr 20, 2026 · Apr 21, 2026 · Apr 21, 2026
diff --git a/vortex-cuda/benches/dynamic_dispatch_cuda.rs b/vortex-cuda/benches/dynamic_dispatch_cuda.rs
@@ -123,7 +123,7 @@ struct BenchRunner {
 }
 
 impl BenchRunner {
-    fn new(array: &vortex::array::ArrayRef, len: usize, cuda_ctx: &CudaExecutionCtx) -> Self {
+    fn new(array: &vortex::array::ArrayRef, len: usize, cuda_ctx: &mut CudaExecutionCtx) -> Self {
         let plan = match DispatchPlan::new(array).vortex_expect("build_dyn_dispatch_plan") {
             DispatchPlan::Fused(plan) => plan,
             _ => unreachable!("encoding not fusable"),
@@ -201,7 +201,7 @@ fn bench_for_bitpacked(c: &mut Criterion) {
                 let mut cuda_ctx =
                     CudaSession::create_execution_ctx(&VortexSession::empty()).vortex_expect("ctx");
 
-                let bench_runner = BenchRunner::new(&array, n, &cuda_ctx);
+                let bench_runner = BenchRunner::new(&array, n, &mut cuda_ctx);
 
                 b.iter_custom(|iters| {
                     let mut total_time = Duration::ZERO;
@@ -246,7 +246,7 @@ fn bench_dict_bp_codes(c: &mut Criterion) {
                 let mut cuda_ctx =
                     CudaSession::create_execution_ctx(&VortexSession::empty()).vortex_expect("ctx");
 
-                let bench_runner = BenchRunner::new(&array, n, &cuda_ctx);
+                let bench_runner = BenchRunner::new(&array, n, &mut cuda_ctx);
 
                 b.iter_custom(|iters| {
                     let mut total_time = Duration::ZERO;
@@ -290,7 +290,7 @@ fn bench_runend(c: &mut Criterion) {
                 let mut cuda_ctx =
                     CudaSession::create_execution_ctx(&VortexSession::empty()).vortex_expect("ctx");
 
-                let bench_runner = BenchRunner::new(&array, n, &cuda_ctx);
+                let bench_runner = BenchRunner::new(&array, n, &mut cuda_ctx);
 
                 b.iter_custom(|iters| {
                     let mut total_time = Duration::ZERO;
@@ -344,7 +344,7 @@ fn bench_dict_bp_codes_bp_for_values(c: &mut Criterion) {
                 let mut cuda_ctx =
                     CudaSession::create_execution_ctx(&VortexSession::empty()).vortex_expect("ctx");
 
-                let bench_runner = BenchRunner::new(&array, n, &cuda_ctx);
+                let bench_runner = BenchRunner::new(&array, n, &mut cuda_ctx);
 
                 b.iter_custom(|iters| {
                     let mut total_time = Duration::ZERO;
@@ -409,7 +409,7 @@ fn bench_alp_for_bitpacked(c: &mut Criterion) {
                 let mut cuda_ctx =
                     CudaSession::create_execution_ctx(&VortexSession::empty()).vortex_expect("ctx");
 
-                let bench_runner = BenchRunner::new(&array, n, &cuda_ctx);
+                let bench_runner = BenchRunner::new(&array, n, &mut cuda_ctx);
 
                 b.iter_custom(|iters| {
                     let mut total_time = Duration::ZERO;

diff --git a/vortex-cuda/kernels/src/bit_unpack_16.cu b/vortex-cuda/kernels/src/bit_unpack_16.cu
@@ -4,148 +4,148 @@
 
 template <int BW>
 __device__ void _bit_unpack_16_device(const uint16_t *__restrict in, uint16_t *__restrict out, uint16_t reference, int thread_idx, GPUPatches& patches) {
-    __shared__ uint16_t shared_out[1024];
+    __shared__ uint16_t shared_out[FL_CHUNK];
 
     // Step 1: Unpack into shared memory
     #pragma unroll
-    for (int i = 0; i < 2; i++) {
-        _bit_unpack_16_lane<BW>(in, shared_out, reference, thread_idx * 2 + i);
+    for (int i = 0; i < FL_LANES<uint16_t> / 32; i++) {
+        _bit_unpack_16_lane<BW>(in, shared_out, reference, thread_idx * (FL_LANES<uint16_t> / 32) + i);
     }
     __syncwarp();
 
     // Step 2: Apply patches to shared memory in parallel
     PatchesCursor<uint16_t> cursor(patches, blockIdx.x, thread_idx, 32);
     auto patch = cursor.next();
-    while (patch.index != 1024) {
+    while (patch.index != FL_CHUNK) {
         shared_out[patch.index] = patch.value;
         patch = cursor.next();
     }
     __syncwarp();
 
     // Step 3: Copy to global memory
     #pragma unroll
-    for (int i = 0; i < 32; i++) {
+    for (int i = 0; i < FL_CHUNK / 32; i++) {
         auto idx = i * 32 + thread_idx;
         out[idx] = shared_out[idx];
     }
 }
 
 extern "C" __global__ void bit_unpack_16_0bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 0 / sizeof(uint16_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint16_t> * 0));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_16_device<0>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_16_1bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 1 / sizeof(uint16_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint16_t> * 1));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_16_device<1>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_16_2bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 2 / sizeof(uint16_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint16_t> * 2));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_16_device<2>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_16_3bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 3 / sizeof(uint16_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint16_t> * 3));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_16_device<3>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_16_4bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 4 / sizeof(uint16_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint16_t> * 4));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_16_device<4>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_16_5bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 5 / sizeof(uint16_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint16_t> * 5));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_16_device<5>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_16_6bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 6 / sizeof(uint16_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint16_t> * 6));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_16_device<6>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_16_7bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 7 / sizeof(uint16_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint16_t> * 7));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_16_device<7>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_16_8bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 8 / sizeof(uint16_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint16_t> * 8));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_16_device<8>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_16_9bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 9 / sizeof(uint16_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint16_t> * 9));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_16_device<9>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_16_10bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 10 / sizeof(uint16_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint16_t> * 10));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_16_device<10>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_16_11bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 11 / sizeof(uint16_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint16_t> * 11));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_16_device<11>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_16_12bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 12 / sizeof(uint16_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint16_t> * 12));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_16_device<12>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_16_13bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 13 / sizeof(uint16_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint16_t> * 13));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_16_device<13>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_16_14bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 14 / sizeof(uint16_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint16_t> * 14));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_16_device<14>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_16_15bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 15 / sizeof(uint16_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint16_t> * 15));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_16_device<15>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_16_16bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 16 / sizeof(uint16_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint16_t> * 16));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_16_device<16>(in, out, reference, thread_idx, patches);
 }
 
diff --git a/vortex-cuda/kernels/src/bit_unpack_16_lanes.cuh b/vortex-cuda/kernels/src/bit_unpack_16_lanes.cuh
@@ -19,7 +19,7 @@ __device__ void _bit_unpack_16_lane<0>(const uint16_t *__restrict in, uint16_t *
 
 template <>
 __device__ void _bit_unpack_16_lane<1>(const uint16_t *__restrict in, uint16_t *__restrict out, uint16_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 64;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint16_t>;
     uint16_t src;
     uint16_t tmp;
     src = in[lane];
@@ -59,7 +59,7 @@ __device__ void _bit_unpack_16_lane<1>(const uint16_t *__restrict in, uint16_t *
 
 template <>
 __device__ void _bit_unpack_16_lane<2>(const uint16_t *__restrict in, uint16_t *__restrict out, uint16_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 64;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint16_t>;
     uint16_t src;
     uint16_t tmp;
     src = in[lane];
@@ -101,7 +101,7 @@ __device__ void _bit_unpack_16_lane<2>(const uint16_t *__restrict in, uint16_t *
 
 template <>
 __device__ void _bit_unpack_16_lane<3>(const uint16_t *__restrict in, uint16_t *__restrict out, uint16_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 64;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint16_t>;
     uint16_t src;
     uint16_t tmp;
     src = in[lane];
@@ -145,7 +145,7 @@ __device__ void _bit_unpack_16_lane<3>(const uint16_t *__restrict in, uint16_t *
 
 template <>
 __device__ void _bit_unpack_16_lane<4>(const uint16_t *__restrict in, uint16_t *__restrict out, uint16_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 64;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint16_t>;
     uint16_t src;
     uint16_t tmp;
     src = in[lane];
@@ -191,7 +191,7 @@ __device__ void _bit_unpack_16_lane<4>(const uint16_t *__restrict in, uint16_t *
 
 template <>
 __device__ void _bit_unpack_16_lane<5>(const uint16_t *__restrict in, uint16_t *__restrict out, uint16_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 64;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint16_t>;
     uint16_t src;
     uint16_t tmp;
     src = in[lane];
@@ -239,7 +239,7 @@ __device__ void _bit_unpack_16_lane<5>(const uint16_t *__restrict in, uint16_t *
 
 template <>
 __device__ void _bit_unpack_16_lane<6>(const uint16_t *__restrict in, uint16_t *__restrict out, uint16_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 64;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint16_t>;
     uint16_t src;
     uint16_t tmp;
     src = in[lane];
@@ -289,7 +289,7 @@ __device__ void _bit_unpack_16_lane<6>(const uint16_t *__restrict in, uint16_t *
 
 template <>
 __device__ void _bit_unpack_16_lane<7>(const uint16_t *__restrict in, uint16_t *__restrict out, uint16_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 64;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint16_t>;
     uint16_t src;
     uint16_t tmp;
     src = in[lane];
@@ -341,7 +341,7 @@ __device__ void _bit_unpack_16_lane<7>(const uint16_t *__restrict in, uint16_t *
 
 template <>
 __device__ void _bit_unpack_16_lane<8>(const uint16_t *__restrict in, uint16_t *__restrict out, uint16_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 64;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint16_t>;
     uint16_t src;
     uint16_t tmp;
     src = in[lane];
@@ -395,7 +395,7 @@ __device__ void _bit_unpack_16_lane<8>(const uint16_t *__restrict in, uint16_t *
 
 template <>
 __device__ void _bit_unpack_16_lane<9>(const uint16_t *__restrict in, uint16_t *__restrict out, uint16_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 64;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint16_t>;
     uint16_t src;
     uint16_t tmp;
     src = in[lane];
@@ -451,7 +451,7 @@ __device__ void _bit_unpack_16_lane<9>(const uint16_t *__restrict in, uint16_t *
 
 template <>
 __device__ void _bit_unpack_16_lane<10>(const uint16_t *__restrict in, uint16_t *__restrict out, uint16_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 64;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint16_t>;
     uint16_t src;
     uint16_t tmp;
     src = in[lane];
@@ -509,7 +509,7 @@ __device__ void _bit_unpack_16_lane<10>(const uint16_t *__restrict in, uint16_t
 
 template <>
 __device__ void _bit_unpack_16_lane<11>(const uint16_t *__restrict in, uint16_t *__restrict out, uint16_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 64;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint16_t>;
     uint16_t src;
     uint16_t tmp;
     src = in[lane];
@@ -569,7 +569,7 @@ __device__ void _bit_unpack_16_lane<11>(const uint16_t *__restrict in, uint16_t
 
 template <>
 __device__ void _bit_unpack_16_lane<12>(const uint16_t *__restrict in, uint16_t *__restrict out, uint16_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 64;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint16_t>;
     uint16_t src;
     uint16_t tmp;
     src = in[lane];
@@ -631,7 +631,7 @@ __device__ void _bit_unpack_16_lane<12>(const uint16_t *__restrict in, uint16_t
 
 template <>
 __device__ void _bit_unpack_16_lane<13>(const uint16_t *__restrict in, uint16_t *__restrict out, uint16_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 64;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint16_t>;
     uint16_t src;
     uint16_t tmp;
     src = in[lane];
@@ -695,7 +695,7 @@ __device__ void _bit_unpack_16_lane<13>(const uint16_t *__restrict in, uint16_t
 
 template <>
 __device__ void _bit_unpack_16_lane<14>(const uint16_t *__restrict in, uint16_t *__restrict out, uint16_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 64;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint16_t>;
     uint16_t src;
     uint16_t tmp;
     src = in[lane];
@@ -761,7 +761,7 @@ __device__ void _bit_unpack_16_lane<14>(const uint16_t *__restrict in, uint16_t
 
 template <>
 __device__ void _bit_unpack_16_lane<15>(const uint16_t *__restrict in, uint16_t *__restrict out, uint16_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 64;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint16_t>;
     uint16_t src;
     uint16_t tmp;
     src = in[lane];
@@ -829,7 +829,7 @@ __device__ void _bit_unpack_16_lane<15>(const uint16_t *__restrict in, uint16_t
 
 template <>
 __device__ void _bit_unpack_16_lane<16>(const uint16_t *__restrict in, uint16_t *__restrict out, uint16_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 64;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint16_t>;
     #pragma unroll
     for (int row = 0; row < 16; row++) {
         out[INDEX(row, lane)] = in[LANE_COUNT * row + lane] + reference;