From a678da16ff2e009e3e097734bc8c68ec812fdedc Mon Sep 17 00:00:00 2001
From: Z Stern <zalman@google.com>
Date: Fri, 16 Sep 2022 19:27:34 +0000
Subject: [PATCH 01/16] Add support for generating llvm.vp.* intrinsics. This
 is particularly useful for RISC V, but it may be a simpler, better optimized
 path, for Halide vector operations in general.

Add support for a maximum vector size that might be larger than the
native vector size. RISC V vector LMUL support is an example of an
architecture supporting this.
---
 src/CodeGen_LLVM.cpp  | 287 ++++++++++++++++++++++++++++++++++--------
 src/CodeGen_LLVM.h    |  43 ++++++-
 src/CodeGen_RISCV.cpp |  24 +++-
 3 files changed, 295 insertions(+), 59 deletions(-)

diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp
index 1b1e17326532..56aa4c573a27 100644
--- a/src/CodeGen_LLVM.cpp
+++ b/src/CodeGen_LLVM.cpp
@@ -217,6 +217,7 @@ CodeGen_LLVM::CodeGen_LLVM(const Target &t)
 
       inside_atomic_mutex_node(false),
       emit_atomic_stores(false),
+      use_llvm_vp_intrinsics(false),
 
       destructor_block(nullptr),
       strict_float(t.has_feature(Target::StrictFloat)),
@@ -1536,13 +1537,18 @@ void CodeGen_LLVM::visit(const Add *op) {
     Value *a = codegen(op->a);
     Value *b = codegen(op->b);
     if (op->type.is_float()) {
-        value = builder->CreateFAdd(a, b);
+        if (!call_vector_predication_intrinsic("fadd", t, nullptr, a, b)) {
+            value = builder->CreateFAdd(a, b);
+        }
     } else if (op->type.is_int() && op->type.bits() >= 32) {
         // We tell llvm integers don't wrap, so that it generates good
         // code for loop indices.
+        // TODO(zalman): Figure out if vector predication needs to/can work here.
         value = builder->CreateNSWAdd(a, b);
     } else {
-        value = builder->CreateAdd(a, b);
+        if (!call_vector_predication_intrinsic("add", t, nullptr, a, b)) {
+            value = builder->CreateAdd(a, b);
+        }
     }
 }
 
@@ -1556,13 +1562,18 @@ void CodeGen_LLVM::visit(const Sub *op) {
     Value *a = codegen(op->a);
     Value *b = codegen(op->b);
     if (op->type.is_float()) {
-        value = builder->CreateFSub(a, b);
+        if (!call_vector_predication_intrinsic("fsub", t, nullptr, a, b)) {
+            value = builder->CreateFSub(a, b);
+        }
     } else if (op->type.is_int() && op->type.bits() >= 32) {
         // We tell llvm integers don't wrap, so that it generates good
         // code for loop indices.
+        // TODO(zalman): Figure out if vector predication needs to/can work here.
         value = builder->CreateNSWSub(a, b);
     } else {
-        value = builder->CreateSub(a, b);
+        if (!call_vector_predication_intrinsic("sub", t, nullptr, a, b)) {
+            value = builder->CreateSub(a, b);
+        }
     }
 }
 
@@ -1580,13 +1591,18 @@ void CodeGen_LLVM::visit(const Mul *op) {
     Value *a = codegen(op->a);
     Value *b = codegen(op->b);
     if (op->type.is_float()) {
-        value = builder->CreateFMul(a, b);
+        if (!call_vector_predication_intrinsic("fmul", t, nullptr, a, b)) {
+            value = builder->CreateFMul(a, b);
+        }
     } else if (op->type.is_int() && op->type.bits() >= 32) {
         // We tell llvm integers don't wrap, so that it generates good
         // code for loop indices.
+        // TODO(zalman): Figure out if vector predication needs to/can work here.
         value = builder->CreateNSWMul(a, b);
     } else {
-        value = builder->CreateMul(a, b);
+        if (!call_vector_predication_intrinsic("mul", t, nullptr, a, b)) {
+            value = builder->CreateMul(a, b);
+        }
     }
 }
 
@@ -1606,7 +1622,9 @@ void CodeGen_LLVM::visit(const Div *op) {
         // output hard.
         Value *a = codegen(op->a);
         Value *b = codegen(op->b);
-        value = builder->CreateFDiv(a, b);
+        if (!call_vector_predication_intrinsic("fdiv", t, nullptr, a, b)) {
+            value = builder->CreateFDiv(a, b);
+        }
     } else {
         value = codegen(lower_int_uint_div(op->a, op->b));
     }
@@ -1676,9 +1694,13 @@ void CodeGen_LLVM::visit(const EQ *op) {
     Value *a = codegen(op->a);
     Value *b = codegen(op->b);
     if (t.is_float()) {
-        value = builder->CreateFCmpOEQ(a, b);
+        if (!call_vector_predication_comparison("fcmp", t, nullptr, a, b, "oeq")) {
+            value = builder->CreateFCmpOEQ(a, b);
+        }
     } else {
-        value = builder->CreateICmpEQ(a, b);
+        if (!call_vector_predication_comparison("icmp", t, nullptr, a, b, "eq")) {
+            value = builder->CreateICmpEQ(a, b);
+        }
     }
 }
 
@@ -1692,9 +1714,13 @@ void CodeGen_LLVM::visit(const NE *op) {
     Value *a = codegen(op->a);
     Value *b = codegen(op->b);
     if (t.is_float()) {
-        value = builder->CreateFCmpONE(a, b);
+        if (!call_vector_predication_comparison("fcmp", t, nullptr, a, b, "one")) {
+            value = builder->CreateFCmpONE(a, b);
+        }
     } else {
-        value = builder->CreateICmpNE(a, b);
+        if (!call_vector_predication_comparison("icmp", t, nullptr, a, b, "ne")) {
+            value = builder->CreateICmpNE(a, b);
+        }
     }
 }
 
@@ -1708,11 +1734,17 @@ void CodeGen_LLVM::visit(const LT *op) {
     Value *a = codegen(op->a);
     Value *b = codegen(op->b);
     if (t.is_float()) {
-        value = builder->CreateFCmpOLT(a, b);
+        if (!call_vector_predication_comparison("fcmp", t, nullptr, a, b, "olt")) {
+            value = builder->CreateFCmpOLT(a, b);
+        }
     } else if (t.is_int()) {
-        value = builder->CreateICmpSLT(a, b);
+        if (!call_vector_predication_comparison("icmp", t, nullptr, a, b, "slt")) {
+            value = builder->CreateICmpSLT(a, b);
+        }
     } else {
-        value = builder->CreateICmpULT(a, b);
+        if (!call_vector_predication_comparison("icmp", t, nullptr, a, b, "ult")) {
+            value = builder->CreateICmpULT(a, b);
+        }
     }
 }
 
@@ -1726,11 +1758,17 @@ void CodeGen_LLVM::visit(const LE *op) {
     Value *a = codegen(op->a);
     Value *b = codegen(op->b);
     if (t.is_float()) {
-        value = builder->CreateFCmpOLE(a, b);
+        if (!call_vector_predication_comparison("fcmp", t, nullptr, a, b, "ole")) {
+            value = builder->CreateFCmpOLE(a, b);
+        }
     } else if (t.is_int()) {
-        value = builder->CreateICmpSLE(a, b);
+        if (!call_vector_predication_comparison("icmp", t, nullptr, a, b, "sle")) {
+            value = builder->CreateICmpSLE(a, b);
+        }
     } else {
-        value = builder->CreateICmpULE(a, b);
+        if (!call_vector_predication_comparison("icmp", t, nullptr, a, b, "ule")) {
+            value = builder->CreateICmpULE(a, b);
+        }
     }
 }
 
@@ -1745,11 +1783,17 @@ void CodeGen_LLVM::visit(const GT *op) {
     Value *b = codegen(op->b);
 
     if (t.is_float()) {
-        value = builder->CreateFCmpOGT(a, b);
+        if (!call_vector_predication_comparison("fcmp", t, nullptr, a, b, "ogt")) {
+            value = builder->CreateFCmpOGT(a, b);
+        }
     } else if (t.is_int()) {
-        value = builder->CreateICmpSGT(a, b);
+        if (!call_vector_predication_comparison("icmp", t, nullptr, a, b, "sgt")) {
+            value = builder->CreateICmpSGT(a, b);
+        }
     } else {
-        value = builder->CreateICmpUGT(a, b);
+        if (!call_vector_predication_comparison("icmp", t, nullptr, a, b, "ugt")) {
+            value = builder->CreateICmpUGT(a, b);
+        }
     }
 }
 
@@ -1763,11 +1807,17 @@ void CodeGen_LLVM::visit(const GE *op) {
     Value *a = codegen(op->a);
     Value *b = codegen(op->b);
     if (t.is_float()) {
-        value = builder->CreateFCmpOGE(a, b);
+        if (!call_vector_predication_comparison("fcmp", t, nullptr, a, b, "oge")) {
+            value = builder->CreateFCmpOGE(a, b);
+        }
     } else if (t.is_int()) {
-        value = builder->CreateICmpSGE(a, b);
+        if (!call_vector_predication_comparison("icmp", t, nullptr, a, b, "sge")) {
+            value = builder->CreateICmpSGE(a, b);
+        }
     } else {
-        value = builder->CreateICmpUGE(a, b);
+        if (!call_vector_predication_comparison("icmp", t, nullptr, a, b, "uge")) {
+            value = builder->CreateICmpUGE(a, b);
+        }
     }
 }
 
@@ -1778,7 +1828,9 @@ void CodeGen_LLVM::visit(const And *op) {
 
     Value *a = codegen(op->a);
     Value *b = codegen(op->b);
-    value = builder->CreateAnd(a, b);
+    if (!call_vector_predication_intrinsic("and", op->type, nullptr, a, b)) {
+        value = builder->CreateAnd(a, b);
+    }
 }
 
 void CodeGen_LLVM::visit(const Or *op) {
@@ -1788,19 +1840,25 @@ void CodeGen_LLVM::visit(const Or *op) {
 
     Value *a = codegen(op->a);
     Value *b = codegen(op->b);
-    value = builder->CreateOr(a, b);
+    if (!call_vector_predication_intrinsic("or", op->type, nullptr, a, b)) {
+        value = builder->CreateOr(a, b);
+    }
 }
 
 void CodeGen_LLVM::visit(const Not *op) {
     Value *a = codegen(op->a);
-    value = builder->CreateNot(a);
+    if (!call_vector_predication_intrinsic("not", op->type, nullptr, a)) {
+        value = builder->CreateNot(a);
+    }
 }
 
 void CodeGen_LLVM::visit(const Select *op) {
     Value *cmp = codegen(op->condition);
     Value *a = codegen(op->true_value);
     Value *b = codegen(op->false_value);
-    value = builder->CreateSelect(cmp, a, b);
+    if (!call_vector_predication_intrinsic("select", op->type, nullptr, a, b, cmp)) {
+        value = builder->CreateSelect(cmp, a, b);
+    }
 }
 
 namespace {
@@ -1971,7 +2029,7 @@ void CodeGen_LLVM::visit(const Load *op) {
 
         llvm::Type *load_type = llvm_type_of(op->type.element_of());
         if (ramp && stride && stride->value == 1) {
-            value = codegen_dense_vector_load(op);
+          value = codegen_dense_vector_load(op, nullptr);
         } else if (ramp && stride && 2 <= stride->value && stride->value <= 4) {
             // Try to rewrite strided loads as shuffles of dense loads,
             // aligned to the stride. This makes adjacent strided loads
@@ -2026,8 +2084,8 @@ void CodeGen_LLVM::visit(const Load *op) {
                 int lanes_i = std::min<int>(slice_lanes, op->type.lanes() - i);
                 Expr slice_base = simplify(base + load_base_i);
 
-                Value *load_i = codegen_dense_vector_load(op->type.with_lanes(load_lanes_i), op->name, slice_base,
-                                                          op->image, op->param, align, nullptr, false);
+                Value *load_i = codegen_vector_load(op->type.with_lanes(load_lanes_i), op->name, slice_base,
+                                                    op->image, op->param, align, nullptr, false, nullptr);
 
                 std::vector<int> constants;
                 for (int j = 0; j < lanes_i; j++) {
@@ -2241,8 +2299,7 @@ void CodeGen_LLVM::codegen_predicated_store(const Store *op) {
         Halide::Type value_type = op->value.type();
         Value *val = codegen(op->value);
         int alignment = value_type.bytes();
-        int native_bits = native_vector_bits();
-        int native_bytes = native_bits / 8;
+        int native_bytes = native_vector_bits() / 8;
 
         // Boost the alignment if possible, up to the native vector width.
         ModulusRemainder mod_rem = op->alignment;
@@ -2265,7 +2322,7 @@ void CodeGen_LLVM::codegen_predicated_store(const Store *op) {
         // For dense vector stores wider than the native vector
         // width, bust them up into native vectors.
         int store_lanes = value_type.lanes();
-        int native_lanes = native_bits / value_type.bits();
+        int native_lanes = maximum_vector_bits() / value_type.bits();
 
         for (int i = 0; i < store_lanes; i += native_lanes) {
             int slice_lanes = std::min(native_lanes, store_lanes - i);
@@ -2277,8 +2334,13 @@ void CodeGen_LLVM::codegen_predicated_store(const Store *op) {
             Value *vec_ptr = builder->CreatePointerCast(elt_ptr, slice_val->getType()->getPointerTo());
 
             Value *slice_mask = slice_vector(vpred, i, slice_lanes);
-            Instruction *store =
-                builder->CreateMaskedStore(slice_val, vec_ptr, llvm::Align(alignment), slice_mask);
+            Instruction *store;
+            if (call_vector_predication_intrinsic("store", value_type.with_lanes(slice_lanes), slice_mask, slice_val,
+                                                  vec_ptr, nullptr, alignment, ".p0")) {
+                store = dyn_cast<Instruction>(value);
+            } else {
+                store = builder->CreateMaskedStore(slice_val, vec_ptr, llvm::Align(alignment), slice_mask);
+            }
             add_tbaa_metadata(store, op->name, slice_index);
         }
     } else {  // It's not dense vector store, we need to scalarize it
@@ -2322,9 +2384,9 @@ void CodeGen_LLVM::codegen_predicated_store(const Store *op) {
     }
 }
 
-llvm::Value *CodeGen_LLVM::codegen_dense_vector_load(const Type &type, const std::string &name, const Expr &base,
-                                                     const Buffer<> &image, const Parameter &param, const ModulusRemainder &alignment,
-                                                     llvm::Value *vpred, bool slice_to_native) {
+llvm::Value *CodeGen_LLVM::codegen_vector_load(const Type &type, const std::string &name, const Expr &base,
+                                               const Buffer<> &image, const Parameter &param, const ModulusRemainder &alignment,
+                                               llvm::Value *vpred, bool slice_to_native, llvm::Value *stride) {
     debug(4) << "Vectorize predicated dense vector load:\n\t"
              << "(" << type << ")" << name << "[ramp(base, 1, " << type.lanes() << ")]\n";
 
@@ -2361,7 +2423,7 @@ llvm::Value *CodeGen_LLVM::codegen_dense_vector_load(const Type &type, const std
     // For dense vector loads wider than the native vector
     // width, bust them up into native vectors
     int load_lanes = type.lanes();
-    int native_lanes = slice_to_native ? std::max(1, native_bits / type.bits()) : load_lanes;
+    int native_lanes = slice_to_native ? std::max(1, maximum_vector_bits() / type.bits()) : load_lanes;
     vector<Value *> slices;
     for (int i = 0; i < load_lanes; i += native_lanes) {
         int slice_lanes = std::min(native_lanes, load_lanes - i);
@@ -2372,12 +2434,27 @@ llvm::Value *CodeGen_LLVM::codegen_dense_vector_load(const Type &type, const std
         Value *elt_ptr = codegen_buffer_pointer(name, type.element_of(), slice_base);
         Value *vec_ptr = builder->CreatePointerCast(elt_ptr, slice_type->getPointerTo());
 
+        Value *slice_mask = (vpred != nullptr) ? slice_vector(vpred, i, slice_lanes) : nullptr;
+
         Instruction *load_inst;
-        if (vpred != nullptr) {
-            Value *slice_mask = slice_vector(vpred, i, slice_lanes);
-            load_inst = builder->CreateMaskedLoad(slice_type, vec_ptr, llvm::Align(align_bytes), slice_mask);
+        if (stride) {
+            if (call_vector_predication_intrinsic("strided.load", type.with_lanes(slice_lanes), slice_mask,
+                                                  vec_ptr, stride, nullptr, align_bytes, ".i64")) {
+                load_inst = dyn_cast<Instruction>(value);
+            } else {
+                internal_error << "Vector predicated strided load should not be requested if not supported.\n";
+            }
         } else {
-            load_inst = builder->CreateAlignedLoad(slice_type, vec_ptr, llvm::Align(align_bytes));
+            if (call_vector_predication_intrinsic("load", type.with_lanes(slice_lanes), slice_mask,
+                                                  vec_ptr, nullptr, nullptr, align_bytes, ".p0")) {
+                load_inst = dyn_cast<Instruction>(value);
+            } else {
+                if (slice_mask != nullptr) {
+                    load_inst = builder->CreateMaskedLoad(slice_type, vec_ptr, llvm::Align(align_bytes), slice_mask);
+                } else {
+                    load_inst = builder->CreateAlignedLoad(slice_type, vec_ptr, llvm::Align(align_bytes));
+                }
+            }
         }
         add_tbaa_metadata(load_inst, name, slice_index);
         slices.push_back(load_inst);
@@ -2390,14 +2467,22 @@ Value *CodeGen_LLVM::codegen_dense_vector_load(const Load *load, Value *vpred, b
     const Ramp *ramp = load->index.as<Ramp>();
     internal_assert(ramp && is_const_one(ramp->stride)) << "Should be dense vector load\n";
 
-    return codegen_dense_vector_load(load->type, load->name, ramp->base, load->image, load->param,
-                                     load->alignment, vpred, slice_to_native);
+    return codegen_vector_load(load->type, load->name, ramp->base, load->image, load->param,
+                               load->alignment, vpred, slice_to_native, nullptr);
 }
 
 void CodeGen_LLVM::codegen_predicated_load(const Load *op) {
     const Ramp *ramp = op->index.as<Ramp>();
     const IntImm *stride = ramp ? ramp->stride.as<IntImm>() : nullptr;
 
+    if (use_llvm_vp_intrinsics && stride) {
+        Value *vpred = codegen(op->predicate);
+        Value *llvm_stride = codegen(stride);
+        value = codegen_vector_load(op->type, op->name, ramp->base, op->image, op->param,
+                                   op->alignment, vpred, true, llvm_stride);
+        return;
+    }
+
     if (ramp && is_const_one(ramp->stride)) {  // Dense vector load
         Value *vpred = codegen(op->predicate);
         value = codegen_dense_vector_load(op, vpred);
@@ -2594,27 +2679,37 @@ void CodeGen_LLVM::visit(const Call *op) {
         internal_assert(op->args.size() == 2);
         Value *a = codegen(op->args[0]);
         Value *b = codegen(op->args[1]);
-        value = builder->CreateAnd(a, b);
+        if (!call_vector_predication_intrinsic("and", op->type, nullptr, a, b)) {
+            value = builder->CreateAnd(a, b);
+        }
     } else if (op->is_intrinsic(Call::bitwise_xor)) {
         internal_assert(op->args.size() == 2);
         Value *a = codegen(op->args[0]);
         Value *b = codegen(op->args[1]);
-        value = builder->CreateXor(a, b);
+        if (!call_vector_predication_intrinsic("xor", op->type, nullptr, a, b)) {
+            value = builder->CreateXor(a, b);
+        }
     } else if (op->is_intrinsic(Call::bitwise_or)) {
         internal_assert(op->args.size() == 2);
         Value *a = codegen(op->args[0]);
         Value *b = codegen(op->args[1]);
-        value = builder->CreateOr(a, b);
+        if (!call_vector_predication_intrinsic("or", op->type, nullptr, a, b)) {
+            value = builder->CreateOr(a, b);
+        }
     } else if (op->is_intrinsic(Call::bitwise_not)) {
         internal_assert(op->args.size() == 1);
         Value *a = codegen(op->args[0]);
-        value = builder->CreateNot(a);
+        if (!call_vector_predication_intrinsic("not", op->type, nullptr, a)) {
+            value = builder->CreateNot(a);
+        }
     } else if (op->is_intrinsic(Call::shift_left)) {
         internal_assert(op->args.size() == 2);
         if (op->args[1].type().is_uint()) {
             Value *a = codegen(op->args[0]);
             Value *b = codegen(op->args[1]);
-            value = builder->CreateShl(a, b);
+            if (!call_vector_predication_intrinsic("shl", op->type, nullptr, a, b)) {
+                value = builder->CreateShl(a, b);
+            }
         } else {
             value = codegen(lower_signed_shift_left(op->args[0], op->args[1]));
         }
@@ -2624,9 +2719,13 @@ void CodeGen_LLVM::visit(const Call *op) {
             Value *a = codegen(op->args[0]);
             Value *b = codegen(op->args[1]);
             if (op->type.is_int()) {
-                value = builder->CreateAShr(a, b);
+                if (!call_vector_predication_intrinsic("ashr", op->type, nullptr, a, b)) {
+                    value = builder->CreateAShr(a, b);
+                }
             } else {
-                value = builder->CreateLShr(a, b);
+                if (!call_vector_predication_intrinsic("lshr", op->type, nullptr, a, b)) {
+                    value = builder->CreateLShr(a, b);
+                }
             }
         } else {
             value = codegen(lower_signed_shift_right(op->args[0], op->args[1]));
@@ -3701,7 +3800,7 @@ void CodeGen_LLVM::visit(const Store *op) {
             // For dense vector stores wider than the native vector
             // width, bust them up into native vectors.
             int store_lanes = value_type.lanes();
-            int native_lanes = native_bits / value_type.bits();
+            int native_lanes = maximum_vector_bits() / value_type.bits();
 
             for (int i = 0; i < store_lanes; i += native_lanes) {
                 int slice_lanes = std::min(native_lanes, store_lanes - i);
@@ -4020,7 +4119,7 @@ void CodeGen_LLVM::visit(const VectorReduce *op) {
 void CodeGen_LLVM::codegen_vector_reduce(const VectorReduce *op, const Expr &init) {
     Expr val = op->value;
     const int output_lanes = op->type.lanes();
-    const int native_lanes = native_vector_bits() / op->type.bits();
+    const int native_lanes = maximum_vector_bits() / op->type.bits();
     const int factor = val.type().lanes() / output_lanes;
     Type elt = op->type.element_of();
 
@@ -4570,9 +4669,11 @@ Value *CodeGen_LLVM::call_intrin(const llvm::Type *result_type, int intrin_lanes
     llvm::FunctionType *intrin_type = intrin->getFunctionType();
     for (int i = 0; i < (int)arg_values.size(); i++) {
         if (arg_values[i]->getType() != intrin_type->getParamType(i)) {
+          debug(0) << "Normalizing fixed/scalable.\n";
             arg_values[i] = normalize_fixed_scalable_vector_type(intrin_type->getParamType(i), arg_values[i]);
         }
         if (arg_values[i]->getType() != intrin_type->getParamType(i)) {
+          debug(0) << "Bit casting type.\n";
             // There can be some mismatches in types, such as when passing scalar Halide type T
             // to LLVM vector type <1 x T>.
             arg_values[i] = builder->CreateBitCast(arg_values[i], intrin_type->getParamType(i));
@@ -4947,5 +5048,83 @@ llvm::Constant *CodeGen_LLVM::get_splat(int lanes, llvm::Constant *value,
     return ConstantVector::getSplat(ec, value);
 }
 
+bool CodeGen_LLVM::call_vector_predication_intrinsic(const std::string &name, const Type &result_type,
+                                                     llvm::Value *mask,  // Pass nullptr for constrant true.
+                                                     llvm::Value *a, llvm::Value *b, llvm::Value *c, int alignment,
+                                                     const char *overload_suffix) {
+    if (!use_llvm_vp_intrinsics ||
+        result_type.is_scalar()) {
+        return false;
+    }
+
+    llvm::Type *llvm_result_type = llvm_type_of(result_type);
+    int32_t length = result_type.lanes();
+    const char *type_designator = result_type.is_float() ? "f" : "i";
+    std::string type_string = ".";
+    bool is_scalable = isa<llvm::ScalableVectorType>(llvm_result_type);
+    llvm::ElementCount llvm_vector_ec;
+    if (is_scalable) {
+        const auto *vt = cast<llvm::ScalableVectorType>(llvm_result_type);
+        std::string bits_designator = std::to_string(vt->getScalarSizeInBits());
+        llvm_vector_ec = vt->getElementCount();
+        type_string = ".nxv" + std::to_string(vt->getMinNumElements()) + type_designator + bits_designator;
+    } else {
+        const auto *vt = cast<llvm::FixedVectorType>(llvm_result_type);
+        std::string bits_designator = std::to_string(vt->getScalarSizeInBits());
+        llvm_vector_ec = vt->getElementCount();
+        type_string = ".v" + std::to_string(vt->getNumElements()) + type_designator + bits_designator;
+    }
+
+    const char *name_base = (starts_with(name, "strided")) ? "llvm.experimental.vp." : "llvm.vp.";
+    std::string full_name = name_base + name + type_string + overload_suffix;
+    int arg_count = 3 + (b != nullptr) + (c != nullptr);
+    std::vector<llvm::Value *> args(arg_count);
+    size_t i = 0;
+
+    int ptr_index = -1;
+    if (isa<PointerType>(a->getType())) {
+        ptr_index = 0;
+    }
+    args[i++] = a;
+    if (b != nullptr) { 
+        args[i++] = b;
+        if (isa<PointerType>(b->getType())) {
+            if (ptr_index != -1) {
+                ptr_index = 1;
+            }
+        }
+    }
+    if (c != nullptr) { 
+        args[i++] = c;
+    }
+    if (mask == nullptr) {
+        args[i++] = ConstantVector::getSplat(llvm_vector_ec, ConstantInt::get(i1_t, 1));
+    } else {
+        args[i++] = mask;
+    }
+    args[i++] = ConstantInt::get(i32_t, length);
+
+    value  = call_intrin(llvm_result_type, get_vector_num_elements(llvm_result_type), full_name, args, is_scalable);
+    if (alignment != 0 && ptr_index != -1 && isa<CallInst>(value)) {
+        llvm::CallInst *call = dyn_cast<llvm::CallInst>(value);
+        call->addParamAttr(ptr_index, Attribute::getWithAlignment(*context, llvm::Align(alignment)));
+    }
+    return true;
+}
+  
+bool CodeGen_LLVM::call_vector_predication_comparison(const std::string &name, const Type &result_type,
+                                                      llvm::Value *mask,  // Pass nullptr for constrant true.
+                                                      llvm::Value *a, llvm::Value *b, const char *cmp_op) {
+    // Early out to prevent creating useless metadata.
+    if (!use_llvm_vp_intrinsics ||
+        result_type.is_scalar()) {
+        return false;
+    }
+
+    llvm::MDBuilder builder(*context);
+    llvm::Value *md_val = llvm::MetadataAsValue::get(*context, builder.createString(cmp_op));
+    return call_vector_predication_intrinsic(name, result_type, mask, a, b, md_val);
+}
+
 }  // namespace Internal
 }  // namespace Halide
diff --git a/src/CodeGen_LLVM.h b/src/CodeGen_LLVM.h
index d6ee5b26adff..c2f5d5d363eb 100644
--- a/src/CodeGen_LLVM.h
+++ b/src/CodeGen_LLVM.h
@@ -136,6 +136,11 @@ class CodeGen_LLVM : public IRVisitor {
     /** What's the natural vector bit-width to use for loads, stores, etc. */
     virtual int native_vector_bits() const = 0;
 
+    /** Used to decide whether to break a vector up into multiple smaller
+     * operations. This is the largest size the architecture supports. */
+    virtual int maximum_vector_bits() const {
+        return native_vector_bits();
+    }
     /** For architectures that have vscale vectors, return the constant vscale to use.
      * Default of 0 means do not use vscale vectors. Generally will depend on
      * the target flags and vector_bits settings.
@@ -557,6 +562,38 @@ class CodeGen_LLVM : public IRVisitor {
     llvm::Constant *get_splat(int lanes, llvm::Constant *value,
                               VectorTypeConstraint type_constraint = VectorTypeConstraint::None) const;
 
+    /** Call an "@llvm.vp.*" intrinsic, forming the full overloaded name and argument list.
+     * A key detail here is that the length of the vector operation is taken from the
+     * Halide type while the size of the LLVM vector type used (fixed or scalable) is taken
+     * from the LLVM promotion of the vector type, which should be the same as the types used
+     * in the arguments. These can be different. It may become useful to pass an explict
+     * length as well.
+     *
+     * The method is virtual to allow backends to extend this for architecture specific
+     * intrinsics. (E.g. RISC V LMUL.) Unfortunately, this involves matching the name as
+     * as string to do much. (TODO(zalman): decide if this is the right way to go based
+     * on LMUL experiment. Really LLVM ought to do this automatically for these intrinsics
+     * on larger lengths.)
+     *
+     * The name is the simple name like "add" for "@llvm.vp.add.v16i32".
+     * If mask is nullptr, it is provided as constant true.
+     * If b or c is nullptr, it is assumed to be a unary or binary operator respectively.
+     *
+     * Assigns result of vp intrinsic to value and returns true if it an instuction is generated,
+     * otherwise returns false.
+     */
+    virtual bool call_vector_predication_intrinsic(const std::string &name, const Type &result_type,
+                                                   llvm::Value *mask, llvm::Value *a, llvm::Value *b = nullptr,
+                                                   llvm::Value *c = nullptr, int alignment = 0,
+                                                   const char *overload_suffix = "");
+
+    virtual bool call_vector_predication_comparison(const std::string &name, const Type &result_type,
+                                                    llvm::Value *mask,  // Pass nullptr for constrant true.
+                                                    llvm::Value *a, llvm::Value *b, const char *cmp_op);
+
+    /** Controls use of vector predicated intrinsics for vector operations. */
+    bool use_llvm_vp_intrinsics;
+
 private:
     /** All the values in scope at the current code location during
      * codegen. Use sym_push and sym_pop to access. */
@@ -598,9 +635,9 @@ class CodeGen_LLVM : public IRVisitor {
     llvm::Function *add_argv_wrapper(llvm::Function *fn, const std::string &name,
                                      bool result_in_argv, std::vector<bool> &arg_is_buffer);
 
-    llvm::Value *codegen_dense_vector_load(const Type &type, const std::string &name, const Expr &base,
-                                           const Buffer<> &image, const Parameter &param, const ModulusRemainder &alignment,
-                                           llvm::Value *vpred = nullptr, bool slice_to_native = true);
+    llvm::Value *codegen_vector_load(const Type &type, const std::string &name, const Expr &base,
+                                     const Buffer<> &image, const Parameter &param, const ModulusRemainder &alignment,
+                                     llvm::Value *vpred = nullptr, bool slice_to_native = true, llvm::Value *stride = nullptr);
     llvm::Value *codegen_dense_vector_load(const Load *load, llvm::Value *vpred = nullptr, bool slice_to_native = true);
 
     virtual void codegen_predicated_load(const Load *op);
diff --git a/src/CodeGen_RISCV.cpp b/src/CodeGen_RISCV.cpp
index 434105724c3a..c4ccf3e60850 100644
--- a/src/CodeGen_RISCV.cpp
+++ b/src/CodeGen_RISCV.cpp
@@ -25,10 +25,13 @@ class CodeGen_RISCV : public CodeGen_Posix {
     string mabi() const override;
     bool use_soft_float_abi() const override;
     int native_vector_bits() const override;
+    int maximum_vector_bits() const override;
+    int target_vscale() const override;
 };
 
 CodeGen_RISCV::CodeGen_RISCV(const Target &t)
     : CodeGen_Posix(t) {
+    use_llvm_vp_intrinsics = true;
 }
 
 string CodeGen_RISCV::mcpu_target() const {
@@ -50,7 +53,7 @@ string CodeGen_RISCV::mattrs() const {
     string arch_flags = "+m,+a,+f,+d,+c";
 
     if (target.has_feature(Target::RVV)) {
-        arch_flags += ",+experimental-v";
+        arch_flags += ",+v";
     }
     return arch_flags;
 }
@@ -73,7 +76,24 @@ bool CodeGen_RISCV::use_soft_float_abi() const {
 }
 
 int CodeGen_RISCV::native_vector_bits() const {
-    return 128;
+    if (target.vector_bits != 0 &&
+        target.has_feature(Target::RVV)) {
+        return target.vector_bits;
+    }
+    return 0;
+}
+
+int CodeGen_RISCV::maximum_vector_bits() const {
+    return native_vector_bits() * 8;
+}
+  
+int CodeGen_RISCV::target_vscale() const {
+    if (target.vector_bits != 0 &&
+        target.has_feature(Target::RVV)) {
+        internal_assert((target.vector_bits % 64) == 0);
+        return target.vector_bits / 64;
+    }
+    return 0;
 }
 
 }  // namespace

From 541833cfa6e089140d3772c7f595bf387e11be7a Mon Sep 17 00:00:00 2001
From: Z Stern <zalman@google.com>
Date: Thu, 22 Sep 2022 05:09:56 +0000
Subject: [PATCH 02/16] Add vector predicated store support.

---
 src/CodeGen_LLVM.cpp | 107 ++++++++++++++++++++++++++++++++-----------
 src/CodeGen_LLVM.h   |   6 ++-
 2 files changed, 86 insertions(+), 27 deletions(-)

diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp
index 56aa4c573a27..f0ebd8263248 100644
--- a/src/CodeGen_LLVM.cpp
+++ b/src/CodeGen_LLVM.cpp
@@ -1930,7 +1930,13 @@ Value *CodeGen_LLVM::codegen_buffer_pointer(Value *base_address, Halide::Type ty
     // Promote index to 64-bit on targets that use 64-bit pointers.
     llvm::DataLayout d(module.get());
     if (d.getPointerSize() == 8) {
-        index = builder->CreateIntCast(index, i64_t, true);
+        llvm::Type *index_type = index->getType();
+        llvm::Type *desired_index_type = i64_t;
+        if (isa<VectorType>(index_type)) {
+            desired_index_type = VectorType::get(desired_index_type,
+                                                 dyn_cast<VectorType>(index_type)->getElementCount());
+        }
+        index = builder->CreateIntCast(index, desired_index_type, true);
     }
 
     return CreateInBoundsGEP(builder, load_type, base_address, index);
@@ -2336,7 +2342,7 @@ void CodeGen_LLVM::codegen_predicated_store(const Store *op) {
             Value *slice_mask = slice_vector(vpred, i, slice_lanes);
             Instruction *store;
             if (call_vector_predication_intrinsic("store", value_type.with_lanes(slice_lanes), slice_mask, slice_val,
-                                                  vec_ptr, nullptr, alignment, ".p0")) {
+                                                  vec_ptr, nullptr, alignment, ".p0", true)) {
                 store = dyn_cast<Instruction>(value);
             } else {
                 store = builder->CreateMaskedStore(slice_val, vec_ptr, llvm::Align(alignment), slice_mask);
@@ -3774,7 +3780,11 @@ void CodeGen_LLVM::visit(const Store *op) {
     } else {
         int alignment = value_type.bytes();
         const Ramp *ramp = op->index.as<Ramp>();
-        if (ramp && is_const_one(ramp->stride)) {
+        // TODO(zalman): consider splitting out vector predication path. Current
+        // code shows how vector predication would simplify things as the
+        // following scalarization cases would go away.
+        bool is_dense = ramp && is_const_one(ramp->stride);
+        if (use_llvm_vp_intrinsics || is_dense) {
 
             int native_bits = native_vector_bits();
             int native_bytes = native_bits / 8;
@@ -3802,16 +3812,41 @@ void CodeGen_LLVM::visit(const Store *op) {
             int store_lanes = value_type.lanes();
             int native_lanes = maximum_vector_bits() / value_type.bits();
 
+            Expr base = (ramp != nullptr) ? ramp->base : 0;
+            Expr stride = (ramp != nullptr) ? ramp->stride : 0;
+            Value *stride_val = (!is_dense && ramp != nullptr) ? codegen(stride) : nullptr;
+
+            Value *index = (ramp == nullptr) ? codegen(op->index) : nullptr;
+            
             for (int i = 0; i < store_lanes; i += native_lanes) {
                 int slice_lanes = std::min(native_lanes, store_lanes - i);
-                Expr slice_base = simplify(ramp->base + i);
+                Expr slice_base = simplify(base + i * stride);
                 Expr slice_stride = make_one(slice_base.type());
                 Expr slice_index = slice_lanes == 1 ? slice_base : Ramp::make(slice_base, slice_stride, slice_lanes);
                 Value *slice_val = slice_vector(val, i, slice_lanes);
                 Value *elt_ptr = codegen_buffer_pointer(op->name, value_type.element_of(), slice_base);
                 Value *vec_ptr = builder->CreatePointerCast(elt_ptr, slice_val->getType()->getPointerTo());
-                StoreInst *store = builder->CreateAlignedStore(slice_val, vec_ptr, llvm::Align(alignment));
-                annotate_store(store, slice_index);
+                if (is_dense || slice_lanes == 1) {
+                    if (call_vector_predication_intrinsic("store", value_type.with_lanes(slice_lanes), nullptr, slice_val,
+                                                          vec_ptr, nullptr, alignment, ".p0", true)) {
+                        add_tbaa_metadata(dyn_cast<Instruction>(value), op->name, slice_index);
+                    } else {
+                        StoreInst *store = builder->CreateAlignedStore(slice_val, vec_ptr, llvm::Align(alignment));
+                        annotate_store(store, slice_index);
+                    }
+                } else if (ramp != nullptr) {
+                    bool generated = call_vector_predication_intrinsic("strided.store", value_type.with_lanes(slice_lanes), nullptr, slice_val,
+                                                                       vec_ptr, stride_val, alignment, ".i64", true);
+                    internal_assert(generated) << "Using vector predicated intrinsics, but code generation was not successful for strided store.\n"; 
+                    add_tbaa_metadata(dyn_cast<Instruction>(value), op->name, slice_index);
+                } else {
+                    Value *slice_index = slice_vector(index, i, slice_lanes);
+                    Value *vec_ptrs = codegen_buffer_pointer(op->name, value_type, slice_index);
+                    bool generated = call_vector_predication_intrinsic("scatter", value_type.with_lanes(slice_lanes), nullptr, slice_val,
+                                                                       vec_ptrs, nullptr, alignment, mangle_llvm_vector_type(vec_ptrs->getType()), true);
+
+                    internal_assert(generated) << "Using vector predicated intrinsics, but code generation was not successful for gathering store.\n";
+                }
             }
         } else if (ramp) {
             Type ptr_type = value_type.element_of();
@@ -4598,7 +4633,9 @@ Value *CodeGen_LLVM::call_intrin(const llvm::Type *result_type, int intrin_lanes
 
         llvm::Type *intrinsic_result_type = result_type->getScalarType();
         if (intrin_lanes > 1) {
-            if (scalable_vector_result && effective_vscale != 0) {
+            if (result_type == void_t) {
+                intrinsic_result_type = void_t;
+            } else if (scalable_vector_result && effective_vscale != 0) {
                 intrinsic_result_type = get_vector_type(result_type->getScalarType(),
                                                         intrin_lanes / effective_vscale, VectorTypeConstraint::VScale);
             } else {
@@ -4618,7 +4655,9 @@ Value *CodeGen_LLVM::call_intrin(const llvm::Type *result_type, int intrin_lanes
                                  llvm::Function *intrin, vector<Value *> arg_values) {
     internal_assert(intrin);
     int arg_lanes = 1;
-    if (result_type->isVectorTy()) {
+    if (result_type == void_t) {
+        arg_lanes = intrin_lanes;
+    } else if (result_type->isVectorTy()) {
         arg_lanes = get_vector_num_elements(result_type);
     }
 
@@ -4657,7 +4696,7 @@ Value *CodeGen_LLVM::call_intrin(const llvm::Type *result_type, int intrin_lanes
                 }
             }
 
-            llvm::Type *result_slice_type =
+            llvm::Type *result_slice_type = (result_type == void_t) ? void_t :
                 get_vector_type(result_type->getScalarType(), intrin_lanes);
 
             results.push_back(call_intrin(result_slice_type, intrin_lanes, intrin, args));
@@ -5048,35 +5087,41 @@ llvm::Constant *CodeGen_LLVM::get_splat(int lanes, llvm::Constant *value,
     return ConstantVector::getSplat(ec, value);
 }
 
-bool CodeGen_LLVM::call_vector_predication_intrinsic(const std::string &name, const Type &result_type,
-                                                     llvm::Value *mask,  // Pass nullptr for constrant true.
-                                                     llvm::Value *a, llvm::Value *b, llvm::Value *c, int alignment,
-                                                     const char *overload_suffix) {
-    if (!use_llvm_vp_intrinsics ||
-        result_type.is_scalar()) {
-        return false;
-    }
-
-    llvm::Type *llvm_result_type = llvm_type_of(result_type);
-    int32_t length = result_type.lanes();
-    const char *type_designator = result_type.is_float() ? "f" : "i";
+std::string CodeGen_LLVM::mangle_llvm_vector_type(llvm::Type *type) {
     std::string type_string = ".";
-    bool is_scalable = isa<llvm::ScalableVectorType>(llvm_result_type);
+    bool is_scalable = isa<llvm::ScalableVectorType>(type);
     llvm::ElementCount llvm_vector_ec;
     if (is_scalable) {
-        const auto *vt = cast<llvm::ScalableVectorType>(llvm_result_type);
+        const auto *vt = cast<llvm::ScalableVectorType>(type);
+        const char *type_designator = vt->getElementType()->isIntegerTy() ? "i" : "f";
         std::string bits_designator = std::to_string(vt->getScalarSizeInBits());
         llvm_vector_ec = vt->getElementCount();
         type_string = ".nxv" + std::to_string(vt->getMinNumElements()) + type_designator + bits_designator;
     } else {
-        const auto *vt = cast<llvm::FixedVectorType>(llvm_result_type);
+        const auto *vt = cast<llvm::FixedVectorType>(type);
+        const char *type_designator = vt->getElementType()->isIntegerTy() ? "i" : "f";
         std::string bits_designator = std::to_string(vt->getScalarSizeInBits());
         llvm_vector_ec = vt->getElementCount();
         type_string = ".v" + std::to_string(vt->getNumElements()) + type_designator + bits_designator;
     }
+    return type_string;
+}
+
+bool CodeGen_LLVM::call_vector_predication_intrinsic(const std::string &name, const Type &result_type,
+                                                     llvm::Value *mask,  // Pass nullptr for constrant true.
+                                                     llvm::Value *a, llvm::Value *b, llvm::Value *c, int alignment,
+                                                     const std::string &overload_suffix,
+                                                     bool void_return) {
+    if (!use_llvm_vp_intrinsics ||
+        result_type.is_scalar()) {
+        return false;
+    }
+
+    llvm::Type *llvm_result_type = llvm_type_of(result_type);
+    int32_t length = result_type.lanes();
 
     const char *name_base = (starts_with(name, "strided")) ? "llvm.experimental.vp." : "llvm.vp.";
-    std::string full_name = name_base + name + type_string + overload_suffix;
+    std::string full_name = name_base + name + mangle_llvm_vector_type(llvm_result_type) + overload_suffix;
     int arg_count = 3 + (b != nullptr) + (c != nullptr);
     std::vector<llvm::Value *> args(arg_count);
     size_t i = 0;
@@ -5097,14 +5142,24 @@ bool CodeGen_LLVM::call_vector_predication_intrinsic(const std::string &name, co
     if (c != nullptr) { 
         args[i++] = c;
     }
+    bool is_scalable = isa<llvm::ScalableVectorType>(llvm_result_type);
     if (mask == nullptr) {
+        llvm::ElementCount llvm_vector_ec;
+        if (is_scalable) {
+            const auto *vt = cast<llvm::ScalableVectorType>(llvm_result_type);
+            llvm_vector_ec = vt->getElementCount();
+        } else {
+            const auto *vt = cast<llvm::FixedVectorType>(llvm_result_type);
+            llvm_vector_ec = vt->getElementCount();
+        }
+
         args[i++] = ConstantVector::getSplat(llvm_vector_ec, ConstantInt::get(i1_t, 1));
     } else {
         args[i++] = mask;
     }
     args[i++] = ConstantInt::get(i32_t, length);
 
-    value  = call_intrin(llvm_result_type, get_vector_num_elements(llvm_result_type), full_name, args, is_scalable);
+    value  = call_intrin(void_return ? void_t : llvm_result_type, get_vector_num_elements(llvm_result_type), full_name, args, is_scalable);
     if (alignment != 0 && ptr_index != -1 && isa<CallInst>(value)) {
         llvm::CallInst *call = dyn_cast<llvm::CallInst>(value);
         call->addParamAttr(ptr_index, Attribute::getWithAlignment(*context, llvm::Align(alignment)));
diff --git a/src/CodeGen_LLVM.h b/src/CodeGen_LLVM.h
index c2f5d5d363eb..5ca5e985f39c 100644
--- a/src/CodeGen_LLVM.h
+++ b/src/CodeGen_LLVM.h
@@ -307,6 +307,9 @@ class CodeGen_LLVM : public IRVisitor {
     llvm::Value *codegen_buffer_pointer(llvm::Value *base_address, Type type, llvm::Value *index);
     // @}
 
+    /** Return an appropriate type string for a type which is of VectorType. */
+    std::string mangle_llvm_vector_type(llvm::Type *type);
+
     /** Turn a Halide Type into an llvm::Value representing a constant halide_type_t */
     llvm::Value *make_halide_type_t(const Type &);
 
@@ -585,7 +588,8 @@ class CodeGen_LLVM : public IRVisitor {
     virtual bool call_vector_predication_intrinsic(const std::string &name, const Type &result_type,
                                                    llvm::Value *mask, llvm::Value *a, llvm::Value *b = nullptr,
                                                    llvm::Value *c = nullptr, int alignment = 0,
-                                                   const char *overload_suffix = "");
+                                                   const std::string &overload_suffix = "",
+                                                   bool void_return = false);
 
     virtual bool call_vector_predication_comparison(const std::string &name, const Type &result_type,
                                                     llvm::Value *mask,  // Pass nullptr for constrant true.

From cab2f01dee84dcef43682fad9a113de2c5fe84eb Mon Sep 17 00:00:00 2001
From: Z Stern <zalman@google.com>
Date: Fri, 23 Sep 2022 23:40:59 +0000
Subject: [PATCH 03/16] Change how void type is handled with call_intrin, other
 vector promotion contexts.

---
 src/CodeGen_LLVM.cpp | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp
index a4b793dc82ad..a6b2869955a4 100644
--- a/src/CodeGen_LLVM.cpp
+++ b/src/CodeGen_LLVM.cpp
@@ -2451,7 +2451,7 @@ llvm::Value *CodeGen_LLVM::codegen_vector_load(const Type &type, const std::stri
 
         Value *slice_mask = (vpred != nullptr) ? slice_vector(vpred, i, slice_lanes) : nullptr;
 
-        Instruction *load_inst;
+        Instruction *load_inst = nullptr;
         if (stride) {
             if (call_vector_predication_intrinsic("strided.load", type.with_lanes(slice_lanes), slice_mask,
                                                   vec_ptr, stride, nullptr, align_bytes, ".i64")) {
@@ -4642,9 +4642,7 @@ Value *CodeGen_LLVM::call_intrin(const llvm::Type *result_type, int intrin_lanes
 
         llvm::Type *intrinsic_result_type = result_type->getScalarType();
         if (intrin_lanes > 1) {
-            if (result_type == void_t) {
-                intrinsic_result_type = void_t;
-            } else if (scalable_vector_result && effective_vscale != 0) {
+            if (scalable_vector_result && effective_vscale != 0) {
                 intrinsic_result_type = get_vector_type(result_type->getScalarType(),
                                                         intrin_lanes / effective_vscale, VectorTypeConstraint::VScale);
             } else {
@@ -4705,7 +4703,7 @@ Value *CodeGen_LLVM::call_intrin(const llvm::Type *result_type, int intrin_lanes
                 }
             }
 
-            llvm::Type *result_slice_type = (result_type == void_t) ? void_t :
+            llvm::Type *result_slice_type =
                 get_vector_type(result_type->getScalarType(), intrin_lanes);
 
             results.push_back(call_intrin(result_slice_type, intrin_lanes, intrin, args));
@@ -5053,6 +5051,10 @@ llvm::Type *CodeGen_LLVM::get_vector_type(llvm::Type *t, int n,
                                           VectorTypeConstraint type_constraint) const {
     bool scalable;
 
+    if (t->isVoidTy()) {
+        return t;
+    }
+      
     switch (type_constraint) {
     case VectorTypeConstraint::None:
         scalable = effective_vscale != 0 &&

From a6a0ba90b7328377e7ecca71a3ed00c5c9181130 Mon Sep 17 00:00:00 2001
From: Z Stern <zalman@google.com>
Date: Wed, 5 Oct 2022 22:55:02 +0000
Subject: [PATCH 04/16] Fix a few issues with types, order of arguments and
 name mangling in vector predication intrinsics.

---
 src/CodeGen_LLVM.cpp | 50 +++++++++++++++++++++++++++++++-------------
 1 file changed, 35 insertions(+), 15 deletions(-)

diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp
index 8617c4660d1e..9bb230d1ef65 100644
--- a/src/CodeGen_LLVM.cpp
+++ b/src/CodeGen_LLVM.cpp
@@ -1863,9 +1863,15 @@ void CodeGen_LLVM::visit(const Not *op) {
 
 void CodeGen_LLVM::visit(const Select *op) {
     Value *cmp = codegen(op->condition);
+    if (use_llvm_vp_intrinsics &&
+        op->type.is_vector() &&
+        op->condition.type().is_scalar()) {
+        cmp = create_broadcast(cmp, op->type.lanes());
+    }
+
     Value *a = codegen(op->true_value);
     Value *b = codegen(op->false_value);
-    if (!call_vector_predication_intrinsic("select", op->type, nullptr, a, b, cmp)) {
+    if (!call_vector_predication_intrinsic("select", op->type, nullptr, cmp, a, b)) {
         value = builder->CreateSelect(cmp, a, b);
     }
 }
@@ -2453,8 +2459,13 @@ llvm::Value *CodeGen_LLVM::codegen_vector_load(const Type &type, const std::stri
 
         Instruction *load_inst = nullptr;
         if (stride) {
+            if (get_target().bits == 64 && !stride->getType()->isIntegerTy(64)) {
+                stride = builder->CreateIntCast(stride, i64_t, true);
+            }
+            const char *mangle = stride->getType()->isIntegerTy(64) ? ".p0.i64" : ".p0.i32";
+              
             if (call_vector_predication_intrinsic("strided.load", type.with_lanes(slice_lanes), slice_mask,
-                                                  vec_ptr, stride, nullptr, align_bytes, ".i64")) {
+                                                  vec_ptr, stride, nullptr, align_bytes, mangle)) {
                 load_inst = dyn_cast<Instruction>(value);
             } else {
                 internal_error << "Vector predicated strided load should not be requested if not supported.\n";
@@ -3846,8 +3857,12 @@ void CodeGen_LLVM::visit(const Store *op) {
                         annotate_store(store, slice_index);
                     }
                 } else if (ramp != nullptr) {
+                    if (get_target().bits == 64 && !stride_val->getType()->isIntegerTy(64)) {
+                        stride_val = builder->CreateIntCast(stride_val, i64_t, true);
+                    }
+                    const char *mangle = stride_val->getType()->isIntegerTy(64) ? ".p0.i64" : ".p0.i32";
                     bool generated = call_vector_predication_intrinsic("strided.store", value_type.with_lanes(slice_lanes), nullptr, slice_val,
-                                                                       vec_ptr, stride_val, alignment, ".i64", true);
+                                                                       vec_ptr, stride_val, alignment, mangle, true);
                     internal_assert(generated) << "Using vector predicated intrinsics, but code generation was not successful for strided store.\n"; 
                     add_tbaa_metadata(dyn_cast<Instruction>(value), op->name, slice_index);
                 } else {
@@ -5133,9 +5148,12 @@ bool CodeGen_LLVM::call_vector_predication_intrinsic(const std::string &name, co
     llvm::Type *llvm_result_type = llvm_type_of(result_type);
     int32_t length = result_type.lanes();
 
+    // TODO(zvookin): Fix the interface here to not case on names.
+    bool no_mask = name == "select";
     const char *name_base = (starts_with(name, "strided")) ? "llvm.experimental.vp." : "llvm.vp.";
+
     std::string full_name = name_base + name + mangle_llvm_vector_type(llvm_result_type) + overload_suffix;
-    int arg_count = 3 + (b != nullptr) + (c != nullptr);
+    int arg_count = 2 + !no_mask  + (b != nullptr) + (c != nullptr);
     std::vector<llvm::Value *> args(arg_count);
     size_t i = 0;
 
@@ -5156,19 +5174,21 @@ bool CodeGen_LLVM::call_vector_predication_intrinsic(const std::string &name, co
         args[i++] = c;
     }
     bool is_scalable = isa<llvm::ScalableVectorType>(llvm_result_type);
-    if (mask == nullptr) {
-        llvm::ElementCount llvm_vector_ec;
-        if (is_scalable) {
-            const auto *vt = cast<llvm::ScalableVectorType>(llvm_result_type);
-            llvm_vector_ec = vt->getElementCount();
+    if (!no_mask) {
+        if (mask == nullptr) {
+            llvm::ElementCount llvm_vector_ec;
+            if (is_scalable) {
+                const auto *vt = cast<llvm::ScalableVectorType>(llvm_result_type);
+                llvm_vector_ec = vt->getElementCount();
+            } else {
+                const auto *vt = cast<llvm::FixedVectorType>(llvm_result_type);
+                llvm_vector_ec = vt->getElementCount();
+            }
+
+            args[i++] = ConstantVector::getSplat(llvm_vector_ec, ConstantInt::get(i1_t, 1));
         } else {
-            const auto *vt = cast<llvm::FixedVectorType>(llvm_result_type);
-            llvm_vector_ec = vt->getElementCount();
+            args[i++] = mask;
         }
-
-        args[i++] = ConstantVector::getSplat(llvm_vector_ec, ConstantInt::get(i1_t, 1));
-    } else {
-        args[i++] = mask;
     }
     args[i++] = ConstantInt::get(i32_t, length);
 

From ab2a68f1c7cd2bc5f66c4ead0e09e0f55b74250e Mon Sep 17 00:00:00 2001
From: Z Stern <zalman@google.com>
Date: Fri, 7 Oct 2022 01:36:59 +0000
Subject: [PATCH 05/16] Add support for using @llvm.vp.reduce.* intrinsics in
 vector reductions.

---
 src/CodeGen_LLVM.cpp | 145 ++++++++++++++++++++++++++++---------------
 src/CodeGen_LLVM.h   |   7 ++-
 2 files changed, 98 insertions(+), 54 deletions(-)

diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp
index 9bb230d1ef65..57dadf2be3ba 100644
--- a/src/CodeGen_LLVM.cpp
+++ b/src/CodeGen_LLVM.cpp
@@ -4253,45 +4253,53 @@ void CodeGen_LLVM::codegen_vector_reduce(const VectorReduce *op, const Expr &ini
               op->op == VectorReduce::Mul ||
               op->op == VectorReduce::Min ||
               op->op == VectorReduce::Max) &&
-             // Must be a power of two lanes
-             (input_lanes >= 2) &&
-             ((input_lanes & (input_lanes - 1)) == 0) &&
-             // int versions exist up to 1024 bits
-             ((!op->type.is_float() && input_bytes <= 1024) ||
-              // float versions exist up to 16 lanes
-              input_lanes <= 16) &&
-             // As of the release of llvm 10, the 64-bit experimental total
-             // reductions don't seem to be done yet on arm.
-             (val.type().bits() != 64 ||
-              target.arch != Target::ARM));
+             (use_llvm_vp_intrinsics ||
+              // Must be a power of two lanes
+              ((input_lanes >= 2) &&
+               ((input_lanes & (input_lanes - 1)) == 0) &&
+               // int versions exist up to 1024 bits
+               ((!op->type.is_float() && input_bytes <= 1024) ||
+                // float versions exist up to 16 lanes
+                input_lanes <= 16) &&
+               // As of the release of llvm 10, the 64-bit experimental total
+               // reductions don't seem to be done yet on arm.
+               (val.type().bits() != 64 ||
+                target.arch != Target::ARM))));
 
         if (llvm_has_intrinsic) {
-            std::stringstream name;
-            name << "llvm.vector.reduce.";
+            const char *name = "<err>";
             const int bits = op->type.bits();
-            bool takes_initial_value = false;
+            bool takes_initial_value = use_llvm_vp_intrinsics;
             Expr initial_value = init;
             if (op->type.is_float()) {
                 switch (op->op) {
                 case VectorReduce::Add:
-                    name << "fadd";
+                    name = "fadd";
                     takes_initial_value = true;
                     if (!initial_value.defined()) {
                         initial_value = make_zero(op->type);
                     }
                     break;
                 case VectorReduce::Mul:
-                    name << "fmul";
+                    name = "fmul";
                     takes_initial_value = true;
                     if (!initial_value.defined()) {
                         initial_value = make_one(op->type);
                     }
                     break;
                 case VectorReduce::Min:
-                    name << "fmin";
+                    name = "fmin";
+                    // TODO(zvookin): For signed case, whether this is Inf or the max floating-point value depends on strict_float. (Or maybe it is QNaN in strict_float.)
+                    if (takes_initial_value && !initial_value.defined()) {
+                        initial_value = op->type.max();
+                    }
                     break;
                 case VectorReduce::Max:
-                    name << "fmax";
+                    name = "fmax";
+                    // TODO(zvookin): For signed case, whether this is -Inf or the min floating-point value depends on strict_float. (Or maybe it is -QNaN in strict_float.)
+                    if (takes_initial_value && !initial_value.defined()) {
+                        initial_value = op->type.min();
+                    }
                     break;
                 default:
                     break;
@@ -4299,55 +4307,82 @@ void CodeGen_LLVM::codegen_vector_reduce(const VectorReduce *op, const Expr &ini
             } else if (op->type.is_int() || op->type.is_uint()) {
                 switch (op->op) {
                 case VectorReduce::Add:
-                    name << "add";
+                    name = "add";
+                    if (takes_initial_value && !initial_value.defined()) {
+                        initial_value = make_zero(op->type);
+                    }
                     break;
                 case VectorReduce::Mul:
-                    name << "mul";
+                    name = "mul";
+                    if (takes_initial_value && !initial_value.defined()) {
+                        initial_value = make_one(op->type);
+                    }
                     break;
                 case VectorReduce::Min:
-                    name << (op->type.is_int() ? 's' : 'u') << "min";
+                    name = op->type.is_int() ? "smin" : "umin";
+                    if (takes_initial_value && !initial_value.defined()) {
+                        initial_value = op->type.max();
+                    }
                     break;
                 case VectorReduce::Max:
-                    name << (op->type.is_int() ? 's' : 'u') << "max";
+                    name = op->type.is_int() ? "smax" : "umax";
+                    if (takes_initial_value && !initial_value.defined()) {
+                        initial_value = op->type.min();
+                    }
                     break;
                 default:
                     break;
                 }
             }
-            name << ".v" << val.type().lanes() << (op->type.is_float() ? 'f' : 'i') << bits;
 
-            string intrin_name = name.str();
+            if (use_llvm_vp_intrinsics) {
+                string vp_name = "reduce.";
+                vp_name += name;
+                codegen(initial_value);
+                llvm::Value *init = value;
+                codegen(op->value);
+                llvm::Value *val = value;
+                bool generated = call_vector_predication_intrinsic(vp_name, op->value.type(), nullptr, init, val, nullptr, 0, "", false, true);
+                internal_assert(generated) << "Vector predication intrinsic generation failed for vector reduction " << name << "\n";
+            } else {
+                std::stringstream build_name;
+                build_name << "llvm.vector.reduce.";
+                build_name << name;
+                build_name << ".v" << val.type().lanes() << (op->type.is_float() ? 'f' : 'i') << bits;
 
-            vector<Expr> args;
-            if (takes_initial_value) {
-                args.push_back(initial_value);
-                initial_value = Expr();
-            }
-            args.push_back(op->value);
+                string intrin_name = build_name.str();
 
-            // Make sure the declaration exists, or the codegen for
-            // call will assume that the args should scalarize.
-            if (!module->getFunction(intrin_name)) {
-                vector<llvm::Type *> arg_types;
-                for (const Expr &e : args) {
-                    arg_types.push_back(llvm_type_of(e.type()));
+                vector<Expr> args;
+                if (takes_initial_value) {
+                    args.push_back(initial_value);
+                    initial_value = Expr();
+                }
+                args.push_back(op->value);
+
+                // Make sure the declaration exists, or the codegen for
+                // call will assume that the args should scalarize.
+                if (!module->getFunction(intrin_name)) {
+                    vector<llvm::Type *> arg_types;
+                    for (const Expr &e : args) {
+                        arg_types.push_back(llvm_type_of(e.type()));
+                    }
+                    FunctionType *func_t = FunctionType::get(llvm_type_of(op->type), arg_types, false);
+                    llvm::Function::Create(func_t, llvm::Function::ExternalLinkage, intrin_name, module.get());
                 }
-                FunctionType *func_t = FunctionType::get(llvm_type_of(op->type), arg_types, false);
-                llvm::Function::Create(func_t, llvm::Function::ExternalLinkage, intrin_name, module.get());
-            }
 
-            Expr equiv = Call::make(op->type, intrin_name, args, Call::PureExtern);
-            if (initial_value.defined()) {
-                equiv = binop(initial_value, equiv);
+                Expr equiv = Call::make(op->type, intrin_name, args, Call::PureExtern);
+                if (initial_value.defined()) {
+                    equiv = binop(initial_value, equiv);
+                }
+                equiv.accept(this);
             }
-            equiv.accept(this);
             return;
         }
     }
 
     if (output_lanes == 1 &&
         factor > native_lanes &&
-        factor % native_lanes == 0) {
+        (use_llvm_vp_intrinsics || (factor % native_lanes == 0))) {
         // It's a total reduction of multiple native
         // vectors. Start by adding the vectors together.
         Expr equiv;
@@ -4647,9 +4682,10 @@ Value *CodeGen_LLVM::call_intrin(const Type &result_type, int intrin_lanes,
                        intrin, arg_values);
 }
 
+
 Value *CodeGen_LLVM::call_intrin(const llvm::Type *result_type, int intrin_lanes,
                                  const string &name, vector<Value *> arg_values,
-                                 bool scalable_vector_result) {
+                                 bool scalable_vector_result, bool is_reduction) {
     llvm::Function *fn = module->getFunction(name);
     if (!fn) {
         vector<llvm::Type *> arg_types(arg_values.size());
@@ -4658,7 +4694,7 @@ Value *CodeGen_LLVM::call_intrin(const llvm::Type *result_type, int intrin_lanes
         }
 
         llvm::Type *intrinsic_result_type = result_type->getScalarType();
-        if (intrin_lanes > 1) {
+        if (intrin_lanes > 1 && !is_reduction) {
             if (scalable_vector_result && effective_vscale != 0) {
                 intrinsic_result_type = get_vector_type(result_type->getScalarType(),
                                                         intrin_lanes / effective_vscale, VectorTypeConstraint::VScale);
@@ -4672,11 +4708,12 @@ Value *CodeGen_LLVM::call_intrin(const llvm::Type *result_type, int intrin_lanes
         fn->setCallingConv(CallingConv::C);
     }
 
-    return call_intrin(result_type, intrin_lanes, fn, arg_values);
+    return call_intrin(result_type, intrin_lanes, fn, arg_values, is_reduction);
 }
 
 Value *CodeGen_LLVM::call_intrin(const llvm::Type *result_type, int intrin_lanes,
-                                 llvm::Function *intrin, vector<Value *> arg_values) {
+                                 llvm::Function *intrin, vector<Value *> arg_values,
+                                 bool is_reduction) {
     internal_assert(intrin);
     int arg_lanes = 1;
     if (result_type->isVoidTy()) {
@@ -4685,7 +4722,7 @@ Value *CodeGen_LLVM::call_intrin(const llvm::Type *result_type, int intrin_lanes
         arg_lanes = get_vector_num_elements(result_type);
     }
 
-    if (intrin_lanes != arg_lanes) {
+    if (!is_reduction && intrin_lanes != arg_lanes) {
         // Cut up each arg into appropriately-sized pieces, call the
         // intrinsic on each, then splice together the results.
         vector<Value *> results;
@@ -5139,7 +5176,7 @@ bool CodeGen_LLVM::call_vector_predication_intrinsic(const std::string &name, co
                                                      llvm::Value *mask,  // Pass nullptr for constrant true.
                                                      llvm::Value *a, llvm::Value *b, llvm::Value *c, int alignment,
                                                      const std::string &overload_suffix,
-                                                     bool void_return) {
+                                                     bool void_return, bool is_reduction) {
     if (!use_llvm_vp_intrinsics ||
         result_type.is_scalar()) {
         return false;
@@ -5192,7 +5229,13 @@ bool CodeGen_LLVM::call_vector_predication_intrinsic(const std::string &name, co
     }
     args[i++] = ConstantInt::get(i32_t, length);
 
-    value  = call_intrin(void_return ? void_t : llvm_result_type, get_vector_num_elements(llvm_result_type), full_name, args, is_scalable);
+    llvm::Type *llvm_return_type = llvm_result_type;
+    if (void_return) {
+        llvm_return_type = void_t;
+    } else if (is_reduction) {
+        llvm_return_type = llvm_result_type->getScalarType();
+    }
+    value = call_intrin(llvm_return_type, get_vector_num_elements(llvm_result_type), full_name, args, is_scalable, is_reduction);
     if (alignment != 0 && ptr_index != -1 && isa<CallInst>(value)) {
         llvm::CallInst *call = dyn_cast<llvm::CallInst>(value);
         call->addParamAttr(ptr_index, Attribute::getWithAlignment(*context, llvm::Align(alignment)));
diff --git a/src/CodeGen_LLVM.h b/src/CodeGen_LLVM.h
index 5ca5e985f39c..f5e0d4ed41a3 100644
--- a/src/CodeGen_LLVM.h
+++ b/src/CodeGen_LLVM.h
@@ -477,9 +477,10 @@ class CodeGen_LLVM : public IRVisitor {
                              llvm::Function *intrin, std::vector<Expr>);
     llvm::Value *call_intrin(const llvm::Type *t, int intrin_lanes,
                              const std::string &name, std::vector<llvm::Value *>,
-                             bool scalable_vector_result = false);
+                             bool scalable_vector_result = false, bool is_reduction = false);
     llvm::Value *call_intrin(const llvm::Type *t, int intrin_lanes,
-                             llvm::Function *intrin, std::vector<llvm::Value *>);
+                             llvm::Function *intrin, std::vector<llvm::Value *>,
+                             bool is_reduction = false);
     // @}
 
     /** Take a slice of lanes out of an llvm vector. Pads with undefs
@@ -589,7 +590,7 @@ class CodeGen_LLVM : public IRVisitor {
                                                    llvm::Value *mask, llvm::Value *a, llvm::Value *b = nullptr,
                                                    llvm::Value *c = nullptr, int alignment = 0,
                                                    const std::string &overload_suffix = "",
-                                                   bool void_return = false);
+                                                   bool void_return = false, bool is_reduction = false);
 
     virtual bool call_vector_predication_comparison(const std::string &name, const Type &result_type,
                                                     llvm::Value *mask,  // Pass nullptr for constrant true.

From af55a2ee032b30f755ad10a79adb6939fe6b4f13 Mon Sep 17 00:00:00 2001
From: Z Stern <zalman@google.com>
Date: Fri, 21 Oct 2022 00:45:30 +0000
Subject: [PATCH 06/16] Small refactor to clean up vector predication support.
 Mainly improving the calling convention and naming of the new routines to
 generate the intrinsics.

---
 src/CodeGen_LLVM.cpp | 233 +++++++++++++++++++++++--------------------
 src/CodeGen_LLVM.h   |  71 ++++++++-----
 2 files changed, 170 insertions(+), 134 deletions(-)

diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp
index f5ac01cc6fd5..20a36be26c9f 100644
--- a/src/CodeGen_LLVM.cpp
+++ b/src/CodeGen_LLVM.cpp
@@ -1511,16 +1511,20 @@ void CodeGen_LLVM::visit(const Add *op) {
     Value *a = codegen(op->a);
     Value *b = codegen(op->b);
     if (op->type.is_float()) {
-        if (!call_vector_predication_intrinsic("fadd", t, nullptr, a, b)) {
+        if (!try_vector_predication_intrinsic("llvm.vp.fadd", llvm_type_of(t), t.lanes(), AllEnabledMask(),
+                                              { VPArg(a, 0), VPArg(b) })) {
             value = builder->CreateFAdd(a, b);
         }
     } else if (op->type.is_int() && op->type.bits() >= 32) {
         // We tell llvm integers don't wrap, so that it generates good
         // code for loop indices.
-        // TODO(zalman): Figure out if vector predication needs to/can work here.
+        // TODO(zalman): This needs vector predication, but I can't
+        // see a way to do it. May go away in introducing correct
+        // index type instead of using int32_t.
         value = builder->CreateNSWAdd(a, b);
     } else {
-        if (!call_vector_predication_intrinsic("add", t, nullptr, a, b)) {
+        if (!try_vector_predication_intrinsic("llvm.vp.add", llvm_type_of(t), t.lanes(), AllEnabledMask(),
+                                              { VPArg(a, 0), VPArg(b) })) {
             value = builder->CreateAdd(a, b);
         }
     }
@@ -1536,16 +1540,20 @@ void CodeGen_LLVM::visit(const Sub *op) {
     Value *a = codegen(op->a);
     Value *b = codegen(op->b);
     if (op->type.is_float()) {
-        if (!call_vector_predication_intrinsic("fsub", t, nullptr, a, b)) {
+        if (!try_vector_predication_intrinsic("llvm.vp.fsub", llvm_type_of(t), t.lanes(), AllEnabledMask(),
+                                              { VPArg(a, 0), VPArg(b) })) {
             value = builder->CreateFSub(a, b);
         }
     } else if (op->type.is_int() && op->type.bits() >= 32) {
         // We tell llvm integers don't wrap, so that it generates good
         // code for loop indices.
-        // TODO(zalman): Figure out if vector predication needs to/can work here.
+        // TODO(zalman): This needs vector predication, but I can't
+        // see a way to do it. May go away in introducing correct
+        // index type instead of using int32_t.
         value = builder->CreateNSWSub(a, b);
     } else {
-        if (!call_vector_predication_intrinsic("sub", t, nullptr, a, b)) {
+        if (!try_vector_predication_intrinsic("llvm.vp.sub", llvm_type_of(t), t.lanes(), AllEnabledMask(),
+                                              { VPArg(a, 0), VPArg(b) })) {
             value = builder->CreateSub(a, b);
         }
     }
@@ -1565,16 +1573,20 @@ void CodeGen_LLVM::visit(const Mul *op) {
     Value *a = codegen(op->a);
     Value *b = codegen(op->b);
     if (op->type.is_float()) {
-        if (!call_vector_predication_intrinsic("fmul", t, nullptr, a, b)) {
+        if (!try_vector_predication_intrinsic("llvm.vp.fmul", llvm_type_of(t), t.lanes(), AllEnabledMask(),
+                                              { VPArg(a, 0), VPArg(b) })) {
             value = builder->CreateFMul(a, b);
         }
     } else if (op->type.is_int() && op->type.bits() >= 32) {
         // We tell llvm integers don't wrap, so that it generates good
         // code for loop indices.
-        // TODO(zalman): Figure out if vector predication needs to/can work here.
+        // TODO(zalman): This needs vector predication, but I can't
+        // see a way to do it. May go away in introducing correct
+        // index type instead of using int32_t.
         value = builder->CreateNSWMul(a, b);
     } else {
-        if (!call_vector_predication_intrinsic("mul", t, nullptr, a, b)) {
+        if (!try_vector_predication_intrinsic("llvm.vp.mul", llvm_type_of(t), t.lanes(), AllEnabledMask(),
+                                              { VPArg(a, 0), VPArg(b) })) {
             value = builder->CreateMul(a, b);
         }
     }
@@ -1596,7 +1608,8 @@ void CodeGen_LLVM::visit(const Div *op) {
         // output hard.
         Value *a = codegen(op->a);
         Value *b = codegen(op->b);
-        if (!call_vector_predication_intrinsic("fdiv", t, nullptr, a, b)) {
+        if (!try_vector_predication_intrinsic("llvm.vp.fdiv", llvm_type_of(t), t.lanes(), AllEnabledMask(),
+                                              { VPArg(a, 0), VPArg(b) })) {
             value = builder->CreateFDiv(a, b);
         }
     } else {
@@ -1668,11 +1681,11 @@ void CodeGen_LLVM::visit(const EQ *op) {
     Value *a = codegen(op->a);
     Value *b = codegen(op->b);
     if (t.is_float()) {
-        if (!call_vector_predication_comparison("fcmp", t, nullptr, a, b, "oeq")) {
+        if (!try_vector_predication_comparison("llvm.vp.fcmp", t, AllEnabledMask(), a, b, "oeq")) {
             value = builder->CreateFCmpOEQ(a, b);
         }
     } else {
-        if (!call_vector_predication_comparison("icmp", t, nullptr, a, b, "eq")) {
+        if (!try_vector_predication_comparison("llvm.vp.icmp", t, AllEnabledMask(), a, b, "eq")) {
             value = builder->CreateICmpEQ(a, b);
         }
     }
@@ -1688,11 +1701,11 @@ void CodeGen_LLVM::visit(const NE *op) {
     Value *a = codegen(op->a);
     Value *b = codegen(op->b);
     if (t.is_float()) {
-        if (!call_vector_predication_comparison("fcmp", t, nullptr, a, b, "one")) {
+        if (!try_vector_predication_comparison("llvm.vp.fcmp", t, AllEnabledMask(), a, b, "one")) {
             value = builder->CreateFCmpONE(a, b);
         }
     } else {
-        if (!call_vector_predication_comparison("icmp", t, nullptr, a, b, "ne")) {
+        if (!try_vector_predication_comparison("llvm.vp.icmp", t, AllEnabledMask(), a, b, "ne")) {
             value = builder->CreateICmpNE(a, b);
         }
     }
@@ -1708,15 +1721,15 @@ void CodeGen_LLVM::visit(const LT *op) {
     Value *a = codegen(op->a);
     Value *b = codegen(op->b);
     if (t.is_float()) {
-        if (!call_vector_predication_comparison("fcmp", t, nullptr, a, b, "olt")) {
+        if (!try_vector_predication_comparison("llvm.vp.fcmp", t, AllEnabledMask(), a, b, "olt")) {
             value = builder->CreateFCmpOLT(a, b);
         }
     } else if (t.is_int()) {
-        if (!call_vector_predication_comparison("icmp", t, nullptr, a, b, "slt")) {
+        if (!try_vector_predication_comparison("llvm.vp.icmp", t, AllEnabledMask(), a, b, "slt")) {
             value = builder->CreateICmpSLT(a, b);
         }
     } else {
-        if (!call_vector_predication_comparison("icmp", t, nullptr, a, b, "ult")) {
+        if (!try_vector_predication_comparison("llvm.vp.icmp", t, AllEnabledMask(), a, b, "ult")) {
             value = builder->CreateICmpULT(a, b);
         }
     }
@@ -1732,15 +1745,15 @@ void CodeGen_LLVM::visit(const LE *op) {
     Value *a = codegen(op->a);
     Value *b = codegen(op->b);
     if (t.is_float()) {
-        if (!call_vector_predication_comparison("fcmp", t, nullptr, a, b, "ole")) {
+        if (!try_vector_predication_comparison("llvm.vp.fcmp", t, AllEnabledMask(), a, b, "ole")) {
             value = builder->CreateFCmpOLE(a, b);
         }
     } else if (t.is_int()) {
-        if (!call_vector_predication_comparison("icmp", t, nullptr, a, b, "sle")) {
+        if (!try_vector_predication_comparison("llvm.vp.icmp", t, AllEnabledMask(), a, b, "sle")) {
             value = builder->CreateICmpSLE(a, b);
         }
     } else {
-        if (!call_vector_predication_comparison("icmp", t, nullptr, a, b, "ule")) {
+        if (!try_vector_predication_comparison("llvm.vp.icmp", t, AllEnabledMask(), a, b, "ule")) {
             value = builder->CreateICmpULE(a, b);
         }
     }
@@ -1757,15 +1770,15 @@ void CodeGen_LLVM::visit(const GT *op) {
     Value *b = codegen(op->b);
 
     if (t.is_float()) {
-        if (!call_vector_predication_comparison("fcmp", t, nullptr, a, b, "ogt")) {
+        if (!try_vector_predication_comparison("llvm.vp.fcmp", t, AllEnabledMask(), a, b, "ogt")) {
             value = builder->CreateFCmpOGT(a, b);
         }
     } else if (t.is_int()) {
-        if (!call_vector_predication_comparison("icmp", t, nullptr, a, b, "sgt")) {
+        if (!try_vector_predication_comparison("llvm.vp.icmp", t, AllEnabledMask(), a, b, "sgt")) {
             value = builder->CreateICmpSGT(a, b);
         }
     } else {
-        if (!call_vector_predication_comparison("icmp", t, nullptr, a, b, "ugt")) {
+        if (!try_vector_predication_comparison("llvm.vp.icmp", t, AllEnabledMask(), a, b, "ugt")) {
             value = builder->CreateICmpUGT(a, b);
         }
     }
@@ -1781,15 +1794,15 @@ void CodeGen_LLVM::visit(const GE *op) {
     Value *a = codegen(op->a);
     Value *b = codegen(op->b);
     if (t.is_float()) {
-        if (!call_vector_predication_comparison("fcmp", t, nullptr, a, b, "oge")) {
+        if (!try_vector_predication_comparison("llvm.vp.fcmp", t, AllEnabledMask(), a, b, "oge")) {
             value = builder->CreateFCmpOGE(a, b);
         }
     } else if (t.is_int()) {
-        if (!call_vector_predication_comparison("icmp", t, nullptr, a, b, "sge")) {
+        if (!try_vector_predication_comparison("llvm.vp.icmp", t, AllEnabledMask(), a, b, "sge")) {
             value = builder->CreateICmpSGE(a, b);
         }
     } else {
-        if (!call_vector_predication_comparison("icmp", t, nullptr, a, b, "uge")) {
+        if (!try_vector_predication_comparison("llvm.vp.icmp", t, AllEnabledMask(), a, b, "uge")) {
             value = builder->CreateICmpUGE(a, b);
         }
     }
@@ -1802,7 +1815,8 @@ void CodeGen_LLVM::visit(const And *op) {
 
     Value *a = codegen(op->a);
     Value *b = codegen(op->b);
-    if (!call_vector_predication_intrinsic("and", op->type, nullptr, a, b)) {
+    if (!try_vector_predication_intrinsic("llvm.vp.and", llvm_type_of(op->type), op->type.lanes(),
+                                          AllEnabledMask(), { VPArg(a, 0), VPArg(b) })) {
         value = builder->CreateAnd(a, b);
     }
 }
@@ -1814,14 +1828,16 @@ void CodeGen_LLVM::visit(const Or *op) {
 
     Value *a = codegen(op->a);
     Value *b = codegen(op->b);
-    if (!call_vector_predication_intrinsic("or", op->type, nullptr, a, b)) {
+    if (!try_vector_predication_intrinsic("llvm.vp.or", llvm_type_of(op->type), op->type.lanes(),
+                                          AllEnabledMask(), { VPArg(a, 0), VPArg(b) })) {
         value = builder->CreateOr(a, b);
     }
 }
 
 void CodeGen_LLVM::visit(const Not *op) {
     Value *a = codegen(op->a);
-    if (!call_vector_predication_intrinsic("not", op->type, nullptr, a)) {
+    if (!try_vector_predication_intrinsic("llvm.vp.not", llvm_type_of(op->type), op->type.lanes(),
+                                          AllEnabledMask(), { VPArg(a, 0) })) {
         value = builder->CreateNot(a);
     }
 }
@@ -1836,7 +1852,8 @@ void CodeGen_LLVM::visit(const Select *op) {
 
     Value *a = codegen(op->true_value);
     Value *b = codegen(op->false_value);
-    if (!call_vector_predication_intrinsic("select", op->type, nullptr, cmp, a, b)) {
+    if (!try_vector_predication_intrinsic("llvm.vp.select", llvm_type_of(op->type), op->type.lanes(),
+                                          AllEnabledMask(), { VPArg(cmp), VPArg(a, 0), VPArg(b) })) {
         value = builder->CreateSelect(cmp, a, b);
     }
 }
@@ -2321,8 +2338,8 @@ void CodeGen_LLVM::codegen_predicated_store(const Store *op) {
 
             Value *slice_mask = slice_vector(vpred, i, slice_lanes);
             Instruction *store;
-            if (call_vector_predication_intrinsic("store", value_type.with_lanes(slice_lanes), slice_mask, slice_val,
-                                                  vec_ptr, nullptr, alignment, ".p0", true)) {
+            if (try_vector_predication_intrinsic("llvm.vp.store", void_t, slice_lanes, slice_mask,
+                                                 { VPArg(slice_val, 0), VPArg(vec_ptr, 1, alignment) })) {
                 store = dyn_cast<Instruction>(value);
             } else {
                 store = builder->CreateMaskedStore(slice_val, vec_ptr, llvm::Align(alignment), slice_mask);
@@ -2427,17 +2444,15 @@ llvm::Value *CodeGen_LLVM::codegen_vector_load(const Type &type, const std::stri
             if (get_target().bits == 64 && !stride->getType()->isIntegerTy(64)) {
                 stride = builder->CreateIntCast(stride, i64_t, true);
             }
-            const char *mangle = stride->getType()->isIntegerTy(64) ? ".p0.i64" : ".p0.i32";
-              
-            if (call_vector_predication_intrinsic("strided.load", type.with_lanes(slice_lanes), slice_mask,
-                                                  vec_ptr, stride, nullptr, align_bytes, mangle)) {
+            if (try_vector_predication_intrinsic("llvm.experimental.vp.strided.load", slide_type, slice_lanes, slice_mask,
+                                                 { VPArg(vec_ptr, 0, align_bytes), VPArg(stride, 1) })) {
                 load_inst = dyn_cast<Instruction>(value);
             } else {
                 internal_error << "Vector predicated strided load should not be requested if not supported.\n";
             }
         } else {
-            if (call_vector_predication_intrinsic("load", type.with_lanes(slice_lanes), slice_mask,
-                                                  vec_ptr, nullptr, nullptr, align_bytes, ".p0")) {
+            if (try_vector_predication_intrinsic("llvm.vp.load", slice_type, slice_lanes, slice_mask,
+                                                 { VPArg(vec_ptr, 0, align_bytes) })) {
                 load_inst = dyn_cast<Instruction>(value);
             } else {
                 if (slice_mask != nullptr) {
@@ -2670,27 +2685,31 @@ void CodeGen_LLVM::visit(const Call *op) {
         internal_assert(op->args.size() == 2);
         Value *a = codegen(op->args[0]);
         Value *b = codegen(op->args[1]);
-        if (!call_vector_predication_intrinsic("and", op->type, nullptr, a, b)) {
+        if (!try_vector_predication_intrinsic("llvm.vp.and", llvm_type_of(op->type), op->type.lanes(),
+                                              AllEnabledMask(), { VPArg(a, 0), VPArg(b) })) {
             value = builder->CreateAnd(a, b);
         }
     } else if (op->is_intrinsic(Call::bitwise_xor)) {
         internal_assert(op->args.size() == 2);
         Value *a = codegen(op->args[0]);
         Value *b = codegen(op->args[1]);
-        if (!call_vector_predication_intrinsic("xor", op->type, nullptr, a, b)) {
+        if (!try_vector_predication_intrinsic("llvm.vp.xor", llvm_type_of(op->type), op->type.lanes(),
+                                              AllEnabledMask(), { VPArg(a, 0), VPArg(b) })) {
             value = builder->CreateXor(a, b);
         }
     } else if (op->is_intrinsic(Call::bitwise_or)) {
         internal_assert(op->args.size() == 2);
         Value *a = codegen(op->args[0]);
         Value *b = codegen(op->args[1]);
-        if (!call_vector_predication_intrinsic("or", op->type, nullptr, a, b)) {
+        if (!try_vector_predication_intrinsic("llvm.vp.or", llvm_type_of(op->type), op->type.lanes(),
+                                              AllEnabledMask(), { VPArg(a, 0), VPArg(b) })) {
             value = builder->CreateOr(a, b);
         }
     } else if (op->is_intrinsic(Call::bitwise_not)) {
         internal_assert(op->args.size() == 1);
         Value *a = codegen(op->args[0]);
-        if (!call_vector_predication_intrinsic("not", op->type, nullptr, a)) {
+        if (!try_vector_predication_intrinsic("llvm.vp.not", llvm_type_of(op->type), op->type.lanes(),
+                                              AllEnabledMask(), { VPArg(a, 0) })) {
             value = builder->CreateNot(a);
         }
     } else if (op->is_intrinsic(Call::shift_left)) {
@@ -2698,7 +2717,8 @@ void CodeGen_LLVM::visit(const Call *op) {
         if (op->args[1].type().is_uint()) {
             Value *a = codegen(op->args[0]);
             Value *b = codegen(op->args[1]);
-            if (!call_vector_predication_intrinsic("shl", op->type, nullptr, a, b)) {
+            if (!try_vector_predication_intrinsic("llvm.vp.shl", llvm_type_of(op->type), op->type.lanes(),
+                                                  AllEnabledMask(), { VPArg(a, 0), VPArg(b) })) {
                 value = builder->CreateShl(a, b);
             }
         } else {
@@ -2710,11 +2730,13 @@ void CodeGen_LLVM::visit(const Call *op) {
             Value *a = codegen(op->args[0]);
             Value *b = codegen(op->args[1]);
             if (op->type.is_int()) {
-                if (!call_vector_predication_intrinsic("ashr", op->type, nullptr, a, b)) {
+                if (!try_vector_predication_intrinsic("llvm.vp.ashr", llvm_type_of(op->type), op->type.lanes(),
+                                                      AllEnabledMask(), { VPArg(a, 0), VPArg(b) })) {
                     value = builder->CreateAShr(a, b);
                 }
             } else {
-                if (!call_vector_predication_intrinsic("lshr", op->type, nullptr, a, b)) {
+                if (!try_vector_predication_intrinsic("llvm.vp.lshr", llvm_type_of(op->type), op->type.lanes(),
+                                                      AllEnabledMask(), { VPArg(a, 0), VPArg(b) })) {
                     value = builder->CreateLShr(a, b);
                 }
             }
@@ -3814,8 +3836,8 @@ void CodeGen_LLVM::visit(const Store *op) {
                 Value *elt_ptr = codegen_buffer_pointer(op->name, value_type.element_of(), slice_base);
                 Value *vec_ptr = builder->CreatePointerCast(elt_ptr, slice_val->getType()->getPointerTo());
                 if (is_dense || slice_lanes == 1) {
-                    if (call_vector_predication_intrinsic("store", value_type.with_lanes(slice_lanes), nullptr, slice_val,
-                                                          vec_ptr, nullptr, alignment, ".p0", true)) {
+                    if (try_vector_predication_intrinsic("llvm.vp.store", void_t, slice_lanes, AllEnabledMask(),
+                                                         { VPArg(slice_val, 0), VPArg(vec_ptr, 1, alignment) })) {
                         add_tbaa_metadata(dyn_cast<Instruction>(value), op->name, slice_index);
                     } else {
                         StoreInst *store = builder->CreateAlignedStore(slice_val, vec_ptr, llvm::Align(alignment));
@@ -3825,17 +3847,15 @@ void CodeGen_LLVM::visit(const Store *op) {
                     if (get_target().bits == 64 && !stride_val->getType()->isIntegerTy(64)) {
                         stride_val = builder->CreateIntCast(stride_val, i64_t, true);
                     }
-                    const char *mangle = stride_val->getType()->isIntegerTy(64) ? ".p0.i64" : ".p0.i32";
-                    bool generated = call_vector_predication_intrinsic("strided.store", value_type.with_lanes(slice_lanes), nullptr, slice_val,
-                                                                       vec_ptr, stride_val, alignment, mangle, true);
+                    bool generated = try_vector_predication_intrinsic("llvm.experimental.vp.strided.store", void_t, slice_lanes, AllEnabledMask(),
+                                                                      { VPArg(slice_val, 0), VPArg(vec_ptr, 1, alignment), VPArg(stride_val, 2) });
                     internal_assert(generated) << "Using vector predicated intrinsics, but code generation was not successful for strided store.\n"; 
                     add_tbaa_metadata(dyn_cast<Instruction>(value), op->name, slice_index);
                 } else {
                     Value *slice_index = slice_vector(index, i, slice_lanes);
                     Value *vec_ptrs = codegen_buffer_pointer(op->name, value_type, slice_index);
-                    bool generated = call_vector_predication_intrinsic("scatter", value_type.with_lanes(slice_lanes), nullptr, slice_val,
-                                                                       vec_ptrs, nullptr, alignment, mangle_llvm_vector_type(vec_ptrs->getType()), true);
-
+                    bool generated = try_vector_predication_intrinsic("llvm.vp.scatter", void_t, slice_lanes, AllEnabledMask(),
+                                                                      { VPArg(slice_val, 0), VPArg(vec_ptrs, 1, alignment) });
                     internal_assert(generated) << "Using vector predicated intrinsics, but code generation was not successful for gathering store.\n";
                 }
             }
@@ -4301,13 +4321,14 @@ void CodeGen_LLVM::codegen_vector_reduce(const VectorReduce *op, const Expr &ini
             }
 
             if (use_llvm_vp_intrinsics) {
-                string vp_name = "reduce.";
+                string vp_name = "llvm.vp.reduce.";
                 vp_name += name;
                 codegen(initial_value);
                 llvm::Value *init = value;
                 codegen(op->value);
                 llvm::Value *val = value;
-                bool generated = call_vector_predication_intrinsic(vp_name, op->value.type(), nullptr, init, val, nullptr, 0, "", false, true);
+                bool generated = try_vector_predication_intrinsic(vp_name, llvm_type_of(op->value.type()), op->value.type().lanes(),
+                                                                  AllEnabledMask(), { VPArg(init), VPArg(val, 0) });
                 internal_assert(generated) << "Vector predication intrinsic generation failed for vector reduction " << name << "\n";
             } else {
                 std::stringstream build_name;
@@ -5137,47 +5158,49 @@ std::string CodeGen_LLVM::mangle_llvm_vector_type(llvm::Type *type) {
     return type_string;
 }
 
-bool CodeGen_LLVM::call_vector_predication_intrinsic(const std::string &name, const Type &result_type,
-                                                     llvm::Value *mask,  // Pass nullptr for constrant true.
-                                                     llvm::Value *a, llvm::Value *b, llvm::Value *c, int alignment,
-                                                     const std::string &overload_suffix,
-                                                     bool void_return, bool is_reduction) {
-    if (!use_llvm_vp_intrinsics ||
-        result_type.is_scalar()) {
+bool CodeGen_LLVM::try_vector_predication_intrinsic(const std::string &name, llvm::Type *llvm_result_type,
+                                                    int32_t length, MaskVariant mask, std::vector<VPArg> vp_args) {
+    if (!use_llvm_vp_intrinsics) {
         return false;
     }
 
-    llvm::Type *llvm_result_type = llvm_type_of(result_type);
-    int32_t length = result_type.lanes();
-
-    // TODO(zvookin): Fix the interface here to not case on names.
-    bool no_mask = name == "select";
-    const char *name_base = (starts_with(name, "strided")) ? "llvm.experimental.vp." : "llvm.vp.";
-
-    std::string full_name = name_base + name + mangle_llvm_vector_type(llvm_result_type) + overload_suffix;
-    int arg_count = 2 + !no_mask  + (b != nullptr) + (c != nullptr);
-    std::vector<llvm::Value *> args(arg_count);
-    size_t i = 0;
-
-    int ptr_index = -1;
-    if (isa<PointerType>(a->getType())) {
-        ptr_index = 0;
+    bool any_scalable = isa<llvm::ScalableVectorType>(llvm_result_type);
+    bool any_fixed = isa<llvm::FixedVectorType>(llvm_result_type);
+    bool is_reduction = !any_scalable && !any_fixed;
+    for (const VPArg &arg : vp_args) {
+        any_scalable |= isa<llvm::ScalableVectorType>(arg.value->getType());
+        any_fixed |= isa<llvm::FixedVectorType>(arg.value->getType());
+    }
+    if (!any_fixed && !any_scalable) {
+        return false;
     }
-    args[i++] = a;
-    if (b != nullptr) { 
-        args[i++] = b;
-        if (isa<PointerType>(b->getType())) {
-            if (ptr_index != -1) {
-                ptr_index = 1;
+    internal_assert(!(any_scalable && any_fixed)) << "Cannot combine fixed and scalable vectors to vector predication intrinsic.\n";
+
+    bool is_scalable = any_scalable;
+    
+    std::vector<llvm::Value *> args;
+    args.reserve(2 + vp_args.size());
+    std::vector<string> mangled_types(args.size());
+
+    for (const VPArg &arg : vp_args) {
+        args.push_back(arg.value);
+        if (arg.mangle_index != -1) {
+            llvm::Type *llvm_type = arg.value->getType();
+            if (isa<PointerType>(llvm_type)) {
+                mangled_types[arg.mangle_index] = ".p0";
+            } else  {
+                mangled_types[arg.mangle_index] = mangle_llvm_vector_type(llvm_type);
             }
         }
     }
-    if (c != nullptr) { 
-        args[i++] = c;
+
+    std::string full_name = name;
+    for (const std::string &mangle : mangled_types) {
+        full_name += mangle;
     }
-    bool is_scalable = isa<llvm::ScalableVectorType>(llvm_result_type);
-    if (!no_mask) {
-        if (mask == nullptr) {
+
+    if (!std::holds_alternative<NoMask>(mask)) {
+       if (std::holds_alternative<AllEnabledMask>(mask)) {
             llvm::ElementCount llvm_vector_ec;
             if (is_scalable) {
                 const auto *vt = cast<llvm::ScalableVectorType>(llvm_result_type);
@@ -5186,31 +5209,26 @@ bool CodeGen_LLVM::call_vector_predication_intrinsic(const std::string &name, co
                 const auto *vt = cast<llvm::FixedVectorType>(llvm_result_type);
                 llvm_vector_ec = vt->getElementCount();
             }
-
-            args[i++] = ConstantVector::getSplat(llvm_vector_ec, ConstantInt::get(i1_t, 1));
+            args.push_back(ConstantVector::getSplat(llvm_vector_ec, ConstantInt::get(i1_t, 1)));
         } else {
-            args[i++] = mask;
+           args.push_back(std::get<llvm::Value *>(mask));
         }
     }
-    args[i++] = ConstantInt::get(i32_t, length);
+    args.push_back(ConstantInt::get(i32_t, length));
 
-    llvm::Type *llvm_return_type = llvm_result_type;
-    if (void_return) {
-        llvm_return_type = void_t;
-    } else if (is_reduction) {
-        llvm_return_type = llvm_result_type->getScalarType();
-    }
-    value = call_intrin(llvm_return_type, get_vector_num_elements(llvm_result_type), full_name, args, is_scalable, is_reduction);
-    if (alignment != 0 && ptr_index != -1 && isa<CallInst>(value)) {
-        llvm::CallInst *call = dyn_cast<llvm::CallInst>(value);
-        call->addParamAttr(ptr_index, Attribute::getWithAlignment(*context, llvm::Align(alignment)));
+    value = call_intrin(llvm_result_type, length, full_name, args, is_scalable, is_reduction);
+    llvm::CallInst *call = dyn_cast<llvm::CallInst>(value);
+    for (size_t i = 0; i < args.size(); i++) {
+        if (vp_args[i].alignment != 0) {
+            call->addParamAttr(i, Attribute::getWithAlignment(*context, llvm::Align(vp_args[i].alignment)));
+        }
     }
     return true;
 }
-  
-bool CodeGen_LLVM::call_vector_predication_comparison(const std::string &name, const Type &result_type,
-                                                      llvm::Value *mask,  // Pass nullptr for constrant true.
-                                                      llvm::Value *a, llvm::Value *b, const char *cmp_op) {
+
+bool CodeGen_LLVM::try_vector_predication_comparison(const std::string &name, const Type &result_type,
+                                                     MaskVariant mask, llvm::Value *a, llvm::Value *b,
+                                                     const char *cmp_op) {
     // Early out to prevent creating useless metadata.
     if (!use_llvm_vp_intrinsics ||
         result_type.is_scalar()) {
@@ -5219,7 +5237,8 @@ bool CodeGen_LLVM::call_vector_predication_comparison(const std::string &name, c
 
     llvm::MDBuilder builder(*context);
     llvm::Value *md_val = llvm::MetadataAsValue::get(*context, builder.createString(cmp_op));
-    return call_vector_predication_intrinsic(name, result_type, mask, a, b, md_val);
+    return try_vector_predication_intrinsic(name, llvm_type_of(result_type), result_type.lanes(), mask,
+                                            { VPArg(a, 0), VPArg(b), VPArg(md_val) });
 }
 
 }  // namespace Internal
diff --git a/src/CodeGen_LLVM.h b/src/CodeGen_LLVM.h
index 778f7eb6d7bf..9df2cc322b36 100644
--- a/src/CodeGen_LLVM.h
+++ b/src/CodeGen_LLVM.h
@@ -35,6 +35,7 @@ class GlobalVariable;
 #include <map>
 #include <memory>
 #include <string>
+#include <variant>
 #include <vector>
 
 #include "IRVisitor.h"
@@ -561,38 +562,54 @@ class CodeGen_LLVM : public IRVisitor {
     llvm::Constant *get_splat(int lanes, llvm::Constant *value,
                               VectorTypeConstraint type_constraint = VectorTypeConstraint::None) const;
 
-    /** Call an "@llvm.vp.*" intrinsic, forming the full overloaded name and argument list.
-     * A key detail here is that the length of the vector operation is taken from the
-     * Halide type while the size of the LLVM vector type used (fixed or scalable) is taken
-     * from the LLVM promotion of the vector type, which should be the same as the types used
-     * in the arguments. These can be different. It may become useful to pass an explict
-     * length as well.
-     *
-     * The method is virtual to allow backends to extend this for architecture specific
-     * intrinsics. (E.g. RISC V LMUL.) Unfortunately, this involves matching the name as
-     * as string to do much. (TODO(zalman): decide if this is the right way to go based
-     * on LMUL experiment. Really LLVM ought to do this automatically for these intrinsics
-     * on larger lengths.)
-     *
-     * The name is the simple name like "add" for "@llvm.vp.add.v16i32".
-     * If mask is nullptr, it is provided as constant true.
-     * If b or c is nullptr, it is assumed to be a unary or binary operator respectively.
-     *
-     * Assigns result of vp intrinsic to value and returns true if it an instuction is generated,
-     * otherwise returns false.
+    /** Support for generating LLVM vector predication intrinsics
+     * ("@llvm.vp.*" and "@llvm.experimental.vp.*")
      */
-    virtual bool call_vector_predication_intrinsic(const std::string &name, const Type &result_type,
-                                                   llvm::Value *mask, llvm::Value *a, llvm::Value *b = nullptr,
-                                                   llvm::Value *c = nullptr, int alignment = 0,
-                                                   const std::string &overload_suffix = "",
-                                                   bool void_return = false, bool is_reduction = false);
+    // @{
+    /** Struct to hold descriptor for an argument to a vector
+     *  predicated intrinsic. This includes the value, whether the
+     *  type of the argument should be mangled into the intrisic name
+     *  and if so, where, and the alignment for pointer arguments. */
+    struct VPArg {
+        llvm::Value *value;
+        int mangle_index;
+        int alignment;
+        VPArg(llvm::Value *value, int32_t mangle_index = -1, int32_t alignment = 0)
+            : value(value), mangle_index(mangle_index), alignment(alignment) {
+        }
+    };
 
-    virtual bool call_vector_predication_comparison(const std::string &name, const Type &result_type,
-                                                    llvm::Value *mask,  // Pass nullptr for constrant true.
-                                                    llvm::Value *a, llvm::Value *b, const char *cmp_op);
+    /** Type indicating an intrinsic does not take a mask. */
+    struct NoMask {
+    };
+
+    /** Type indicating mask to use is all true -- all lanes enabled. */
+    struct AllEnabledMask {
+    };
+
+    /** Predication mask using the above two types for special cases
+     *   and an llvm::Value for the general one. */
+    typedef std::variant<NoMask, AllEnabledMask, llvm::Value *> MaskVariant;
+
+    /** Generate a vector predicated comparison intrinsic call if
+     * use_llvm_vp_intrinsics is true and result_type is a vector
+     * type. If generated, assigns result of vp intrinsic to value and
+     * returns true if it an instuction is generated, otherwise
+     * returns false. */
+    virtual bool try_vector_predication_comparison(const std::string &name, const Type &result_type,
+                                                   MaskVariant mask, llvm::Value *a, llvm::Value *b,
+                                                   const char *cmp_op);
+
+    /** Generate an intrisic call if use_llvm_vp_intrinsics is true
+     * and length is greater than 1. If generated, assigns result
+     * of vp intrinsic to value and returns true if it an instuction
+     * is generated, otherwise returns false. */
+    bool try_vector_predication_intrinsic(const std::string &name, llvm::Type *llvm_result_type,
+                                          int32_t length, MaskVariant mask, std::vector<VPArg> args);
 
     /** Controls use of vector predicated intrinsics for vector operations. */
     bool use_llvm_vp_intrinsics;
+    // @}
 
 private:
     /** All the values in scope at the current code location during

From eaa4100f004999422c8b131f20f7ba4416b0e947 Mon Sep 17 00:00:00 2001
From: Z Stern <zalman@google.com>
Date: Fri, 21 Oct 2022 00:52:10 +0000
Subject: [PATCH 07/16] Typo slipped in.

---
 src/CodeGen_LLVM.cpp | 4 ++--
 src/CodeGen_LLVM.h   | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp
index 20a36be26c9f..2f77b30169b0 100644
--- a/src/CodeGen_LLVM.cpp
+++ b/src/CodeGen_LLVM.cpp
@@ -2444,8 +2444,8 @@ llvm::Value *CodeGen_LLVM::codegen_vector_load(const Type &type, const std::stri
             if (get_target().bits == 64 && !stride->getType()->isIntegerTy(64)) {
                 stride = builder->CreateIntCast(stride, i64_t, true);
             }
-            if (try_vector_predication_intrinsic("llvm.experimental.vp.strided.load", slide_type, slice_lanes, slice_mask,
-                                                 { VPArg(vec_ptr, 0, align_bytes), VPArg(stride, 1) })) {
+            if (try_vector_predication_intrinsic("llvm.experimental.vp.strided.load", slice_type, slice_lanes, slice_mask,
+                                                 { VPArg(vec_ptr, 0, align_bytes), VPArg(tride, 1) })) {
                 load_inst = dyn_cast<Instruction>(value);
             } else {
                 internal_error << "Vector predicated strided load should not be requested if not supported.\n";
diff --git a/src/CodeGen_LLVM.h b/src/CodeGen_LLVM.h
index 9df2cc322b36..c1e5d46ccbf4 100644
--- a/src/CodeGen_LLVM.h
+++ b/src/CodeGen_LLVM.h
@@ -596,7 +596,7 @@ class CodeGen_LLVM : public IRVisitor {
      * type. If generated, assigns result of vp intrinsic to value and
      * returns true if it an instuction is generated, otherwise
      * returns false. */
-    virtual bool try_vector_predication_comparison(const std::string &name, const Type &result_type,
+    bool try_vector_predication_comparison(const std::string &name, const Type &result_type,
                                                    MaskVariant mask, llvm::Value *a, llvm::Value *b,
                                                    const char *cmp_op);
 

From 7a8201ca66a3891a5478671cd764614fafbedd31 Mon Sep 17 00:00:00 2001
From: Z Stern <zalman@google.com>
Date: Fri, 21 Oct 2022 01:00:33 +0000
Subject: [PATCH 08/16] This time for sure.

---
 src/CodeGen_LLVM.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp
index 2f77b30169b0..55a47b2959f3 100644
--- a/src/CodeGen_LLVM.cpp
+++ b/src/CodeGen_LLVM.cpp
@@ -2445,7 +2445,7 @@ llvm::Value *CodeGen_LLVM::codegen_vector_load(const Type &type, const std::stri
                 stride = builder->CreateIntCast(stride, i64_t, true);
             }
             if (try_vector_predication_intrinsic("llvm.experimental.vp.strided.load", slice_type, slice_lanes, slice_mask,
-                                                 { VPArg(vec_ptr, 0, align_bytes), VPArg(tride, 1) })) {
+                                                 { VPArg(vec_ptr, 0, align_bytes), VPArg(stride, 1) })) {
                 load_inst = dyn_cast<Instruction>(value);
             } else {
                 internal_error << "Vector predicated strided load should not be requested if not supported.\n";

From c0a9679cbe8bbc73d22863ba7d687e4f0b2f70e3 Mon Sep 17 00:00:00 2001
From: Z Stern <zalman@google.com>
Date: Fri, 21 Oct 2022 11:47:19 +0000
Subject: [PATCH 09/16] Formatting.

---
 src/CodeGen_LLVM.cpp  | 64 ++++++++++++++++++++-----------------------
 src/CodeGen_LLVM.h    |  4 +--
 src/CodeGen_RISCV.cpp |  2 +-
 3 files changed, 33 insertions(+), 37 deletions(-)

diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp
index 55a47b2959f3..73ead1c95cc4 100644
--- a/src/CodeGen_LLVM.cpp
+++ b/src/CodeGen_LLVM.cpp
@@ -1512,7 +1512,7 @@ void CodeGen_LLVM::visit(const Add *op) {
     Value *b = codegen(op->b);
     if (op->type.is_float()) {
         if (!try_vector_predication_intrinsic("llvm.vp.fadd", llvm_type_of(t), t.lanes(), AllEnabledMask(),
-                                              { VPArg(a, 0), VPArg(b) })) {
+                                              {VPArg(a, 0), VPArg(b)})) {
             value = builder->CreateFAdd(a, b);
         }
     } else if (op->type.is_int() && op->type.bits() >= 32) {
@@ -1524,7 +1524,7 @@ void CodeGen_LLVM::visit(const Add *op) {
         value = builder->CreateNSWAdd(a, b);
     } else {
         if (!try_vector_predication_intrinsic("llvm.vp.add", llvm_type_of(t), t.lanes(), AllEnabledMask(),
-                                              { VPArg(a, 0), VPArg(b) })) {
+                                              {VPArg(a, 0), VPArg(b)})) {
             value = builder->CreateAdd(a, b);
         }
     }
@@ -1541,7 +1541,7 @@ void CodeGen_LLVM::visit(const Sub *op) {
     Value *b = codegen(op->b);
     if (op->type.is_float()) {
         if (!try_vector_predication_intrinsic("llvm.vp.fsub", llvm_type_of(t), t.lanes(), AllEnabledMask(),
-                                              { VPArg(a, 0), VPArg(b) })) {
+                                              {VPArg(a, 0), VPArg(b)})) {
             value = builder->CreateFSub(a, b);
         }
     } else if (op->type.is_int() && op->type.bits() >= 32) {
@@ -1553,7 +1553,7 @@ void CodeGen_LLVM::visit(const Sub *op) {
         value = builder->CreateNSWSub(a, b);
     } else {
         if (!try_vector_predication_intrinsic("llvm.vp.sub", llvm_type_of(t), t.lanes(), AllEnabledMask(),
-                                              { VPArg(a, 0), VPArg(b) })) {
+                                              {VPArg(a, 0), VPArg(b)})) {
             value = builder->CreateSub(a, b);
         }
     }
@@ -1574,7 +1574,7 @@ void CodeGen_LLVM::visit(const Mul *op) {
     Value *b = codegen(op->b);
     if (op->type.is_float()) {
         if (!try_vector_predication_intrinsic("llvm.vp.fmul", llvm_type_of(t), t.lanes(), AllEnabledMask(),
-                                              { VPArg(a, 0), VPArg(b) })) {
+                                              {VPArg(a, 0), VPArg(b)})) {
             value = builder->CreateFMul(a, b);
         }
     } else if (op->type.is_int() && op->type.bits() >= 32) {
@@ -1586,7 +1586,7 @@ void CodeGen_LLVM::visit(const Mul *op) {
         value = builder->CreateNSWMul(a, b);
     } else {
         if (!try_vector_predication_intrinsic("llvm.vp.mul", llvm_type_of(t), t.lanes(), AllEnabledMask(),
-                                              { VPArg(a, 0), VPArg(b) })) {
+                                              {VPArg(a, 0), VPArg(b)})) {
             value = builder->CreateMul(a, b);
         }
     }
@@ -1609,7 +1609,7 @@ void CodeGen_LLVM::visit(const Div *op) {
         Value *a = codegen(op->a);
         Value *b = codegen(op->b);
         if (!try_vector_predication_intrinsic("llvm.vp.fdiv", llvm_type_of(t), t.lanes(), AllEnabledMask(),
-                                              { VPArg(a, 0), VPArg(b) })) {
+                                              {VPArg(a, 0), VPArg(b)})) {
             value = builder->CreateFDiv(a, b);
         }
     } else {
@@ -1816,7 +1816,7 @@ void CodeGen_LLVM::visit(const And *op) {
     Value *a = codegen(op->a);
     Value *b = codegen(op->b);
     if (!try_vector_predication_intrinsic("llvm.vp.and", llvm_type_of(op->type), op->type.lanes(),
-                                          AllEnabledMask(), { VPArg(a, 0), VPArg(b) })) {
+                                          AllEnabledMask(), {VPArg(a, 0), VPArg(b)})) {
         value = builder->CreateAnd(a, b);
     }
 }
@@ -1829,7 +1829,7 @@ void CodeGen_LLVM::visit(const Or *op) {
     Value *a = codegen(op->a);
     Value *b = codegen(op->b);
     if (!try_vector_predication_intrinsic("llvm.vp.or", llvm_type_of(op->type), op->type.lanes(),
-                                          AllEnabledMask(), { VPArg(a, 0), VPArg(b) })) {
+                                          AllEnabledMask(), {VPArg(a, 0), VPArg(b)})) {
         value = builder->CreateOr(a, b);
     }
 }
@@ -1837,7 +1837,7 @@ void CodeGen_LLVM::visit(const Or *op) {
 void CodeGen_LLVM::visit(const Not *op) {
     Value *a = codegen(op->a);
     if (!try_vector_predication_intrinsic("llvm.vp.not", llvm_type_of(op->type), op->type.lanes(),
-                                          AllEnabledMask(), { VPArg(a, 0) })) {
+                                          AllEnabledMask(), {VPArg(a, 0)})) {
         value = builder->CreateNot(a);
     }
 }
@@ -1853,7 +1853,7 @@ void CodeGen_LLVM::visit(const Select *op) {
     Value *a = codegen(op->true_value);
     Value *b = codegen(op->false_value);
     if (!try_vector_predication_intrinsic("llvm.vp.select", llvm_type_of(op->type), op->type.lanes(),
-                                          AllEnabledMask(), { VPArg(cmp), VPArg(a, 0), VPArg(b) })) {
+                                          AllEnabledMask(), {VPArg(cmp), VPArg(a, 0), VPArg(b)})) {
         value = builder->CreateSelect(cmp, a, b);
     }
 }
@@ -2339,7 +2339,7 @@ void CodeGen_LLVM::codegen_predicated_store(const Store *op) {
             Value *slice_mask = slice_vector(vpred, i, slice_lanes);
             Instruction *store;
             if (try_vector_predication_intrinsic("llvm.vp.store", void_t, slice_lanes, slice_mask,
-                                                 { VPArg(slice_val, 0), VPArg(vec_ptr, 1, alignment) })) {
+                                                 {VPArg(slice_val, 0), VPArg(vec_ptr, 1, alignment)})) {
                 store = dyn_cast<Instruction>(value);
             } else {
                 store = builder->CreateMaskedStore(slice_val, vec_ptr, llvm::Align(alignment), slice_mask);
@@ -2445,14 +2445,14 @@ llvm::Value *CodeGen_LLVM::codegen_vector_load(const Type &type, const std::stri
                 stride = builder->CreateIntCast(stride, i64_t, true);
             }
             if (try_vector_predication_intrinsic("llvm.experimental.vp.strided.load", slice_type, slice_lanes, slice_mask,
-                                                 { VPArg(vec_ptr, 0, align_bytes), VPArg(stride, 1) })) {
+                                                 {VPArg(vec_ptr, 0, align_bytes), VPArg(stride, 1)})) {
                 load_inst = dyn_cast<Instruction>(value);
             } else {
                 internal_error << "Vector predicated strided load should not be requested if not supported.\n";
             }
         } else {
             if (try_vector_predication_intrinsic("llvm.vp.load", slice_type, slice_lanes, slice_mask,
-                                                 { VPArg(vec_ptr, 0, align_bytes) })) {
+                                                 {VPArg(vec_ptr, 0, align_bytes)})) {
                 load_inst = dyn_cast<Instruction>(value);
             } else {
                 if (slice_mask != nullptr) {
@@ -2485,7 +2485,7 @@ void CodeGen_LLVM::codegen_predicated_load(const Load *op) {
         Value *vpred = codegen(op->predicate);
         Value *llvm_stride = codegen(stride);
         value = codegen_vector_load(op->type, op->name, ramp->base, op->image, op->param,
-                                   op->alignment, vpred, true, llvm_stride);
+                                    op->alignment, vpred, true, llvm_stride);
         return;
     }
 
@@ -2686,7 +2686,7 @@ void CodeGen_LLVM::visit(const Call *op) {
         Value *a = codegen(op->args[0]);
         Value *b = codegen(op->args[1]);
         if (!try_vector_predication_intrinsic("llvm.vp.and", llvm_type_of(op->type), op->type.lanes(),
-                                              AllEnabledMask(), { VPArg(a, 0), VPArg(b) })) {
+                                              AllEnabledMask(), {VPArg(a, 0), VPArg(b)})) {
             value = builder->CreateAnd(a, b);
         }
     } else if (op->is_intrinsic(Call::bitwise_xor)) {
@@ -2694,7 +2694,7 @@ void CodeGen_LLVM::visit(const Call *op) {
         Value *a = codegen(op->args[0]);
         Value *b = codegen(op->args[1]);
         if (!try_vector_predication_intrinsic("llvm.vp.xor", llvm_type_of(op->type), op->type.lanes(),
-                                              AllEnabledMask(), { VPArg(a, 0), VPArg(b) })) {
+                                              AllEnabledMask(), {VPArg(a, 0), VPArg(b)})) {
             value = builder->CreateXor(a, b);
         }
     } else if (op->is_intrinsic(Call::bitwise_or)) {
@@ -2702,14 +2702,14 @@ void CodeGen_LLVM::visit(const Call *op) {
         Value *a = codegen(op->args[0]);
         Value *b = codegen(op->args[1]);
         if (!try_vector_predication_intrinsic("llvm.vp.or", llvm_type_of(op->type), op->type.lanes(),
-                                              AllEnabledMask(), { VPArg(a, 0), VPArg(b) })) {
+                                              AllEnabledMask(), {VPArg(a, 0), VPArg(b)})) {
             value = builder->CreateOr(a, b);
         }
     } else if (op->is_intrinsic(Call::bitwise_not)) {
         internal_assert(op->args.size() == 1);
         Value *a = codegen(op->args[0]);
         if (!try_vector_predication_intrinsic("llvm.vp.not", llvm_type_of(op->type), op->type.lanes(),
-                                              AllEnabledMask(), { VPArg(a, 0) })) {
+                                              AllEnabledMask(), {VPArg(a, 0)})) {
             value = builder->CreateNot(a);
         }
     } else if (op->is_intrinsic(Call::shift_left)) {
@@ -2718,7 +2718,7 @@ void CodeGen_LLVM::visit(const Call *op) {
             Value *a = codegen(op->args[0]);
             Value *b = codegen(op->args[1]);
             if (!try_vector_predication_intrinsic("llvm.vp.shl", llvm_type_of(op->type), op->type.lanes(),
-                                                  AllEnabledMask(), { VPArg(a, 0), VPArg(b) })) {
+                                                  AllEnabledMask(), {VPArg(a, 0), VPArg(b)})) {
                 value = builder->CreateShl(a, b);
             }
         } else {
@@ -2731,12 +2731,12 @@ void CodeGen_LLVM::visit(const Call *op) {
             Value *b = codegen(op->args[1]);
             if (op->type.is_int()) {
                 if (!try_vector_predication_intrinsic("llvm.vp.ashr", llvm_type_of(op->type), op->type.lanes(),
-                                                      AllEnabledMask(), { VPArg(a, 0), VPArg(b) })) {
+                                                      AllEnabledMask(), {VPArg(a, 0), VPArg(b)})) {
                     value = builder->CreateAShr(a, b);
                 }
             } else {
                 if (!try_vector_predication_intrinsic("llvm.vp.lshr", llvm_type_of(op->type), op->type.lanes(),
-                                                      AllEnabledMask(), { VPArg(a, 0), VPArg(b) })) {
+                                                      AllEnabledMask(), {VPArg(a, 0), VPArg(b)})) {
                     value = builder->CreateLShr(a, b);
                 }
             }
@@ -3824,9 +3824,8 @@ void CodeGen_LLVM::visit(const Store *op) {
             Expr base = (ramp != nullptr) ? ramp->base : 0;
             Expr stride = (ramp != nullptr) ? ramp->stride : 0;
             Value *stride_val = (!is_dense && ramp != nullptr) ? codegen(stride) : nullptr;
-
             Value *index = (ramp == nullptr) ? codegen(op->index) : nullptr;
-            
+
             for (int i = 0; i < store_lanes; i += native_lanes) {
                 int slice_lanes = std::min(native_lanes, store_lanes - i);
                 Expr slice_base = simplify(base + i * stride);
@@ -3837,7 +3836,7 @@ void CodeGen_LLVM::visit(const Store *op) {
                 Value *vec_ptr = builder->CreatePointerCast(elt_ptr, slice_val->getType()->getPointerTo());
                 if (is_dense || slice_lanes == 1) {
                     if (try_vector_predication_intrinsic("llvm.vp.store", void_t, slice_lanes, AllEnabledMask(),
-                                                         { VPArg(slice_val, 0), VPArg(vec_ptr, 1, alignment) })) {
+                                                         {VPArg(slice_val, 0), VPArg(vec_ptr, 1, alignment)})) {
                         add_tbaa_metadata(dyn_cast<Instruction>(value), op->name, slice_index);
                     } else {
                         StoreInst *store = builder->CreateAlignedStore(slice_val, vec_ptr, llvm::Align(alignment));
@@ -3848,14 +3847,14 @@ void CodeGen_LLVM::visit(const Store *op) {
                         stride_val = builder->CreateIntCast(stride_val, i64_t, true);
                     }
                     bool generated = try_vector_predication_intrinsic("llvm.experimental.vp.strided.store", void_t, slice_lanes, AllEnabledMask(),
-                                                                      { VPArg(slice_val, 0), VPArg(vec_ptr, 1, alignment), VPArg(stride_val, 2) });
+                                                                      {VPArg(slice_val, 0), VPArg(vec_ptr, 1, alignment), VPArg(stride_val, 2)});
                     internal_assert(generated) << "Using vector predicated intrinsics, but code generation was not successful for strided store.\n"; 
                     add_tbaa_metadata(dyn_cast<Instruction>(value), op->name, slice_index);
                 } else {
                     Value *slice_index = slice_vector(index, i, slice_lanes);
                     Value *vec_ptrs = codegen_buffer_pointer(op->name, value_type, slice_index);
                     bool generated = try_vector_predication_intrinsic("llvm.vp.scatter", void_t, slice_lanes, AllEnabledMask(),
-                                                                      { VPArg(slice_val, 0), VPArg(vec_ptrs, 1, alignment) });
+                                                                      {VPArg(slice_val, 0), VPArg(vec_ptrs, 1, alignment)});
                     internal_assert(generated) << "Using vector predicated intrinsics, but code generation was not successful for gathering store.\n";
                 }
             }
@@ -4328,7 +4327,7 @@ void CodeGen_LLVM::codegen_vector_reduce(const VectorReduce *op, const Expr &ini
                 codegen(op->value);
                 llvm::Value *val = value;
                 bool generated = try_vector_predication_intrinsic(vp_name, llvm_type_of(op->value.type()), op->value.type().lanes(),
-                                                                  AllEnabledMask(), { VPArg(init), VPArg(val, 0) });
+                                                                  AllEnabledMask(), {VPArg(init), VPArg(val, 0)});
                 internal_assert(generated) << "Vector predication intrinsic generation failed for vector reduction " << name << "\n";
             } else {
                 std::stringstream build_name;
@@ -4668,7 +4667,6 @@ Value *CodeGen_LLVM::call_intrin(const Type &result_type, int intrin_lanes,
                        intrin, arg_values);
 }
 
-
 Value *CodeGen_LLVM::call_intrin(const llvm::Type *result_type, int intrin_lanes,
                                  const string &name, vector<Value *> arg_values,
                                  bool scalable_vector_result, bool is_reduction) {
@@ -4755,11 +4753,9 @@ Value *CodeGen_LLVM::call_intrin(const llvm::Type *result_type, int intrin_lanes
     llvm::FunctionType *intrin_type = intrin->getFunctionType();
     for (int i = 0; i < (int)arg_values.size(); i++) {
         if (arg_values[i]->getType() != intrin_type->getParamType(i)) {
-          debug(0) << "Normalizing fixed/scalable.\n";
             arg_values[i] = normalize_fixed_scalable_vector_type(intrin_type->getParamType(i), arg_values[i]);
         }
         if (arg_values[i]->getType() != intrin_type->getParamType(i)) {
-          debug(0) << "Bit casting type.\n";
             // There can be some mismatches in types, such as when passing scalar Halide type T
             // to LLVM vector type <1 x T>.
             arg_values[i] = builder->CreateBitCast(arg_values[i], intrin_type->getParamType(i));
@@ -5188,7 +5184,7 @@ bool CodeGen_LLVM::try_vector_predication_intrinsic(const std::string &name, llv
             llvm::Type *llvm_type = arg.value->getType();
             if (isa<PointerType>(llvm_type)) {
                 mangled_types[arg.mangle_index] = ".p0";
-            } else  {
+            } else {
                 mangled_types[arg.mangle_index] = mangle_llvm_vector_type(llvm_type);
             }
         }
@@ -5200,7 +5196,7 @@ bool CodeGen_LLVM::try_vector_predication_intrinsic(const std::string &name, llv
     }
 
     if (!std::holds_alternative<NoMask>(mask)) {
-       if (std::holds_alternative<AllEnabledMask>(mask)) {
+        if (std::holds_alternative<AllEnabledMask>(mask)) {
             llvm::ElementCount llvm_vector_ec;
             if (is_scalable) {
                 const auto *vt = cast<llvm::ScalableVectorType>(llvm_result_type);
@@ -5211,7 +5207,7 @@ bool CodeGen_LLVM::try_vector_predication_intrinsic(const std::string &name, llv
             }
             args.push_back(ConstantVector::getSplat(llvm_vector_ec, ConstantInt::get(i1_t, 1)));
         } else {
-           args.push_back(std::get<llvm::Value *>(mask));
+            args.push_back(std::get<llvm::Value *>(mask));
         }
     }
     args.push_back(ConstantInt::get(i32_t, length));
diff --git a/src/CodeGen_LLVM.h b/src/CodeGen_LLVM.h
index c1e5d46ccbf4..c6662a1b2ca7 100644
--- a/src/CodeGen_LLVM.h
+++ b/src/CodeGen_LLVM.h
@@ -597,8 +597,8 @@ class CodeGen_LLVM : public IRVisitor {
      * returns true if it an instuction is generated, otherwise
      * returns false. */
     bool try_vector_predication_comparison(const std::string &name, const Type &result_type,
-                                                   MaskVariant mask, llvm::Value *a, llvm::Value *b,
-                                                   const char *cmp_op);
+                                           MaskVariant mask, llvm::Value *a, llvm::Value *b,
+                                           const char *cmp_op);
 
     /** Generate an intrisic call if use_llvm_vp_intrinsics is true
      * and length is greater than 1. If generated, assigns result
diff --git a/src/CodeGen_RISCV.cpp b/src/CodeGen_RISCV.cpp
index db0d10fa24ca..60a5c3feff19 100644
--- a/src/CodeGen_RISCV.cpp
+++ b/src/CodeGen_RISCV.cpp
@@ -86,7 +86,7 @@ int CodeGen_RISCV::native_vector_bits() const {
 int CodeGen_RISCV::maximum_vector_bits() const {
     return native_vector_bits() * 8;
 }
-  
+
 int CodeGen_RISCV::target_vscale() const {
     if (target.vector_bits != 0 &&
         target.has_feature(Target::RVV)) {

From 100a5c1299ad1b2d6682b8ed4ebe12ee94d7c0fd Mon Sep 17 00:00:00 2001
From: Z Stern <zalman@google.com>
Date: Fri, 21 Oct 2022 14:28:32 +0000
Subject: [PATCH 10/16] Formatting.

---
 src/CodeGen_LLVM.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp
index 73ead1c95cc4..fe23a334bc62 100644
--- a/src/CodeGen_LLVM.cpp
+++ b/src/CodeGen_LLVM.cpp
@@ -2032,7 +2032,7 @@ void CodeGen_LLVM::visit(const Load *op) {
 
         llvm::Type *load_type = llvm_type_of(op->type.element_of());
         if (ramp && stride && stride->value == 1) {
-          value = codegen_dense_vector_load(op, nullptr);
+            value = codegen_dense_vector_load(op, nullptr);
         } else if (ramp && stride && 2 <= stride->value && stride->value <= 4) {
             // Try to rewrite strided loads as shuffles of dense loads,
             // aligned to the stride. This makes adjacent strided loads
@@ -3848,7 +3848,7 @@ void CodeGen_LLVM::visit(const Store *op) {
                     }
                     bool generated = try_vector_predication_intrinsic("llvm.experimental.vp.strided.store", void_t, slice_lanes, AllEnabledMask(),
                                                                       {VPArg(slice_val, 0), VPArg(vec_ptr, 1, alignment), VPArg(stride_val, 2)});
-                    internal_assert(generated) << "Using vector predicated intrinsics, but code generation was not successful for strided store.\n"; 
+                    internal_assert(generated) << "Using vector predicated intrinsics, but code generation was not successful for strided store.\n";
                     add_tbaa_metadata(dyn_cast<Instruction>(value), op->name, slice_index);
                 } else {
                     Value *slice_index = slice_vector(index, i, slice_lanes);
@@ -5173,7 +5173,7 @@ bool CodeGen_LLVM::try_vector_predication_intrinsic(const std::string &name, llv
     internal_assert(!(any_scalable && any_fixed)) << "Cannot combine fixed and scalable vectors to vector predication intrinsic.\n";
 
     bool is_scalable = any_scalable;
-    
+
     std::vector<llvm::Value *> args;
     args.reserve(2 + vp_args.size());
     std::vector<string> mangled_types(args.size());

From db0ea7c4331eb21e1b8ab5cf6a6474721b9c1ac7 Mon Sep 17 00:00:00 2001
From: Z Stern <zalman@google.com>
Date: Fri, 21 Oct 2022 14:31:04 +0000
Subject: [PATCH 11/16] More formatting.

---
 src/CodeGen_LLVM.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp
index fe23a334bc62..f5523ccaae45 100644
--- a/src/CodeGen_LLVM.cpp
+++ b/src/CodeGen_LLVM.cpp
@@ -5234,7 +5234,7 @@ bool CodeGen_LLVM::try_vector_predication_comparison(const std::string &name, co
     llvm::MDBuilder builder(*context);
     llvm::Value *md_val = llvm::MetadataAsValue::get(*context, builder.createString(cmp_op));
     return try_vector_predication_intrinsic(name, llvm_type_of(result_type), result_type.lanes(), mask,
-                                            { VPArg(a, 0), VPArg(b), VPArg(md_val) });
+                                            {VPArg(a, 0), VPArg(b), VPArg(md_val)});
 }
 
 }  // namespace Internal

From 96dcd93351d1ba35abfa5970a99361b9ba02ded8 Mon Sep 17 00:00:00 2001
From: Z Stern <zalman@google.com>
Date: Mon, 24 Oct 2022 18:06:24 +0000
Subject: [PATCH 12/16] Use std::optional instead of -1 bottom value for
 mangle_index. Simple caveperson programmer habits die hard.

Improve comments.
---
 src/CodeGen_LLVM.cpp |  6 +++---
 src/CodeGen_LLVM.h   | 15 +++++++++++----
 2 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp
index f5523ccaae45..a3fe55ec6e6c 100644
--- a/src/CodeGen_LLVM.cpp
+++ b/src/CodeGen_LLVM.cpp
@@ -5180,12 +5180,12 @@ bool CodeGen_LLVM::try_vector_predication_intrinsic(const std::string &name, llv
 
     for (const VPArg &arg : vp_args) {
         args.push_back(arg.value);
-        if (arg.mangle_index != -1) {
+        if (arg.mangle_index) {
             llvm::Type *llvm_type = arg.value->getType();
             if (isa<PointerType>(llvm_type)) {
-                mangled_types[arg.mangle_index] = ".p0";
+                mangled_types[arg.mangle_index.value()] = ".p0";
             } else {
-                mangled_types[arg.mangle_index] = mangle_llvm_vector_type(llvm_type);
+                mangled_types[arg.mangle_index.value()] = mangle_llvm_vector_type(llvm_type);
             }
         }
     }
diff --git a/src/CodeGen_LLVM.h b/src/CodeGen_LLVM.h
index c6662a1b2ca7..029e181f5806 100644
--- a/src/CodeGen_LLVM.h
+++ b/src/CodeGen_LLVM.h
@@ -34,6 +34,7 @@ class GlobalVariable;
 
 #include <map>
 #include <memory>
+#include <optional>
 #include <string>
 #include <variant>
 #include <vector>
@@ -303,7 +304,11 @@ class CodeGen_LLVM : public IRVisitor {
     llvm::Value *codegen_buffer_pointer(llvm::Value *base_address, Type type, llvm::Value *index);
     // @}
 
-    /** Return an appropriate type string for a type which is of VectorType. */
+    /** Return type string for LLVM vector type using LLVM IR intrinsic type mangling.
+     * E.g. ".nxv4i32" for a scalable vector of four 32-bit integers,
+     * or ".v4f32" for a fixed vector of four 32-bit floats.
+     * The dot is included in the result.
+     */
     std::string mangle_llvm_vector_type(llvm::Type *type);
 
     /** Turn a Halide Type into an llvm::Value representing a constant halide_type_t */
@@ -572,9 +577,10 @@ class CodeGen_LLVM : public IRVisitor {
      *  and if so, where, and the alignment for pointer arguments. */
     struct VPArg {
         llvm::Value *value;
-        int mangle_index;
+        // If provided, put argument's type into the intrinsic name via LLVM IR type mangling.
+        std::optional<size_t> mangle_index;
         int alignment;
-        VPArg(llvm::Value *value, int32_t mangle_index = -1, int32_t alignment = 0)
+        VPArg(llvm::Value *value, std::optional<size_t> mangle_index = std::nullopt, int32_t alignment = 0)
             : value(value), mangle_index(mangle_index), alignment(alignment) {
         }
     };
@@ -607,7 +613,8 @@ class CodeGen_LLVM : public IRVisitor {
     bool try_vector_predication_intrinsic(const std::string &name, llvm::Type *llvm_result_type,
                                           int32_t length, MaskVariant mask, std::vector<VPArg> args);
 
-    /** Controls use of vector predicated intrinsics for vector operations. */
+    /** Controls use of vector predicated intrinsics for vector operations.
+     * Will be set by certain backends (e.g. RISC V) to control codegen. */
     bool use_llvm_vp_intrinsics;
     // @}
 

From 51f3e35b81e7f9ac6de61e8791bafec68c955d42 Mon Sep 17 00:00:00 2001
From: Z Stern <zalman@google.com>
Date: Mon, 24 Oct 2022 18:09:28 +0000
Subject: [PATCH 13/16] Switch to using instead of typedef per review feedback.

---
 src/CodeGen_LLVM.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/CodeGen_LLVM.h b/src/CodeGen_LLVM.h
index 029e181f5806..10d5268ec0bd 100644
--- a/src/CodeGen_LLVM.h
+++ b/src/CodeGen_LLVM.h
@@ -595,7 +595,7 @@ class CodeGen_LLVM : public IRVisitor {
 
     /** Predication mask using the above two types for special cases
      *   and an llvm::Value for the general one. */
-    typedef std::variant<NoMask, AllEnabledMask, llvm::Value *> MaskVariant;
+    using MaskVariant = std::variant<NoMask, AllEnabledMask, llvm::Value *>;
 
     /** Generate a vector predicated comparison intrinsic call if
      * use_llvm_vp_intrinsics is true and result_type is a vector

From cb0cbbc96e5158fd3abaab6cc03054b2b581665b Mon Sep 17 00:00:00 2001
From: Z Stern <zalman@google.com>
Date: Mon, 24 Oct 2022 18:39:59 +0000
Subject: [PATCH 14/16] Address review feedback re: default arguments, moving
 string concatenation into one line.

---
 src/CodeGen_LLVM.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp
index a3fe55ec6e6c..398849a193c5 100644
--- a/src/CodeGen_LLVM.cpp
+++ b/src/CodeGen_LLVM.cpp
@@ -2032,7 +2032,7 @@ void CodeGen_LLVM::visit(const Load *op) {
 
         llvm::Type *load_type = llvm_type_of(op->type.element_of());
         if (ramp && stride && stride->value == 1) {
-            value = codegen_dense_vector_load(op, nullptr);
+            value = codegen_dense_vector_load(op);
         } else if (ramp && stride && 2 <= stride->value && stride->value <= 4) {
             // Try to rewrite strided loads as shuffles of dense loads,
             // aligned to the stride. This makes adjacent strided loads
@@ -2088,7 +2088,8 @@ void CodeGen_LLVM::visit(const Load *op) {
                 Expr slice_base = simplify(base + load_base_i);
 
                 Value *load_i = codegen_vector_load(op->type.with_lanes(load_lanes_i), op->name, slice_base,
-                                                    op->image, op->param, align, nullptr, false, nullptr);
+                                                    op->image, op->param, align, /*vpred=*/nullptr,
+                                                    /*slice_to_native=*/false);
 
                 std::vector<int> constants;
                 for (int j = 0; j < lanes_i; j++) {
@@ -4320,8 +4321,7 @@ void CodeGen_LLVM::codegen_vector_reduce(const VectorReduce *op, const Expr &ini
             }
 
             if (use_llvm_vp_intrinsics) {
-                string vp_name = "llvm.vp.reduce.";
-                vp_name += name;
+                string vp_name = "llvm.vp.reduce." + name;
                 codegen(initial_value);
                 llvm::Value *init = value;
                 codegen(op->value);

From 740f1210c687947da456c0fee420cdfb6b072872 Mon Sep 17 00:00:00 2001
From: Z Stern <zalman@google.com>
Date: Mon, 24 Oct 2022 18:55:09 +0000
Subject: [PATCH 15/16] Add GitHub issue for fmax/fmin strict_float TODO.

Change TODO(zalman) to TODO(zvookin) uniformly.

Few other cleanups.
---
 src/CodeGen_LLVM.cpp | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp
index 398849a193c5..ea5e681a3ab1 100644
--- a/src/CodeGen_LLVM.cpp
+++ b/src/CodeGen_LLVM.cpp
@@ -1518,7 +1518,7 @@ void CodeGen_LLVM::visit(const Add *op) {
     } else if (op->type.is_int() && op->type.bits() >= 32) {
         // We tell llvm integers don't wrap, so that it generates good
         // code for loop indices.
-        // TODO(zalman): This needs vector predication, but I can't
+        // TODO(zvookin): This needs vector predication, but I can't
         // see a way to do it. May go away in introducing correct
         // index type instead of using int32_t.
         value = builder->CreateNSWAdd(a, b);
@@ -1547,7 +1547,7 @@ void CodeGen_LLVM::visit(const Sub *op) {
     } else if (op->type.is_int() && op->type.bits() >= 32) {
         // We tell llvm integers don't wrap, so that it generates good
         // code for loop indices.
-        // TODO(zalman): This needs vector predication, but I can't
+        // TODO(zvookin): This needs vector predication, but I can't
         // see a way to do it. May go away in introducing correct
         // index type instead of using int32_t.
         value = builder->CreateNSWSub(a, b);
@@ -1580,7 +1580,7 @@ void CodeGen_LLVM::visit(const Mul *op) {
     } else if (op->type.is_int() && op->type.bits() >= 32) {
         // We tell llvm integers don't wrap, so that it generates good
         // code for loop indices.
-        // TODO(zalman): This needs vector predication, but I can't
+        // TODO(zvookin): This needs vector predication, but I can't
         // see a way to do it. May go away in introducing correct
         // index type instead of using int32_t.
         value = builder->CreateNSWMul(a, b);
@@ -3679,7 +3679,7 @@ void CodeGen_LLVM::visit(const For *op) {
     Value *extent = codegen(op->extent);
     const Acquire *acquire = op->body.as<Acquire>();
 
-    // TODO(zalman): remove this after validating it doesn't happen
+    // TODO(zvookin): remove this after validating it doesn't happen
     internal_assert(!(op->for_type == ForType::Parallel ||
                       (op->for_type == ForType::Serial &&
                        acquire &&
@@ -3790,7 +3790,7 @@ void CodeGen_LLVM::visit(const Store *op) {
     } else {
         int alignment = value_type.bytes();
         const Ramp *ramp = op->index.as<Ramp>();
-        // TODO(zalman): consider splitting out vector predication path. Current
+        // TODO(zvookin): consider splitting out vector predication path. Current
         // code shows how vector predication would simplify things as the
         // following scalarization cases would go away.
         bool is_dense = ramp && is_const_one(ramp->stride);
@@ -4274,14 +4274,14 @@ void CodeGen_LLVM::codegen_vector_reduce(const VectorReduce *op, const Expr &ini
                     break;
                 case VectorReduce::Min:
                     name = "fmin";
-                    // TODO(zvookin): For signed case, whether this is Inf or the max floating-point value depends on strict_float. (Or maybe it is QNaN in strict_float.)
+                    // TODO(zvookin): Not correct for stricT_float. See: https://github.com/halide/Halide/issues/7118
                     if (takes_initial_value && !initial_value.defined()) {
                         initial_value = op->type.max();
                     }
                     break;
                 case VectorReduce::Max:
                     name = "fmax";
-                    // TODO(zvookin): For signed case, whether this is -Inf or the min floating-point value depends on strict_float. (Or maybe it is -QNaN in strict_float.)
+                    // TODO(zvookin): Not correct for stricT_float. See: https://github.com/halide/Halide/issues/7118
                     if (takes_initial_value && !initial_value.defined()) {
                         initial_value = op->type.min();
                     }
@@ -4321,7 +4321,7 @@ void CodeGen_LLVM::codegen_vector_reduce(const VectorReduce *op, const Expr &ini
             }
 
             if (use_llvm_vp_intrinsics) {
-                string vp_name = "llvm.vp.reduce." + name;
+                string vp_name = "llvm.vp.reduce." + std::string(name);
                 codegen(initial_value);
                 llvm::Value *init = value;
                 codegen(op->value);

From 2c0df5f5334f20806be0b1311a24e4bfd4482db8 Mon Sep 17 00:00:00 2001
From: Z Stern <zalman@google.com>
Date: Mon, 24 Oct 2022 23:49:23 +0000
Subject: [PATCH 16/16] Rearrage the maze of twisty passages to not use vector
 predicated strided load for dense case. Add some comments.

---
 src/CodeGen_LLVM.cpp | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp
index ea5e681a3ab1..df9f331cfe15 100644
--- a/src/CodeGen_LLVM.cpp
+++ b/src/CodeGen_LLVM.cpp
@@ -2441,6 +2441,11 @@ llvm::Value *CodeGen_LLVM::codegen_vector_load(const Type &type, const std::stri
         Value *slice_mask = (vpred != nullptr) ? slice_vector(vpred, i, slice_lanes) : nullptr;
 
         Instruction *load_inst = nullptr;
+        // In this path, strided predicated loads are only handled if vector
+        // predication is enabled. Otherwise this would be scalarized at a higher
+        // level. Assume that if stride is passed, this is not dense, though
+        // LLVM should codegen the same thing for a constant 1 strided load as
+        // for a non-strided load.
         if (stride) {
             if (get_target().bits == 64 && !stride->getType()->isIntegerTy(64)) {
                 stride = builder->CreateIntCast(stride, i64_t, true);
@@ -2482,17 +2487,14 @@ void CodeGen_LLVM::codegen_predicated_load(const Load *op) {
     const Ramp *ramp = op->index.as<Ramp>();
     const IntImm *stride = ramp ? ramp->stride.as<IntImm>() : nullptr;
 
-    if (use_llvm_vp_intrinsics && stride) {
-        Value *vpred = codegen(op->predicate);
-        Value *llvm_stride = codegen(stride);
-        value = codegen_vector_load(op->type, op->name, ramp->base, op->image, op->param,
-                                    op->alignment, vpred, true, llvm_stride);
-        return;
-    }
-
     if (ramp && is_const_one(ramp->stride)) {  // Dense vector load
         Value *vpred = codegen(op->predicate);
         value = codegen_dense_vector_load(op, vpred);
+    } else if (use_llvm_vp_intrinsics && stride) {  // Case only handled by vector predication, otherwise must scalarize.
+        Value *vpred = codegen(op->predicate);
+        Value *llvm_stride = codegen(stride);  // Not 1 (dense) as that was caught above.
+        value = codegen_vector_load(op->type, op->name, ramp->base, op->image, op->param,
+                                    op->alignment, vpred, true, llvm_stride);
     } else if (ramp && stride && stride->value == -1) {
         debug(4) << "Predicated dense vector load with stride -1\n\t" << Expr(op) << "\n";
         vector<int> indices(ramp->lanes);