From a678da16ff2e009e3e097734bc8c68ec812fdedc Mon Sep 17 00:00:00 2001 From: Z Stern Date: Fri, 16 Sep 2022 19:27:34 +0000 Subject: [PATCH 01/16] Add support for generating llvm.vp.* intrinsics. This is particularly useful for RISC V, but it may be a simpler, better optimized path, for Halide vector operations in general. Add support for a maximum vector size that might be larger than the native vector size. RISC V vector LMUL support is an example of an architecture supporting this. --- src/CodeGen_LLVM.cpp | 287 ++++++++++++++++++++++++++++++++++-------- src/CodeGen_LLVM.h | 43 ++++++- src/CodeGen_RISCV.cpp | 24 +++- 3 files changed, 295 insertions(+), 59 deletions(-) diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp index 1b1e17326532..56aa4c573a27 100644 --- a/src/CodeGen_LLVM.cpp +++ b/src/CodeGen_LLVM.cpp @@ -217,6 +217,7 @@ CodeGen_LLVM::CodeGen_LLVM(const Target &t) inside_atomic_mutex_node(false), emit_atomic_stores(false), + use_llvm_vp_intrinsics(false), destructor_block(nullptr), strict_float(t.has_feature(Target::StrictFloat)), @@ -1536,13 +1537,18 @@ void CodeGen_LLVM::visit(const Add *op) { Value *a = codegen(op->a); Value *b = codegen(op->b); if (op->type.is_float()) { - value = builder->CreateFAdd(a, b); + if (!call_vector_predication_intrinsic("fadd", t, nullptr, a, b)) { + value = builder->CreateFAdd(a, b); + } } else if (op->type.is_int() && op->type.bits() >= 32) { // We tell llvm integers don't wrap, so that it generates good // code for loop indices. + // TODO(zalman): Figure out if vector predication needs to/can work here. value = builder->CreateNSWAdd(a, b); } else { - value = builder->CreateAdd(a, b); + if (!call_vector_predication_intrinsic("add", t, nullptr, a, b)) { + value = builder->CreateAdd(a, b); + } } } @@ -1556,13 +1562,18 @@ void CodeGen_LLVM::visit(const Sub *op) { Value *a = codegen(op->a); Value *b = codegen(op->b); if (op->type.is_float()) { - value = builder->CreateFSub(a, b); + if (!call_vector_predication_intrinsic("fsub", t, nullptr, a, b)) { + value = builder->CreateFSub(a, b); + } } else if (op->type.is_int() && op->type.bits() >= 32) { // We tell llvm integers don't wrap, so that it generates good // code for loop indices. + // TODO(zalman): Figure out if vector predication needs to/can work here. value = builder->CreateNSWSub(a, b); } else { - value = builder->CreateSub(a, b); + if (!call_vector_predication_intrinsic("sub", t, nullptr, a, b)) { + value = builder->CreateSub(a, b); + } } } @@ -1580,13 +1591,18 @@ void CodeGen_LLVM::visit(const Mul *op) { Value *a = codegen(op->a); Value *b = codegen(op->b); if (op->type.is_float()) { - value = builder->CreateFMul(a, b); + if (!call_vector_predication_intrinsic("fmul", t, nullptr, a, b)) { + value = builder->CreateFMul(a, b); + } } else if (op->type.is_int() && op->type.bits() >= 32) { // We tell llvm integers don't wrap, so that it generates good // code for loop indices. + // TODO(zalman): Figure out if vector predication needs to/can work here. value = builder->CreateNSWMul(a, b); } else { - value = builder->CreateMul(a, b); + if (!call_vector_predication_intrinsic("mul", t, nullptr, a, b)) { + value = builder->CreateMul(a, b); + } } } @@ -1606,7 +1622,9 @@ void CodeGen_LLVM::visit(const Div *op) { // output hard. Value *a = codegen(op->a); Value *b = codegen(op->b); - value = builder->CreateFDiv(a, b); + if (!call_vector_predication_intrinsic("fdiv", t, nullptr, a, b)) { + value = builder->CreateFDiv(a, b); + } } else { value = codegen(lower_int_uint_div(op->a, op->b)); } @@ -1676,9 +1694,13 @@ void CodeGen_LLVM::visit(const EQ *op) { Value *a = codegen(op->a); Value *b = codegen(op->b); if (t.is_float()) { - value = builder->CreateFCmpOEQ(a, b); + if (!call_vector_predication_comparison("fcmp", t, nullptr, a, b, "oeq")) { + value = builder->CreateFCmpOEQ(a, b); + } } else { - value = builder->CreateICmpEQ(a, b); + if (!call_vector_predication_comparison("icmp", t, nullptr, a, b, "eq")) { + value = builder->CreateICmpEQ(a, b); + } } } @@ -1692,9 +1714,13 @@ void CodeGen_LLVM::visit(const NE *op) { Value *a = codegen(op->a); Value *b = codegen(op->b); if (t.is_float()) { - value = builder->CreateFCmpONE(a, b); + if (!call_vector_predication_comparison("fcmp", t, nullptr, a, b, "one")) { + value = builder->CreateFCmpONE(a, b); + } } else { - value = builder->CreateICmpNE(a, b); + if (!call_vector_predication_comparison("icmp", t, nullptr, a, b, "ne")) { + value = builder->CreateICmpNE(a, b); + } } } @@ -1708,11 +1734,17 @@ void CodeGen_LLVM::visit(const LT *op) { Value *a = codegen(op->a); Value *b = codegen(op->b); if (t.is_float()) { - value = builder->CreateFCmpOLT(a, b); + if (!call_vector_predication_comparison("fcmp", t, nullptr, a, b, "olt")) { + value = builder->CreateFCmpOLT(a, b); + } } else if (t.is_int()) { - value = builder->CreateICmpSLT(a, b); + if (!call_vector_predication_comparison("icmp", t, nullptr, a, b, "slt")) { + value = builder->CreateICmpSLT(a, b); + } } else { - value = builder->CreateICmpULT(a, b); + if (!call_vector_predication_comparison("icmp", t, nullptr, a, b, "ult")) { + value = builder->CreateICmpULT(a, b); + } } } @@ -1726,11 +1758,17 @@ void CodeGen_LLVM::visit(const LE *op) { Value *a = codegen(op->a); Value *b = codegen(op->b); if (t.is_float()) { - value = builder->CreateFCmpOLE(a, b); + if (!call_vector_predication_comparison("fcmp", t, nullptr, a, b, "ole")) { + value = builder->CreateFCmpOLE(a, b); + } } else if (t.is_int()) { - value = builder->CreateICmpSLE(a, b); + if (!call_vector_predication_comparison("icmp", t, nullptr, a, b, "sle")) { + value = builder->CreateICmpSLE(a, b); + } } else { - value = builder->CreateICmpULE(a, b); + if (!call_vector_predication_comparison("icmp", t, nullptr, a, b, "ule")) { + value = builder->CreateICmpULE(a, b); + } } } @@ -1745,11 +1783,17 @@ void CodeGen_LLVM::visit(const GT *op) { Value *b = codegen(op->b); if (t.is_float()) { - value = builder->CreateFCmpOGT(a, b); + if (!call_vector_predication_comparison("fcmp", t, nullptr, a, b, "ogt")) { + value = builder->CreateFCmpOGT(a, b); + } } else if (t.is_int()) { - value = builder->CreateICmpSGT(a, b); + if (!call_vector_predication_comparison("icmp", t, nullptr, a, b, "sgt")) { + value = builder->CreateICmpSGT(a, b); + } } else { - value = builder->CreateICmpUGT(a, b); + if (!call_vector_predication_comparison("icmp", t, nullptr, a, b, "ugt")) { + value = builder->CreateICmpUGT(a, b); + } } } @@ -1763,11 +1807,17 @@ void CodeGen_LLVM::visit(const GE *op) { Value *a = codegen(op->a); Value *b = codegen(op->b); if (t.is_float()) { - value = builder->CreateFCmpOGE(a, b); + if (!call_vector_predication_comparison("fcmp", t, nullptr, a, b, "oge")) { + value = builder->CreateFCmpOGE(a, b); + } } else if (t.is_int()) { - value = builder->CreateICmpSGE(a, b); + if (!call_vector_predication_comparison("icmp", t, nullptr, a, b, "sge")) { + value = builder->CreateICmpSGE(a, b); + } } else { - value = builder->CreateICmpUGE(a, b); + if (!call_vector_predication_comparison("icmp", t, nullptr, a, b, "uge")) { + value = builder->CreateICmpUGE(a, b); + } } } @@ -1778,7 +1828,9 @@ void CodeGen_LLVM::visit(const And *op) { Value *a = codegen(op->a); Value *b = codegen(op->b); - value = builder->CreateAnd(a, b); + if (!call_vector_predication_intrinsic("and", op->type, nullptr, a, b)) { + value = builder->CreateAnd(a, b); + } } void CodeGen_LLVM::visit(const Or *op) { @@ -1788,19 +1840,25 @@ void CodeGen_LLVM::visit(const Or *op) { Value *a = codegen(op->a); Value *b = codegen(op->b); - value = builder->CreateOr(a, b); + if (!call_vector_predication_intrinsic("or", op->type, nullptr, a, b)) { + value = builder->CreateOr(a, b); + } } void CodeGen_LLVM::visit(const Not *op) { Value *a = codegen(op->a); - value = builder->CreateNot(a); + if (!call_vector_predication_intrinsic("not", op->type, nullptr, a)) { + value = builder->CreateNot(a); + } } void CodeGen_LLVM::visit(const Select *op) { Value *cmp = codegen(op->condition); Value *a = codegen(op->true_value); Value *b = codegen(op->false_value); - value = builder->CreateSelect(cmp, a, b); + if (!call_vector_predication_intrinsic("select", op->type, nullptr, a, b, cmp)) { + value = builder->CreateSelect(cmp, a, b); + } } namespace { @@ -1971,7 +2029,7 @@ void CodeGen_LLVM::visit(const Load *op) { llvm::Type *load_type = llvm_type_of(op->type.element_of()); if (ramp && stride && stride->value == 1) { - value = codegen_dense_vector_load(op); + value = codegen_dense_vector_load(op, nullptr); } else if (ramp && stride && 2 <= stride->value && stride->value <= 4) { // Try to rewrite strided loads as shuffles of dense loads, // aligned to the stride. This makes adjacent strided loads @@ -2026,8 +2084,8 @@ void CodeGen_LLVM::visit(const Load *op) { int lanes_i = std::min(slice_lanes, op->type.lanes() - i); Expr slice_base = simplify(base + load_base_i); - Value *load_i = codegen_dense_vector_load(op->type.with_lanes(load_lanes_i), op->name, slice_base, - op->image, op->param, align, nullptr, false); + Value *load_i = codegen_vector_load(op->type.with_lanes(load_lanes_i), op->name, slice_base, + op->image, op->param, align, nullptr, false, nullptr); std::vector constants; for (int j = 0; j < lanes_i; j++) { @@ -2241,8 +2299,7 @@ void CodeGen_LLVM::codegen_predicated_store(const Store *op) { Halide::Type value_type = op->value.type(); Value *val = codegen(op->value); int alignment = value_type.bytes(); - int native_bits = native_vector_bits(); - int native_bytes = native_bits / 8; + int native_bytes = native_vector_bits() / 8; // Boost the alignment if possible, up to the native vector width. ModulusRemainder mod_rem = op->alignment; @@ -2265,7 +2322,7 @@ void CodeGen_LLVM::codegen_predicated_store(const Store *op) { // For dense vector stores wider than the native vector // width, bust them up into native vectors. int store_lanes = value_type.lanes(); - int native_lanes = native_bits / value_type.bits(); + int native_lanes = maximum_vector_bits() / value_type.bits(); for (int i = 0; i < store_lanes; i += native_lanes) { int slice_lanes = std::min(native_lanes, store_lanes - i); @@ -2277,8 +2334,13 @@ void CodeGen_LLVM::codegen_predicated_store(const Store *op) { Value *vec_ptr = builder->CreatePointerCast(elt_ptr, slice_val->getType()->getPointerTo()); Value *slice_mask = slice_vector(vpred, i, slice_lanes); - Instruction *store = - builder->CreateMaskedStore(slice_val, vec_ptr, llvm::Align(alignment), slice_mask); + Instruction *store; + if (call_vector_predication_intrinsic("store", value_type.with_lanes(slice_lanes), slice_mask, slice_val, + vec_ptr, nullptr, alignment, ".p0")) { + store = dyn_cast(value); + } else { + store = builder->CreateMaskedStore(slice_val, vec_ptr, llvm::Align(alignment), slice_mask); + } add_tbaa_metadata(store, op->name, slice_index); } } else { // It's not dense vector store, we need to scalarize it @@ -2322,9 +2384,9 @@ void CodeGen_LLVM::codegen_predicated_store(const Store *op) { } } -llvm::Value *CodeGen_LLVM::codegen_dense_vector_load(const Type &type, const std::string &name, const Expr &base, - const Buffer<> &image, const Parameter ¶m, const ModulusRemainder &alignment, - llvm::Value *vpred, bool slice_to_native) { +llvm::Value *CodeGen_LLVM::codegen_vector_load(const Type &type, const std::string &name, const Expr &base, + const Buffer<> &image, const Parameter ¶m, const ModulusRemainder &alignment, + llvm::Value *vpred, bool slice_to_native, llvm::Value *stride) { debug(4) << "Vectorize predicated dense vector load:\n\t" << "(" << type << ")" << name << "[ramp(base, 1, " << type.lanes() << ")]\n"; @@ -2361,7 +2423,7 @@ llvm::Value *CodeGen_LLVM::codegen_dense_vector_load(const Type &type, const std // For dense vector loads wider than the native vector // width, bust them up into native vectors int load_lanes = type.lanes(); - int native_lanes = slice_to_native ? std::max(1, native_bits / type.bits()) : load_lanes; + int native_lanes = slice_to_native ? std::max(1, maximum_vector_bits() / type.bits()) : load_lanes; vector slices; for (int i = 0; i < load_lanes; i += native_lanes) { int slice_lanes = std::min(native_lanes, load_lanes - i); @@ -2372,12 +2434,27 @@ llvm::Value *CodeGen_LLVM::codegen_dense_vector_load(const Type &type, const std Value *elt_ptr = codegen_buffer_pointer(name, type.element_of(), slice_base); Value *vec_ptr = builder->CreatePointerCast(elt_ptr, slice_type->getPointerTo()); + Value *slice_mask = (vpred != nullptr) ? slice_vector(vpred, i, slice_lanes) : nullptr; + Instruction *load_inst; - if (vpred != nullptr) { - Value *slice_mask = slice_vector(vpred, i, slice_lanes); - load_inst = builder->CreateMaskedLoad(slice_type, vec_ptr, llvm::Align(align_bytes), slice_mask); + if (stride) { + if (call_vector_predication_intrinsic("strided.load", type.with_lanes(slice_lanes), slice_mask, + vec_ptr, stride, nullptr, align_bytes, ".i64")) { + load_inst = dyn_cast(value); + } else { + internal_error << "Vector predicated strided load should not be requested if not supported.\n"; + } } else { - load_inst = builder->CreateAlignedLoad(slice_type, vec_ptr, llvm::Align(align_bytes)); + if (call_vector_predication_intrinsic("load", type.with_lanes(slice_lanes), slice_mask, + vec_ptr, nullptr, nullptr, align_bytes, ".p0")) { + load_inst = dyn_cast(value); + } else { + if (slice_mask != nullptr) { + load_inst = builder->CreateMaskedLoad(slice_type, vec_ptr, llvm::Align(align_bytes), slice_mask); + } else { + load_inst = builder->CreateAlignedLoad(slice_type, vec_ptr, llvm::Align(align_bytes)); + } + } } add_tbaa_metadata(load_inst, name, slice_index); slices.push_back(load_inst); @@ -2390,14 +2467,22 @@ Value *CodeGen_LLVM::codegen_dense_vector_load(const Load *load, Value *vpred, b const Ramp *ramp = load->index.as(); internal_assert(ramp && is_const_one(ramp->stride)) << "Should be dense vector load\n"; - return codegen_dense_vector_load(load->type, load->name, ramp->base, load->image, load->param, - load->alignment, vpred, slice_to_native); + return codegen_vector_load(load->type, load->name, ramp->base, load->image, load->param, + load->alignment, vpred, slice_to_native, nullptr); } void CodeGen_LLVM::codegen_predicated_load(const Load *op) { const Ramp *ramp = op->index.as(); const IntImm *stride = ramp ? ramp->stride.as() : nullptr; + if (use_llvm_vp_intrinsics && stride) { + Value *vpred = codegen(op->predicate); + Value *llvm_stride = codegen(stride); + value = codegen_vector_load(op->type, op->name, ramp->base, op->image, op->param, + op->alignment, vpred, true, llvm_stride); + return; + } + if (ramp && is_const_one(ramp->stride)) { // Dense vector load Value *vpred = codegen(op->predicate); value = codegen_dense_vector_load(op, vpred); @@ -2594,27 +2679,37 @@ void CodeGen_LLVM::visit(const Call *op) { internal_assert(op->args.size() == 2); Value *a = codegen(op->args[0]); Value *b = codegen(op->args[1]); - value = builder->CreateAnd(a, b); + if (!call_vector_predication_intrinsic("and", op->type, nullptr, a, b)) { + value = builder->CreateAnd(a, b); + } } else if (op->is_intrinsic(Call::bitwise_xor)) { internal_assert(op->args.size() == 2); Value *a = codegen(op->args[0]); Value *b = codegen(op->args[1]); - value = builder->CreateXor(a, b); + if (!call_vector_predication_intrinsic("xor", op->type, nullptr, a, b)) { + value = builder->CreateXor(a, b); + } } else if (op->is_intrinsic(Call::bitwise_or)) { internal_assert(op->args.size() == 2); Value *a = codegen(op->args[0]); Value *b = codegen(op->args[1]); - value = builder->CreateOr(a, b); + if (!call_vector_predication_intrinsic("or", op->type, nullptr, a, b)) { + value = builder->CreateOr(a, b); + } } else if (op->is_intrinsic(Call::bitwise_not)) { internal_assert(op->args.size() == 1); Value *a = codegen(op->args[0]); - value = builder->CreateNot(a); + if (!call_vector_predication_intrinsic("not", op->type, nullptr, a)) { + value = builder->CreateNot(a); + } } else if (op->is_intrinsic(Call::shift_left)) { internal_assert(op->args.size() == 2); if (op->args[1].type().is_uint()) { Value *a = codegen(op->args[0]); Value *b = codegen(op->args[1]); - value = builder->CreateShl(a, b); + if (!call_vector_predication_intrinsic("shl", op->type, nullptr, a, b)) { + value = builder->CreateShl(a, b); + } } else { value = codegen(lower_signed_shift_left(op->args[0], op->args[1])); } @@ -2624,9 +2719,13 @@ void CodeGen_LLVM::visit(const Call *op) { Value *a = codegen(op->args[0]); Value *b = codegen(op->args[1]); if (op->type.is_int()) { - value = builder->CreateAShr(a, b); + if (!call_vector_predication_intrinsic("ashr", op->type, nullptr, a, b)) { + value = builder->CreateAShr(a, b); + } } else { - value = builder->CreateLShr(a, b); + if (!call_vector_predication_intrinsic("lshr", op->type, nullptr, a, b)) { + value = builder->CreateLShr(a, b); + } } } else { value = codegen(lower_signed_shift_right(op->args[0], op->args[1])); @@ -3701,7 +3800,7 @@ void CodeGen_LLVM::visit(const Store *op) { // For dense vector stores wider than the native vector // width, bust them up into native vectors. int store_lanes = value_type.lanes(); - int native_lanes = native_bits / value_type.bits(); + int native_lanes = maximum_vector_bits() / value_type.bits(); for (int i = 0; i < store_lanes; i += native_lanes) { int slice_lanes = std::min(native_lanes, store_lanes - i); @@ -4020,7 +4119,7 @@ void CodeGen_LLVM::visit(const VectorReduce *op) { void CodeGen_LLVM::codegen_vector_reduce(const VectorReduce *op, const Expr &init) { Expr val = op->value; const int output_lanes = op->type.lanes(); - const int native_lanes = native_vector_bits() / op->type.bits(); + const int native_lanes = maximum_vector_bits() / op->type.bits(); const int factor = val.type().lanes() / output_lanes; Type elt = op->type.element_of(); @@ -4570,9 +4669,11 @@ Value *CodeGen_LLVM::call_intrin(const llvm::Type *result_type, int intrin_lanes llvm::FunctionType *intrin_type = intrin->getFunctionType(); for (int i = 0; i < (int)arg_values.size(); i++) { if (arg_values[i]->getType() != intrin_type->getParamType(i)) { + debug(0) << "Normalizing fixed/scalable.\n"; arg_values[i] = normalize_fixed_scalable_vector_type(intrin_type->getParamType(i), arg_values[i]); } if (arg_values[i]->getType() != intrin_type->getParamType(i)) { + debug(0) << "Bit casting type.\n"; // There can be some mismatches in types, such as when passing scalar Halide type T // to LLVM vector type <1 x T>. arg_values[i] = builder->CreateBitCast(arg_values[i], intrin_type->getParamType(i)); @@ -4947,5 +5048,83 @@ llvm::Constant *CodeGen_LLVM::get_splat(int lanes, llvm::Constant *value, return ConstantVector::getSplat(ec, value); } +bool CodeGen_LLVM::call_vector_predication_intrinsic(const std::string &name, const Type &result_type, + llvm::Value *mask, // Pass nullptr for constrant true. + llvm::Value *a, llvm::Value *b, llvm::Value *c, int alignment, + const char *overload_suffix) { + if (!use_llvm_vp_intrinsics || + result_type.is_scalar()) { + return false; + } + + llvm::Type *llvm_result_type = llvm_type_of(result_type); + int32_t length = result_type.lanes(); + const char *type_designator = result_type.is_float() ? "f" : "i"; + std::string type_string = "."; + bool is_scalable = isa(llvm_result_type); + llvm::ElementCount llvm_vector_ec; + if (is_scalable) { + const auto *vt = cast(llvm_result_type); + std::string bits_designator = std::to_string(vt->getScalarSizeInBits()); + llvm_vector_ec = vt->getElementCount(); + type_string = ".nxv" + std::to_string(vt->getMinNumElements()) + type_designator + bits_designator; + } else { + const auto *vt = cast(llvm_result_type); + std::string bits_designator = std::to_string(vt->getScalarSizeInBits()); + llvm_vector_ec = vt->getElementCount(); + type_string = ".v" + std::to_string(vt->getNumElements()) + type_designator + bits_designator; + } + + const char *name_base = (starts_with(name, "strided")) ? "llvm.experimental.vp." : "llvm.vp."; + std::string full_name = name_base + name + type_string + overload_suffix; + int arg_count = 3 + (b != nullptr) + (c != nullptr); + std::vector args(arg_count); + size_t i = 0; + + int ptr_index = -1; + if (isa(a->getType())) { + ptr_index = 0; + } + args[i++] = a; + if (b != nullptr) { + args[i++] = b; + if (isa(b->getType())) { + if (ptr_index != -1) { + ptr_index = 1; + } + } + } + if (c != nullptr) { + args[i++] = c; + } + if (mask == nullptr) { + args[i++] = ConstantVector::getSplat(llvm_vector_ec, ConstantInt::get(i1_t, 1)); + } else { + args[i++] = mask; + } + args[i++] = ConstantInt::get(i32_t, length); + + value = call_intrin(llvm_result_type, get_vector_num_elements(llvm_result_type), full_name, args, is_scalable); + if (alignment != 0 && ptr_index != -1 && isa(value)) { + llvm::CallInst *call = dyn_cast(value); + call->addParamAttr(ptr_index, Attribute::getWithAlignment(*context, llvm::Align(alignment))); + } + return true; +} + +bool CodeGen_LLVM::call_vector_predication_comparison(const std::string &name, const Type &result_type, + llvm::Value *mask, // Pass nullptr for constrant true. + llvm::Value *a, llvm::Value *b, const char *cmp_op) { + // Early out to prevent creating useless metadata. + if (!use_llvm_vp_intrinsics || + result_type.is_scalar()) { + return false; + } + + llvm::MDBuilder builder(*context); + llvm::Value *md_val = llvm::MetadataAsValue::get(*context, builder.createString(cmp_op)); + return call_vector_predication_intrinsic(name, result_type, mask, a, b, md_val); +} + } // namespace Internal } // namespace Halide diff --git a/src/CodeGen_LLVM.h b/src/CodeGen_LLVM.h index d6ee5b26adff..c2f5d5d363eb 100644 --- a/src/CodeGen_LLVM.h +++ b/src/CodeGen_LLVM.h @@ -136,6 +136,11 @@ class CodeGen_LLVM : public IRVisitor { /** What's the natural vector bit-width to use for loads, stores, etc. */ virtual int native_vector_bits() const = 0; + /** Used to decide whether to break a vector up into multiple smaller + * operations. This is the largest size the architecture supports. */ + virtual int maximum_vector_bits() const { + return native_vector_bits(); + } /** For architectures that have vscale vectors, return the constant vscale to use. * Default of 0 means do not use vscale vectors. Generally will depend on * the target flags and vector_bits settings. @@ -557,6 +562,38 @@ class CodeGen_LLVM : public IRVisitor { llvm::Constant *get_splat(int lanes, llvm::Constant *value, VectorTypeConstraint type_constraint = VectorTypeConstraint::None) const; + /** Call an "@llvm.vp.*" intrinsic, forming the full overloaded name and argument list. + * A key detail here is that the length of the vector operation is taken from the + * Halide type while the size of the LLVM vector type used (fixed or scalable) is taken + * from the LLVM promotion of the vector type, which should be the same as the types used + * in the arguments. These can be different. It may become useful to pass an explict + * length as well. + * + * The method is virtual to allow backends to extend this for architecture specific + * intrinsics. (E.g. RISC V LMUL.) Unfortunately, this involves matching the name as + * as string to do much. (TODO(zalman): decide if this is the right way to go based + * on LMUL experiment. Really LLVM ought to do this automatically for these intrinsics + * on larger lengths.) + * + * The name is the simple name like "add" for "@llvm.vp.add.v16i32". + * If mask is nullptr, it is provided as constant true. + * If b or c is nullptr, it is assumed to be a unary or binary operator respectively. + * + * Assigns result of vp intrinsic to value and returns true if it an instuction is generated, + * otherwise returns false. + */ + virtual bool call_vector_predication_intrinsic(const std::string &name, const Type &result_type, + llvm::Value *mask, llvm::Value *a, llvm::Value *b = nullptr, + llvm::Value *c = nullptr, int alignment = 0, + const char *overload_suffix = ""); + + virtual bool call_vector_predication_comparison(const std::string &name, const Type &result_type, + llvm::Value *mask, // Pass nullptr for constrant true. + llvm::Value *a, llvm::Value *b, const char *cmp_op); + + /** Controls use of vector predicated intrinsics for vector operations. */ + bool use_llvm_vp_intrinsics; + private: /** All the values in scope at the current code location during * codegen. Use sym_push and sym_pop to access. */ @@ -598,9 +635,9 @@ class CodeGen_LLVM : public IRVisitor { llvm::Function *add_argv_wrapper(llvm::Function *fn, const std::string &name, bool result_in_argv, std::vector &arg_is_buffer); - llvm::Value *codegen_dense_vector_load(const Type &type, const std::string &name, const Expr &base, - const Buffer<> &image, const Parameter ¶m, const ModulusRemainder &alignment, - llvm::Value *vpred = nullptr, bool slice_to_native = true); + llvm::Value *codegen_vector_load(const Type &type, const std::string &name, const Expr &base, + const Buffer<> &image, const Parameter ¶m, const ModulusRemainder &alignment, + llvm::Value *vpred = nullptr, bool slice_to_native = true, llvm::Value *stride = nullptr); llvm::Value *codegen_dense_vector_load(const Load *load, llvm::Value *vpred = nullptr, bool slice_to_native = true); virtual void codegen_predicated_load(const Load *op); diff --git a/src/CodeGen_RISCV.cpp b/src/CodeGen_RISCV.cpp index 434105724c3a..c4ccf3e60850 100644 --- a/src/CodeGen_RISCV.cpp +++ b/src/CodeGen_RISCV.cpp @@ -25,10 +25,13 @@ class CodeGen_RISCV : public CodeGen_Posix { string mabi() const override; bool use_soft_float_abi() const override; int native_vector_bits() const override; + int maximum_vector_bits() const override; + int target_vscale() const override; }; CodeGen_RISCV::CodeGen_RISCV(const Target &t) : CodeGen_Posix(t) { + use_llvm_vp_intrinsics = true; } string CodeGen_RISCV::mcpu_target() const { @@ -50,7 +53,7 @@ string CodeGen_RISCV::mattrs() const { string arch_flags = "+m,+a,+f,+d,+c"; if (target.has_feature(Target::RVV)) { - arch_flags += ",+experimental-v"; + arch_flags += ",+v"; } return arch_flags; } @@ -73,7 +76,24 @@ bool CodeGen_RISCV::use_soft_float_abi() const { } int CodeGen_RISCV::native_vector_bits() const { - return 128; + if (target.vector_bits != 0 && + target.has_feature(Target::RVV)) { + return target.vector_bits; + } + return 0; +} + +int CodeGen_RISCV::maximum_vector_bits() const { + return native_vector_bits() * 8; +} + +int CodeGen_RISCV::target_vscale() const { + if (target.vector_bits != 0 && + target.has_feature(Target::RVV)) { + internal_assert((target.vector_bits % 64) == 0); + return target.vector_bits / 64; + } + return 0; } } // namespace From 541833cfa6e089140d3772c7f595bf387e11be7a Mon Sep 17 00:00:00 2001 From: Z Stern Date: Thu, 22 Sep 2022 05:09:56 +0000 Subject: [PATCH 02/16] Add vector predicated store support. --- src/CodeGen_LLVM.cpp | 107 ++++++++++++++++++++++++++++++++----------- src/CodeGen_LLVM.h | 6 ++- 2 files changed, 86 insertions(+), 27 deletions(-) diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp index 56aa4c573a27..f0ebd8263248 100644 --- a/src/CodeGen_LLVM.cpp +++ b/src/CodeGen_LLVM.cpp @@ -1930,7 +1930,13 @@ Value *CodeGen_LLVM::codegen_buffer_pointer(Value *base_address, Halide::Type ty // Promote index to 64-bit on targets that use 64-bit pointers. llvm::DataLayout d(module.get()); if (d.getPointerSize() == 8) { - index = builder->CreateIntCast(index, i64_t, true); + llvm::Type *index_type = index->getType(); + llvm::Type *desired_index_type = i64_t; + if (isa(index_type)) { + desired_index_type = VectorType::get(desired_index_type, + dyn_cast(index_type)->getElementCount()); + } + index = builder->CreateIntCast(index, desired_index_type, true); } return CreateInBoundsGEP(builder, load_type, base_address, index); @@ -2336,7 +2342,7 @@ void CodeGen_LLVM::codegen_predicated_store(const Store *op) { Value *slice_mask = slice_vector(vpred, i, slice_lanes); Instruction *store; if (call_vector_predication_intrinsic("store", value_type.with_lanes(slice_lanes), slice_mask, slice_val, - vec_ptr, nullptr, alignment, ".p0")) { + vec_ptr, nullptr, alignment, ".p0", true)) { store = dyn_cast(value); } else { store = builder->CreateMaskedStore(slice_val, vec_ptr, llvm::Align(alignment), slice_mask); @@ -3774,7 +3780,11 @@ void CodeGen_LLVM::visit(const Store *op) { } else { int alignment = value_type.bytes(); const Ramp *ramp = op->index.as(); - if (ramp && is_const_one(ramp->stride)) { + // TODO(zalman): consider splitting out vector predication path. Current + // code shows how vector predication would simplify things as the + // following scalarization cases would go away. + bool is_dense = ramp && is_const_one(ramp->stride); + if (use_llvm_vp_intrinsics || is_dense) { int native_bits = native_vector_bits(); int native_bytes = native_bits / 8; @@ -3802,16 +3812,41 @@ void CodeGen_LLVM::visit(const Store *op) { int store_lanes = value_type.lanes(); int native_lanes = maximum_vector_bits() / value_type.bits(); + Expr base = (ramp != nullptr) ? ramp->base : 0; + Expr stride = (ramp != nullptr) ? ramp->stride : 0; + Value *stride_val = (!is_dense && ramp != nullptr) ? codegen(stride) : nullptr; + + Value *index = (ramp == nullptr) ? codegen(op->index) : nullptr; + for (int i = 0; i < store_lanes; i += native_lanes) { int slice_lanes = std::min(native_lanes, store_lanes - i); - Expr slice_base = simplify(ramp->base + i); + Expr slice_base = simplify(base + i * stride); Expr slice_stride = make_one(slice_base.type()); Expr slice_index = slice_lanes == 1 ? slice_base : Ramp::make(slice_base, slice_stride, slice_lanes); Value *slice_val = slice_vector(val, i, slice_lanes); Value *elt_ptr = codegen_buffer_pointer(op->name, value_type.element_of(), slice_base); Value *vec_ptr = builder->CreatePointerCast(elt_ptr, slice_val->getType()->getPointerTo()); - StoreInst *store = builder->CreateAlignedStore(slice_val, vec_ptr, llvm::Align(alignment)); - annotate_store(store, slice_index); + if (is_dense || slice_lanes == 1) { + if (call_vector_predication_intrinsic("store", value_type.with_lanes(slice_lanes), nullptr, slice_val, + vec_ptr, nullptr, alignment, ".p0", true)) { + add_tbaa_metadata(dyn_cast(value), op->name, slice_index); + } else { + StoreInst *store = builder->CreateAlignedStore(slice_val, vec_ptr, llvm::Align(alignment)); + annotate_store(store, slice_index); + } + } else if (ramp != nullptr) { + bool generated = call_vector_predication_intrinsic("strided.store", value_type.with_lanes(slice_lanes), nullptr, slice_val, + vec_ptr, stride_val, alignment, ".i64", true); + internal_assert(generated) << "Using vector predicated intrinsics, but code generation was not successful for strided store.\n"; + add_tbaa_metadata(dyn_cast(value), op->name, slice_index); + } else { + Value *slice_index = slice_vector(index, i, slice_lanes); + Value *vec_ptrs = codegen_buffer_pointer(op->name, value_type, slice_index); + bool generated = call_vector_predication_intrinsic("scatter", value_type.with_lanes(slice_lanes), nullptr, slice_val, + vec_ptrs, nullptr, alignment, mangle_llvm_vector_type(vec_ptrs->getType()), true); + + internal_assert(generated) << "Using vector predicated intrinsics, but code generation was not successful for gathering store.\n"; + } } } else if (ramp) { Type ptr_type = value_type.element_of(); @@ -4598,7 +4633,9 @@ Value *CodeGen_LLVM::call_intrin(const llvm::Type *result_type, int intrin_lanes llvm::Type *intrinsic_result_type = result_type->getScalarType(); if (intrin_lanes > 1) { - if (scalable_vector_result && effective_vscale != 0) { + if (result_type == void_t) { + intrinsic_result_type = void_t; + } else if (scalable_vector_result && effective_vscale != 0) { intrinsic_result_type = get_vector_type(result_type->getScalarType(), intrin_lanes / effective_vscale, VectorTypeConstraint::VScale); } else { @@ -4618,7 +4655,9 @@ Value *CodeGen_LLVM::call_intrin(const llvm::Type *result_type, int intrin_lanes llvm::Function *intrin, vector arg_values) { internal_assert(intrin); int arg_lanes = 1; - if (result_type->isVectorTy()) { + if (result_type == void_t) { + arg_lanes = intrin_lanes; + } else if (result_type->isVectorTy()) { arg_lanes = get_vector_num_elements(result_type); } @@ -4657,7 +4696,7 @@ Value *CodeGen_LLVM::call_intrin(const llvm::Type *result_type, int intrin_lanes } } - llvm::Type *result_slice_type = + llvm::Type *result_slice_type = (result_type == void_t) ? void_t : get_vector_type(result_type->getScalarType(), intrin_lanes); results.push_back(call_intrin(result_slice_type, intrin_lanes, intrin, args)); @@ -5048,35 +5087,41 @@ llvm::Constant *CodeGen_LLVM::get_splat(int lanes, llvm::Constant *value, return ConstantVector::getSplat(ec, value); } -bool CodeGen_LLVM::call_vector_predication_intrinsic(const std::string &name, const Type &result_type, - llvm::Value *mask, // Pass nullptr for constrant true. - llvm::Value *a, llvm::Value *b, llvm::Value *c, int alignment, - const char *overload_suffix) { - if (!use_llvm_vp_intrinsics || - result_type.is_scalar()) { - return false; - } - - llvm::Type *llvm_result_type = llvm_type_of(result_type); - int32_t length = result_type.lanes(); - const char *type_designator = result_type.is_float() ? "f" : "i"; +std::string CodeGen_LLVM::mangle_llvm_vector_type(llvm::Type *type) { std::string type_string = "."; - bool is_scalable = isa(llvm_result_type); + bool is_scalable = isa(type); llvm::ElementCount llvm_vector_ec; if (is_scalable) { - const auto *vt = cast(llvm_result_type); + const auto *vt = cast(type); + const char *type_designator = vt->getElementType()->isIntegerTy() ? "i" : "f"; std::string bits_designator = std::to_string(vt->getScalarSizeInBits()); llvm_vector_ec = vt->getElementCount(); type_string = ".nxv" + std::to_string(vt->getMinNumElements()) + type_designator + bits_designator; } else { - const auto *vt = cast(llvm_result_type); + const auto *vt = cast(type); + const char *type_designator = vt->getElementType()->isIntegerTy() ? "i" : "f"; std::string bits_designator = std::to_string(vt->getScalarSizeInBits()); llvm_vector_ec = vt->getElementCount(); type_string = ".v" + std::to_string(vt->getNumElements()) + type_designator + bits_designator; } + return type_string; +} + +bool CodeGen_LLVM::call_vector_predication_intrinsic(const std::string &name, const Type &result_type, + llvm::Value *mask, // Pass nullptr for constrant true. + llvm::Value *a, llvm::Value *b, llvm::Value *c, int alignment, + const std::string &overload_suffix, + bool void_return) { + if (!use_llvm_vp_intrinsics || + result_type.is_scalar()) { + return false; + } + + llvm::Type *llvm_result_type = llvm_type_of(result_type); + int32_t length = result_type.lanes(); const char *name_base = (starts_with(name, "strided")) ? "llvm.experimental.vp." : "llvm.vp."; - std::string full_name = name_base + name + type_string + overload_suffix; + std::string full_name = name_base + name + mangle_llvm_vector_type(llvm_result_type) + overload_suffix; int arg_count = 3 + (b != nullptr) + (c != nullptr); std::vector args(arg_count); size_t i = 0; @@ -5097,14 +5142,24 @@ bool CodeGen_LLVM::call_vector_predication_intrinsic(const std::string &name, co if (c != nullptr) { args[i++] = c; } + bool is_scalable = isa(llvm_result_type); if (mask == nullptr) { + llvm::ElementCount llvm_vector_ec; + if (is_scalable) { + const auto *vt = cast(llvm_result_type); + llvm_vector_ec = vt->getElementCount(); + } else { + const auto *vt = cast(llvm_result_type); + llvm_vector_ec = vt->getElementCount(); + } + args[i++] = ConstantVector::getSplat(llvm_vector_ec, ConstantInt::get(i1_t, 1)); } else { args[i++] = mask; } args[i++] = ConstantInt::get(i32_t, length); - value = call_intrin(llvm_result_type, get_vector_num_elements(llvm_result_type), full_name, args, is_scalable); + value = call_intrin(void_return ? void_t : llvm_result_type, get_vector_num_elements(llvm_result_type), full_name, args, is_scalable); if (alignment != 0 && ptr_index != -1 && isa(value)) { llvm::CallInst *call = dyn_cast(value); call->addParamAttr(ptr_index, Attribute::getWithAlignment(*context, llvm::Align(alignment))); diff --git a/src/CodeGen_LLVM.h b/src/CodeGen_LLVM.h index c2f5d5d363eb..5ca5e985f39c 100644 --- a/src/CodeGen_LLVM.h +++ b/src/CodeGen_LLVM.h @@ -307,6 +307,9 @@ class CodeGen_LLVM : public IRVisitor { llvm::Value *codegen_buffer_pointer(llvm::Value *base_address, Type type, llvm::Value *index); // @} + /** Return an appropriate type string for a type which is of VectorType. */ + std::string mangle_llvm_vector_type(llvm::Type *type); + /** Turn a Halide Type into an llvm::Value representing a constant halide_type_t */ llvm::Value *make_halide_type_t(const Type &); @@ -585,7 +588,8 @@ class CodeGen_LLVM : public IRVisitor { virtual bool call_vector_predication_intrinsic(const std::string &name, const Type &result_type, llvm::Value *mask, llvm::Value *a, llvm::Value *b = nullptr, llvm::Value *c = nullptr, int alignment = 0, - const char *overload_suffix = ""); + const std::string &overload_suffix = "", + bool void_return = false); virtual bool call_vector_predication_comparison(const std::string &name, const Type &result_type, llvm::Value *mask, // Pass nullptr for constrant true. From cab2f01dee84dcef43682fad9a113de2c5fe84eb Mon Sep 17 00:00:00 2001 From: Z Stern Date: Fri, 23 Sep 2022 23:40:59 +0000 Subject: [PATCH 03/16] Change how void type is handled with call_intrin, other vector promotion contexts. --- src/CodeGen_LLVM.cpp | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp index a4b793dc82ad..a6b2869955a4 100644 --- a/src/CodeGen_LLVM.cpp +++ b/src/CodeGen_LLVM.cpp @@ -2451,7 +2451,7 @@ llvm::Value *CodeGen_LLVM::codegen_vector_load(const Type &type, const std::stri Value *slice_mask = (vpred != nullptr) ? slice_vector(vpred, i, slice_lanes) : nullptr; - Instruction *load_inst; + Instruction *load_inst = nullptr; if (stride) { if (call_vector_predication_intrinsic("strided.load", type.with_lanes(slice_lanes), slice_mask, vec_ptr, stride, nullptr, align_bytes, ".i64")) { @@ -4642,9 +4642,7 @@ Value *CodeGen_LLVM::call_intrin(const llvm::Type *result_type, int intrin_lanes llvm::Type *intrinsic_result_type = result_type->getScalarType(); if (intrin_lanes > 1) { - if (result_type == void_t) { - intrinsic_result_type = void_t; - } else if (scalable_vector_result && effective_vscale != 0) { + if (scalable_vector_result && effective_vscale != 0) { intrinsic_result_type = get_vector_type(result_type->getScalarType(), intrin_lanes / effective_vscale, VectorTypeConstraint::VScale); } else { @@ -4705,7 +4703,7 @@ Value *CodeGen_LLVM::call_intrin(const llvm::Type *result_type, int intrin_lanes } } - llvm::Type *result_slice_type = (result_type == void_t) ? void_t : + llvm::Type *result_slice_type = get_vector_type(result_type->getScalarType(), intrin_lanes); results.push_back(call_intrin(result_slice_type, intrin_lanes, intrin, args)); @@ -5053,6 +5051,10 @@ llvm::Type *CodeGen_LLVM::get_vector_type(llvm::Type *t, int n, VectorTypeConstraint type_constraint) const { bool scalable; + if (t->isVoidTy()) { + return t; + } + switch (type_constraint) { case VectorTypeConstraint::None: scalable = effective_vscale != 0 && From a6a0ba90b7328377e7ecca71a3ed00c5c9181130 Mon Sep 17 00:00:00 2001 From: Z Stern Date: Wed, 5 Oct 2022 22:55:02 +0000 Subject: [PATCH 04/16] Fix a few issues with types, order of arguments and name mangling in vector predication intrinsics. --- src/CodeGen_LLVM.cpp | 50 +++++++++++++++++++++++++++++++------------- 1 file changed, 35 insertions(+), 15 deletions(-) diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp index 8617c4660d1e..9bb230d1ef65 100644 --- a/src/CodeGen_LLVM.cpp +++ b/src/CodeGen_LLVM.cpp @@ -1863,9 +1863,15 @@ void CodeGen_LLVM::visit(const Not *op) { void CodeGen_LLVM::visit(const Select *op) { Value *cmp = codegen(op->condition); + if (use_llvm_vp_intrinsics && + op->type.is_vector() && + op->condition.type().is_scalar()) { + cmp = create_broadcast(cmp, op->type.lanes()); + } + Value *a = codegen(op->true_value); Value *b = codegen(op->false_value); - if (!call_vector_predication_intrinsic("select", op->type, nullptr, a, b, cmp)) { + if (!call_vector_predication_intrinsic("select", op->type, nullptr, cmp, a, b)) { value = builder->CreateSelect(cmp, a, b); } } @@ -2453,8 +2459,13 @@ llvm::Value *CodeGen_LLVM::codegen_vector_load(const Type &type, const std::stri Instruction *load_inst = nullptr; if (stride) { + if (get_target().bits == 64 && !stride->getType()->isIntegerTy(64)) { + stride = builder->CreateIntCast(stride, i64_t, true); + } + const char *mangle = stride->getType()->isIntegerTy(64) ? ".p0.i64" : ".p0.i32"; + if (call_vector_predication_intrinsic("strided.load", type.with_lanes(slice_lanes), slice_mask, - vec_ptr, stride, nullptr, align_bytes, ".i64")) { + vec_ptr, stride, nullptr, align_bytes, mangle)) { load_inst = dyn_cast(value); } else { internal_error << "Vector predicated strided load should not be requested if not supported.\n"; @@ -3846,8 +3857,12 @@ void CodeGen_LLVM::visit(const Store *op) { annotate_store(store, slice_index); } } else if (ramp != nullptr) { + if (get_target().bits == 64 && !stride_val->getType()->isIntegerTy(64)) { + stride_val = builder->CreateIntCast(stride_val, i64_t, true); + } + const char *mangle = stride_val->getType()->isIntegerTy(64) ? ".p0.i64" : ".p0.i32"; bool generated = call_vector_predication_intrinsic("strided.store", value_type.with_lanes(slice_lanes), nullptr, slice_val, - vec_ptr, stride_val, alignment, ".i64", true); + vec_ptr, stride_val, alignment, mangle, true); internal_assert(generated) << "Using vector predicated intrinsics, but code generation was not successful for strided store.\n"; add_tbaa_metadata(dyn_cast(value), op->name, slice_index); } else { @@ -5133,9 +5148,12 @@ bool CodeGen_LLVM::call_vector_predication_intrinsic(const std::string &name, co llvm::Type *llvm_result_type = llvm_type_of(result_type); int32_t length = result_type.lanes(); + // TODO(zvookin): Fix the interface here to not case on names. + bool no_mask = name == "select"; const char *name_base = (starts_with(name, "strided")) ? "llvm.experimental.vp." : "llvm.vp."; + std::string full_name = name_base + name + mangle_llvm_vector_type(llvm_result_type) + overload_suffix; - int arg_count = 3 + (b != nullptr) + (c != nullptr); + int arg_count = 2 + !no_mask + (b != nullptr) + (c != nullptr); std::vector args(arg_count); size_t i = 0; @@ -5156,19 +5174,21 @@ bool CodeGen_LLVM::call_vector_predication_intrinsic(const std::string &name, co args[i++] = c; } bool is_scalable = isa(llvm_result_type); - if (mask == nullptr) { - llvm::ElementCount llvm_vector_ec; - if (is_scalable) { - const auto *vt = cast(llvm_result_type); - llvm_vector_ec = vt->getElementCount(); + if (!no_mask) { + if (mask == nullptr) { + llvm::ElementCount llvm_vector_ec; + if (is_scalable) { + const auto *vt = cast(llvm_result_type); + llvm_vector_ec = vt->getElementCount(); + } else { + const auto *vt = cast(llvm_result_type); + llvm_vector_ec = vt->getElementCount(); + } + + args[i++] = ConstantVector::getSplat(llvm_vector_ec, ConstantInt::get(i1_t, 1)); } else { - const auto *vt = cast(llvm_result_type); - llvm_vector_ec = vt->getElementCount(); + args[i++] = mask; } - - args[i++] = ConstantVector::getSplat(llvm_vector_ec, ConstantInt::get(i1_t, 1)); - } else { - args[i++] = mask; } args[i++] = ConstantInt::get(i32_t, length); From ab2a68f1c7cd2bc5f66c4ead0e09e0f55b74250e Mon Sep 17 00:00:00 2001 From: Z Stern Date: Fri, 7 Oct 2022 01:36:59 +0000 Subject: [PATCH 05/16] Add support for using @llvm.vp.reduce.* intrinsics in vector reductions. --- src/CodeGen_LLVM.cpp | 145 ++++++++++++++++++++++++++++--------------- src/CodeGen_LLVM.h | 7 ++- 2 files changed, 98 insertions(+), 54 deletions(-) diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp index 9bb230d1ef65..57dadf2be3ba 100644 --- a/src/CodeGen_LLVM.cpp +++ b/src/CodeGen_LLVM.cpp @@ -4253,45 +4253,53 @@ void CodeGen_LLVM::codegen_vector_reduce(const VectorReduce *op, const Expr &ini op->op == VectorReduce::Mul || op->op == VectorReduce::Min || op->op == VectorReduce::Max) && - // Must be a power of two lanes - (input_lanes >= 2) && - ((input_lanes & (input_lanes - 1)) == 0) && - // int versions exist up to 1024 bits - ((!op->type.is_float() && input_bytes <= 1024) || - // float versions exist up to 16 lanes - input_lanes <= 16) && - // As of the release of llvm 10, the 64-bit experimental total - // reductions don't seem to be done yet on arm. - (val.type().bits() != 64 || - target.arch != Target::ARM)); + (use_llvm_vp_intrinsics || + // Must be a power of two lanes + ((input_lanes >= 2) && + ((input_lanes & (input_lanes - 1)) == 0) && + // int versions exist up to 1024 bits + ((!op->type.is_float() && input_bytes <= 1024) || + // float versions exist up to 16 lanes + input_lanes <= 16) && + // As of the release of llvm 10, the 64-bit experimental total + // reductions don't seem to be done yet on arm. + (val.type().bits() != 64 || + target.arch != Target::ARM)))); if (llvm_has_intrinsic) { - std::stringstream name; - name << "llvm.vector.reduce."; + const char *name = ""; const int bits = op->type.bits(); - bool takes_initial_value = false; + bool takes_initial_value = use_llvm_vp_intrinsics; Expr initial_value = init; if (op->type.is_float()) { switch (op->op) { case VectorReduce::Add: - name << "fadd"; + name = "fadd"; takes_initial_value = true; if (!initial_value.defined()) { initial_value = make_zero(op->type); } break; case VectorReduce::Mul: - name << "fmul"; + name = "fmul"; takes_initial_value = true; if (!initial_value.defined()) { initial_value = make_one(op->type); } break; case VectorReduce::Min: - name << "fmin"; + name = "fmin"; + // TODO(zvookin): For signed case, whether this is Inf or the max floating-point value depends on strict_float. (Or maybe it is QNaN in strict_float.) + if (takes_initial_value && !initial_value.defined()) { + initial_value = op->type.max(); + } break; case VectorReduce::Max: - name << "fmax"; + name = "fmax"; + // TODO(zvookin): For signed case, whether this is -Inf or the min floating-point value depends on strict_float. (Or maybe it is -QNaN in strict_float.) + if (takes_initial_value && !initial_value.defined()) { + initial_value = op->type.min(); + } break; default: break; @@ -4299,55 +4307,82 @@ void CodeGen_LLVM::codegen_vector_reduce(const VectorReduce *op, const Expr &ini } else if (op->type.is_int() || op->type.is_uint()) { switch (op->op) { case VectorReduce::Add: - name << "add"; + name = "add"; + if (takes_initial_value && !initial_value.defined()) { + initial_value = make_zero(op->type); + } break; case VectorReduce::Mul: - name << "mul"; + name = "mul"; + if (takes_initial_value && !initial_value.defined()) { + initial_value = make_one(op->type); + } break; case VectorReduce::Min: - name << (op->type.is_int() ? 's' : 'u') << "min"; + name = op->type.is_int() ? "smin" : "umin"; + if (takes_initial_value && !initial_value.defined()) { + initial_value = op->type.max(); + } break; case VectorReduce::Max: - name << (op->type.is_int() ? 's' : 'u') << "max"; + name = op->type.is_int() ? "smax" : "umax"; + if (takes_initial_value && !initial_value.defined()) { + initial_value = op->type.min(); + } break; default: break; } } - name << ".v" << val.type().lanes() << (op->type.is_float() ? 'f' : 'i') << bits; - string intrin_name = name.str(); + if (use_llvm_vp_intrinsics) { + string vp_name = "reduce."; + vp_name += name; + codegen(initial_value); + llvm::Value *init = value; + codegen(op->value); + llvm::Value *val = value; + bool generated = call_vector_predication_intrinsic(vp_name, op->value.type(), nullptr, init, val, nullptr, 0, "", false, true); + internal_assert(generated) << "Vector predication intrinsic generation failed for vector reduction " << name << "\n"; + } else { + std::stringstream build_name; + build_name << "llvm.vector.reduce."; + build_name << name; + build_name << ".v" << val.type().lanes() << (op->type.is_float() ? 'f' : 'i') << bits; - vector args; - if (takes_initial_value) { - args.push_back(initial_value); - initial_value = Expr(); - } - args.push_back(op->value); + string intrin_name = build_name.str(); - // Make sure the declaration exists, or the codegen for - // call will assume that the args should scalarize. - if (!module->getFunction(intrin_name)) { - vector arg_types; - for (const Expr &e : args) { - arg_types.push_back(llvm_type_of(e.type())); + vector args; + if (takes_initial_value) { + args.push_back(initial_value); + initial_value = Expr(); + } + args.push_back(op->value); + + // Make sure the declaration exists, or the codegen for + // call will assume that the args should scalarize. + if (!module->getFunction(intrin_name)) { + vector arg_types; + for (const Expr &e : args) { + arg_types.push_back(llvm_type_of(e.type())); + } + FunctionType *func_t = FunctionType::get(llvm_type_of(op->type), arg_types, false); + llvm::Function::Create(func_t, llvm::Function::ExternalLinkage, intrin_name, module.get()); } - FunctionType *func_t = FunctionType::get(llvm_type_of(op->type), arg_types, false); - llvm::Function::Create(func_t, llvm::Function::ExternalLinkage, intrin_name, module.get()); - } - Expr equiv = Call::make(op->type, intrin_name, args, Call::PureExtern); - if (initial_value.defined()) { - equiv = binop(initial_value, equiv); + Expr equiv = Call::make(op->type, intrin_name, args, Call::PureExtern); + if (initial_value.defined()) { + equiv = binop(initial_value, equiv); + } + equiv.accept(this); } - equiv.accept(this); return; } } if (output_lanes == 1 && factor > native_lanes && - factor % native_lanes == 0) { + (use_llvm_vp_intrinsics || (factor % native_lanes == 0))) { // It's a total reduction of multiple native // vectors. Start by adding the vectors together. Expr equiv; @@ -4647,9 +4682,10 @@ Value *CodeGen_LLVM::call_intrin(const Type &result_type, int intrin_lanes, intrin, arg_values); } + Value *CodeGen_LLVM::call_intrin(const llvm::Type *result_type, int intrin_lanes, const string &name, vector arg_values, - bool scalable_vector_result) { + bool scalable_vector_result, bool is_reduction) { llvm::Function *fn = module->getFunction(name); if (!fn) { vector arg_types(arg_values.size()); @@ -4658,7 +4694,7 @@ Value *CodeGen_LLVM::call_intrin(const llvm::Type *result_type, int intrin_lanes } llvm::Type *intrinsic_result_type = result_type->getScalarType(); - if (intrin_lanes > 1) { + if (intrin_lanes > 1 && !is_reduction) { if (scalable_vector_result && effective_vscale != 0) { intrinsic_result_type = get_vector_type(result_type->getScalarType(), intrin_lanes / effective_vscale, VectorTypeConstraint::VScale); @@ -4672,11 +4708,12 @@ Value *CodeGen_LLVM::call_intrin(const llvm::Type *result_type, int intrin_lanes fn->setCallingConv(CallingConv::C); } - return call_intrin(result_type, intrin_lanes, fn, arg_values); + return call_intrin(result_type, intrin_lanes, fn, arg_values, is_reduction); } Value *CodeGen_LLVM::call_intrin(const llvm::Type *result_type, int intrin_lanes, - llvm::Function *intrin, vector arg_values) { + llvm::Function *intrin, vector arg_values, + bool is_reduction) { internal_assert(intrin); int arg_lanes = 1; if (result_type->isVoidTy()) { @@ -4685,7 +4722,7 @@ Value *CodeGen_LLVM::call_intrin(const llvm::Type *result_type, int intrin_lanes arg_lanes = get_vector_num_elements(result_type); } - if (intrin_lanes != arg_lanes) { + if (!is_reduction && intrin_lanes != arg_lanes) { // Cut up each arg into appropriately-sized pieces, call the // intrinsic on each, then splice together the results. vector results; @@ -5139,7 +5176,7 @@ bool CodeGen_LLVM::call_vector_predication_intrinsic(const std::string &name, co llvm::Value *mask, // Pass nullptr for constrant true. llvm::Value *a, llvm::Value *b, llvm::Value *c, int alignment, const std::string &overload_suffix, - bool void_return) { + bool void_return, bool is_reduction) { if (!use_llvm_vp_intrinsics || result_type.is_scalar()) { return false; @@ -5192,7 +5229,13 @@ bool CodeGen_LLVM::call_vector_predication_intrinsic(const std::string &name, co } args[i++] = ConstantInt::get(i32_t, length); - value = call_intrin(void_return ? void_t : llvm_result_type, get_vector_num_elements(llvm_result_type), full_name, args, is_scalable); + llvm::Type *llvm_return_type = llvm_result_type; + if (void_return) { + llvm_return_type = void_t; + } else if (is_reduction) { + llvm_return_type = llvm_result_type->getScalarType(); + } + value = call_intrin(llvm_return_type, get_vector_num_elements(llvm_result_type), full_name, args, is_scalable, is_reduction); if (alignment != 0 && ptr_index != -1 && isa(value)) { llvm::CallInst *call = dyn_cast(value); call->addParamAttr(ptr_index, Attribute::getWithAlignment(*context, llvm::Align(alignment))); diff --git a/src/CodeGen_LLVM.h b/src/CodeGen_LLVM.h index 5ca5e985f39c..f5e0d4ed41a3 100644 --- a/src/CodeGen_LLVM.h +++ b/src/CodeGen_LLVM.h @@ -477,9 +477,10 @@ class CodeGen_LLVM : public IRVisitor { llvm::Function *intrin, std::vector); llvm::Value *call_intrin(const llvm::Type *t, int intrin_lanes, const std::string &name, std::vector, - bool scalable_vector_result = false); + bool scalable_vector_result = false, bool is_reduction = false); llvm::Value *call_intrin(const llvm::Type *t, int intrin_lanes, - llvm::Function *intrin, std::vector); + llvm::Function *intrin, std::vector, + bool is_reduction = false); // @} /** Take a slice of lanes out of an llvm vector. Pads with undefs @@ -589,7 +590,7 @@ class CodeGen_LLVM : public IRVisitor { llvm::Value *mask, llvm::Value *a, llvm::Value *b = nullptr, llvm::Value *c = nullptr, int alignment = 0, const std::string &overload_suffix = "", - bool void_return = false); + bool void_return = false, bool is_reduction = false); virtual bool call_vector_predication_comparison(const std::string &name, const Type &result_type, llvm::Value *mask, // Pass nullptr for constrant true. From af55a2ee032b30f755ad10a79adb6939fe6b4f13 Mon Sep 17 00:00:00 2001 From: Z Stern Date: Fri, 21 Oct 2022 00:45:30 +0000 Subject: [PATCH 06/16] Small refactor to clean up vector predication support. Mainly improving the calling convention and naming of the new routines to generate the intrinsics. --- src/CodeGen_LLVM.cpp | 233 +++++++++++++++++++++++-------------------- src/CodeGen_LLVM.h | 71 ++++++++----- 2 files changed, 170 insertions(+), 134 deletions(-) diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp index f5ac01cc6fd5..20a36be26c9f 100644 --- a/src/CodeGen_LLVM.cpp +++ b/src/CodeGen_LLVM.cpp @@ -1511,16 +1511,20 @@ void CodeGen_LLVM::visit(const Add *op) { Value *a = codegen(op->a); Value *b = codegen(op->b); if (op->type.is_float()) { - if (!call_vector_predication_intrinsic("fadd", t, nullptr, a, b)) { + if (!try_vector_predication_intrinsic("llvm.vp.fadd", llvm_type_of(t), t.lanes(), AllEnabledMask(), + { VPArg(a, 0), VPArg(b) })) { value = builder->CreateFAdd(a, b); } } else if (op->type.is_int() && op->type.bits() >= 32) { // We tell llvm integers don't wrap, so that it generates good // code for loop indices. - // TODO(zalman): Figure out if vector predication needs to/can work here. + // TODO(zalman): This needs vector predication, but I can't + // see a way to do it. May go away in introducing correct + // index type instead of using int32_t. value = builder->CreateNSWAdd(a, b); } else { - if (!call_vector_predication_intrinsic("add", t, nullptr, a, b)) { + if (!try_vector_predication_intrinsic("llvm.vp.add", llvm_type_of(t), t.lanes(), AllEnabledMask(), + { VPArg(a, 0), VPArg(b) })) { value = builder->CreateAdd(a, b); } } @@ -1536,16 +1540,20 @@ void CodeGen_LLVM::visit(const Sub *op) { Value *a = codegen(op->a); Value *b = codegen(op->b); if (op->type.is_float()) { - if (!call_vector_predication_intrinsic("fsub", t, nullptr, a, b)) { + if (!try_vector_predication_intrinsic("llvm.vp.fsub", llvm_type_of(t), t.lanes(), AllEnabledMask(), + { VPArg(a, 0), VPArg(b) })) { value = builder->CreateFSub(a, b); } } else if (op->type.is_int() && op->type.bits() >= 32) { // We tell llvm integers don't wrap, so that it generates good // code for loop indices. - // TODO(zalman): Figure out if vector predication needs to/can work here. + // TODO(zalman): This needs vector predication, but I can't + // see a way to do it. May go away in introducing correct + // index type instead of using int32_t. value = builder->CreateNSWSub(a, b); } else { - if (!call_vector_predication_intrinsic("sub", t, nullptr, a, b)) { + if (!try_vector_predication_intrinsic("llvm.vp.sub", llvm_type_of(t), t.lanes(), AllEnabledMask(), + { VPArg(a, 0), VPArg(b) })) { value = builder->CreateSub(a, b); } } @@ -1565,16 +1573,20 @@ void CodeGen_LLVM::visit(const Mul *op) { Value *a = codegen(op->a); Value *b = codegen(op->b); if (op->type.is_float()) { - if (!call_vector_predication_intrinsic("fmul", t, nullptr, a, b)) { + if (!try_vector_predication_intrinsic("llvm.vp.fmul", llvm_type_of(t), t.lanes(), AllEnabledMask(), + { VPArg(a, 0), VPArg(b) })) { value = builder->CreateFMul(a, b); } } else if (op->type.is_int() && op->type.bits() >= 32) { // We tell llvm integers don't wrap, so that it generates good // code for loop indices. - // TODO(zalman): Figure out if vector predication needs to/can work here. + // TODO(zalman): This needs vector predication, but I can't + // see a way to do it. May go away in introducing correct + // index type instead of using int32_t. value = builder->CreateNSWMul(a, b); } else { - if (!call_vector_predication_intrinsic("mul", t, nullptr, a, b)) { + if (!try_vector_predication_intrinsic("llvm.vp.mul", llvm_type_of(t), t.lanes(), AllEnabledMask(), + { VPArg(a, 0), VPArg(b) })) { value = builder->CreateMul(a, b); } } @@ -1596,7 +1608,8 @@ void CodeGen_LLVM::visit(const Div *op) { // output hard. Value *a = codegen(op->a); Value *b = codegen(op->b); - if (!call_vector_predication_intrinsic("fdiv", t, nullptr, a, b)) { + if (!try_vector_predication_intrinsic("llvm.vp.fdiv", llvm_type_of(t), t.lanes(), AllEnabledMask(), + { VPArg(a, 0), VPArg(b) })) { value = builder->CreateFDiv(a, b); } } else { @@ -1668,11 +1681,11 @@ void CodeGen_LLVM::visit(const EQ *op) { Value *a = codegen(op->a); Value *b = codegen(op->b); if (t.is_float()) { - if (!call_vector_predication_comparison("fcmp", t, nullptr, a, b, "oeq")) { + if (!try_vector_predication_comparison("llvm.vp.fcmp", t, AllEnabledMask(), a, b, "oeq")) { value = builder->CreateFCmpOEQ(a, b); } } else { - if (!call_vector_predication_comparison("icmp", t, nullptr, a, b, "eq")) { + if (!try_vector_predication_comparison("llvm.vp.icmp", t, AllEnabledMask(), a, b, "eq")) { value = builder->CreateICmpEQ(a, b); } } @@ -1688,11 +1701,11 @@ void CodeGen_LLVM::visit(const NE *op) { Value *a = codegen(op->a); Value *b = codegen(op->b); if (t.is_float()) { - if (!call_vector_predication_comparison("fcmp", t, nullptr, a, b, "one")) { + if (!try_vector_predication_comparison("llvm.vp.fcmp", t, AllEnabledMask(), a, b, "one")) { value = builder->CreateFCmpONE(a, b); } } else { - if (!call_vector_predication_comparison("icmp", t, nullptr, a, b, "ne")) { + if (!try_vector_predication_comparison("llvm.vp.icmp", t, AllEnabledMask(), a, b, "ne")) { value = builder->CreateICmpNE(a, b); } } @@ -1708,15 +1721,15 @@ void CodeGen_LLVM::visit(const LT *op) { Value *a = codegen(op->a); Value *b = codegen(op->b); if (t.is_float()) { - if (!call_vector_predication_comparison("fcmp", t, nullptr, a, b, "olt")) { + if (!try_vector_predication_comparison("llvm.vp.fcmp", t, AllEnabledMask(), a, b, "olt")) { value = builder->CreateFCmpOLT(a, b); } } else if (t.is_int()) { - if (!call_vector_predication_comparison("icmp", t, nullptr, a, b, "slt")) { + if (!try_vector_predication_comparison("llvm.vp.icmp", t, AllEnabledMask(), a, b, "slt")) { value = builder->CreateICmpSLT(a, b); } } else { - if (!call_vector_predication_comparison("icmp", t, nullptr, a, b, "ult")) { + if (!try_vector_predication_comparison("llvm.vp.icmp", t, AllEnabledMask(), a, b, "ult")) { value = builder->CreateICmpULT(a, b); } } @@ -1732,15 +1745,15 @@ void CodeGen_LLVM::visit(const LE *op) { Value *a = codegen(op->a); Value *b = codegen(op->b); if (t.is_float()) { - if (!call_vector_predication_comparison("fcmp", t, nullptr, a, b, "ole")) { + if (!try_vector_predication_comparison("llvm.vp.fcmp", t, AllEnabledMask(), a, b, "ole")) { value = builder->CreateFCmpOLE(a, b); } } else if (t.is_int()) { - if (!call_vector_predication_comparison("icmp", t, nullptr, a, b, "sle")) { + if (!try_vector_predication_comparison("llvm.vp.icmp", t, AllEnabledMask(), a, b, "sle")) { value = builder->CreateICmpSLE(a, b); } } else { - if (!call_vector_predication_comparison("icmp", t, nullptr, a, b, "ule")) { + if (!try_vector_predication_comparison("llvm.vp.icmp", t, AllEnabledMask(), a, b, "ule")) { value = builder->CreateICmpULE(a, b); } } @@ -1757,15 +1770,15 @@ void CodeGen_LLVM::visit(const GT *op) { Value *b = codegen(op->b); if (t.is_float()) { - if (!call_vector_predication_comparison("fcmp", t, nullptr, a, b, "ogt")) { + if (!try_vector_predication_comparison("llvm.vp.fcmp", t, AllEnabledMask(), a, b, "ogt")) { value = builder->CreateFCmpOGT(a, b); } } else if (t.is_int()) { - if (!call_vector_predication_comparison("icmp", t, nullptr, a, b, "sgt")) { + if (!try_vector_predication_comparison("llvm.vp.icmp", t, AllEnabledMask(), a, b, "sgt")) { value = builder->CreateICmpSGT(a, b); } } else { - if (!call_vector_predication_comparison("icmp", t, nullptr, a, b, "ugt")) { + if (!try_vector_predication_comparison("llvm.vp.icmp", t, AllEnabledMask(), a, b, "ugt")) { value = builder->CreateICmpUGT(a, b); } } @@ -1781,15 +1794,15 @@ void CodeGen_LLVM::visit(const GE *op) { Value *a = codegen(op->a); Value *b = codegen(op->b); if (t.is_float()) { - if (!call_vector_predication_comparison("fcmp", t, nullptr, a, b, "oge")) { + if (!try_vector_predication_comparison("llvm.vp.fcmp", t, AllEnabledMask(), a, b, "oge")) { value = builder->CreateFCmpOGE(a, b); } } else if (t.is_int()) { - if (!call_vector_predication_comparison("icmp", t, nullptr, a, b, "sge")) { + if (!try_vector_predication_comparison("llvm.vp.icmp", t, AllEnabledMask(), a, b, "sge")) { value = builder->CreateICmpSGE(a, b); } } else { - if (!call_vector_predication_comparison("icmp", t, nullptr, a, b, "uge")) { + if (!try_vector_predication_comparison("llvm.vp.icmp", t, AllEnabledMask(), a, b, "uge")) { value = builder->CreateICmpUGE(a, b); } } @@ -1802,7 +1815,8 @@ void CodeGen_LLVM::visit(const And *op) { Value *a = codegen(op->a); Value *b = codegen(op->b); - if (!call_vector_predication_intrinsic("and", op->type, nullptr, a, b)) { + if (!try_vector_predication_intrinsic("llvm.vp.and", llvm_type_of(op->type), op->type.lanes(), + AllEnabledMask(), { VPArg(a, 0), VPArg(b) })) { value = builder->CreateAnd(a, b); } } @@ -1814,14 +1828,16 @@ void CodeGen_LLVM::visit(const Or *op) { Value *a = codegen(op->a); Value *b = codegen(op->b); - if (!call_vector_predication_intrinsic("or", op->type, nullptr, a, b)) { + if (!try_vector_predication_intrinsic("llvm.vp.or", llvm_type_of(op->type), op->type.lanes(), + AllEnabledMask(), { VPArg(a, 0), VPArg(b) })) { value = builder->CreateOr(a, b); } } void CodeGen_LLVM::visit(const Not *op) { Value *a = codegen(op->a); - if (!call_vector_predication_intrinsic("not", op->type, nullptr, a)) { + if (!try_vector_predication_intrinsic("llvm.vp.not", llvm_type_of(op->type), op->type.lanes(), + AllEnabledMask(), { VPArg(a, 0) })) { value = builder->CreateNot(a); } } @@ -1836,7 +1852,8 @@ void CodeGen_LLVM::visit(const Select *op) { Value *a = codegen(op->true_value); Value *b = codegen(op->false_value); - if (!call_vector_predication_intrinsic("select", op->type, nullptr, cmp, a, b)) { + if (!try_vector_predication_intrinsic("llvm.vp.select", llvm_type_of(op->type), op->type.lanes(), + AllEnabledMask(), { VPArg(cmp), VPArg(a, 0), VPArg(b) })) { value = builder->CreateSelect(cmp, a, b); } } @@ -2321,8 +2338,8 @@ void CodeGen_LLVM::codegen_predicated_store(const Store *op) { Value *slice_mask = slice_vector(vpred, i, slice_lanes); Instruction *store; - if (call_vector_predication_intrinsic("store", value_type.with_lanes(slice_lanes), slice_mask, slice_val, - vec_ptr, nullptr, alignment, ".p0", true)) { + if (try_vector_predication_intrinsic("llvm.vp.store", void_t, slice_lanes, slice_mask, + { VPArg(slice_val, 0), VPArg(vec_ptr, 1, alignment) })) { store = dyn_cast(value); } else { store = builder->CreateMaskedStore(slice_val, vec_ptr, llvm::Align(alignment), slice_mask); @@ -2427,17 +2444,15 @@ llvm::Value *CodeGen_LLVM::codegen_vector_load(const Type &type, const std::stri if (get_target().bits == 64 && !stride->getType()->isIntegerTy(64)) { stride = builder->CreateIntCast(stride, i64_t, true); } - const char *mangle = stride->getType()->isIntegerTy(64) ? ".p0.i64" : ".p0.i32"; - - if (call_vector_predication_intrinsic("strided.load", type.with_lanes(slice_lanes), slice_mask, - vec_ptr, stride, nullptr, align_bytes, mangle)) { + if (try_vector_predication_intrinsic("llvm.experimental.vp.strided.load", slide_type, slice_lanes, slice_mask, + { VPArg(vec_ptr, 0, align_bytes), VPArg(stride, 1) })) { load_inst = dyn_cast(value); } else { internal_error << "Vector predicated strided load should not be requested if not supported.\n"; } } else { - if (call_vector_predication_intrinsic("load", type.with_lanes(slice_lanes), slice_mask, - vec_ptr, nullptr, nullptr, align_bytes, ".p0")) { + if (try_vector_predication_intrinsic("llvm.vp.load", slice_type, slice_lanes, slice_mask, + { VPArg(vec_ptr, 0, align_bytes) })) { load_inst = dyn_cast(value); } else { if (slice_mask != nullptr) { @@ -2670,27 +2685,31 @@ void CodeGen_LLVM::visit(const Call *op) { internal_assert(op->args.size() == 2); Value *a = codegen(op->args[0]); Value *b = codegen(op->args[1]); - if (!call_vector_predication_intrinsic("and", op->type, nullptr, a, b)) { + if (!try_vector_predication_intrinsic("llvm.vp.and", llvm_type_of(op->type), op->type.lanes(), + AllEnabledMask(), { VPArg(a, 0), VPArg(b) })) { value = builder->CreateAnd(a, b); } } else if (op->is_intrinsic(Call::bitwise_xor)) { internal_assert(op->args.size() == 2); Value *a = codegen(op->args[0]); Value *b = codegen(op->args[1]); - if (!call_vector_predication_intrinsic("xor", op->type, nullptr, a, b)) { + if (!try_vector_predication_intrinsic("llvm.vp.xor", llvm_type_of(op->type), op->type.lanes(), + AllEnabledMask(), { VPArg(a, 0), VPArg(b) })) { value = builder->CreateXor(a, b); } } else if (op->is_intrinsic(Call::bitwise_or)) { internal_assert(op->args.size() == 2); Value *a = codegen(op->args[0]); Value *b = codegen(op->args[1]); - if (!call_vector_predication_intrinsic("or", op->type, nullptr, a, b)) { + if (!try_vector_predication_intrinsic("llvm.vp.or", llvm_type_of(op->type), op->type.lanes(), + AllEnabledMask(), { VPArg(a, 0), VPArg(b) })) { value = builder->CreateOr(a, b); } } else if (op->is_intrinsic(Call::bitwise_not)) { internal_assert(op->args.size() == 1); Value *a = codegen(op->args[0]); - if (!call_vector_predication_intrinsic("not", op->type, nullptr, a)) { + if (!try_vector_predication_intrinsic("llvm.vp.not", llvm_type_of(op->type), op->type.lanes(), + AllEnabledMask(), { VPArg(a, 0) })) { value = builder->CreateNot(a); } } else if (op->is_intrinsic(Call::shift_left)) { @@ -2698,7 +2717,8 @@ void CodeGen_LLVM::visit(const Call *op) { if (op->args[1].type().is_uint()) { Value *a = codegen(op->args[0]); Value *b = codegen(op->args[1]); - if (!call_vector_predication_intrinsic("shl", op->type, nullptr, a, b)) { + if (!try_vector_predication_intrinsic("llvm.vp.shl", llvm_type_of(op->type), op->type.lanes(), + AllEnabledMask(), { VPArg(a, 0), VPArg(b) })) { value = builder->CreateShl(a, b); } } else { @@ -2710,11 +2730,13 @@ void CodeGen_LLVM::visit(const Call *op) { Value *a = codegen(op->args[0]); Value *b = codegen(op->args[1]); if (op->type.is_int()) { - if (!call_vector_predication_intrinsic("ashr", op->type, nullptr, a, b)) { + if (!try_vector_predication_intrinsic("llvm.vp.ashr", llvm_type_of(op->type), op->type.lanes(), + AllEnabledMask(), { VPArg(a, 0), VPArg(b) })) { value = builder->CreateAShr(a, b); } } else { - if (!call_vector_predication_intrinsic("lshr", op->type, nullptr, a, b)) { + if (!try_vector_predication_intrinsic("llvm.vp.lshr", llvm_type_of(op->type), op->type.lanes(), + AllEnabledMask(), { VPArg(a, 0), VPArg(b) })) { value = builder->CreateLShr(a, b); } } @@ -3814,8 +3836,8 @@ void CodeGen_LLVM::visit(const Store *op) { Value *elt_ptr = codegen_buffer_pointer(op->name, value_type.element_of(), slice_base); Value *vec_ptr = builder->CreatePointerCast(elt_ptr, slice_val->getType()->getPointerTo()); if (is_dense || slice_lanes == 1) { - if (call_vector_predication_intrinsic("store", value_type.with_lanes(slice_lanes), nullptr, slice_val, - vec_ptr, nullptr, alignment, ".p0", true)) { + if (try_vector_predication_intrinsic("llvm.vp.store", void_t, slice_lanes, AllEnabledMask(), + { VPArg(slice_val, 0), VPArg(vec_ptr, 1, alignment) })) { add_tbaa_metadata(dyn_cast(value), op->name, slice_index); } else { StoreInst *store = builder->CreateAlignedStore(slice_val, vec_ptr, llvm::Align(alignment)); @@ -3825,17 +3847,15 @@ void CodeGen_LLVM::visit(const Store *op) { if (get_target().bits == 64 && !stride_val->getType()->isIntegerTy(64)) { stride_val = builder->CreateIntCast(stride_val, i64_t, true); } - const char *mangle = stride_val->getType()->isIntegerTy(64) ? ".p0.i64" : ".p0.i32"; - bool generated = call_vector_predication_intrinsic("strided.store", value_type.with_lanes(slice_lanes), nullptr, slice_val, - vec_ptr, stride_val, alignment, mangle, true); + bool generated = try_vector_predication_intrinsic("llvm.experimental.vp.strided.store", void_t, slice_lanes, AllEnabledMask(), + { VPArg(slice_val, 0), VPArg(vec_ptr, 1, alignment), VPArg(stride_val, 2) }); internal_assert(generated) << "Using vector predicated intrinsics, but code generation was not successful for strided store.\n"; add_tbaa_metadata(dyn_cast(value), op->name, slice_index); } else { Value *slice_index = slice_vector(index, i, slice_lanes); Value *vec_ptrs = codegen_buffer_pointer(op->name, value_type, slice_index); - bool generated = call_vector_predication_intrinsic("scatter", value_type.with_lanes(slice_lanes), nullptr, slice_val, - vec_ptrs, nullptr, alignment, mangle_llvm_vector_type(vec_ptrs->getType()), true); - + bool generated = try_vector_predication_intrinsic("llvm.vp.scatter", void_t, slice_lanes, AllEnabledMask(), + { VPArg(slice_val, 0), VPArg(vec_ptrs, 1, alignment) }); internal_assert(generated) << "Using vector predicated intrinsics, but code generation was not successful for gathering store.\n"; } } @@ -4301,13 +4321,14 @@ void CodeGen_LLVM::codegen_vector_reduce(const VectorReduce *op, const Expr &ini } if (use_llvm_vp_intrinsics) { - string vp_name = "reduce."; + string vp_name = "llvm.vp.reduce."; vp_name += name; codegen(initial_value); llvm::Value *init = value; codegen(op->value); llvm::Value *val = value; - bool generated = call_vector_predication_intrinsic(vp_name, op->value.type(), nullptr, init, val, nullptr, 0, "", false, true); + bool generated = try_vector_predication_intrinsic(vp_name, llvm_type_of(op->value.type()), op->value.type().lanes(), + AllEnabledMask(), { VPArg(init), VPArg(val, 0) }); internal_assert(generated) << "Vector predication intrinsic generation failed for vector reduction " << name << "\n"; } else { std::stringstream build_name; @@ -5137,47 +5158,49 @@ std::string CodeGen_LLVM::mangle_llvm_vector_type(llvm::Type *type) { return type_string; } -bool CodeGen_LLVM::call_vector_predication_intrinsic(const std::string &name, const Type &result_type, - llvm::Value *mask, // Pass nullptr for constrant true. - llvm::Value *a, llvm::Value *b, llvm::Value *c, int alignment, - const std::string &overload_suffix, - bool void_return, bool is_reduction) { - if (!use_llvm_vp_intrinsics || - result_type.is_scalar()) { +bool CodeGen_LLVM::try_vector_predication_intrinsic(const std::string &name, llvm::Type *llvm_result_type, + int32_t length, MaskVariant mask, std::vector vp_args) { + if (!use_llvm_vp_intrinsics) { return false; } - llvm::Type *llvm_result_type = llvm_type_of(result_type); - int32_t length = result_type.lanes(); - - // TODO(zvookin): Fix the interface here to not case on names. - bool no_mask = name == "select"; - const char *name_base = (starts_with(name, "strided")) ? "llvm.experimental.vp." : "llvm.vp."; - - std::string full_name = name_base + name + mangle_llvm_vector_type(llvm_result_type) + overload_suffix; - int arg_count = 2 + !no_mask + (b != nullptr) + (c != nullptr); - std::vector args(arg_count); - size_t i = 0; - - int ptr_index = -1; - if (isa(a->getType())) { - ptr_index = 0; + bool any_scalable = isa(llvm_result_type); + bool any_fixed = isa(llvm_result_type); + bool is_reduction = !any_scalable && !any_fixed; + for (const VPArg &arg : vp_args) { + any_scalable |= isa(arg.value->getType()); + any_fixed |= isa(arg.value->getType()); + } + if (!any_fixed && !any_scalable) { + return false; } - args[i++] = a; - if (b != nullptr) { - args[i++] = b; - if (isa(b->getType())) { - if (ptr_index != -1) { - ptr_index = 1; + internal_assert(!(any_scalable && any_fixed)) << "Cannot combine fixed and scalable vectors to vector predication intrinsic.\n"; + + bool is_scalable = any_scalable; + + std::vector args; + args.reserve(2 + vp_args.size()); + std::vector mangled_types(args.size()); + + for (const VPArg &arg : vp_args) { + args.push_back(arg.value); + if (arg.mangle_index != -1) { + llvm::Type *llvm_type = arg.value->getType(); + if (isa(llvm_type)) { + mangled_types[arg.mangle_index] = ".p0"; + } else { + mangled_types[arg.mangle_index] = mangle_llvm_vector_type(llvm_type); } } } - if (c != nullptr) { - args[i++] = c; + + std::string full_name = name; + for (const std::string &mangle : mangled_types) { + full_name += mangle; } - bool is_scalable = isa(llvm_result_type); - if (!no_mask) { - if (mask == nullptr) { + + if (!std::holds_alternative(mask)) { + if (std::holds_alternative(mask)) { llvm::ElementCount llvm_vector_ec; if (is_scalable) { const auto *vt = cast(llvm_result_type); @@ -5186,31 +5209,26 @@ bool CodeGen_LLVM::call_vector_predication_intrinsic(const std::string &name, co const auto *vt = cast(llvm_result_type); llvm_vector_ec = vt->getElementCount(); } - - args[i++] = ConstantVector::getSplat(llvm_vector_ec, ConstantInt::get(i1_t, 1)); + args.push_back(ConstantVector::getSplat(llvm_vector_ec, ConstantInt::get(i1_t, 1))); } else { - args[i++] = mask; + args.push_back(std::get(mask)); } } - args[i++] = ConstantInt::get(i32_t, length); + args.push_back(ConstantInt::get(i32_t, length)); - llvm::Type *llvm_return_type = llvm_result_type; - if (void_return) { - llvm_return_type = void_t; - } else if (is_reduction) { - llvm_return_type = llvm_result_type->getScalarType(); - } - value = call_intrin(llvm_return_type, get_vector_num_elements(llvm_result_type), full_name, args, is_scalable, is_reduction); - if (alignment != 0 && ptr_index != -1 && isa(value)) { - llvm::CallInst *call = dyn_cast(value); - call->addParamAttr(ptr_index, Attribute::getWithAlignment(*context, llvm::Align(alignment))); + value = call_intrin(llvm_result_type, length, full_name, args, is_scalable, is_reduction); + llvm::CallInst *call = dyn_cast(value); + for (size_t i = 0; i < args.size(); i++) { + if (vp_args[i].alignment != 0) { + call->addParamAttr(i, Attribute::getWithAlignment(*context, llvm::Align(vp_args[i].alignment))); + } } return true; } - -bool CodeGen_LLVM::call_vector_predication_comparison(const std::string &name, const Type &result_type, - llvm::Value *mask, // Pass nullptr for constrant true. - llvm::Value *a, llvm::Value *b, const char *cmp_op) { + +bool CodeGen_LLVM::try_vector_predication_comparison(const std::string &name, const Type &result_type, + MaskVariant mask, llvm::Value *a, llvm::Value *b, + const char *cmp_op) { // Early out to prevent creating useless metadata. if (!use_llvm_vp_intrinsics || result_type.is_scalar()) { @@ -5219,7 +5237,8 @@ bool CodeGen_LLVM::call_vector_predication_comparison(const std::string &name, c llvm::MDBuilder builder(*context); llvm::Value *md_val = llvm::MetadataAsValue::get(*context, builder.createString(cmp_op)); - return call_vector_predication_intrinsic(name, result_type, mask, a, b, md_val); + return try_vector_predication_intrinsic(name, llvm_type_of(result_type), result_type.lanes(), mask, + { VPArg(a, 0), VPArg(b), VPArg(md_val) }); } } // namespace Internal diff --git a/src/CodeGen_LLVM.h b/src/CodeGen_LLVM.h index 778f7eb6d7bf..9df2cc322b36 100644 --- a/src/CodeGen_LLVM.h +++ b/src/CodeGen_LLVM.h @@ -35,6 +35,7 @@ class GlobalVariable; #include #include #include +#include #include #include "IRVisitor.h" @@ -561,38 +562,54 @@ class CodeGen_LLVM : public IRVisitor { llvm::Constant *get_splat(int lanes, llvm::Constant *value, VectorTypeConstraint type_constraint = VectorTypeConstraint::None) const; - /** Call an "@llvm.vp.*" intrinsic, forming the full overloaded name and argument list. - * A key detail here is that the length of the vector operation is taken from the - * Halide type while the size of the LLVM vector type used (fixed or scalable) is taken - * from the LLVM promotion of the vector type, which should be the same as the types used - * in the arguments. These can be different. It may become useful to pass an explict - * length as well. - * - * The method is virtual to allow backends to extend this for architecture specific - * intrinsics. (E.g. RISC V LMUL.) Unfortunately, this involves matching the name as - * as string to do much. (TODO(zalman): decide if this is the right way to go based - * on LMUL experiment. Really LLVM ought to do this automatically for these intrinsics - * on larger lengths.) - * - * The name is the simple name like "add" for "@llvm.vp.add.v16i32". - * If mask is nullptr, it is provided as constant true. - * If b or c is nullptr, it is assumed to be a unary or binary operator respectively. - * - * Assigns result of vp intrinsic to value and returns true if it an instuction is generated, - * otherwise returns false. + /** Support for generating LLVM vector predication intrinsics + * ("@llvm.vp.*" and "@llvm.experimental.vp.*") */ - virtual bool call_vector_predication_intrinsic(const std::string &name, const Type &result_type, - llvm::Value *mask, llvm::Value *a, llvm::Value *b = nullptr, - llvm::Value *c = nullptr, int alignment = 0, - const std::string &overload_suffix = "", - bool void_return = false, bool is_reduction = false); + // @{ + /** Struct to hold descriptor for an argument to a vector + * predicated intrinsic. This includes the value, whether the + * type of the argument should be mangled into the intrisic name + * and if so, where, and the alignment for pointer arguments. */ + struct VPArg { + llvm::Value *value; + int mangle_index; + int alignment; + VPArg(llvm::Value *value, int32_t mangle_index = -1, int32_t alignment = 0) + : value(value), mangle_index(mangle_index), alignment(alignment) { + } + }; - virtual bool call_vector_predication_comparison(const std::string &name, const Type &result_type, - llvm::Value *mask, // Pass nullptr for constrant true. - llvm::Value *a, llvm::Value *b, const char *cmp_op); + /** Type indicating an intrinsic does not take a mask. */ + struct NoMask { + }; + + /** Type indicating mask to use is all true -- all lanes enabled. */ + struct AllEnabledMask { + }; + + /** Predication mask using the above two types for special cases + * and an llvm::Value for the general one. */ + typedef std::variant MaskVariant; + + /** Generate a vector predicated comparison intrinsic call if + * use_llvm_vp_intrinsics is true and result_type is a vector + * type. If generated, assigns result of vp intrinsic to value and + * returns true if it an instuction is generated, otherwise + * returns false. */ + virtual bool try_vector_predication_comparison(const std::string &name, const Type &result_type, + MaskVariant mask, llvm::Value *a, llvm::Value *b, + const char *cmp_op); + + /** Generate an intrisic call if use_llvm_vp_intrinsics is true + * and length is greater than 1. If generated, assigns result + * of vp intrinsic to value and returns true if it an instuction + * is generated, otherwise returns false. */ + bool try_vector_predication_intrinsic(const std::string &name, llvm::Type *llvm_result_type, + int32_t length, MaskVariant mask, std::vector args); /** Controls use of vector predicated intrinsics for vector operations. */ bool use_llvm_vp_intrinsics; + // @} private: /** All the values in scope at the current code location during From eaa4100f004999422c8b131f20f7ba4416b0e947 Mon Sep 17 00:00:00 2001 From: Z Stern Date: Fri, 21 Oct 2022 00:52:10 +0000 Subject: [PATCH 07/16] Typo slipped in. --- src/CodeGen_LLVM.cpp | 4 ++-- src/CodeGen_LLVM.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp index 20a36be26c9f..2f77b30169b0 100644 --- a/src/CodeGen_LLVM.cpp +++ b/src/CodeGen_LLVM.cpp @@ -2444,8 +2444,8 @@ llvm::Value *CodeGen_LLVM::codegen_vector_load(const Type &type, const std::stri if (get_target().bits == 64 && !stride->getType()->isIntegerTy(64)) { stride = builder->CreateIntCast(stride, i64_t, true); } - if (try_vector_predication_intrinsic("llvm.experimental.vp.strided.load", slide_type, slice_lanes, slice_mask, - { VPArg(vec_ptr, 0, align_bytes), VPArg(stride, 1) })) { + if (try_vector_predication_intrinsic("llvm.experimental.vp.strided.load", slice_type, slice_lanes, slice_mask, + { VPArg(vec_ptr, 0, align_bytes), VPArg(tride, 1) })) { load_inst = dyn_cast(value); } else { internal_error << "Vector predicated strided load should not be requested if not supported.\n"; diff --git a/src/CodeGen_LLVM.h b/src/CodeGen_LLVM.h index 9df2cc322b36..c1e5d46ccbf4 100644 --- a/src/CodeGen_LLVM.h +++ b/src/CodeGen_LLVM.h @@ -596,7 +596,7 @@ class CodeGen_LLVM : public IRVisitor { * type. If generated, assigns result of vp intrinsic to value and * returns true if it an instuction is generated, otherwise * returns false. */ - virtual bool try_vector_predication_comparison(const std::string &name, const Type &result_type, + bool try_vector_predication_comparison(const std::string &name, const Type &result_type, MaskVariant mask, llvm::Value *a, llvm::Value *b, const char *cmp_op); From 7a8201ca66a3891a5478671cd764614fafbedd31 Mon Sep 17 00:00:00 2001 From: Z Stern Date: Fri, 21 Oct 2022 01:00:33 +0000 Subject: [PATCH 08/16] This time for sure. --- src/CodeGen_LLVM.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp index 2f77b30169b0..55a47b2959f3 100644 --- a/src/CodeGen_LLVM.cpp +++ b/src/CodeGen_LLVM.cpp @@ -2445,7 +2445,7 @@ llvm::Value *CodeGen_LLVM::codegen_vector_load(const Type &type, const std::stri stride = builder->CreateIntCast(stride, i64_t, true); } if (try_vector_predication_intrinsic("llvm.experimental.vp.strided.load", slice_type, slice_lanes, slice_mask, - { VPArg(vec_ptr, 0, align_bytes), VPArg(tride, 1) })) { + { VPArg(vec_ptr, 0, align_bytes), VPArg(stride, 1) })) { load_inst = dyn_cast(value); } else { internal_error << "Vector predicated strided load should not be requested if not supported.\n"; From c0a9679cbe8bbc73d22863ba7d687e4f0b2f70e3 Mon Sep 17 00:00:00 2001 From: Z Stern Date: Fri, 21 Oct 2022 11:47:19 +0000 Subject: [PATCH 09/16] Formatting. --- src/CodeGen_LLVM.cpp | 64 ++++++++++++++++++++----------------------- src/CodeGen_LLVM.h | 4 +-- src/CodeGen_RISCV.cpp | 2 +- 3 files changed, 33 insertions(+), 37 deletions(-) diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp index 55a47b2959f3..73ead1c95cc4 100644 --- a/src/CodeGen_LLVM.cpp +++ b/src/CodeGen_LLVM.cpp @@ -1512,7 +1512,7 @@ void CodeGen_LLVM::visit(const Add *op) { Value *b = codegen(op->b); if (op->type.is_float()) { if (!try_vector_predication_intrinsic("llvm.vp.fadd", llvm_type_of(t), t.lanes(), AllEnabledMask(), - { VPArg(a, 0), VPArg(b) })) { + {VPArg(a, 0), VPArg(b)})) { value = builder->CreateFAdd(a, b); } } else if (op->type.is_int() && op->type.bits() >= 32) { @@ -1524,7 +1524,7 @@ void CodeGen_LLVM::visit(const Add *op) { value = builder->CreateNSWAdd(a, b); } else { if (!try_vector_predication_intrinsic("llvm.vp.add", llvm_type_of(t), t.lanes(), AllEnabledMask(), - { VPArg(a, 0), VPArg(b) })) { + {VPArg(a, 0), VPArg(b)})) { value = builder->CreateAdd(a, b); } } @@ -1541,7 +1541,7 @@ void CodeGen_LLVM::visit(const Sub *op) { Value *b = codegen(op->b); if (op->type.is_float()) { if (!try_vector_predication_intrinsic("llvm.vp.fsub", llvm_type_of(t), t.lanes(), AllEnabledMask(), - { VPArg(a, 0), VPArg(b) })) { + {VPArg(a, 0), VPArg(b)})) { value = builder->CreateFSub(a, b); } } else if (op->type.is_int() && op->type.bits() >= 32) { @@ -1553,7 +1553,7 @@ void CodeGen_LLVM::visit(const Sub *op) { value = builder->CreateNSWSub(a, b); } else { if (!try_vector_predication_intrinsic("llvm.vp.sub", llvm_type_of(t), t.lanes(), AllEnabledMask(), - { VPArg(a, 0), VPArg(b) })) { + {VPArg(a, 0), VPArg(b)})) { value = builder->CreateSub(a, b); } } @@ -1574,7 +1574,7 @@ void CodeGen_LLVM::visit(const Mul *op) { Value *b = codegen(op->b); if (op->type.is_float()) { if (!try_vector_predication_intrinsic("llvm.vp.fmul", llvm_type_of(t), t.lanes(), AllEnabledMask(), - { VPArg(a, 0), VPArg(b) })) { + {VPArg(a, 0), VPArg(b)})) { value = builder->CreateFMul(a, b); } } else if (op->type.is_int() && op->type.bits() >= 32) { @@ -1586,7 +1586,7 @@ void CodeGen_LLVM::visit(const Mul *op) { value = builder->CreateNSWMul(a, b); } else { if (!try_vector_predication_intrinsic("llvm.vp.mul", llvm_type_of(t), t.lanes(), AllEnabledMask(), - { VPArg(a, 0), VPArg(b) })) { + {VPArg(a, 0), VPArg(b)})) { value = builder->CreateMul(a, b); } } @@ -1609,7 +1609,7 @@ void CodeGen_LLVM::visit(const Div *op) { Value *a = codegen(op->a); Value *b = codegen(op->b); if (!try_vector_predication_intrinsic("llvm.vp.fdiv", llvm_type_of(t), t.lanes(), AllEnabledMask(), - { VPArg(a, 0), VPArg(b) })) { + {VPArg(a, 0), VPArg(b)})) { value = builder->CreateFDiv(a, b); } } else { @@ -1816,7 +1816,7 @@ void CodeGen_LLVM::visit(const And *op) { Value *a = codegen(op->a); Value *b = codegen(op->b); if (!try_vector_predication_intrinsic("llvm.vp.and", llvm_type_of(op->type), op->type.lanes(), - AllEnabledMask(), { VPArg(a, 0), VPArg(b) })) { + AllEnabledMask(), {VPArg(a, 0), VPArg(b)})) { value = builder->CreateAnd(a, b); } } @@ -1829,7 +1829,7 @@ void CodeGen_LLVM::visit(const Or *op) { Value *a = codegen(op->a); Value *b = codegen(op->b); if (!try_vector_predication_intrinsic("llvm.vp.or", llvm_type_of(op->type), op->type.lanes(), - AllEnabledMask(), { VPArg(a, 0), VPArg(b) })) { + AllEnabledMask(), {VPArg(a, 0), VPArg(b)})) { value = builder->CreateOr(a, b); } } @@ -1837,7 +1837,7 @@ void CodeGen_LLVM::visit(const Or *op) { void CodeGen_LLVM::visit(const Not *op) { Value *a = codegen(op->a); if (!try_vector_predication_intrinsic("llvm.vp.not", llvm_type_of(op->type), op->type.lanes(), - AllEnabledMask(), { VPArg(a, 0) })) { + AllEnabledMask(), {VPArg(a, 0)})) { value = builder->CreateNot(a); } } @@ -1853,7 +1853,7 @@ void CodeGen_LLVM::visit(const Select *op) { Value *a = codegen(op->true_value); Value *b = codegen(op->false_value); if (!try_vector_predication_intrinsic("llvm.vp.select", llvm_type_of(op->type), op->type.lanes(), - AllEnabledMask(), { VPArg(cmp), VPArg(a, 0), VPArg(b) })) { + AllEnabledMask(), {VPArg(cmp), VPArg(a, 0), VPArg(b)})) { value = builder->CreateSelect(cmp, a, b); } } @@ -2339,7 +2339,7 @@ void CodeGen_LLVM::codegen_predicated_store(const Store *op) { Value *slice_mask = slice_vector(vpred, i, slice_lanes); Instruction *store; if (try_vector_predication_intrinsic("llvm.vp.store", void_t, slice_lanes, slice_mask, - { VPArg(slice_val, 0), VPArg(vec_ptr, 1, alignment) })) { + {VPArg(slice_val, 0), VPArg(vec_ptr, 1, alignment)})) { store = dyn_cast(value); } else { store = builder->CreateMaskedStore(slice_val, vec_ptr, llvm::Align(alignment), slice_mask); @@ -2445,14 +2445,14 @@ llvm::Value *CodeGen_LLVM::codegen_vector_load(const Type &type, const std::stri stride = builder->CreateIntCast(stride, i64_t, true); } if (try_vector_predication_intrinsic("llvm.experimental.vp.strided.load", slice_type, slice_lanes, slice_mask, - { VPArg(vec_ptr, 0, align_bytes), VPArg(stride, 1) })) { + {VPArg(vec_ptr, 0, align_bytes), VPArg(stride, 1)})) { load_inst = dyn_cast(value); } else { internal_error << "Vector predicated strided load should not be requested if not supported.\n"; } } else { if (try_vector_predication_intrinsic("llvm.vp.load", slice_type, slice_lanes, slice_mask, - { VPArg(vec_ptr, 0, align_bytes) })) { + {VPArg(vec_ptr, 0, align_bytes)})) { load_inst = dyn_cast(value); } else { if (slice_mask != nullptr) { @@ -2485,7 +2485,7 @@ void CodeGen_LLVM::codegen_predicated_load(const Load *op) { Value *vpred = codegen(op->predicate); Value *llvm_stride = codegen(stride); value = codegen_vector_load(op->type, op->name, ramp->base, op->image, op->param, - op->alignment, vpred, true, llvm_stride); + op->alignment, vpred, true, llvm_stride); return; } @@ -2686,7 +2686,7 @@ void CodeGen_LLVM::visit(const Call *op) { Value *a = codegen(op->args[0]); Value *b = codegen(op->args[1]); if (!try_vector_predication_intrinsic("llvm.vp.and", llvm_type_of(op->type), op->type.lanes(), - AllEnabledMask(), { VPArg(a, 0), VPArg(b) })) { + AllEnabledMask(), {VPArg(a, 0), VPArg(b)})) { value = builder->CreateAnd(a, b); } } else if (op->is_intrinsic(Call::bitwise_xor)) { @@ -2694,7 +2694,7 @@ void CodeGen_LLVM::visit(const Call *op) { Value *a = codegen(op->args[0]); Value *b = codegen(op->args[1]); if (!try_vector_predication_intrinsic("llvm.vp.xor", llvm_type_of(op->type), op->type.lanes(), - AllEnabledMask(), { VPArg(a, 0), VPArg(b) })) { + AllEnabledMask(), {VPArg(a, 0), VPArg(b)})) { value = builder->CreateXor(a, b); } } else if (op->is_intrinsic(Call::bitwise_or)) { @@ -2702,14 +2702,14 @@ void CodeGen_LLVM::visit(const Call *op) { Value *a = codegen(op->args[0]); Value *b = codegen(op->args[1]); if (!try_vector_predication_intrinsic("llvm.vp.or", llvm_type_of(op->type), op->type.lanes(), - AllEnabledMask(), { VPArg(a, 0), VPArg(b) })) { + AllEnabledMask(), {VPArg(a, 0), VPArg(b)})) { value = builder->CreateOr(a, b); } } else if (op->is_intrinsic(Call::bitwise_not)) { internal_assert(op->args.size() == 1); Value *a = codegen(op->args[0]); if (!try_vector_predication_intrinsic("llvm.vp.not", llvm_type_of(op->type), op->type.lanes(), - AllEnabledMask(), { VPArg(a, 0) })) { + AllEnabledMask(), {VPArg(a, 0)})) { value = builder->CreateNot(a); } } else if (op->is_intrinsic(Call::shift_left)) { @@ -2718,7 +2718,7 @@ void CodeGen_LLVM::visit(const Call *op) { Value *a = codegen(op->args[0]); Value *b = codegen(op->args[1]); if (!try_vector_predication_intrinsic("llvm.vp.shl", llvm_type_of(op->type), op->type.lanes(), - AllEnabledMask(), { VPArg(a, 0), VPArg(b) })) { + AllEnabledMask(), {VPArg(a, 0), VPArg(b)})) { value = builder->CreateShl(a, b); } } else { @@ -2731,12 +2731,12 @@ void CodeGen_LLVM::visit(const Call *op) { Value *b = codegen(op->args[1]); if (op->type.is_int()) { if (!try_vector_predication_intrinsic("llvm.vp.ashr", llvm_type_of(op->type), op->type.lanes(), - AllEnabledMask(), { VPArg(a, 0), VPArg(b) })) { + AllEnabledMask(), {VPArg(a, 0), VPArg(b)})) { value = builder->CreateAShr(a, b); } } else { if (!try_vector_predication_intrinsic("llvm.vp.lshr", llvm_type_of(op->type), op->type.lanes(), - AllEnabledMask(), { VPArg(a, 0), VPArg(b) })) { + AllEnabledMask(), {VPArg(a, 0), VPArg(b)})) { value = builder->CreateLShr(a, b); } } @@ -3824,9 +3824,8 @@ void CodeGen_LLVM::visit(const Store *op) { Expr base = (ramp != nullptr) ? ramp->base : 0; Expr stride = (ramp != nullptr) ? ramp->stride : 0; Value *stride_val = (!is_dense && ramp != nullptr) ? codegen(stride) : nullptr; - Value *index = (ramp == nullptr) ? codegen(op->index) : nullptr; - + for (int i = 0; i < store_lanes; i += native_lanes) { int slice_lanes = std::min(native_lanes, store_lanes - i); Expr slice_base = simplify(base + i * stride); @@ -3837,7 +3836,7 @@ void CodeGen_LLVM::visit(const Store *op) { Value *vec_ptr = builder->CreatePointerCast(elt_ptr, slice_val->getType()->getPointerTo()); if (is_dense || slice_lanes == 1) { if (try_vector_predication_intrinsic("llvm.vp.store", void_t, slice_lanes, AllEnabledMask(), - { VPArg(slice_val, 0), VPArg(vec_ptr, 1, alignment) })) { + {VPArg(slice_val, 0), VPArg(vec_ptr, 1, alignment)})) { add_tbaa_metadata(dyn_cast(value), op->name, slice_index); } else { StoreInst *store = builder->CreateAlignedStore(slice_val, vec_ptr, llvm::Align(alignment)); @@ -3848,14 +3847,14 @@ void CodeGen_LLVM::visit(const Store *op) { stride_val = builder->CreateIntCast(stride_val, i64_t, true); } bool generated = try_vector_predication_intrinsic("llvm.experimental.vp.strided.store", void_t, slice_lanes, AllEnabledMask(), - { VPArg(slice_val, 0), VPArg(vec_ptr, 1, alignment), VPArg(stride_val, 2) }); + {VPArg(slice_val, 0), VPArg(vec_ptr, 1, alignment), VPArg(stride_val, 2)}); internal_assert(generated) << "Using vector predicated intrinsics, but code generation was not successful for strided store.\n"; add_tbaa_metadata(dyn_cast(value), op->name, slice_index); } else { Value *slice_index = slice_vector(index, i, slice_lanes); Value *vec_ptrs = codegen_buffer_pointer(op->name, value_type, slice_index); bool generated = try_vector_predication_intrinsic("llvm.vp.scatter", void_t, slice_lanes, AllEnabledMask(), - { VPArg(slice_val, 0), VPArg(vec_ptrs, 1, alignment) }); + {VPArg(slice_val, 0), VPArg(vec_ptrs, 1, alignment)}); internal_assert(generated) << "Using vector predicated intrinsics, but code generation was not successful for gathering store.\n"; } } @@ -4328,7 +4327,7 @@ void CodeGen_LLVM::codegen_vector_reduce(const VectorReduce *op, const Expr &ini codegen(op->value); llvm::Value *val = value; bool generated = try_vector_predication_intrinsic(vp_name, llvm_type_of(op->value.type()), op->value.type().lanes(), - AllEnabledMask(), { VPArg(init), VPArg(val, 0) }); + AllEnabledMask(), {VPArg(init), VPArg(val, 0)}); internal_assert(generated) << "Vector predication intrinsic generation failed for vector reduction " << name << "\n"; } else { std::stringstream build_name; @@ -4668,7 +4667,6 @@ Value *CodeGen_LLVM::call_intrin(const Type &result_type, int intrin_lanes, intrin, arg_values); } - Value *CodeGen_LLVM::call_intrin(const llvm::Type *result_type, int intrin_lanes, const string &name, vector arg_values, bool scalable_vector_result, bool is_reduction) { @@ -4755,11 +4753,9 @@ Value *CodeGen_LLVM::call_intrin(const llvm::Type *result_type, int intrin_lanes llvm::FunctionType *intrin_type = intrin->getFunctionType(); for (int i = 0; i < (int)arg_values.size(); i++) { if (arg_values[i]->getType() != intrin_type->getParamType(i)) { - debug(0) << "Normalizing fixed/scalable.\n"; arg_values[i] = normalize_fixed_scalable_vector_type(intrin_type->getParamType(i), arg_values[i]); } if (arg_values[i]->getType() != intrin_type->getParamType(i)) { - debug(0) << "Bit casting type.\n"; // There can be some mismatches in types, such as when passing scalar Halide type T // to LLVM vector type <1 x T>. arg_values[i] = builder->CreateBitCast(arg_values[i], intrin_type->getParamType(i)); @@ -5188,7 +5184,7 @@ bool CodeGen_LLVM::try_vector_predication_intrinsic(const std::string &name, llv llvm::Type *llvm_type = arg.value->getType(); if (isa(llvm_type)) { mangled_types[arg.mangle_index] = ".p0"; - } else { + } else { mangled_types[arg.mangle_index] = mangle_llvm_vector_type(llvm_type); } } @@ -5200,7 +5196,7 @@ bool CodeGen_LLVM::try_vector_predication_intrinsic(const std::string &name, llv } if (!std::holds_alternative(mask)) { - if (std::holds_alternative(mask)) { + if (std::holds_alternative(mask)) { llvm::ElementCount llvm_vector_ec; if (is_scalable) { const auto *vt = cast(llvm_result_type); @@ -5211,7 +5207,7 @@ bool CodeGen_LLVM::try_vector_predication_intrinsic(const std::string &name, llv } args.push_back(ConstantVector::getSplat(llvm_vector_ec, ConstantInt::get(i1_t, 1))); } else { - args.push_back(std::get(mask)); + args.push_back(std::get(mask)); } } args.push_back(ConstantInt::get(i32_t, length)); diff --git a/src/CodeGen_LLVM.h b/src/CodeGen_LLVM.h index c1e5d46ccbf4..c6662a1b2ca7 100644 --- a/src/CodeGen_LLVM.h +++ b/src/CodeGen_LLVM.h @@ -597,8 +597,8 @@ class CodeGen_LLVM : public IRVisitor { * returns true if it an instuction is generated, otherwise * returns false. */ bool try_vector_predication_comparison(const std::string &name, const Type &result_type, - MaskVariant mask, llvm::Value *a, llvm::Value *b, - const char *cmp_op); + MaskVariant mask, llvm::Value *a, llvm::Value *b, + const char *cmp_op); /** Generate an intrisic call if use_llvm_vp_intrinsics is true * and length is greater than 1. If generated, assigns result diff --git a/src/CodeGen_RISCV.cpp b/src/CodeGen_RISCV.cpp index db0d10fa24ca..60a5c3feff19 100644 --- a/src/CodeGen_RISCV.cpp +++ b/src/CodeGen_RISCV.cpp @@ -86,7 +86,7 @@ int CodeGen_RISCV::native_vector_bits() const { int CodeGen_RISCV::maximum_vector_bits() const { return native_vector_bits() * 8; } - + int CodeGen_RISCV::target_vscale() const { if (target.vector_bits != 0 && target.has_feature(Target::RVV)) { From 100a5c1299ad1b2d6682b8ed4ebe12ee94d7c0fd Mon Sep 17 00:00:00 2001 From: Z Stern Date: Fri, 21 Oct 2022 14:28:32 +0000 Subject: [PATCH 10/16] Formatting. --- src/CodeGen_LLVM.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp index 73ead1c95cc4..fe23a334bc62 100644 --- a/src/CodeGen_LLVM.cpp +++ b/src/CodeGen_LLVM.cpp @@ -2032,7 +2032,7 @@ void CodeGen_LLVM::visit(const Load *op) { llvm::Type *load_type = llvm_type_of(op->type.element_of()); if (ramp && stride && stride->value == 1) { - value = codegen_dense_vector_load(op, nullptr); + value = codegen_dense_vector_load(op, nullptr); } else if (ramp && stride && 2 <= stride->value && stride->value <= 4) { // Try to rewrite strided loads as shuffles of dense loads, // aligned to the stride. This makes adjacent strided loads @@ -3848,7 +3848,7 @@ void CodeGen_LLVM::visit(const Store *op) { } bool generated = try_vector_predication_intrinsic("llvm.experimental.vp.strided.store", void_t, slice_lanes, AllEnabledMask(), {VPArg(slice_val, 0), VPArg(vec_ptr, 1, alignment), VPArg(stride_val, 2)}); - internal_assert(generated) << "Using vector predicated intrinsics, but code generation was not successful for strided store.\n"; + internal_assert(generated) << "Using vector predicated intrinsics, but code generation was not successful for strided store.\n"; add_tbaa_metadata(dyn_cast(value), op->name, slice_index); } else { Value *slice_index = slice_vector(index, i, slice_lanes); @@ -5173,7 +5173,7 @@ bool CodeGen_LLVM::try_vector_predication_intrinsic(const std::string &name, llv internal_assert(!(any_scalable && any_fixed)) << "Cannot combine fixed and scalable vectors to vector predication intrinsic.\n"; bool is_scalable = any_scalable; - + std::vector args; args.reserve(2 + vp_args.size()); std::vector mangled_types(args.size()); From db0ea7c4331eb21e1b8ab5cf6a6474721b9c1ac7 Mon Sep 17 00:00:00 2001 From: Z Stern Date: Fri, 21 Oct 2022 14:31:04 +0000 Subject: [PATCH 11/16] More formatting. --- src/CodeGen_LLVM.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp index fe23a334bc62..f5523ccaae45 100644 --- a/src/CodeGen_LLVM.cpp +++ b/src/CodeGen_LLVM.cpp @@ -5234,7 +5234,7 @@ bool CodeGen_LLVM::try_vector_predication_comparison(const std::string &name, co llvm::MDBuilder builder(*context); llvm::Value *md_val = llvm::MetadataAsValue::get(*context, builder.createString(cmp_op)); return try_vector_predication_intrinsic(name, llvm_type_of(result_type), result_type.lanes(), mask, - { VPArg(a, 0), VPArg(b), VPArg(md_val) }); + {VPArg(a, 0), VPArg(b), VPArg(md_val)}); } } // namespace Internal From 96dcd93351d1ba35abfa5970a99361b9ba02ded8 Mon Sep 17 00:00:00 2001 From: Z Stern Date: Mon, 24 Oct 2022 18:06:24 +0000 Subject: [PATCH 12/16] Use std::optional instead of -1 bottom value for mangle_index. Simple caveperson programmer habits die hard. Improve comments. --- src/CodeGen_LLVM.cpp | 6 +++--- src/CodeGen_LLVM.h | 15 +++++++++++---- 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp index f5523ccaae45..a3fe55ec6e6c 100644 --- a/src/CodeGen_LLVM.cpp +++ b/src/CodeGen_LLVM.cpp @@ -5180,12 +5180,12 @@ bool CodeGen_LLVM::try_vector_predication_intrinsic(const std::string &name, llv for (const VPArg &arg : vp_args) { args.push_back(arg.value); - if (arg.mangle_index != -1) { + if (arg.mangle_index) { llvm::Type *llvm_type = arg.value->getType(); if (isa(llvm_type)) { - mangled_types[arg.mangle_index] = ".p0"; + mangled_types[arg.mangle_index.value()] = ".p0"; } else { - mangled_types[arg.mangle_index] = mangle_llvm_vector_type(llvm_type); + mangled_types[arg.mangle_index.value()] = mangle_llvm_vector_type(llvm_type); } } } diff --git a/src/CodeGen_LLVM.h b/src/CodeGen_LLVM.h index c6662a1b2ca7..029e181f5806 100644 --- a/src/CodeGen_LLVM.h +++ b/src/CodeGen_LLVM.h @@ -34,6 +34,7 @@ class GlobalVariable; #include #include +#include #include #include #include @@ -303,7 +304,11 @@ class CodeGen_LLVM : public IRVisitor { llvm::Value *codegen_buffer_pointer(llvm::Value *base_address, Type type, llvm::Value *index); // @} - /** Return an appropriate type string for a type which is of VectorType. */ + /** Return type string for LLVM vector type using LLVM IR intrinsic type mangling. + * E.g. ".nxv4i32" for a scalable vector of four 32-bit integers, + * or ".v4f32" for a fixed vector of four 32-bit floats. + * The dot is included in the result. + */ std::string mangle_llvm_vector_type(llvm::Type *type); /** Turn a Halide Type into an llvm::Value representing a constant halide_type_t */ @@ -572,9 +577,10 @@ class CodeGen_LLVM : public IRVisitor { * and if so, where, and the alignment for pointer arguments. */ struct VPArg { llvm::Value *value; - int mangle_index; + // If provided, put argument's type into the intrinsic name via LLVM IR type mangling. + std::optional mangle_index; int alignment; - VPArg(llvm::Value *value, int32_t mangle_index = -1, int32_t alignment = 0) + VPArg(llvm::Value *value, std::optional mangle_index = std::nullopt, int32_t alignment = 0) : value(value), mangle_index(mangle_index), alignment(alignment) { } }; @@ -607,7 +613,8 @@ class CodeGen_LLVM : public IRVisitor { bool try_vector_predication_intrinsic(const std::string &name, llvm::Type *llvm_result_type, int32_t length, MaskVariant mask, std::vector args); - /** Controls use of vector predicated intrinsics for vector operations. */ + /** Controls use of vector predicated intrinsics for vector operations. + * Will be set by certain backends (e.g. RISC V) to control codegen. */ bool use_llvm_vp_intrinsics; // @} From 51f3e35b81e7f9ac6de61e8791bafec68c955d42 Mon Sep 17 00:00:00 2001 From: Z Stern Date: Mon, 24 Oct 2022 18:09:28 +0000 Subject: [PATCH 13/16] Switch to using instead of typedef per review feedback. --- src/CodeGen_LLVM.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/CodeGen_LLVM.h b/src/CodeGen_LLVM.h index 029e181f5806..10d5268ec0bd 100644 --- a/src/CodeGen_LLVM.h +++ b/src/CodeGen_LLVM.h @@ -595,7 +595,7 @@ class CodeGen_LLVM : public IRVisitor { /** Predication mask using the above two types for special cases * and an llvm::Value for the general one. */ - typedef std::variant MaskVariant; + using MaskVariant = std::variant; /** Generate a vector predicated comparison intrinsic call if * use_llvm_vp_intrinsics is true and result_type is a vector From cb0cbbc96e5158fd3abaab6cc03054b2b581665b Mon Sep 17 00:00:00 2001 From: Z Stern Date: Mon, 24 Oct 2022 18:39:59 +0000 Subject: [PATCH 14/16] Address review feedback re: default arguments, moving string concatenation into one line. --- src/CodeGen_LLVM.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp index a3fe55ec6e6c..398849a193c5 100644 --- a/src/CodeGen_LLVM.cpp +++ b/src/CodeGen_LLVM.cpp @@ -2032,7 +2032,7 @@ void CodeGen_LLVM::visit(const Load *op) { llvm::Type *load_type = llvm_type_of(op->type.element_of()); if (ramp && stride && stride->value == 1) { - value = codegen_dense_vector_load(op, nullptr); + value = codegen_dense_vector_load(op); } else if (ramp && stride && 2 <= stride->value && stride->value <= 4) { // Try to rewrite strided loads as shuffles of dense loads, // aligned to the stride. This makes adjacent strided loads @@ -2088,7 +2088,8 @@ void CodeGen_LLVM::visit(const Load *op) { Expr slice_base = simplify(base + load_base_i); Value *load_i = codegen_vector_load(op->type.with_lanes(load_lanes_i), op->name, slice_base, - op->image, op->param, align, nullptr, false, nullptr); + op->image, op->param, align, /*vpred=*/nullptr, + /*slice_to_native=*/false); std::vector constants; for (int j = 0; j < lanes_i; j++) { @@ -4320,8 +4321,7 @@ void CodeGen_LLVM::codegen_vector_reduce(const VectorReduce *op, const Expr &ini } if (use_llvm_vp_intrinsics) { - string vp_name = "llvm.vp.reduce."; - vp_name += name; + string vp_name = "llvm.vp.reduce." + name; codegen(initial_value); llvm::Value *init = value; codegen(op->value); From 740f1210c687947da456c0fee420cdfb6b072872 Mon Sep 17 00:00:00 2001 From: Z Stern Date: Mon, 24 Oct 2022 18:55:09 +0000 Subject: [PATCH 15/16] Add GitHub issue for fmax/fmin strict_float TODO. Change TODO(zalman) to TODO(zvookin) uniformly. Few other cleanups. --- src/CodeGen_LLVM.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp index 398849a193c5..ea5e681a3ab1 100644 --- a/src/CodeGen_LLVM.cpp +++ b/src/CodeGen_LLVM.cpp @@ -1518,7 +1518,7 @@ void CodeGen_LLVM::visit(const Add *op) { } else if (op->type.is_int() && op->type.bits() >= 32) { // We tell llvm integers don't wrap, so that it generates good // code for loop indices. - // TODO(zalman): This needs vector predication, but I can't + // TODO(zvookin): This needs vector predication, but I can't // see a way to do it. May go away in introducing correct // index type instead of using int32_t. value = builder->CreateNSWAdd(a, b); @@ -1547,7 +1547,7 @@ void CodeGen_LLVM::visit(const Sub *op) { } else if (op->type.is_int() && op->type.bits() >= 32) { // We tell llvm integers don't wrap, so that it generates good // code for loop indices. - // TODO(zalman): This needs vector predication, but I can't + // TODO(zvookin): This needs vector predication, but I can't // see a way to do it. May go away in introducing correct // index type instead of using int32_t. value = builder->CreateNSWSub(a, b); @@ -1580,7 +1580,7 @@ void CodeGen_LLVM::visit(const Mul *op) { } else if (op->type.is_int() && op->type.bits() >= 32) { // We tell llvm integers don't wrap, so that it generates good // code for loop indices. - // TODO(zalman): This needs vector predication, but I can't + // TODO(zvookin): This needs vector predication, but I can't // see a way to do it. May go away in introducing correct // index type instead of using int32_t. value = builder->CreateNSWMul(a, b); @@ -3679,7 +3679,7 @@ void CodeGen_LLVM::visit(const For *op) { Value *extent = codegen(op->extent); const Acquire *acquire = op->body.as(); - // TODO(zalman): remove this after validating it doesn't happen + // TODO(zvookin): remove this after validating it doesn't happen internal_assert(!(op->for_type == ForType::Parallel || (op->for_type == ForType::Serial && acquire && @@ -3790,7 +3790,7 @@ void CodeGen_LLVM::visit(const Store *op) { } else { int alignment = value_type.bytes(); const Ramp *ramp = op->index.as(); - // TODO(zalman): consider splitting out vector predication path. Current + // TODO(zvookin): consider splitting out vector predication path. Current // code shows how vector predication would simplify things as the // following scalarization cases would go away. bool is_dense = ramp && is_const_one(ramp->stride); @@ -4274,14 +4274,14 @@ void CodeGen_LLVM::codegen_vector_reduce(const VectorReduce *op, const Expr &ini break; case VectorReduce::Min: name = "fmin"; - // TODO(zvookin): For signed case, whether this is Inf or the max floating-point value depends on strict_float. (Or maybe it is QNaN in strict_float.) + // TODO(zvookin): Not correct for stricT_float. See: https://github.com/halide/Halide/issues/7118 if (takes_initial_value && !initial_value.defined()) { initial_value = op->type.max(); } break; case VectorReduce::Max: name = "fmax"; - // TODO(zvookin): For signed case, whether this is -Inf or the min floating-point value depends on strict_float. (Or maybe it is -QNaN in strict_float.) + // TODO(zvookin): Not correct for stricT_float. See: https://github.com/halide/Halide/issues/7118 if (takes_initial_value && !initial_value.defined()) { initial_value = op->type.min(); } @@ -4321,7 +4321,7 @@ void CodeGen_LLVM::codegen_vector_reduce(const VectorReduce *op, const Expr &ini } if (use_llvm_vp_intrinsics) { - string vp_name = "llvm.vp.reduce." + name; + string vp_name = "llvm.vp.reduce." + std::string(name); codegen(initial_value); llvm::Value *init = value; codegen(op->value); From 2c0df5f5334f20806be0b1311a24e4bfd4482db8 Mon Sep 17 00:00:00 2001 From: Z Stern Date: Mon, 24 Oct 2022 23:49:23 +0000 Subject: [PATCH 16/16] Rearrage the maze of twisty passages to not use vector predicated strided load for dense case. Add some comments. --- src/CodeGen_LLVM.cpp | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp index ea5e681a3ab1..df9f331cfe15 100644 --- a/src/CodeGen_LLVM.cpp +++ b/src/CodeGen_LLVM.cpp @@ -2441,6 +2441,11 @@ llvm::Value *CodeGen_LLVM::codegen_vector_load(const Type &type, const std::stri Value *slice_mask = (vpred != nullptr) ? slice_vector(vpred, i, slice_lanes) : nullptr; Instruction *load_inst = nullptr; + // In this path, strided predicated loads are only handled if vector + // predication is enabled. Otherwise this would be scalarized at a higher + // level. Assume that if stride is passed, this is not dense, though + // LLVM should codegen the same thing for a constant 1 strided load as + // for a non-strided load. if (stride) { if (get_target().bits == 64 && !stride->getType()->isIntegerTy(64)) { stride = builder->CreateIntCast(stride, i64_t, true); @@ -2482,17 +2487,14 @@ void CodeGen_LLVM::codegen_predicated_load(const Load *op) { const Ramp *ramp = op->index.as(); const IntImm *stride = ramp ? ramp->stride.as() : nullptr; - if (use_llvm_vp_intrinsics && stride) { - Value *vpred = codegen(op->predicate); - Value *llvm_stride = codegen(stride); - value = codegen_vector_load(op->type, op->name, ramp->base, op->image, op->param, - op->alignment, vpred, true, llvm_stride); - return; - } - if (ramp && is_const_one(ramp->stride)) { // Dense vector load Value *vpred = codegen(op->predicate); value = codegen_dense_vector_load(op, vpred); + } else if (use_llvm_vp_intrinsics && stride) { // Case only handled by vector predication, otherwise must scalarize. + Value *vpred = codegen(op->predicate); + Value *llvm_stride = codegen(stride); // Not 1 (dense) as that was caught above. + value = codegen_vector_load(op->type, op->name, ramp->base, op->image, op->param, + op->alignment, vpred, true, llvm_stride); } else if (ramp && stride && stride->value == -1) { debug(4) << "Predicated dense vector load with stride -1\n\t" << Expr(op) << "\n"; vector indices(ramp->lanes);