diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp index 4eb29eb803c2..df9f331cfe15 100644 --- a/src/CodeGen_LLVM.cpp +++ b/src/CodeGen_LLVM.cpp @@ -217,6 +217,7 @@ CodeGen_LLVM::CodeGen_LLVM(const Target &t) inside_atomic_mutex_node(false), emit_atomic_stores(false), + use_llvm_vp_intrinsics(false), destructor_block(nullptr), strict_float(t.has_feature(Target::StrictFloat)), @@ -1510,13 +1511,22 @@ void CodeGen_LLVM::visit(const Add *op) { Value *a = codegen(op->a); Value *b = codegen(op->b); if (op->type.is_float()) { - value = builder->CreateFAdd(a, b); + if (!try_vector_predication_intrinsic("llvm.vp.fadd", llvm_type_of(t), t.lanes(), AllEnabledMask(), + {VPArg(a, 0), VPArg(b)})) { + value = builder->CreateFAdd(a, b); + } } else if (op->type.is_int() && op->type.bits() >= 32) { // We tell llvm integers don't wrap, so that it generates good // code for loop indices. + // TODO(zvookin): This needs vector predication, but I can't + // see a way to do it. May go away in introducing correct + // index type instead of using int32_t. value = builder->CreateNSWAdd(a, b); } else { - value = builder->CreateAdd(a, b); + if (!try_vector_predication_intrinsic("llvm.vp.add", llvm_type_of(t), t.lanes(), AllEnabledMask(), + {VPArg(a, 0), VPArg(b)})) { + value = builder->CreateAdd(a, b); + } } } @@ -1530,13 +1540,22 @@ void CodeGen_LLVM::visit(const Sub *op) { Value *a = codegen(op->a); Value *b = codegen(op->b); if (op->type.is_float()) { - value = builder->CreateFSub(a, b); + if (!try_vector_predication_intrinsic("llvm.vp.fsub", llvm_type_of(t), t.lanes(), AllEnabledMask(), + {VPArg(a, 0), VPArg(b)})) { + value = builder->CreateFSub(a, b); + } } else if (op->type.is_int() && op->type.bits() >= 32) { // We tell llvm integers don't wrap, so that it generates good // code for loop indices. + // TODO(zvookin): This needs vector predication, but I can't + // see a way to do it. May go away in introducing correct + // index type instead of using int32_t. value = builder->CreateNSWSub(a, b); } else { - value = builder->CreateSub(a, b); + if (!try_vector_predication_intrinsic("llvm.vp.sub", llvm_type_of(t), t.lanes(), AllEnabledMask(), + {VPArg(a, 0), VPArg(b)})) { + value = builder->CreateSub(a, b); + } } } @@ -1554,13 +1573,22 @@ void CodeGen_LLVM::visit(const Mul *op) { Value *a = codegen(op->a); Value *b = codegen(op->b); if (op->type.is_float()) { - value = builder->CreateFMul(a, b); + if (!try_vector_predication_intrinsic("llvm.vp.fmul", llvm_type_of(t), t.lanes(), AllEnabledMask(), + {VPArg(a, 0), VPArg(b)})) { + value = builder->CreateFMul(a, b); + } } else if (op->type.is_int() && op->type.bits() >= 32) { // We tell llvm integers don't wrap, so that it generates good // code for loop indices. + // TODO(zvookin): This needs vector predication, but I can't + // see a way to do it. May go away in introducing correct + // index type instead of using int32_t. value = builder->CreateNSWMul(a, b); } else { - value = builder->CreateMul(a, b); + if (!try_vector_predication_intrinsic("llvm.vp.mul", llvm_type_of(t), t.lanes(), AllEnabledMask(), + {VPArg(a, 0), VPArg(b)})) { + value = builder->CreateMul(a, b); + } } } @@ -1580,7 +1608,10 @@ void CodeGen_LLVM::visit(const Div *op) { // output hard. Value *a = codegen(op->a); Value *b = codegen(op->b); - value = builder->CreateFDiv(a, b); + if (!try_vector_predication_intrinsic("llvm.vp.fdiv", llvm_type_of(t), t.lanes(), AllEnabledMask(), + {VPArg(a, 0), VPArg(b)})) { + value = builder->CreateFDiv(a, b); + } } else { value = codegen(lower_int_uint_div(op->a, op->b)); } @@ -1650,9 +1681,13 @@ void CodeGen_LLVM::visit(const EQ *op) { Value *a = codegen(op->a); Value *b = codegen(op->b); if (t.is_float()) { - value = builder->CreateFCmpOEQ(a, b); + if (!try_vector_predication_comparison("llvm.vp.fcmp", t, AllEnabledMask(), a, b, "oeq")) { + value = builder->CreateFCmpOEQ(a, b); + } } else { - value = builder->CreateICmpEQ(a, b); + if (!try_vector_predication_comparison("llvm.vp.icmp", t, AllEnabledMask(), a, b, "eq")) { + value = builder->CreateICmpEQ(a, b); + } } } @@ -1666,9 +1701,13 @@ void CodeGen_LLVM::visit(const NE *op) { Value *a = codegen(op->a); Value *b = codegen(op->b); if (t.is_float()) { - value = builder->CreateFCmpONE(a, b); + if (!try_vector_predication_comparison("llvm.vp.fcmp", t, AllEnabledMask(), a, b, "one")) { + value = builder->CreateFCmpONE(a, b); + } } else { - value = builder->CreateICmpNE(a, b); + if (!try_vector_predication_comparison("llvm.vp.icmp", t, AllEnabledMask(), a, b, "ne")) { + value = builder->CreateICmpNE(a, b); + } } } @@ -1682,11 +1721,17 @@ void CodeGen_LLVM::visit(const LT *op) { Value *a = codegen(op->a); Value *b = codegen(op->b); if (t.is_float()) { - value = builder->CreateFCmpOLT(a, b); + if (!try_vector_predication_comparison("llvm.vp.fcmp", t, AllEnabledMask(), a, b, "olt")) { + value = builder->CreateFCmpOLT(a, b); + } } else if (t.is_int()) { - value = builder->CreateICmpSLT(a, b); + if (!try_vector_predication_comparison("llvm.vp.icmp", t, AllEnabledMask(), a, b, "slt")) { + value = builder->CreateICmpSLT(a, b); + } } else { - value = builder->CreateICmpULT(a, b); + if (!try_vector_predication_comparison("llvm.vp.icmp", t, AllEnabledMask(), a, b, "ult")) { + value = builder->CreateICmpULT(a, b); + } } } @@ -1700,11 +1745,17 @@ void CodeGen_LLVM::visit(const LE *op) { Value *a = codegen(op->a); Value *b = codegen(op->b); if (t.is_float()) { - value = builder->CreateFCmpOLE(a, b); + if (!try_vector_predication_comparison("llvm.vp.fcmp", t, AllEnabledMask(), a, b, "ole")) { + value = builder->CreateFCmpOLE(a, b); + } } else if (t.is_int()) { - value = builder->CreateICmpSLE(a, b); + if (!try_vector_predication_comparison("llvm.vp.icmp", t, AllEnabledMask(), a, b, "sle")) { + value = builder->CreateICmpSLE(a, b); + } } else { - value = builder->CreateICmpULE(a, b); + if (!try_vector_predication_comparison("llvm.vp.icmp", t, AllEnabledMask(), a, b, "ule")) { + value = builder->CreateICmpULE(a, b); + } } } @@ -1719,11 +1770,17 @@ void CodeGen_LLVM::visit(const GT *op) { Value *b = codegen(op->b); if (t.is_float()) { - value = builder->CreateFCmpOGT(a, b); + if (!try_vector_predication_comparison("llvm.vp.fcmp", t, AllEnabledMask(), a, b, "ogt")) { + value = builder->CreateFCmpOGT(a, b); + } } else if (t.is_int()) { - value = builder->CreateICmpSGT(a, b); + if (!try_vector_predication_comparison("llvm.vp.icmp", t, AllEnabledMask(), a, b, "sgt")) { + value = builder->CreateICmpSGT(a, b); + } } else { - value = builder->CreateICmpUGT(a, b); + if (!try_vector_predication_comparison("llvm.vp.icmp", t, AllEnabledMask(), a, b, "ugt")) { + value = builder->CreateICmpUGT(a, b); + } } } @@ -1737,11 +1794,17 @@ void CodeGen_LLVM::visit(const GE *op) { Value *a = codegen(op->a); Value *b = codegen(op->b); if (t.is_float()) { - value = builder->CreateFCmpOGE(a, b); + if (!try_vector_predication_comparison("llvm.vp.fcmp", t, AllEnabledMask(), a, b, "oge")) { + value = builder->CreateFCmpOGE(a, b); + } } else if (t.is_int()) { - value = builder->CreateICmpSGE(a, b); + if (!try_vector_predication_comparison("llvm.vp.icmp", t, AllEnabledMask(), a, b, "sge")) { + value = builder->CreateICmpSGE(a, b); + } } else { - value = builder->CreateICmpUGE(a, b); + if (!try_vector_predication_comparison("llvm.vp.icmp", t, AllEnabledMask(), a, b, "uge")) { + value = builder->CreateICmpUGE(a, b); + } } } @@ -1752,7 +1815,10 @@ void CodeGen_LLVM::visit(const And *op) { Value *a = codegen(op->a); Value *b = codegen(op->b); - value = builder->CreateAnd(a, b); + if (!try_vector_predication_intrinsic("llvm.vp.and", llvm_type_of(op->type), op->type.lanes(), + AllEnabledMask(), {VPArg(a, 0), VPArg(b)})) { + value = builder->CreateAnd(a, b); + } } void CodeGen_LLVM::visit(const Or *op) { @@ -1762,19 +1828,34 @@ void CodeGen_LLVM::visit(const Or *op) { Value *a = codegen(op->a); Value *b = codegen(op->b); - value = builder->CreateOr(a, b); + if (!try_vector_predication_intrinsic("llvm.vp.or", llvm_type_of(op->type), op->type.lanes(), + AllEnabledMask(), {VPArg(a, 0), VPArg(b)})) { + value = builder->CreateOr(a, b); + } } void CodeGen_LLVM::visit(const Not *op) { Value *a = codegen(op->a); - value = builder->CreateNot(a); + if (!try_vector_predication_intrinsic("llvm.vp.not", llvm_type_of(op->type), op->type.lanes(), + AllEnabledMask(), {VPArg(a, 0)})) { + value = builder->CreateNot(a); + } } void CodeGen_LLVM::visit(const Select *op) { Value *cmp = codegen(op->condition); + if (use_llvm_vp_intrinsics && + op->type.is_vector() && + op->condition.type().is_scalar()) { + cmp = create_broadcast(cmp, op->type.lanes()); + } + Value *a = codegen(op->true_value); Value *b = codegen(op->false_value); - value = builder->CreateSelect(cmp, a, b); + if (!try_vector_predication_intrinsic("llvm.vp.select", llvm_type_of(op->type), op->type.lanes(), + AllEnabledMask(), {VPArg(cmp), VPArg(a, 0), VPArg(b)})) { + value = builder->CreateSelect(cmp, a, b); + } } namespace { @@ -2006,8 +2087,9 @@ void CodeGen_LLVM::visit(const Load *op) { int lanes_i = std::min(slice_lanes, op->type.lanes() - i); Expr slice_base = simplify(base + load_base_i); - Value *load_i = codegen_dense_vector_load(op->type.with_lanes(load_lanes_i), op->name, slice_base, - op->image, op->param, align, nullptr, false); + Value *load_i = codegen_vector_load(op->type.with_lanes(load_lanes_i), op->name, slice_base, + op->image, op->param, align, /*vpred=*/nullptr, + /*slice_to_native=*/false); std::vector constants; for (int j = 0; j < lanes_i; j++) { @@ -2221,8 +2303,7 @@ void CodeGen_LLVM::codegen_predicated_store(const Store *op) { Halide::Type value_type = op->value.type(); Value *val = codegen(op->value); int alignment = value_type.bytes(); - int native_bits = native_vector_bits(); - int native_bytes = native_bits / 8; + int native_bytes = native_vector_bits() / 8; // Boost the alignment if possible, up to the native vector width. ModulusRemainder mod_rem = op->alignment; @@ -2245,7 +2326,7 @@ void CodeGen_LLVM::codegen_predicated_store(const Store *op) { // For dense vector stores wider than the native vector // width, bust them up into native vectors. int store_lanes = value_type.lanes(); - int native_lanes = native_bits / value_type.bits(); + int native_lanes = maximum_vector_bits() / value_type.bits(); for (int i = 0; i < store_lanes; i += native_lanes) { int slice_lanes = std::min(native_lanes, store_lanes - i); @@ -2257,8 +2338,13 @@ void CodeGen_LLVM::codegen_predicated_store(const Store *op) { Value *vec_ptr = builder->CreatePointerCast(elt_ptr, slice_val->getType()->getPointerTo()); Value *slice_mask = slice_vector(vpred, i, slice_lanes); - Instruction *store = - builder->CreateMaskedStore(slice_val, vec_ptr, llvm::Align(alignment), slice_mask); + Instruction *store; + if (try_vector_predication_intrinsic("llvm.vp.store", void_t, slice_lanes, slice_mask, + {VPArg(slice_val, 0), VPArg(vec_ptr, 1, alignment)})) { + store = dyn_cast(value); + } else { + store = builder->CreateMaskedStore(slice_val, vec_ptr, llvm::Align(alignment), slice_mask); + } add_tbaa_metadata(store, op->name, slice_index); } } else { // It's not dense vector store, we need to scalarize it @@ -2302,9 +2388,9 @@ void CodeGen_LLVM::codegen_predicated_store(const Store *op) { } } -llvm::Value *CodeGen_LLVM::codegen_dense_vector_load(const Type &type, const std::string &name, const Expr &base, - const Buffer<> &image, const Parameter ¶m, const ModulusRemainder &alignment, - llvm::Value *vpred, bool slice_to_native) { +llvm::Value *CodeGen_LLVM::codegen_vector_load(const Type &type, const std::string &name, const Expr &base, + const Buffer<> &image, const Parameter ¶m, const ModulusRemainder &alignment, + llvm::Value *vpred, bool slice_to_native, llvm::Value *stride) { debug(4) << "Vectorize predicated dense vector load:\n\t" << "(" << type << ")" << name << "[ramp(base, 1, " << type.lanes() << ")]\n"; @@ -2341,7 +2427,7 @@ llvm::Value *CodeGen_LLVM::codegen_dense_vector_load(const Type &type, const std // For dense vector loads wider than the native vector // width, bust them up into native vectors int load_lanes = type.lanes(); - int native_lanes = slice_to_native ? std::max(1, native_bits / type.bits()) : load_lanes; + int native_lanes = slice_to_native ? std::max(1, maximum_vector_bits() / type.bits()) : load_lanes; vector slices; for (int i = 0; i < load_lanes; i += native_lanes) { int slice_lanes = std::min(native_lanes, load_lanes - i); @@ -2352,12 +2438,35 @@ llvm::Value *CodeGen_LLVM::codegen_dense_vector_load(const Type &type, const std Value *elt_ptr = codegen_buffer_pointer(name, type.element_of(), slice_base); Value *vec_ptr = builder->CreatePointerCast(elt_ptr, slice_type->getPointerTo()); - Instruction *load_inst; - if (vpred != nullptr) { - Value *slice_mask = slice_vector(vpred, i, slice_lanes); - load_inst = builder->CreateMaskedLoad(slice_type, vec_ptr, llvm::Align(align_bytes), slice_mask); + Value *slice_mask = (vpred != nullptr) ? slice_vector(vpred, i, slice_lanes) : nullptr; + + Instruction *load_inst = nullptr; + // In this path, strided predicated loads are only handled if vector + // predication is enabled. Otherwise this would be scalarized at a higher + // level. Assume that if stride is passed, this is not dense, though + // LLVM should codegen the same thing for a constant 1 strided load as + // for a non-strided load. + if (stride) { + if (get_target().bits == 64 && !stride->getType()->isIntegerTy(64)) { + stride = builder->CreateIntCast(stride, i64_t, true); + } + if (try_vector_predication_intrinsic("llvm.experimental.vp.strided.load", slice_type, slice_lanes, slice_mask, + {VPArg(vec_ptr, 0, align_bytes), VPArg(stride, 1)})) { + load_inst = dyn_cast(value); + } else { + internal_error << "Vector predicated strided load should not be requested if not supported.\n"; + } } else { - load_inst = builder->CreateAlignedLoad(slice_type, vec_ptr, llvm::Align(align_bytes)); + if (try_vector_predication_intrinsic("llvm.vp.load", slice_type, slice_lanes, slice_mask, + {VPArg(vec_ptr, 0, align_bytes)})) { + load_inst = dyn_cast(value); + } else { + if (slice_mask != nullptr) { + load_inst = builder->CreateMaskedLoad(slice_type, vec_ptr, llvm::Align(align_bytes), slice_mask); + } else { + load_inst = builder->CreateAlignedLoad(slice_type, vec_ptr, llvm::Align(align_bytes)); + } + } } add_tbaa_metadata(load_inst, name, slice_index); slices.push_back(load_inst); @@ -2370,8 +2479,8 @@ Value *CodeGen_LLVM::codegen_dense_vector_load(const Load *load, Value *vpred, b const Ramp *ramp = load->index.as(); internal_assert(ramp && is_const_one(ramp->stride)) << "Should be dense vector load\n"; - return codegen_dense_vector_load(load->type, load->name, ramp->base, load->image, load->param, - load->alignment, vpred, slice_to_native); + return codegen_vector_load(load->type, load->name, ramp->base, load->image, load->param, + load->alignment, vpred, slice_to_native, nullptr); } void CodeGen_LLVM::codegen_predicated_load(const Load *op) { @@ -2381,6 +2490,11 @@ void CodeGen_LLVM::codegen_predicated_load(const Load *op) { if (ramp && is_const_one(ramp->stride)) { // Dense vector load Value *vpred = codegen(op->predicate); value = codegen_dense_vector_load(op, vpred); + } else if (use_llvm_vp_intrinsics && stride) { // Case only handled by vector predication, otherwise must scalarize. + Value *vpred = codegen(op->predicate); + Value *llvm_stride = codegen(stride); // Not 1 (dense) as that was caught above. + value = codegen_vector_load(op->type, op->name, ramp->base, op->image, op->param, + op->alignment, vpred, true, llvm_stride); } else if (ramp && stride && stride->value == -1) { debug(4) << "Predicated dense vector load with stride -1\n\t" << Expr(op) << "\n"; vector indices(ramp->lanes); @@ -2574,27 +2688,42 @@ void CodeGen_LLVM::visit(const Call *op) { internal_assert(op->args.size() == 2); Value *a = codegen(op->args[0]); Value *b = codegen(op->args[1]); - value = builder->CreateAnd(a, b); + if (!try_vector_predication_intrinsic("llvm.vp.and", llvm_type_of(op->type), op->type.lanes(), + AllEnabledMask(), {VPArg(a, 0), VPArg(b)})) { + value = builder->CreateAnd(a, b); + } } else if (op->is_intrinsic(Call::bitwise_xor)) { internal_assert(op->args.size() == 2); Value *a = codegen(op->args[0]); Value *b = codegen(op->args[1]); - value = builder->CreateXor(a, b); + if (!try_vector_predication_intrinsic("llvm.vp.xor", llvm_type_of(op->type), op->type.lanes(), + AllEnabledMask(), {VPArg(a, 0), VPArg(b)})) { + value = builder->CreateXor(a, b); + } } else if (op->is_intrinsic(Call::bitwise_or)) { internal_assert(op->args.size() == 2); Value *a = codegen(op->args[0]); Value *b = codegen(op->args[1]); - value = builder->CreateOr(a, b); + if (!try_vector_predication_intrinsic("llvm.vp.or", llvm_type_of(op->type), op->type.lanes(), + AllEnabledMask(), {VPArg(a, 0), VPArg(b)})) { + value = builder->CreateOr(a, b); + } } else if (op->is_intrinsic(Call::bitwise_not)) { internal_assert(op->args.size() == 1); Value *a = codegen(op->args[0]); - value = builder->CreateNot(a); + if (!try_vector_predication_intrinsic("llvm.vp.not", llvm_type_of(op->type), op->type.lanes(), + AllEnabledMask(), {VPArg(a, 0)})) { + value = builder->CreateNot(a); + } } else if (op->is_intrinsic(Call::shift_left)) { internal_assert(op->args.size() == 2); if (op->args[1].type().is_uint()) { Value *a = codegen(op->args[0]); Value *b = codegen(op->args[1]); - value = builder->CreateShl(a, b); + if (!try_vector_predication_intrinsic("llvm.vp.shl", llvm_type_of(op->type), op->type.lanes(), + AllEnabledMask(), {VPArg(a, 0), VPArg(b)})) { + value = builder->CreateShl(a, b); + } } else { value = codegen(lower_signed_shift_left(op->args[0], op->args[1])); } @@ -2604,9 +2733,15 @@ void CodeGen_LLVM::visit(const Call *op) { Value *a = codegen(op->args[0]); Value *b = codegen(op->args[1]); if (op->type.is_int()) { - value = builder->CreateAShr(a, b); + if (!try_vector_predication_intrinsic("llvm.vp.ashr", llvm_type_of(op->type), op->type.lanes(), + AllEnabledMask(), {VPArg(a, 0), VPArg(b)})) { + value = builder->CreateAShr(a, b); + } } else { - value = builder->CreateLShr(a, b); + if (!try_vector_predication_intrinsic("llvm.vp.lshr", llvm_type_of(op->type), op->type.lanes(), + AllEnabledMask(), {VPArg(a, 0), VPArg(b)})) { + value = builder->CreateLShr(a, b); + } } } else { value = codegen(lower_signed_shift_right(op->args[0], op->args[1])); @@ -3546,7 +3681,7 @@ void CodeGen_LLVM::visit(const For *op) { Value *extent = codegen(op->extent); const Acquire *acquire = op->body.as(); - // TODO(zalman): remove this after validating it doesn't happen + // TODO(zvookin): remove this after validating it doesn't happen internal_assert(!(op->for_type == ForType::Parallel || (op->for_type == ForType::Serial && acquire && @@ -3657,7 +3792,11 @@ void CodeGen_LLVM::visit(const Store *op) { } else { int alignment = value_type.bytes(); const Ramp *ramp = op->index.as(); - if (ramp && is_const_one(ramp->stride)) { + // TODO(zvookin): consider splitting out vector predication path. Current + // code shows how vector predication would simplify things as the + // following scalarization cases would go away. + bool is_dense = ramp && is_const_one(ramp->stride); + if (use_llvm_vp_intrinsics || is_dense) { int native_bits = native_vector_bits(); int native_bytes = native_bits / 8; @@ -3683,18 +3822,44 @@ void CodeGen_LLVM::visit(const Store *op) { // For dense vector stores wider than the native vector // width, bust them up into native vectors. int store_lanes = value_type.lanes(); - int native_lanes = native_bits / value_type.bits(); + int native_lanes = maximum_vector_bits() / value_type.bits(); + + Expr base = (ramp != nullptr) ? ramp->base : 0; + Expr stride = (ramp != nullptr) ? ramp->stride : 0; + Value *stride_val = (!is_dense && ramp != nullptr) ? codegen(stride) : nullptr; + Value *index = (ramp == nullptr) ? codegen(op->index) : nullptr; for (int i = 0; i < store_lanes; i += native_lanes) { int slice_lanes = std::min(native_lanes, store_lanes - i); - Expr slice_base = simplify(ramp->base + i); + Expr slice_base = simplify(base + i * stride); Expr slice_stride = make_one(slice_base.type()); Expr slice_index = slice_lanes == 1 ? slice_base : Ramp::make(slice_base, slice_stride, slice_lanes); Value *slice_val = slice_vector(val, i, slice_lanes); Value *elt_ptr = codegen_buffer_pointer(op->name, value_type.element_of(), slice_base); Value *vec_ptr = builder->CreatePointerCast(elt_ptr, slice_val->getType()->getPointerTo()); - StoreInst *store = builder->CreateAlignedStore(slice_val, vec_ptr, llvm::Align(alignment)); - annotate_store(store, slice_index); + if (is_dense || slice_lanes == 1) { + if (try_vector_predication_intrinsic("llvm.vp.store", void_t, slice_lanes, AllEnabledMask(), + {VPArg(slice_val, 0), VPArg(vec_ptr, 1, alignment)})) { + add_tbaa_metadata(dyn_cast(value), op->name, slice_index); + } else { + StoreInst *store = builder->CreateAlignedStore(slice_val, vec_ptr, llvm::Align(alignment)); + annotate_store(store, slice_index); + } + } else if (ramp != nullptr) { + if (get_target().bits == 64 && !stride_val->getType()->isIntegerTy(64)) { + stride_val = builder->CreateIntCast(stride_val, i64_t, true); + } + bool generated = try_vector_predication_intrinsic("llvm.experimental.vp.strided.store", void_t, slice_lanes, AllEnabledMask(), + {VPArg(slice_val, 0), VPArg(vec_ptr, 1, alignment), VPArg(stride_val, 2)}); + internal_assert(generated) << "Using vector predicated intrinsics, but code generation was not successful for strided store.\n"; + add_tbaa_metadata(dyn_cast(value), op->name, slice_index); + } else { + Value *slice_index = slice_vector(index, i, slice_lanes); + Value *vec_ptrs = codegen_buffer_pointer(op->name, value_type, slice_index); + bool generated = try_vector_predication_intrinsic("llvm.vp.scatter", void_t, slice_lanes, AllEnabledMask(), + {VPArg(slice_val, 0), VPArg(vec_ptrs, 1, alignment)}); + internal_assert(generated) << "Using vector predicated intrinsics, but code generation was not successful for gathering store.\n"; + } } } else if (ramp) { Type ptr_type = value_type.element_of(); @@ -4002,7 +4167,7 @@ void CodeGen_LLVM::visit(const VectorReduce *op) { void CodeGen_LLVM::codegen_vector_reduce(const VectorReduce *op, const Expr &init) { Expr val = op->value; const int output_lanes = op->type.lanes(); - const int native_lanes = native_vector_bits() / op->type.bits(); + const int native_lanes = maximum_vector_bits() / op->type.bits(); const int factor = val.type().lanes() / output_lanes; Type elt = op->type.element_of(); @@ -4075,45 +4240,53 @@ void CodeGen_LLVM::codegen_vector_reduce(const VectorReduce *op, const Expr &ini op->op == VectorReduce::Mul || op->op == VectorReduce::Min || op->op == VectorReduce::Max) && - // Must be a power of two lanes - (input_lanes >= 2) && - ((input_lanes & (input_lanes - 1)) == 0) && - // int versions exist up to 1024 bits - ((!op->type.is_float() && input_bytes <= 1024) || - // float versions exist up to 16 lanes - input_lanes <= 16) && - // As of the release of llvm 10, the 64-bit experimental total - // reductions don't seem to be done yet on arm. - (val.type().bits() != 64 || - target.arch != Target::ARM)); + (use_llvm_vp_intrinsics || + // Must be a power of two lanes + ((input_lanes >= 2) && + ((input_lanes & (input_lanes - 1)) == 0) && + // int versions exist up to 1024 bits + ((!op->type.is_float() && input_bytes <= 1024) || + // float versions exist up to 16 lanes + input_lanes <= 16) && + // As of the release of llvm 10, the 64-bit experimental total + // reductions don't seem to be done yet on arm. + (val.type().bits() != 64 || + target.arch != Target::ARM)))); if (llvm_has_intrinsic) { - std::stringstream name; - name << "llvm.vector.reduce."; + const char *name = ""; const int bits = op->type.bits(); - bool takes_initial_value = false; + bool takes_initial_value = use_llvm_vp_intrinsics; Expr initial_value = init; if (op->type.is_float()) { switch (op->op) { case VectorReduce::Add: - name << "fadd"; + name = "fadd"; takes_initial_value = true; if (!initial_value.defined()) { initial_value = make_zero(op->type); } break; case VectorReduce::Mul: - name << "fmul"; + name = "fmul"; takes_initial_value = true; if (!initial_value.defined()) { initial_value = make_one(op->type); } break; case VectorReduce::Min: - name << "fmin"; + name = "fmin"; + // TODO(zvookin): Not correct for stricT_float. See: https://github.com/halide/Halide/issues/7118 + if (takes_initial_value && !initial_value.defined()) { + initial_value = op->type.max(); + } break; case VectorReduce::Max: - name << "fmax"; + name = "fmax"; + // TODO(zvookin): Not correct for stricT_float. See: https://github.com/halide/Halide/issues/7118 + if (takes_initial_value && !initial_value.defined()) { + initial_value = op->type.min(); + } break; default: break; @@ -4121,55 +4294,82 @@ void CodeGen_LLVM::codegen_vector_reduce(const VectorReduce *op, const Expr &ini } else if (op->type.is_int() || op->type.is_uint()) { switch (op->op) { case VectorReduce::Add: - name << "add"; + name = "add"; + if (takes_initial_value && !initial_value.defined()) { + initial_value = make_zero(op->type); + } break; case VectorReduce::Mul: - name << "mul"; + name = "mul"; + if (takes_initial_value && !initial_value.defined()) { + initial_value = make_one(op->type); + } break; case VectorReduce::Min: - name << (op->type.is_int() ? 's' : 'u') << "min"; + name = op->type.is_int() ? "smin" : "umin"; + if (takes_initial_value && !initial_value.defined()) { + initial_value = op->type.max(); + } break; case VectorReduce::Max: - name << (op->type.is_int() ? 's' : 'u') << "max"; + name = op->type.is_int() ? "smax" : "umax"; + if (takes_initial_value && !initial_value.defined()) { + initial_value = op->type.min(); + } break; default: break; } } - name << ".v" << val.type().lanes() << (op->type.is_float() ? 'f' : 'i') << bits; - string intrin_name = name.str(); + if (use_llvm_vp_intrinsics) { + string vp_name = "llvm.vp.reduce." + std::string(name); + codegen(initial_value); + llvm::Value *init = value; + codegen(op->value); + llvm::Value *val = value; + bool generated = try_vector_predication_intrinsic(vp_name, llvm_type_of(op->value.type()), op->value.type().lanes(), + AllEnabledMask(), {VPArg(init), VPArg(val, 0)}); + internal_assert(generated) << "Vector predication intrinsic generation failed for vector reduction " << name << "\n"; + } else { + std::stringstream build_name; + build_name << "llvm.vector.reduce."; + build_name << name; + build_name << ".v" << val.type().lanes() << (op->type.is_float() ? 'f' : 'i') << bits; - vector args; - if (takes_initial_value) { - args.push_back(initial_value); - initial_value = Expr(); - } - args.push_back(op->value); + string intrin_name = build_name.str(); - // Make sure the declaration exists, or the codegen for - // call will assume that the args should scalarize. - if (!module->getFunction(intrin_name)) { - vector arg_types; - for (const Expr &e : args) { - arg_types.push_back(llvm_type_of(e.type())); + vector args; + if (takes_initial_value) { + args.push_back(initial_value); + initial_value = Expr(); + } + args.push_back(op->value); + + // Make sure the declaration exists, or the codegen for + // call will assume that the args should scalarize. + if (!module->getFunction(intrin_name)) { + vector arg_types; + for (const Expr &e : args) { + arg_types.push_back(llvm_type_of(e.type())); + } + FunctionType *func_t = FunctionType::get(llvm_type_of(op->type), arg_types, false); + llvm::Function::Create(func_t, llvm::Function::ExternalLinkage, intrin_name, module.get()); } - FunctionType *func_t = FunctionType::get(llvm_type_of(op->type), arg_types, false); - llvm::Function::Create(func_t, llvm::Function::ExternalLinkage, intrin_name, module.get()); - } - Expr equiv = Call::make(op->type, intrin_name, args, Call::PureExtern); - if (initial_value.defined()) { - equiv = binop(initial_value, equiv); + Expr equiv = Call::make(op->type, intrin_name, args, Call::PureExtern); + if (initial_value.defined()) { + equiv = binop(initial_value, equiv); + } + equiv.accept(this); } - equiv.accept(this); return; } } if (output_lanes == 1 && factor > native_lanes && - factor % native_lanes == 0) { + (use_llvm_vp_intrinsics || (factor % native_lanes == 0))) { // It's a total reduction of multiple native // vectors. Start by adding the vectors together. Expr equiv; @@ -4471,7 +4671,7 @@ Value *CodeGen_LLVM::call_intrin(const Type &result_type, int intrin_lanes, Value *CodeGen_LLVM::call_intrin(const llvm::Type *result_type, int intrin_lanes, const string &name, vector arg_values, - bool scalable_vector_result) { + bool scalable_vector_result, bool is_reduction) { llvm::Function *fn = module->getFunction(name); if (!fn) { vector arg_types(arg_values.size()); @@ -4480,7 +4680,7 @@ Value *CodeGen_LLVM::call_intrin(const llvm::Type *result_type, int intrin_lanes } llvm::Type *intrinsic_result_type = result_type->getScalarType(); - if (intrin_lanes > 1) { + if (intrin_lanes > 1 && !is_reduction) { if (scalable_vector_result && effective_vscale != 0) { intrinsic_result_type = get_vector_type(result_type->getScalarType(), intrin_lanes / effective_vscale, VectorTypeConstraint::VScale); @@ -4494,11 +4694,12 @@ Value *CodeGen_LLVM::call_intrin(const llvm::Type *result_type, int intrin_lanes fn->setCallingConv(CallingConv::C); } - return call_intrin(result_type, intrin_lanes, fn, arg_values); + return call_intrin(result_type, intrin_lanes, fn, arg_values, is_reduction); } Value *CodeGen_LLVM::call_intrin(const llvm::Type *result_type, int intrin_lanes, - llvm::Function *intrin, vector arg_values) { + llvm::Function *intrin, vector arg_values, + bool is_reduction) { internal_assert(intrin); int arg_lanes = 1; if (result_type->isVoidTy()) { @@ -4507,7 +4708,7 @@ Value *CodeGen_LLVM::call_intrin(const llvm::Type *result_type, int intrin_lanes arg_lanes = get_vector_num_elements(result_type); } - if (intrin_lanes != arg_lanes) { + if (!is_reduction && intrin_lanes != arg_lanes) { // Cut up each arg into appropriately-sized pieces, call the // intrinsic on each, then splice together the results. vector results; @@ -4935,5 +5136,108 @@ llvm::Constant *CodeGen_LLVM::get_splat(int lanes, llvm::Constant *value, return ConstantVector::getSplat(ec, value); } +std::string CodeGen_LLVM::mangle_llvm_vector_type(llvm::Type *type) { + std::string type_string = "."; + bool is_scalable = isa(type); + llvm::ElementCount llvm_vector_ec; + if (is_scalable) { + const auto *vt = cast(type); + const char *type_designator = vt->getElementType()->isIntegerTy() ? "i" : "f"; + std::string bits_designator = std::to_string(vt->getScalarSizeInBits()); + llvm_vector_ec = vt->getElementCount(); + type_string = ".nxv" + std::to_string(vt->getMinNumElements()) + type_designator + bits_designator; + } else { + const auto *vt = cast(type); + const char *type_designator = vt->getElementType()->isIntegerTy() ? "i" : "f"; + std::string bits_designator = std::to_string(vt->getScalarSizeInBits()); + llvm_vector_ec = vt->getElementCount(); + type_string = ".v" + std::to_string(vt->getNumElements()) + type_designator + bits_designator; + } + return type_string; +} + +bool CodeGen_LLVM::try_vector_predication_intrinsic(const std::string &name, llvm::Type *llvm_result_type, + int32_t length, MaskVariant mask, std::vector vp_args) { + if (!use_llvm_vp_intrinsics) { + return false; + } + + bool any_scalable = isa(llvm_result_type); + bool any_fixed = isa(llvm_result_type); + bool is_reduction = !any_scalable && !any_fixed; + for (const VPArg &arg : vp_args) { + any_scalable |= isa(arg.value->getType()); + any_fixed |= isa(arg.value->getType()); + } + if (!any_fixed && !any_scalable) { + return false; + } + internal_assert(!(any_scalable && any_fixed)) << "Cannot combine fixed and scalable vectors to vector predication intrinsic.\n"; + + bool is_scalable = any_scalable; + + std::vector args; + args.reserve(2 + vp_args.size()); + std::vector mangled_types(args.size()); + + for (const VPArg &arg : vp_args) { + args.push_back(arg.value); + if (arg.mangle_index) { + llvm::Type *llvm_type = arg.value->getType(); + if (isa(llvm_type)) { + mangled_types[arg.mangle_index.value()] = ".p0"; + } else { + mangled_types[arg.mangle_index.value()] = mangle_llvm_vector_type(llvm_type); + } + } + } + + std::string full_name = name; + for (const std::string &mangle : mangled_types) { + full_name += mangle; + } + + if (!std::holds_alternative(mask)) { + if (std::holds_alternative(mask)) { + llvm::ElementCount llvm_vector_ec; + if (is_scalable) { + const auto *vt = cast(llvm_result_type); + llvm_vector_ec = vt->getElementCount(); + } else { + const auto *vt = cast(llvm_result_type); + llvm_vector_ec = vt->getElementCount(); + } + args.push_back(ConstantVector::getSplat(llvm_vector_ec, ConstantInt::get(i1_t, 1))); + } else { + args.push_back(std::get(mask)); + } + } + args.push_back(ConstantInt::get(i32_t, length)); + + value = call_intrin(llvm_result_type, length, full_name, args, is_scalable, is_reduction); + llvm::CallInst *call = dyn_cast(value); + for (size_t i = 0; i < args.size(); i++) { + if (vp_args[i].alignment != 0) { + call->addParamAttr(i, Attribute::getWithAlignment(*context, llvm::Align(vp_args[i].alignment))); + } + } + return true; +} + +bool CodeGen_LLVM::try_vector_predication_comparison(const std::string &name, const Type &result_type, + MaskVariant mask, llvm::Value *a, llvm::Value *b, + const char *cmp_op) { + // Early out to prevent creating useless metadata. + if (!use_llvm_vp_intrinsics || + result_type.is_scalar()) { + return false; + } + + llvm::MDBuilder builder(*context); + llvm::Value *md_val = llvm::MetadataAsValue::get(*context, builder.createString(cmp_op)); + return try_vector_predication_intrinsic(name, llvm_type_of(result_type), result_type.lanes(), mask, + {VPArg(a, 0), VPArg(b), VPArg(md_val)}); +} + } // namespace Internal } // namespace Halide diff --git a/src/CodeGen_LLVM.h b/src/CodeGen_LLVM.h index 5bf4d5a5a130..10d5268ec0bd 100644 --- a/src/CodeGen_LLVM.h +++ b/src/CodeGen_LLVM.h @@ -34,7 +34,9 @@ class GlobalVariable; #include #include +#include #include +#include #include #include "IRVisitor.h" @@ -136,6 +138,11 @@ class CodeGen_LLVM : public IRVisitor { /** What's the natural vector bit-width to use for loads, stores, etc. */ virtual int native_vector_bits() const = 0; + /** Used to decide whether to break a vector up into multiple smaller + * operations. This is the largest size the architecture supports. */ + virtual int maximum_vector_bits() const { + return native_vector_bits(); + } /** For architectures that have vscale vectors, return the constant vscale to use. * Default of 0 means do not use vscale vectors. Generally will depend on * the target flags and vector_bits settings. @@ -297,6 +304,13 @@ class CodeGen_LLVM : public IRVisitor { llvm::Value *codegen_buffer_pointer(llvm::Value *base_address, Type type, llvm::Value *index); // @} + /** Return type string for LLVM vector type using LLVM IR intrinsic type mangling. + * E.g. ".nxv4i32" for a scalable vector of four 32-bit integers, + * or ".v4f32" for a fixed vector of four 32-bit floats. + * The dot is included in the result. + */ + std::string mangle_llvm_vector_type(llvm::Type *type); + /** Turn a Halide Type into an llvm::Value representing a constant halide_type_t */ llvm::Value *make_halide_type_t(const Type &); @@ -464,9 +478,10 @@ class CodeGen_LLVM : public IRVisitor { llvm::Function *intrin, std::vector); llvm::Value *call_intrin(const llvm::Type *t, int intrin_lanes, const std::string &name, std::vector, - bool scalable_vector_result = false); + bool scalable_vector_result = false, bool is_reduction = false); llvm::Value *call_intrin(const llvm::Type *t, int intrin_lanes, - llvm::Function *intrin, std::vector); + llvm::Function *intrin, std::vector, + bool is_reduction = false); // @} /** Take a slice of lanes out of an llvm vector. Pads with undefs @@ -552,6 +567,57 @@ class CodeGen_LLVM : public IRVisitor { llvm::Constant *get_splat(int lanes, llvm::Constant *value, VectorTypeConstraint type_constraint = VectorTypeConstraint::None) const; + /** Support for generating LLVM vector predication intrinsics + * ("@llvm.vp.*" and "@llvm.experimental.vp.*") + */ + // @{ + /** Struct to hold descriptor for an argument to a vector + * predicated intrinsic. This includes the value, whether the + * type of the argument should be mangled into the intrisic name + * and if so, where, and the alignment for pointer arguments. */ + struct VPArg { + llvm::Value *value; + // If provided, put argument's type into the intrinsic name via LLVM IR type mangling. + std::optional mangle_index; + int alignment; + VPArg(llvm::Value *value, std::optional mangle_index = std::nullopt, int32_t alignment = 0) + : value(value), mangle_index(mangle_index), alignment(alignment) { + } + }; + + /** Type indicating an intrinsic does not take a mask. */ + struct NoMask { + }; + + /** Type indicating mask to use is all true -- all lanes enabled. */ + struct AllEnabledMask { + }; + + /** Predication mask using the above two types for special cases + * and an llvm::Value for the general one. */ + using MaskVariant = std::variant; + + /** Generate a vector predicated comparison intrinsic call if + * use_llvm_vp_intrinsics is true and result_type is a vector + * type. If generated, assigns result of vp intrinsic to value and + * returns true if it an instuction is generated, otherwise + * returns false. */ + bool try_vector_predication_comparison(const std::string &name, const Type &result_type, + MaskVariant mask, llvm::Value *a, llvm::Value *b, + const char *cmp_op); + + /** Generate an intrisic call if use_llvm_vp_intrinsics is true + * and length is greater than 1. If generated, assigns result + * of vp intrinsic to value and returns true if it an instuction + * is generated, otherwise returns false. */ + bool try_vector_predication_intrinsic(const std::string &name, llvm::Type *llvm_result_type, + int32_t length, MaskVariant mask, std::vector args); + + /** Controls use of vector predicated intrinsics for vector operations. + * Will be set by certain backends (e.g. RISC V) to control codegen. */ + bool use_llvm_vp_intrinsics; + // @} + private: /** All the values in scope at the current code location during * codegen. Use sym_push and sym_pop to access. */ @@ -593,9 +659,9 @@ class CodeGen_LLVM : public IRVisitor { llvm::Function *add_argv_wrapper(llvm::Function *fn, const std::string &name, bool result_in_argv, std::vector &arg_is_buffer); - llvm::Value *codegen_dense_vector_load(const Type &type, const std::string &name, const Expr &base, - const Buffer<> &image, const Parameter ¶m, const ModulusRemainder &alignment, - llvm::Value *vpred = nullptr, bool slice_to_native = true); + llvm::Value *codegen_vector_load(const Type &type, const std::string &name, const Expr &base, + const Buffer<> &image, const Parameter ¶m, const ModulusRemainder &alignment, + llvm::Value *vpred = nullptr, bool slice_to_native = true, llvm::Value *stride = nullptr); llvm::Value *codegen_dense_vector_load(const Load *load, llvm::Value *vpred = nullptr, bool slice_to_native = true); virtual void codegen_predicated_load(const Load *op); diff --git a/src/CodeGen_RISCV.cpp b/src/CodeGen_RISCV.cpp index cac8dc0f2102..60a5c3feff19 100644 --- a/src/CodeGen_RISCV.cpp +++ b/src/CodeGen_RISCV.cpp @@ -25,11 +25,13 @@ class CodeGen_RISCV : public CodeGen_Posix { string mabi() const override; bool use_soft_float_abi() const override; int native_vector_bits() const override; + int maximum_vector_bits() const override; int target_vscale() const override; }; CodeGen_RISCV::CodeGen_RISCV(const Target &t) : CodeGen_Posix(t) { + use_llvm_vp_intrinsics = true; } string CodeGen_RISCV::mcpu_target() const { @@ -81,6 +83,10 @@ int CodeGen_RISCV::native_vector_bits() const { return 0; } +int CodeGen_RISCV::maximum_vector_bits() const { + return native_vector_bits() * 8; +} + int CodeGen_RISCV::target_vscale() const { if (target.vector_bits != 0 && target.has_feature(Target::RVV)) {