diff --git a/src/coreclr/src/jit/codegen.h b/src/coreclr/src/jit/codegen.h index 6227d0133ae54f..bdf9d45ed2ce32 100644 --- a/src/coreclr/src/jit/codegen.h +++ b/src/coreclr/src/jit/codegen.h @@ -980,7 +980,6 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX void genSIMDIntrinsicUnOp(GenTreeSIMD* simdNode); void genSIMDIntrinsicBinOp(GenTreeSIMD* simdNode); void genSIMDIntrinsicRelOp(GenTreeSIMD* simdNode); - void genSIMDIntrinsicDotProduct(GenTreeSIMD* simdNode); void genSIMDIntrinsicSetItem(GenTreeSIMD* simdNode); void genSIMDIntrinsicGetItem(GenTreeSIMD* simdNode); void genSIMDIntrinsicShuffleSSE2(GenTreeSIMD* simdNode); diff --git a/src/coreclr/src/jit/codegenarm64.cpp b/src/coreclr/src/jit/codegenarm64.cpp index 8e8e93ab01fc3a..39bbd4264bcad5 100644 --- a/src/coreclr/src/jit/codegenarm64.cpp +++ b/src/coreclr/src/jit/codegenarm64.cpp @@ -3855,20 +3855,13 @@ void CodeGen::genSIMDIntrinsic(GenTreeSIMD* simdNode) genSIMDIntrinsicNarrow(simdNode); break; - case SIMDIntrinsicAdd: case SIMDIntrinsicSub: - case SIMDIntrinsicMul: - case SIMDIntrinsicDiv: case SIMDIntrinsicBitwiseAnd: case SIMDIntrinsicBitwiseOr: case SIMDIntrinsicEqual: genSIMDIntrinsicBinOp(simdNode); break; - case SIMDIntrinsicDotProduct: - genSIMDIntrinsicDotProduct(simdNode); - break; - case SIMDIntrinsicGetItem: genSIMDIntrinsicGetItem(simdNode); break; @@ -3945,9 +3938,6 @@ instruction CodeGen::getOpForSIMDIntrinsic(SIMDIntrinsicID intrinsicId, var_type { switch (intrinsicId) { - case SIMDIntrinsicAdd: - result = INS_fadd; - break; case SIMDIntrinsicBitwiseAnd: result = INS_and; break; @@ -3961,15 +3951,9 @@ instruction CodeGen::getOpForSIMDIntrinsic(SIMDIntrinsicID intrinsicId, var_type case SIMDIntrinsicConvertToInt64: result = INS_fcvtzs; break; - case SIMDIntrinsicDiv: - result = INS_fdiv; - break; case SIMDIntrinsicEqual: result = INS_fcmeq; break; - case SIMDIntrinsicMul: - result = INS_fmul; - break; case SIMDIntrinsicNarrow: // Use INS_fcvtn lower bytes of result followed by INS_fcvtn2 for upper bytes // Return lower bytes instruction here @@ -3995,9 +3979,6 @@ instruction CodeGen::getOpForSIMDIntrinsic(SIMDIntrinsicID intrinsicId, var_type switch (intrinsicId) { - case SIMDIntrinsicAdd: - result = INS_add; - break; case SIMDIntrinsicBitwiseAnd: result = INS_and; break; @@ -4014,9 +3995,6 @@ instruction CodeGen::getOpForSIMDIntrinsic(SIMDIntrinsicID intrinsicId, var_type case SIMDIntrinsicEqual: result = INS_cmeq; break; - case SIMDIntrinsicMul: - result = INS_mul; - break; case SIMDIntrinsicNarrow: // Use INS_xtn lower bytes of result followed by INS_xtn2 for upper bytes // Return lower bytes instruction here @@ -4326,9 +4304,7 @@ void CodeGen::genSIMDIntrinsicNarrow(GenTreeSIMD* simdNode) // void CodeGen::genSIMDIntrinsicBinOp(GenTreeSIMD* simdNode) { - assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicAdd || simdNode->gtSIMDIntrinsicID == SIMDIntrinsicSub || - simdNode->gtSIMDIntrinsicID == SIMDIntrinsicMul || simdNode->gtSIMDIntrinsicID == SIMDIntrinsicDiv || - simdNode->gtSIMDIntrinsicID == SIMDIntrinsicBitwiseAnd || + assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicSub || simdNode->gtSIMDIntrinsicID == SIMDIntrinsicBitwiseAnd || simdNode->gtSIMDIntrinsicID == SIMDIntrinsicBitwiseOr || simdNode->gtSIMDIntrinsicID == SIMDIntrinsicEqual); GenTree* op1 = simdNode->gtGetOp1(); @@ -4357,90 +4333,6 @@ void CodeGen::genSIMDIntrinsicBinOp(GenTreeSIMD* simdNode) genProduceReg(simdNode); } -//-------------------------------------------------------------------------------- -// genSIMDIntrinsicDotProduct: Generate code for SIMD Intrinsic Dot Product. -// -// Arguments: -// simdNode - The GT_SIMD node -// -// Return Value: -// None. -// -void CodeGen::genSIMDIntrinsicDotProduct(GenTreeSIMD* simdNode) -{ - assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicDotProduct); - - GenTree* op1 = simdNode->gtGetOp1(); - GenTree* op2 = simdNode->gtGetOp2(); - var_types baseType = simdNode->gtSIMDBaseType; - var_types simdType = op1->TypeGet(); - - regNumber targetReg = simdNode->GetRegNum(); - assert(targetReg != REG_NA); - - var_types targetType = simdNode->TypeGet(); - assert(targetType == baseType); - - genConsumeOperands(simdNode); - regNumber op1Reg = op1->GetRegNum(); - regNumber op2Reg = op2->GetRegNum(); - regNumber tmpReg = targetReg; - - if (!varTypeIsFloating(baseType)) - { - tmpReg = simdNode->GetSingleTempReg(RBM_ALLFLOAT); - } - - instruction ins = getOpForSIMDIntrinsic(SIMDIntrinsicMul, baseType); - emitAttr attr = (simdNode->gtSIMDSize > 8) ? EA_16BYTE : EA_8BYTE; - insOpts opt = genGetSimdInsOpt(attr, baseType); - - // Vector multiply - GetEmitter()->emitIns_R_R_R(ins, attr, tmpReg, op1Reg, op2Reg, opt); - - if ((simdNode->gtFlags & GTF_SIMD12_OP) != 0) - { - // For 12Byte vectors we must zero upper bits to get correct dot product - // We do not assume upper bits are zero. - GetEmitter()->emitIns_R_R_I(INS_ins, EA_4BYTE, tmpReg, REG_ZR, 3); - } - - // Vector add horizontal - if (varTypeIsFloating(baseType)) - { - if (baseType == TYP_FLOAT) - { - if (opt == INS_OPTS_4S) - { - GetEmitter()->emitIns_R_R_R(INS_faddp, EA_16BYTE, tmpReg, tmpReg, tmpReg, INS_OPTS_4S); - } - GetEmitter()->emitIns_R_R(INS_faddp, EA_8BYTE, targetReg, tmpReg, INS_OPTS_2S); - } - else - { - GetEmitter()->emitIns_R_R(INS_faddp, EA_16BYTE, targetReg, tmpReg, INS_OPTS_2D); - } - } - else - { - ins = varTypeIsUnsigned(baseType) ? INS_uaddlv : INS_saddlv; - - GetEmitter()->emitIns_R_R(ins, attr, tmpReg, tmpReg, opt); - - // Mov to integer register - if (varTypeIsUnsigned(baseType) || (genTypeSize(baseType) < 4)) - { - GetEmitter()->emitIns_R_R_I(INS_mov, emitTypeSize(baseType), targetReg, tmpReg, 0); - } - else - { - GetEmitter()->emitIns_R_R_I(INS_smov, emitActualTypeSize(baseType), targetReg, tmpReg, 0); - } - } - - genProduceReg(simdNode); -} - //------------------------------------------------------------------------------------ // genSIMDIntrinsicGetItem: Generate code for SIMD Intrinsic get element at index i. // diff --git a/src/coreclr/src/jit/compiler.h b/src/coreclr/src/jit/compiler.h index be64fa8c3e6b4a..2078ec2f786aa7 100644 --- a/src/coreclr/src/jit/compiler.h +++ b/src/coreclr/src/jit/compiler.h @@ -2542,7 +2542,6 @@ class Compiler #ifdef FEATURE_SIMD GenTree* gtNewSIMDVectorZero(var_types simdType, var_types baseType, unsigned size); - GenTree* gtNewSIMDVectorOne(var_types simdType, var_types baseType, unsigned size); #endif GenTree* gtNewBlkOpNode(GenTree* dst, GenTree* srcOrFillVal, bool isVolatile, bool isCopyBlock); @@ -2630,6 +2629,9 @@ class Compiler var_types baseType, unsigned size); + GenTreeHWIntrinsic* gtNewSimdCreateBroadcastNode( + var_types type, GenTree* op1, var_types baseType, unsigned size, bool isSimdAsHWIntrinsic); + GenTreeHWIntrinsic* gtNewSimdAsHWIntrinsicNode(var_types type, NamedIntrinsic hwIntrinsicID, var_types baseType, @@ -3751,7 +3753,7 @@ class Compiler CORINFO_CLASS_HANDLE clsHnd, CORINFO_METHOD_HANDLE method, CORINFO_SIG_INFO* sig, - bool mustExpand); + GenTree* newobjThis); protected: bool compSupportsHWIntrinsic(CORINFO_InstructionSet isa); @@ -3761,7 +3763,8 @@ class Compiler CORINFO_SIG_INFO* sig, var_types retType, var_types baseType, - unsigned simdSize); + unsigned simdSize, + GenTree* newobjThis); GenTree* impSimdAsHWIntrinsicCndSel(CORINFO_CLASS_HANDLE clsHnd, var_types retType, @@ -3779,7 +3782,10 @@ class Compiler var_types retType, unsigned simdSize); - GenTree* getArgForHWIntrinsic(var_types argType, CORINFO_CLASS_HANDLE argClass, bool expectAddr = false); + GenTree* getArgForHWIntrinsic(var_types argType, + CORINFO_CLASS_HANDLE argClass, + bool expectAddr = false, + GenTree* newobjThis = nullptr); GenTree* impNonConstFallback(NamedIntrinsic intrinsic, var_types simdType, var_types baseType); GenTree* addRangeCheckIfNeeded( NamedIntrinsic intrinsic, GenTree* immOp, bool mustExpand, int immLowerBound, int immUpperBound); diff --git a/src/coreclr/src/jit/gentree.cpp b/src/coreclr/src/jit/gentree.cpp index 6a316b1dcf2f12..3c241c0c7c3e95 100644 --- a/src/coreclr/src/jit/gentree.cpp +++ b/src/coreclr/src/jit/gentree.cpp @@ -6058,40 +6058,6 @@ GenTree* Compiler::gtNewSIMDVectorZero(var_types simdType, var_types baseType, u initVal->gtType = baseType; return gtNewSIMDNode(simdType, initVal, nullptr, SIMDIntrinsicInit, baseType, size); } - -//--------------------------------------------------------------------- -// gtNewSIMDVectorOne: create a GT_SIMD node for Vector.One -// -// Arguments: -// simdType - simd vector type -// baseType - element type of vector -// size - size of vector in bytes -GenTree* Compiler::gtNewSIMDVectorOne(var_types simdType, var_types baseType, unsigned size) -{ - GenTree* initVal; - if (varTypeIsSmallInt(baseType)) - { - unsigned baseSize = genTypeSize(baseType); - int val; - if (baseSize == 1) - { - val = 0x01010101; - } - else - { - val = 0x00010001; - } - initVal = gtNewIconNode(val); - } - else - { - initVal = gtNewOneConNode(baseType); - } - - baseType = genActualType(baseType); - initVal->gtType = baseType; - return gtNewSIMDNode(simdType, initVal, nullptr, SIMDIntrinsicInit, baseType, size); -} #endif // FEATURE_SIMD GenTreeCall* Compiler::gtNewIndCallNode(GenTree* addr, var_types type, GenTreeCall::Use* args, IL_OFFSETX ilOffset) @@ -18463,11 +18429,9 @@ bool GenTree::isCommutativeSIMDIntrinsic() assert(gtOper == GT_SIMD); switch (AsSIMD()->gtSIMDIntrinsicID) { - case SIMDIntrinsicAdd: case SIMDIntrinsicBitwiseAnd: case SIMDIntrinsicBitwiseOr: case SIMDIntrinsicEqual: - case SIMDIntrinsicMul: return true; default: return false; @@ -18630,6 +18594,43 @@ GenTreeHWIntrinsic* Compiler::gtNewSimdHWIntrinsicNode(var_types type, GenTreeHWIntrinsic(type, gtNewArgList(op1, op2, op3, op4), hwIntrinsicID, baseType, size); } +GenTreeHWIntrinsic* Compiler::gtNewSimdCreateBroadcastNode( + var_types type, GenTree* op1, var_types baseType, unsigned size, bool isSimdAsHWIntrinsic) +{ + NamedIntrinsic hwIntrinsicID = NI_Vector128_Create; + +#if defined(TARGET_XARCH) +#if defined(TARGET_X86) + if (varTypeIsLong(baseType) && !op1->IsIntegralConst()) + { + // TODO-XARCH-CQ: It may be beneficial to emit the movq + // instruction, which takes a 64-bit memory address and + // works on 32-bit x86 systems. + unreached(); + } +#endif // TARGET_X86 + + if (size == 32) + { + hwIntrinsicID = NI_Vector256_Create; + } +#elif defined(TARGET_ARM64) + if (size == 8) + { + hwIntrinsicID = NI_Vector64_Create; + } +#else +#error Unsupported platform +#endif // !TARGET_XARCH && !TARGET_ARM64 + + if (isSimdAsHWIntrinsic) + { + return gtNewSimdAsHWIntrinsicNode(type, op1, hwIntrinsicID, baseType, size); + } + + return gtNewSimdHWIntrinsicNode(type, op1, hwIntrinsicID, baseType, size); +} + GenTreeHWIntrinsic* Compiler::gtNewScalarHWIntrinsicNode(var_types type, GenTree* op1, NamedIntrinsic hwIntrinsicID) { SetOpLclRelatedToSIMDIntrinsic(op1); diff --git a/src/coreclr/src/jit/hwintrinsic.cpp b/src/coreclr/src/jit/hwintrinsic.cpp index 0ce3ec25b69f5c..237816e9d43ca0 100644 --- a/src/coreclr/src/jit/hwintrinsic.cpp +++ b/src/coreclr/src/jit/hwintrinsic.cpp @@ -487,14 +487,19 @@ bool HWIntrinsicInfo::isImmOp(NamedIntrinsic id, const GenTree* op) // Arguments: // argType -- the required type of argument // argClass -- the class handle of argType -// expectAddr -- if true indicates we are expecting type stack entry to be a TYP_BYREF. +// expectAddr -- if true indicates we are expecting type stack entry to be a TYP_BYREF. +// newobjThis -- For CEE_NEWOBJ, this is the temp grabbed for the allocated uninitalized object. // // Return Value: // the validated argument // -GenTree* Compiler::getArgForHWIntrinsic(var_types argType, CORINFO_CLASS_HANDLE argClass, bool expectAddr) +GenTree* Compiler::getArgForHWIntrinsic(var_types argType, + CORINFO_CLASS_HANDLE argClass, + bool expectAddr, + GenTree* newobjThis) { GenTree* arg = nullptr; + if (varTypeIsStruct(argType)) { if (!varTypeIsSIMD(argType)) @@ -504,16 +509,32 @@ GenTree* Compiler::getArgForHWIntrinsic(var_types argType, CORINFO_CLASS_HANDLE argType = getSIMDTypeForSize(argSizeBytes); } assert(varTypeIsSIMD(argType)); - arg = impSIMDPopStack(argType, expectAddr); - assert(varTypeIsSIMD(arg->TypeGet())); + + if (newobjThis == nullptr) + { + arg = impSIMDPopStack(argType, expectAddr); + assert(varTypeIsSIMD(arg->TypeGet())); + } + else + { + assert((newobjThis->gtOper == GT_ADDR) && (newobjThis->AsOp()->gtOp1->gtOper == GT_LCL_VAR)); + arg = newobjThis; + + // push newobj result on type stack + unsigned tmp = arg->AsOp()->gtOp1->AsLclVarCommon()->GetLclNum(); + impPushOnStack(gtNewLclvNode(tmp, lvaGetRealType(tmp)), verMakeTypeInfo(argClass).NormaliseForStack()); + } } else { assert(varTypeIsArithmetic(argType)); + arg = impPopStack().val; assert(varTypeIsArithmetic(arg->TypeGet())); + assert(genActualType(arg->gtType) == genActualType(argType)); } + return arg; } diff --git a/src/coreclr/src/jit/hwintrinsiclistarm64.h b/src/coreclr/src/jit/hwintrinsiclistarm64.h index 194929b4ac1c73..7de865284e211c 100644 --- a/src/coreclr/src/jit/hwintrinsiclistarm64.h +++ b/src/coreclr/src/jit/hwintrinsiclistarm64.h @@ -30,6 +30,7 @@ HARDWARE_INTRINSIC(Vector64, AsUInt32, HARDWARE_INTRINSIC(Vector64, AsUInt64, 8, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen|HW_Flag_SpecialImport) HARDWARE_INTRINSIC(Vector64, Create, 8, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_mov, INS_mov, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_NoCodeGen|HW_Flag_SpecialImport) HARDWARE_INTRINSIC(Vector64, CreateScalarUnsafe, 8, 1, {INS_ins, INS_ins, INS_ins, INS_ins, INS_ins, INS_ins, INS_invalid, INS_invalid, INS_fmov, INS_invalid}, HW_Category_SIMD, HW_Flag_SpecialCodeGen|HW_Flag_SupportsContainment) +HARDWARE_INTRINSIC(Vector64, Dot, 8, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector64, get_AllBitsSet, 8, 0, {INS_mvni, INS_mvni, INS_mvni, INS_mvni, INS_mvni, INS_mvni, INS_mvni, INS_mvni, INS_mvni, INS_mvni}, HW_Category_Helper, HW_Flag_SpecialCodeGen|HW_Flag_SpecialImport) HARDWARE_INTRINSIC(Vector64, get_Count, 8, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_NoCodeGen|HW_Flag_SpecialImport) HARDWARE_INTRINSIC(Vector64, get_Zero, 8, 0, {INS_movi, INS_movi, INS_movi, INS_movi, INS_movi, INS_movi, INS_movi, INS_movi, INS_movi, INS_movi}, HW_Category_Helper, HW_Flag_SpecialCodeGen|HW_Flag_SpecialImport) @@ -62,6 +63,7 @@ HARDWARE_INTRINSIC(Vector128, AsVector4, HARDWARE_INTRINSIC(Vector128, AsVector128, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen|HW_Flag_SpecialImport) HARDWARE_INTRINSIC(Vector128, Create, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_NoCodeGen|HW_Flag_SpecialImport) HARDWARE_INTRINSIC(Vector128, CreateScalarUnsafe, 16, 1, {INS_ins, INS_ins, INS_ins, INS_ins, INS_ins, INS_ins, INS_ins, INS_ins, INS_fmov, INS_fmov}, HW_Category_SIMD, HW_Flag_SpecialCodeGen|HW_Flag_SupportsContainment) +HARDWARE_INTRINSIC(Vector128, Dot, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector128, get_AllBitsSet, 16, 0, {INS_mvni, INS_mvni, INS_mvni, INS_mvni, INS_mvni, INS_mvni, INS_mvni, INS_mvni, INS_mvni, INS_mvni}, HW_Category_Helper, HW_Flag_SpecialCodeGen|HW_Flag_SpecialImport) HARDWARE_INTRINSIC(Vector128, get_Count, 16, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_NoCodeGen|HW_Flag_SpecialImport) HARDWARE_INTRINSIC(Vector128, get_Zero, 16, 0, {INS_movi, INS_movi, INS_movi, INS_movi, INS_movi, INS_movi, INS_movi, INS_movi, INS_movi, INS_movi}, HW_Category_Helper, HW_Flag_SpecialCodeGen|HW_Flag_SpecialImport) diff --git a/src/coreclr/src/jit/hwintrinsiclistxarch.h b/src/coreclr/src/jit/hwintrinsiclistxarch.h index c6017fb12c44ca..02d1edb15fae12 100644 --- a/src/coreclr/src/jit/hwintrinsiclistxarch.h +++ b/src/coreclr/src/jit/hwintrinsiclistxarch.h @@ -45,6 +45,7 @@ HARDWARE_INTRINSIC(Vector128, AsVector4, HARDWARE_INTRINSIC(Vector128, AsVector128, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(Vector128, Create, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector128, CreateScalarUnsafe, 16, 1, {INS_mov_i2xmm, INS_mov_i2xmm, INS_mov_i2xmm, INS_mov_i2xmm, INS_mov_i2xmm, INS_mov_i2xmm, INS_mov_i2xmm, INS_mov_i2xmm, INS_movss, INS_movsdsse2}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(Vector128, Dot, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_NoCodeGen) // The instruction generated for float/double depends on which ISAs are supported HARDWARE_INTRINSIC(Vector128, get_AllBitsSet, 16, 0, {INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_cmpps, INS_cmppd}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(Vector128, get_Count, 16, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_NoRMWSemantics) @@ -81,6 +82,7 @@ HARDWARE_INTRINSIC(Vector256, get_Count, HARDWARE_INTRINSIC(Vector256, get_Zero, 32, 0, {INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(Vector256, Create, 32, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector256, CreateScalarUnsafe, 32, 1, {INS_mov_i2xmm, INS_mov_i2xmm, INS_mov_i2xmm, INS_mov_i2xmm, INS_mov_i2xmm, INS_mov_i2xmm, INS_mov_i2xmm, INS_mov_i2xmm, INS_movss, INS_movsdsse2}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(Vector256, Dot, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector256, GetElement, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg) HARDWARE_INTRINSIC(Vector256, GetLower, 32, 1, {INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movups, INS_movupd}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(Vector256, op_Equality, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_NoCodeGen) @@ -346,7 +348,7 @@ HARDWARE_INTRINSIC(SSE3, MoveLowAndDuplicate, // SSSE3 Intrinsics HARDWARE_INTRINSIC(SSSE3, Abs, 16, 1, {INS_pabsb, INS_invalid, INS_pabsw, INS_invalid, INS_pabsd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_BaseTypeFromFirstArg) HARDWARE_INTRINSIC(SSSE3, AlignRight, 16, 3, {INS_palignr, INS_palignr, INS_palignr, INS_palignr, INS_palignr, INS_palignr, INS_palignr, INS_palignr, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM) -HARDWARE_INTRINSIC(SSSE3, HorizontalAdd, 16, 2, {INS_invalid, INS_invalid, INS_phaddw, INS_invalid, INS_phaddd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSSE3, HorizontalAdd, 16, 2, {INS_invalid, INS_invalid, INS_phaddw, INS_phaddw, INS_phaddd, INS_phaddd, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(SSSE3, HorizontalAddSaturate, 16, 2, {INS_invalid, INS_invalid, INS_phaddsw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(SSSE3, HorizontalSubtract, 16, 2, {INS_invalid, INS_invalid, INS_phsubw, INS_invalid, INS_phsubd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(SSSE3, HorizontalSubtractSaturate, 16, 2, {INS_invalid, INS_invalid, INS_phsubsw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) @@ -527,7 +529,7 @@ HARDWARE_INTRINSIC(AVX2, GatherVector128, HARDWARE_INTRINSIC(AVX2, GatherVector256, 32, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpgatherdd, INS_vpgatherdd, INS_vpgatherdq, INS_vpgatherdq, INS_vgatherdps, INS_vgatherdpd}, HW_Category_IMM, HW_Flag_MaybeMemoryLoad|HW_Flag_SpecialCodeGen|HW_Flag_NoContainment) HARDWARE_INTRINSIC(AVX2, GatherMaskVector128, 16, 5, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpgatherdd, INS_vpgatherdd, INS_vpgatherdq, INS_vpgatherdq, INS_vgatherdps, INS_vgatherdpd}, HW_Category_IMM, HW_Flag_MaybeMemoryLoad|HW_Flag_SpecialCodeGen|HW_Flag_SpecialImport|HW_Flag_NoContainment) HARDWARE_INTRINSIC(AVX2, GatherMaskVector256, 32, 5, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpgatherdd, INS_vpgatherdd, INS_vpgatherdq, INS_vpgatherdq, INS_vgatherdps, INS_vgatherdpd}, HW_Category_IMM, HW_Flag_MaybeMemoryLoad|HW_Flag_SpecialCodeGen|HW_Flag_SpecialImport|HW_Flag_NoContainment) -HARDWARE_INTRINSIC(AVX2, HorizontalAdd, 32, 2, {INS_invalid, INS_invalid, INS_phaddw, INS_invalid, INS_phaddd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(AVX2, HorizontalAdd, 32, 2, {INS_invalid, INS_invalid, INS_phaddw, INS_phaddw, INS_phaddd, INS_phaddd, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX2, HorizontalAddSaturate, 32, 2, {INS_invalid, INS_invalid, INS_phaddsw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX2, HorizontalSubtract, 32, 2, {INS_invalid, INS_invalid, INS_phsubw, INS_invalid, INS_phsubd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX2, HorizontalSubtractSaturate, 32, 2, {INS_invalid, INS_invalid, INS_phsubsw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) diff --git a/src/coreclr/src/jit/hwintrinsicxarch.cpp b/src/coreclr/src/jit/hwintrinsicxarch.cpp index c7dfaf5f7311e5..3c40ac1d96d86c 100644 --- a/src/coreclr/src/jit/hwintrinsicxarch.cpp +++ b/src/coreclr/src/jit/hwintrinsicxarch.cpp @@ -790,10 +790,93 @@ GenTree* Compiler::impBaseIntrinsic(NamedIntrinsic intrinsic, { assert(sig->numArgs == 1); - if (compExactlyDependsOn(InstructionSet_SSE) && varTypeIsFloating(baseType)) + bool isSupported = false; + + switch (baseType) + { + case TYP_BYTE: + case TYP_UBYTE: + case TYP_SHORT: + case TYP_USHORT: + case TYP_INT: + case TYP_UINT: + { + isSupported = compExactlyDependsOn(InstructionSet_SSE2); + break; + } + + case TYP_LONG: + case TYP_ULONG: + { + isSupported = compExactlyDependsOn(InstructionSet_SSE2_X64); + break; + } + + case TYP_FLOAT: + case TYP_DOUBLE: + { + isSupported = compExactlyDependsOn(InstructionSet_SSE); + break; + } + + default: + { + unreached(); + } + } + + if (isSupported) + { + op1 = impSIMDPopStack(getSIMDTypeForSize(simdSize)); + retNode = gtNewSimdHWIntrinsicNode(retType, op1, intrinsic, baseType, simdSize); + } + break; + } + + case NI_Vector256_ToScalar: + { + assert(sig->numArgs == 1); + + bool isSupported = false; + + switch (baseType) + { + case TYP_BYTE: + case TYP_UBYTE: + case TYP_SHORT: + case TYP_USHORT: + case TYP_INT: + case TYP_UINT: + { + isSupported = compExactlyDependsOn(InstructionSet_AVX); + break; + } + + case TYP_LONG: + case TYP_ULONG: + { + isSupported = + compExactlyDependsOn(InstructionSet_AVX) && compExactlyDependsOn(InstructionSet_SSE2_X64); + break; + } + + case TYP_FLOAT: + case TYP_DOUBLE: + { + isSupported = compExactlyDependsOn(InstructionSet_AVX); + break; + } + + default: + { + unreached(); + } + } + + if (isSupported) { op1 = impSIMDPopStack(getSIMDTypeForSize(simdSize)); - retNode = gtNewSimdHWIntrinsicNode(retType, op1, intrinsic, baseType, 16); + retNode = gtNewSimdHWIntrinsicNode(retType, op1, intrinsic, baseType, simdSize); } break; } @@ -846,18 +929,6 @@ GenTree* Compiler::impBaseIntrinsic(NamedIntrinsic intrinsic, break; } - case NI_Vector256_ToScalar: - { - assert(sig->numArgs == 1); - - if (compExactlyDependsOn(InstructionSet_AVX) && varTypeIsFloating(baseType)) - { - op1 = impSIMDPopStack(getSIMDTypeForSize(simdSize)); - retNode = gtNewSimdHWIntrinsicNode(retType, op1, intrinsic, baseType, 32); - } - break; - } - case NI_Vector256_get_Zero: case NI_Vector256_get_AllBitsSet: { diff --git a/src/coreclr/src/jit/importer.cpp b/src/coreclr/src/jit/importer.cpp index 943a64bdf93306..a549620c24f89a 100644 --- a/src/coreclr/src/jit/importer.cpp +++ b/src/coreclr/src/jit/importer.cpp @@ -3519,7 +3519,11 @@ GenTree* Compiler::impIntrinsic(GenTree* newobjThis, if ((ni > NI_SIMD_AS_HWINTRINSIC_START) && (ni < NI_SIMD_AS_HWINTRINSIC_END)) { - return impSimdAsHWIntrinsic(ni, clsHnd, method, sig, mustExpand); + // These intrinsics aren't defined recursively and so they will never be mustExpand + // Instead, they provide software fallbacks that will be executed instead. + + assert(!mustExpand); + return impSimdAsHWIntrinsic(ni, clsHnd, method, sig, newobjThis); } #endif // FEATURE_HW_INTRINSICS } diff --git a/src/coreclr/src/jit/lower.h b/src/coreclr/src/jit/lower.h index 49b72630eb7078..01e1e401101851 100644 --- a/src/coreclr/src/jit/lower.h +++ b/src/coreclr/src/jit/lower.h @@ -326,11 +326,14 @@ class Lowering final : public Phase void LowerHWIntrinsicCC(GenTreeHWIntrinsic* node, NamedIntrinsic newIntrinsicId, GenCondition condition); void LowerHWIntrinsicCmpOp(GenTreeHWIntrinsic* node, genTreeOps cmpOp); void LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node); + void LowerHWIntrinsicDot(GenTreeHWIntrinsic* node); void LowerFusedMultiplyAdd(GenTreeHWIntrinsic* node); -#ifdef TARGET_ARM64 +#if defined(TARGET_XARCH) + void LowerHWIntrinsicToScalar(GenTreeHWIntrinsic* node); +#elif defined(TARGET_ARM64) bool IsValidConstForMovImm(GenTreeHWIntrinsic* node); -#endif // TARGET_ARM64 +#endif // !TARGET_XARCH && !TARGET_ARM64 union VectorConstant { int8_t i8[32]; @@ -411,11 +414,26 @@ class Lowering final : public Phase case TYP_LONG: case TYP_ULONG: { - if (arg->OperIs(GT_CNS_LNG)) +#if defined(TARGET_64BIT) + if (arg->IsCnsIntOrI()) { - vecCns.i64[argIdx] = static_cast(arg->AsLngCon()->gtLconVal); + vecCns.i64[argIdx] = static_cast(arg->AsIntCon()->gtIconVal); + return true; + } +#else + if (arg->OperIsLong() && arg->AsOp()->gtOp1->IsCnsIntOrI() && arg->AsOp()->gtOp2->IsCnsIntOrI()) + { + // 32-bit targets will decompose GT_CNS_LNG into two GT_CNS_INT + // We need to reconstruct the 64-bit value in order to handle this + + INT64 gtLconVal = arg->AsOp()->gtOp2->AsIntCon()->gtIconVal; + gtLconVal <<= 32; + gtLconVal |= arg->AsOp()->gtOp1->AsIntCon()->gtIconVal; + + vecCns.i64[argIdx] = gtLconVal; return true; } +#endif // TARGET_64BIT else { // We expect the VectorConstant to have been already zeroed diff --git a/src/coreclr/src/jit/lowerarmarch.cpp b/src/coreclr/src/jit/lowerarmarch.cpp index 84c664a7606e9b..69aa22c99fe271 100644 --- a/src/coreclr/src/jit/lowerarmarch.cpp +++ b/src/coreclr/src/jit/lowerarmarch.cpp @@ -553,6 +553,13 @@ void Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node) return; } + case NI_Vector64_Dot: + case NI_Vector128_Dot: + { + LowerHWIntrinsicDot(node); + return; + } + case NI_Vector64_op_Equality: case NI_Vector128_op_Equality: { @@ -773,6 +780,8 @@ void Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node) if ((simdSize == 8) && (simdType == TYP_DOUBLE)) { + // TODO-Cleanup: Struct retyping means we have the wrong type here. We need to + // manually fix it up so the simdType checks below are correct. simdType = TYP_SIMD8; } @@ -887,7 +896,30 @@ void Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node) assert((simdSize == 8) || (simdSize == 16)); - UNATIVE_OFFSET cnsSize = simdSize; + if ((argCnt == 1) || (simdSize == 8) || (vecCns.i64[0] == vecCns.i64[1])) + { + // If we are a single constant or if all parts are the same, we might be able to optimize + // this even further for certain values, such as Zero or AllBitsSet. + + if (vecCns.i64[0] == 0) + { + node->gtOp1 = nullptr; + node->gtOp2 = nullptr; + + node->gtHWIntrinsicId = NI_Vector128_get_Zero; + return; + } + else if (vecCns.i64[0] == -1) + { + node->gtOp1 = nullptr; + node->gtOp2 = nullptr; + + node->gtHWIntrinsicId = NI_Vector128_get_AllBitsSet; + return; + } + } + + UNATIVE_OFFSET cnsSize = (simdSize == 12) ? 16 : simdSize; UNATIVE_OFFSET cnsAlign = cnsSize; CORINFO_FIELD_HANDLE hnd = comp->GetEmitter()->emitAnyConst(&vecCns, cnsSize, cnsAlign); @@ -1013,6 +1045,230 @@ void Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node) node->gtHWIntrinsicId = NI_AdvSimd_Insert; } + +//---------------------------------------------------------------------------------------------- +// Lowering::LowerHWIntrinsicDot: Lowers a Vector64 or Vector128 Dot call +// +// Arguments: +// node - The hardware intrinsic node. +// +void Lowering::LowerHWIntrinsicDot(GenTreeHWIntrinsic* node) +{ + NamedIntrinsic intrinsicId = node->gtHWIntrinsicId; + var_types baseType = node->gtSIMDBaseType; + unsigned simdSize = node->gtSIMDSize; + var_types simdType = Compiler::getSIMDTypeForSize(simdSize); + + assert((intrinsicId == NI_Vector64_Dot) || (intrinsicId == NI_Vector128_Dot)); + assert(varTypeIsSIMD(simdType)); + assert(varTypeIsArithmetic(baseType)); + assert(simdSize != 0); + + GenTree* op1 = node->gtGetOp1(); + GenTree* op2 = node->gtGetOp2(); + + assert(op1 != nullptr); + assert(op2 != nullptr); + assert(!op1->OperIsList()); + + // Spare GenTrees to be used for the lowering logic below + // Defined upfront to avoid naming conflicts, etc... + GenTree* idx = nullptr; + GenTree* tmp1 = nullptr; + GenTree* tmp2 = nullptr; + + if (simdSize == 12) + { + assert(baseType == TYP_FLOAT); + + // For 12 byte SIMD, we need to clear the upper 4 bytes: + // idx = CNS_INT int 0x03 + // tmp1 = * CNS_DLB float 0.0 + // /--* op1 simd16 + // +--* idx int + // +--* tmp1 simd16 + // op1 = * HWINTRINSIC simd16 T Insert + // ... + + // This is roughly the following managed code: + // op1 = AdvSimd.Insert(op1, 0x03, 0.0f); + // ... + + idx = comp->gtNewIconNode(0x03, TYP_INT); + BlockRange().InsertAfter(op1, idx); + + tmp1 = comp->gtNewZeroConNode(TYP_FLOAT); + BlockRange().InsertAfter(idx, tmp1); + LowerNode(tmp1); + + op1 = comp->gtNewSimdAsHWIntrinsicNode(simdType, op1, idx, tmp1, NI_AdvSimd_Insert, baseType, simdSize); + BlockRange().InsertAfter(tmp1, op1); + LowerNode(op1); + } + + // We will be constructing the following parts: + // ... + // /--* op1 simd16 + // +--* op2 simd16 + // tmp1 = * HWINTRINSIC simd16 T Multiply + // ... + + // This is roughly the following managed code: + // ... + // var tmp1 = AdvSimd.Multiply(op1, op2); + // ... + + NamedIntrinsic multiply = (baseType == TYP_DOUBLE) ? NI_AdvSimd_Arm64_Multiply : NI_AdvSimd_Multiply; + assert(!varTypeIsLong(baseType)); + + tmp1 = comp->gtNewSimdAsHWIntrinsicNode(simdType, op1, op2, multiply, baseType, simdSize); + BlockRange().InsertBefore(node, tmp1); + LowerNode(tmp1); + + if (varTypeIsFloating(baseType)) + { + // We will be constructing the following parts: + // ... + // /--* tmp1 simd16 + // * STORE_LCL_VAR simd16 + // tmp1 = LCL_VAR simd16 + // tmp2 = LCL_VAR simd16 + // ... + + // This is roughly the following managed code: + // ... + // var tmp2 = tmp1; + // ... + + node->gtOp1 = tmp1; + LIR::Use tmp1Use(BlockRange(), &node->gtOp1, node); + ReplaceWithLclVar(tmp1Use); + tmp1 = node->gtOp1; + + tmp2 = comp->gtClone(tmp1); + BlockRange().InsertAfter(tmp1, tmp2); + + if (simdSize == 8) + { + assert(baseType == TYP_FLOAT); + + // We will be constructing the following parts: + // ... + // /--* tmp1 simd8 + // +--* tmp2 simd8 + // tmp1 = * HWINTRINSIC simd8 T AddPairwise + // ... + + // This is roughly the following managed code: + // ... + // var tmp1 = AdvSimd.AddPairwise(tmp1, tmp2); + // ... + + tmp1 = comp->gtNewSimdAsHWIntrinsicNode(simdType, tmp1, tmp2, NI_AdvSimd_AddPairwise, baseType, simdSize); + BlockRange().InsertAfter(tmp2, tmp1); + LowerNode(tmp1); + } + else + { + assert((simdSize == 12) || (simdSize == 16)); + + // We will be constructing the following parts: + // ... + // /--* tmp1 simd16 + // +--* tmp2 simd16 + // tmp2 = * HWINTRINSIC simd16 T AddPairwise + // ... + + // This is roughly the following managed code: + // ... + // var tmp1 = AdvSimd.Arm64.AddPairwise(tmp1, tmp2); + // ... + + tmp1 = comp->gtNewSimdAsHWIntrinsicNode(simdType, tmp1, tmp2, NI_AdvSimd_Arm64_AddPairwise, baseType, + simdSize); + BlockRange().InsertAfter(tmp2, tmp1); + LowerNode(tmp1); + + if (baseType == TYP_FLOAT) + { + // Float needs an additional pairwise add to finish summing the parts + // The first will have summed e0 with e1 and e2 with e3 and then repeats that for the upper half + // So, we will have a vector that looks like this: + // < e0 + e1, e2 + e3, e0 + e1, e2 + e3> + // Doing a second horizontal add with itself will then give us + // e0 + e1 + e2 + e3 in all elements of the vector + + // We will be constructing the following parts: + // ... + // /--* tmp1 simd16 + // * STORE_LCL_VAR simd16 + // tmp1 = LCL_VAR simd16 + // tmp2 = LCL_VAR simd16 + // /--* tmp1 simd16 + // +--* tmp2 simd16 + // tmp2 = * HWINTRINSIC simd16 T AddPairwise + // ... + + // This is roughly the following managed code: + // ... + // var tmp2 = tmp1; + // var tmp1 = AdvSimd.Arm64.AddPairwise(tmp1, tmp2); + // ... + + node->gtOp1 = tmp1; + LIR::Use tmp1Use(BlockRange(), &node->gtOp1, node); + ReplaceWithLclVar(tmp1Use); + tmp1 = node->gtOp1; + + tmp2 = comp->gtClone(tmp1); + BlockRange().InsertAfter(tmp1, tmp2); + + tmp1 = comp->gtNewSimdAsHWIntrinsicNode(simdType, tmp1, tmp2, NI_AdvSimd_Arm64_AddPairwise, baseType, + simdSize); + BlockRange().InsertAfter(tmp2, tmp1); + LowerNode(tmp1); + } + } + + tmp2 = tmp1; + } + else + { + assert(varTypeIsIntegral(baseType)); + + // We will be constructing the following parts: + // ... + // /--* tmp1 simd16 + // tmp2 = * HWINTRINSIC simd16 T AddAcross + // ... + + // This is roughly the following managed code: + // ... + // var tmp2 = AdvSimd.Arm64.AddAcross(tmp1); + // ... + + tmp2 = comp->gtNewSimdAsHWIntrinsicNode(simdType, tmp1, NI_AdvSimd_Arm64_AddAcross, baseType, simdSize); + BlockRange().InsertAfter(tmp1, tmp2); + LowerNode(tmp2); + } + + // We will be constructing the following parts: + // ... + // /--* tmp2 simd16 + // node = * HWINTRINSIC simd16 T ToScalar + + // This is roughly the following managed code: + // ... + // return tmp2.ToScalar(); + + node->gtOp1 = tmp2; + node->gtOp2 = nullptr; + + node->gtHWIntrinsicId = (simdSize == 8) ? NI_Vector64_ToScalar : NI_Vector128_ToScalar; + LowerNode(node); + + return; +} #endif // FEATURE_HW_INTRINSICS //------------------------------------------------------------------------ diff --git a/src/coreclr/src/jit/lowerxarch.cpp b/src/coreclr/src/jit/lowerxarch.cpp index 7b861c238dc85a..bf5d71aad36f12 100644 --- a/src/coreclr/src/jit/lowerxarch.cpp +++ b/src/coreclr/src/jit/lowerxarch.cpp @@ -943,6 +943,13 @@ void Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node) return; } + case NI_Vector128_Dot: + case NI_Vector256_Dot: + { + LowerHWIntrinsicDot(node); + return; + } + case NI_Vector128_op_Equality: case NI_Vector256_op_Equality: { @@ -957,6 +964,13 @@ void Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node) return; } + case NI_Vector128_ToScalar: + case NI_Vector256_ToScalar: + { + LowerHWIntrinsicToScalar(node); + break; + } + case NI_SSE2_Insert: case NI_SSE41_Insert: case NI_SSE41_X64_Insert: @@ -1350,7 +1364,7 @@ void Lowering::LowerHWIntrinsicCmpOp(GenTreeHWIntrinsic* node, genTreeOps cmpOp) GenTree* tmp = comp->gtNewOperNode(GT_AND, TYP_INT, msk, mskCns); BlockRange().InsertAfter(mskCns, tmp); - LowerNode(msk); + LowerNode(tmp); msk = tmp; @@ -1386,6 +1400,13 @@ void Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node) unsigned simdSize = node->gtSIMDSize; VectorConstant vecCns = {}; + if ((simdSize == 8) && (simdType == TYP_DOUBLE)) + { + // TODO-Cleanup: Struct retyping means we have the wrong type here. We need to + // manually fix it up so the simdType checks below are correct. + simdType = TYP_SIMD8; + } + assert(varTypeIsSIMD(simdType)); assert(varTypeIsArithmetic(baseType)); assert(simdSize != 0); @@ -1455,22 +1476,72 @@ void Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node) { for (argList = op1->AsArgList(); argList != nullptr; argList = argList->Rest()) { - BlockRange().Remove(argList->Current()); + GenTree* arg = argList->Current(); + +#if !defined(TARGET_64BIT) + if (arg->OperIsLong()) + { + BlockRange().Remove(arg->AsOp()->gtOp1); + BlockRange().Remove(arg->AsOp()->gtOp2); + } +#endif // !TARGET_64BIT + + BlockRange().Remove(arg); } } else { +#if !defined(TARGET_64BIT) + if (op1->OperIsLong()) + { + BlockRange().Remove(op1->AsOp()->gtOp1); + BlockRange().Remove(op1->AsOp()->gtOp2); + } +#endif // !TARGET_64BIT + BlockRange().Remove(op1); if (op2 != nullptr) { +#if defined(TARGET_64BIT) + if (op2->OperIsLong()) + { + BlockRange().Remove(op2->AsOp()->gtOp1); + BlockRange().Remove(op2->AsOp()->gtOp2); + } +#endif // !TARGET_64BIT + BlockRange().Remove(op2); } } - assert((simdSize == 16) || (simdSize == 32)); + assert((simdSize == 8) || (simdSize == 12) || (simdSize == 16) || (simdSize == 32)); - UNATIVE_OFFSET cnsSize = simdSize; + if ((argCnt == 1) || + ((vecCns.i64[0] == vecCns.i64[1]) && ((simdSize <= 16) || (vecCns.i64[2] == vecCns.i64[3])))) + { + // If we are a single constant or if all parts are the same, we might be able to optimize + // this even further for certain values, such as Zero or AllBitsSet. + + if (vecCns.i64[0] == 0) + { + node->gtOp1 = nullptr; + node->gtOp2 = nullptr; + + node->gtHWIntrinsicId = NI_Vector128_get_Zero; + return; + } + else if (vecCns.i64[0] == -1) + { + node->gtOp1 = nullptr; + node->gtOp2 = nullptr; + + node->gtHWIntrinsicId = NI_Vector128_get_AllBitsSet; + return; + } + } + + UNATIVE_OFFSET cnsSize = (simdSize != 12) ? simdSize : 16; UNATIVE_OFFSET cnsAlign = (comp->compCodeOpt() != Compiler::SMALL_CODE) ? cnsSize : 1; CORINFO_FIELD_HANDLE hnd = comp->GetEmitter()->emitAnyConst(&vecCns, cnsSize, cnsAlign); @@ -2244,7 +2315,7 @@ void Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node) // return Sse41.X64.Insert(tmp1, op2, 0x01); idx = comp->gtNewIconNode(0x01, TYP_INT); - BlockRange().InsertAfter(op2, idx); + BlockRange().InsertBefore(node, idx); node->gtOp1 = comp->gtNewArgList(tmp1, op2, idx); node->gtOp2 = nullptr; @@ -2451,6 +2522,703 @@ void Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node) } } } + +//---------------------------------------------------------------------------------------------- +// Lowering::LowerHWIntrinsicDot: Lowers a Vector128 or Vector256 Dot call +// +// Arguments: +// node - The hardware intrinsic node. +// +void Lowering::LowerHWIntrinsicDot(GenTreeHWIntrinsic* node) +{ + NamedIntrinsic intrinsicId = node->gtHWIntrinsicId; + ; + var_types baseType = node->gtSIMDBaseType; + unsigned simdSize = node->gtSIMDSize; + var_types simdType = Compiler::getSIMDTypeForSize(simdSize); + unsigned simd16Count = comp->getSIMDVectorLength(16, baseType); + + assert((intrinsicId == NI_Vector128_Dot) || (intrinsicId == NI_Vector256_Dot)); + assert(varTypeIsSIMD(simdType)); + assert(varTypeIsArithmetic(baseType)); + assert(simdSize != 0); + + GenTree* op1 = node->gtGetOp1(); + GenTree* op2 = node->gtGetOp2(); + + assert(op1 != nullptr); + assert(op2 != nullptr); + assert(!op1->OperIsList()); + + // Spare GenTrees to be used for the lowering logic below + // Defined upfront to avoid naming conflicts, etc... + GenTree* idx = nullptr; + GenTree* tmp1 = nullptr; + GenTree* tmp2 = nullptr; + GenTree* tmp3 = nullptr; + + NamedIntrinsic multiply = NI_Illegal; + NamedIntrinsic horizontalAdd = NI_Illegal; + NamedIntrinsic add = NI_Illegal; + NamedIntrinsic shuffle = NI_Illegal; + + if (simdSize == 32) + { + assert(comp->compIsaSupportedDebugOnly(InstructionSet_AVX2)); + + switch (baseType) + { + case TYP_SHORT: + case TYP_USHORT: + case TYP_INT: + case TYP_UINT: + { + multiply = NI_AVX2_MultiplyLow; + horizontalAdd = NI_AVX2_HorizontalAdd; + add = NI_AVX2_Add; + break; + } + + case TYP_FLOAT: + { + // We will be constructing the following parts: + // idx = CNS_INT int 0xF1 + // /--* op1 simd16 + // +--* op2 simd16 + // +--* idx int + // tmp1 = * HWINTRINSIC simd16 T DotProduct + // /--* tmp1 simd16 + // * STORE_LCL_VAR simd16 + // tmp1 = LCL_VAR simd16 + // tmp2 = LCL_VAR simd16 + // idx = CNS_INT int 0x01 + // /--* tmp2 simd16 + // +--* idx int + // tmp2 = * HWINTRINSIC simd16 T ExtractVector128 + // /--* tmp1 simd16 + // +--* tmp2 simd16 + // tmp3 = * HWINTRINSIC simd16 T Add + // /--* tmp3 simd16 + // node = * HWINTRINSIC simd16 T ToScalar + + // This is roughly the following managed code: + // var tmp1 = Avx.DotProduct(op1, op2, 0xFF); + // var tmp2 = Avx.ExtractVector128(tmp1, 0x01); + // var tmp3 = Sse.Add(tmp1, tmp2); + // return tmp3.ToScalar(); + + idx = comp->gtNewIconNode(0xF1, TYP_INT); + BlockRange().InsertBefore(node, idx); + + tmp1 = comp->gtNewSimdHWIntrinsicNode(simdType, op1, op2, idx, NI_AVX_DotProduct, baseType, simdSize); + BlockRange().InsertAfter(idx, tmp1); + LowerNode(tmp1); + + node->gtOp1 = tmp1; + LIR::Use tmp1Use(BlockRange(), &node->gtOp1, node); + ReplaceWithLclVar(tmp1Use); + tmp1 = node->gtOp1; + + tmp2 = comp->gtClone(tmp1); + BlockRange().InsertAfter(tmp1, tmp2); + + idx = comp->gtNewIconNode(0x01, TYP_INT); + BlockRange().InsertAfter(tmp2, idx); + + tmp2 = + comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, tmp2, idx, NI_AVX_ExtractVector128, baseType, simdSize); + BlockRange().InsertAfter(idx, tmp2); + LowerNode(tmp2); + + tmp3 = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, tmp1, tmp2, NI_SSE_Add, baseType, 16); + BlockRange().InsertAfter(tmp2, tmp3); + LowerNode(tmp3); + + node->gtSIMDSize = 16; + + node->gtOp1 = tmp3; + node->gtOp2 = nullptr; + + node->gtHWIntrinsicId = NI_Vector128_ToScalar; + LowerNode(node); + + return; + } + + case TYP_DOUBLE: + { + multiply = NI_AVX_Multiply; + horizontalAdd = NI_AVX_HorizontalAdd; + add = NI_AVX_Add; + break; + } + + default: + { + unreached(); + } + } + } + else + { + assert(comp->compIsaSupportedDebugOnly(InstructionSet_SSE2)); + + switch (baseType) + { + case TYP_SHORT: + case TYP_USHORT: + { + multiply = NI_SSE2_MultiplyLow; + horizontalAdd = NI_SSSE3_HorizontalAdd; + add = NI_SSE2_Add; + + if (!comp->compOpportunisticallyDependsOn(InstructionSet_SSSE3)) + { + shuffle = NI_SSE2_ShuffleLow; + } + break; + } + + case TYP_INT: + case TYP_UINT: + { + multiply = NI_SSE41_MultiplyLow; + horizontalAdd = NI_SSSE3_HorizontalAdd; + add = NI_SSE2_Add; + + assert(comp->compIsaSupportedDebugOnly(InstructionSet_SSE41)); + break; + } + + case TYP_FLOAT: + { + if (comp->compOpportunisticallyDependsOn(InstructionSet_SSE41)) + { + // We will be constructing the following parts: + // idx = CNS_INT int 0xFF + // /--* op1 simd16 + // +--* op2 simd16 + // +--* idx int + // tmp3 = * HWINTRINSIC simd16 T DotProduct + // /--* tmp3 simd16 + // node = * HWINTRINSIC simd16 T ToScalar + + // This is roughly the following managed code: + // var tmp3 = Avx.DotProduct(op1, op2, 0xFF); + // return tmp3.ToScalar(); + + if (simdSize == 8) + { + idx = comp->gtNewIconNode(0x31, TYP_INT); + } + else if (simdSize == 12) + { + idx = comp->gtNewIconNode(0x71, TYP_INT); + } + else + { + assert(simdSize == 16); + idx = comp->gtNewIconNode(0xF1, TYP_INT); + } + BlockRange().InsertBefore(node, idx); + + tmp3 = comp->gtNewSimdHWIntrinsicNode(simdType, op1, op2, idx, NI_SSE41_DotProduct, baseType, + simdSize); + BlockRange().InsertAfter(idx, tmp3); + LowerNode(tmp3); + + node->gtOp1 = tmp3; + node->gtOp2 = nullptr; + + node->gtHWIntrinsicId = NI_Vector128_ToScalar; + LowerNode(node); + + return; + } + + multiply = NI_SSE_Multiply; + horizontalAdd = NI_SSE3_HorizontalAdd; + add = NI_SSE_Add; + + if (!comp->compOpportunisticallyDependsOn(InstructionSet_SSE3)) + { + shuffle = NI_SSE_Shuffle; + } + break; + } + + case TYP_DOUBLE: + { + if (comp->compOpportunisticallyDependsOn(InstructionSet_SSE41)) + { + // We will be constructing the following parts: + // idx = CNS_INT int 0x31 + // /--* op1 simd16 + // +--* op2 simd16 + // +--* idx int + // tmp3 = * HWINTRINSIC simd16 T DotProduct + // /--* tmp3 simd16 + // node = * HWINTRINSIC simd16 T ToScalar + + // This is roughly the following managed code: + // var tmp3 = Avx.DotProduct(op1, op2, 0x31); + // return tmp3.ToScalar(); + + idx = comp->gtNewIconNode(0x31, TYP_INT); + BlockRange().InsertBefore(node, idx); + + tmp3 = comp->gtNewSimdHWIntrinsicNode(simdType, op1, op2, idx, NI_SSE41_DotProduct, baseType, + simdSize); + BlockRange().InsertAfter(idx, tmp3); + LowerNode(tmp3); + + node->gtOp1 = tmp3; + node->gtOp2 = nullptr; + + node->gtHWIntrinsicId = NI_Vector128_ToScalar; + LowerNode(node); + + return; + } + + multiply = NI_SSE2_Multiply; + horizontalAdd = NI_SSE3_HorizontalAdd; + add = NI_SSE2_Add; + + if (!comp->compOpportunisticallyDependsOn(InstructionSet_SSE3)) + { + shuffle = NI_SSE2_Shuffle; + } + break; + } + + default: + { + unreached(); + } + } + + if (simdSize == 8) + { + assert(baseType == TYP_FLOAT); + + // If simdSize == 8 then we have only two elements, not the 4 that we got from getSIMDVectorLength, + // which we gave a simdSize of 16. So, we set the simd16Count to 2 so that only 1 hadd will + // be emitted rather than 2, so that the upper two elements will be ignored. + + simd16Count = 2; + } + else if (simdSize == 12) + { + assert(baseType == TYP_FLOAT); + + // We will be constructing the following parts: + // ... + // +--* CNS_INT int -1 + // +--* CNS_INT int -1 + // +--* CNS_INT int -1 + // +--* CNS_INT int 0 + // tmp1 = * HWINTRINSIC simd16 T Create + // /--* op2 simd16 + // +--* tmp1 simd16 + // op1 = * HWINTRINSIC simd16 T And + // ... + + // This is roughly the following managed code: + // ... + // tmp1 = Vector128.Create(-1, -1, -1, 0); + // op1 = Sse.And(op1, tmp2); + // ... + + GenTree* cns0 = comp->gtNewIconNode(-1, TYP_INT); + BlockRange().InsertAfter(op1, cns0); + + GenTree* cns1 = comp->gtNewIconNode(-1, TYP_INT); + BlockRange().InsertAfter(cns0, cns1); + + GenTree* cns2 = comp->gtNewIconNode(-1, TYP_INT); + BlockRange().InsertAfter(cns1, cns2); + + GenTree* cns3 = comp->gtNewIconNode(0, TYP_INT); + BlockRange().InsertAfter(cns2, cns3); + + tmp1 = comp->gtNewSimdHWIntrinsicNode(simdType, cns0, cns1, cns2, cns3, NI_Vector128_Create, TYP_INT, 16); + BlockRange().InsertAfter(cns3, tmp1); + LowerNode(tmp1); + + op1 = comp->gtNewSimdHWIntrinsicNode(simdType, op1, tmp1, NI_SSE_And, baseType, simdSize); + BlockRange().InsertAfter(tmp1, op1); + LowerNode(op1); + } + } + + // We will be constructing the following parts: + // /--* op1 simd16 + // +--* op2 simd16 + // tmp1 = * HWINTRINSIC simd16 T Multiply + // ... + + // This is roughly the following managed code: + // var tmp1 = Isa.Multiply(op1, op2); + // ... + + tmp1 = comp->gtNewSimdHWIntrinsicNode(simdType, op1, op2, multiply, baseType, simdSize); + BlockRange().InsertBefore(node, tmp1); + LowerNode(tmp1); + + // HorizontalAdd combines pairs so we need log2(simd16Count) passes to sum all elements together. + int haddCount = genLog2(simd16Count); + + for (int i = 0; i < haddCount; i++) + { + // We will be constructing the following parts: + // ... + // /--* tmp1 simd16 + // * STORE_LCL_VAR simd16 + // tmp1 = LCL_VAR simd16 + // tmp2 = LCL_VAR simd16 + // ... + + // This is roughly the following managed code: + // ... + // tmp2 = tmp1; + // ... + + node->gtOp1 = tmp1; + LIR::Use tmp1Use(BlockRange(), &node->gtOp1, node); + ReplaceWithLclVar(tmp1Use); + tmp1 = node->gtOp1; + + tmp2 = comp->gtClone(tmp1); + BlockRange().InsertAfter(tmp1, tmp2); + + if (shuffle == NI_Illegal) + { + // We will be constructing the following parts: + // ... + // /--* tmp1 simd16 + // +--* tmp2 simd16 + // tmp1 = * HWINTRINSIC simd16 T HorizontalAdd + // ... + + // This is roughly the following managed code: + // ... + // tmp1 = Isa.HorizontalAdd(tmp1, tmp2); + // ... + + tmp1 = comp->gtNewSimdHWIntrinsicNode(simdType, tmp1, tmp2, horizontalAdd, baseType, simdSize); + } + else + { + int shuffleConst = 0x00; + + switch (i) + { + case 0: + { + assert((baseType == TYP_SHORT) || (baseType == TYP_USHORT) || varTypeIsFloating(baseType)); + + // Adds (e0 + e1, e1 + e0, e2 + e3, e3 + e2), giving: + // e0, e1, e2, e3 | e4, e5, e6, e7 + // e1, e0, e3, e2 | e5, e4, e7, e6 + // ... + + shuffleConst = 0xB1; + break; + } + + case 1: + { + assert((baseType == TYP_SHORT) || (baseType == TYP_USHORT) || (baseType == TYP_FLOAT)); + + // Adds (e0 + e2, e1 + e3, e2 + e0, e3 + e1), giving: + // ... + // e2, e3, e0, e1 | e6, e7, e4, e5 + // e3, e2, e1, e0 | e7, e6, e5, e4 + + shuffleConst = 0x4E; + break; + } + + case 2: + { + assert((baseType == TYP_SHORT) || (baseType == TYP_USHORT)); + + // Adds (e0 + e4, e1 + e5, e2 + e6, e3 + e7), giving: + // ... + // e4, e5, e6, e7 | e0, e1, e2, e3 + // e5, e4, e7, e6 | e1, e0, e3, e2 + // e6, e7, e4, e5 | e2, e3, e0, e1 + // e7, e6, e5, e4 | e3, e2, e1, e0 + + shuffleConst = 0x4D; + break; + } + + default: + { + unreached(); + } + } + + idx = comp->gtNewIconNode(shuffleConst, TYP_INT); + BlockRange().InsertAfter(tmp2, idx); + + if (varTypeIsFloating(baseType)) + { + // We will be constructing the following parts: + // ... + // /--* tmp2 simd16 + // * STORE_LCL_VAR simd16 + // tmp2 = LCL_VAR simd16 + // tmp3 = LCL_VAR simd16 + // idx = CNS_INT int shuffleConst + // /--* tmp2 simd16 + // +--* tmp3 simd16 + // +--* idx simd16 + // tmp2 = * HWINTRINSIC simd16 T Shuffle + // ... + + // This is roughly the following managed code: + // ... + // tmp3 = tmp2; + // tmp2 = Isa.Shuffle(tmp2, tmp3, shuffleConst); + // ... + + node->gtOp1 = tmp2; + LIR::Use tmp2Use(BlockRange(), &node->gtOp1, node); + ReplaceWithLclVar(tmp2Use); + tmp2 = node->gtOp1; + + tmp3 = comp->gtClone(tmp2); + BlockRange().InsertAfter(tmp2, tmp3); + + tmp2 = comp->gtNewSimdHWIntrinsicNode(simdType, tmp2, tmp3, idx, shuffle, baseType, simdSize); + } + else + { + assert((baseType == TYP_SHORT) || (baseType == TYP_USHORT)); + + if (i < 2) + { + // We will be constructing the following parts: + // ... + // idx = CNS_INT int shuffleConst + // /--* tmp2 simd16 + // +--* idx simd16 + // tmp2 = * HWINTRINSIC simd16 T ShuffleLow + // idx = CNS_INT int shuffleConst + // /--* tmp2 simd16 + // +--* idx simd16 + // tmp2 = * HWINTRINSIC simd16 T ShuffleHigh + // ... + + // This is roughly the following managed code: + // ... + // tmp2 = Isa.Shuffle(tmp1, shuffleConst); + // ... + + tmp2 = comp->gtNewSimdHWIntrinsicNode(simdType, tmp2, idx, NI_SSE2_ShuffleLow, baseType, simdSize); + BlockRange().InsertAfter(idx, tmp2); + LowerNode(tmp2); + + idx = comp->gtNewIconNode(shuffleConst, TYP_INT); + BlockRange().InsertAfter(tmp2, idx); + + tmp2 = comp->gtNewSimdHWIntrinsicNode(simdType, tmp2, idx, NI_SSE2_ShuffleHigh, baseType, simdSize); + } + else + { + assert(i == 2); + + // We will be constructing the following parts: + // ... + // idx = CNS_INT int shuffleConst + // /--* tmp2 simd16 + // +--* idx simd16 + // tmp2 = * HWINTRINSIC simd16 T ShuffleLow + // ... + + // This is roughly the following managed code: + // ... + // tmp2 = Isa.Shuffle(tmp1, shuffleConst); + // ... + + tmp2 = comp->gtNewSimdHWIntrinsicNode(simdType, tmp2, idx, NI_SSE2_Shuffle, TYP_INT, simdSize); + } + } + + BlockRange().InsertAfter(idx, tmp2); + LowerNode(tmp2); + + // We will be constructing the following parts: + // ... + // /--* tmp1 simd16 + // +--* tmp2 simd16 + // tmp1 = * HWINTRINSIC simd16 T Add + // ... + + // This is roughly the following managed code: + // ... + // tmp1 = Isa.Add(tmp1, tmp2); + // ... + + tmp1 = comp->gtNewSimdHWIntrinsicNode(simdType, tmp1, tmp2, add, baseType, simdSize); + } + + BlockRange().InsertAfter(tmp2, tmp1); + LowerNode(tmp1); + } + + if (simdSize == 32) + { + // We will be constructing the following parts: + // ... + // /--* tmp1 simd16 + // * STORE_LCL_VAR simd16 + // tmp1 = LCL_VAR simd16 + // tmp2 = LCL_VAR simd16 + // idx = CNS_INT int 0x01 + // /--* tmp2 simd16 + // +--* idx int + // tmp2 = * HWINTRINSIC simd16 T ExtractVector128 + // /--* tmp1 simd16 + // +--* tmp2 simd16 + // tmp1 = * HWINTRINSIC simd16 T Add + // ... + + // This is roughly the following managed code: + // ... + // var tmp2 = tmp1; + // tmp2 = Avx.ExtractVector128(tmp2, 0x01); + // var tmp1 = Isa.Add(tmp1, tmp2); + // ... + + node->gtOp1 = tmp1; + LIR::Use tmp1Use(BlockRange(), &node->gtOp1, node); + ReplaceWithLclVar(tmp1Use); + tmp1 = node->gtOp1; + + tmp2 = comp->gtClone(tmp1); + BlockRange().InsertAfter(tmp1, tmp2); + + idx = comp->gtNewIconNode(0x01, TYP_INT); + BlockRange().InsertAfter(tmp2, idx); + + tmp2 = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, tmp2, idx, NI_AVX_ExtractVector128, baseType, simdSize); + BlockRange().InsertAfter(idx, tmp2); + LowerNode(tmp2); + + tmp1 = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, tmp1, tmp2, add, baseType, 16); + BlockRange().InsertAfter(tmp2, tmp1); + LowerNode(tmp1); + + node->gtSIMDSize = 16; + } + + // We will be constructing the following parts: + // ... + // /--* tmp1 simd16 + // node = * HWINTRINSIC simd16 T ToScalar + + // This is roughly the following managed code: + // ... + // return tmp1.ToScalar(); + + node->gtOp1 = tmp1; + node->gtOp2 = nullptr; + + node->gtHWIntrinsicId = NI_Vector128_ToScalar; + LowerNode(node); + + return; +} + +//---------------------------------------------------------------------------------------------- +// Lowering::LowerHWIntrinsicToScalar: Lowers a Vector128 or Vector256 ToScalar call +// +// Arguments: +// node - The hardware intrinsic node. +// +void Lowering::LowerHWIntrinsicToScalar(GenTreeHWIntrinsic* node) +{ + NamedIntrinsic intrinsicId = node->gtHWIntrinsicId; + ; + var_types baseType = node->gtSIMDBaseType; + unsigned simdSize = node->gtSIMDSize; + var_types simdType = Compiler::getSIMDTypeForSize(simdSize); + + assert((intrinsicId == NI_Vector128_ToScalar) || (intrinsicId == NI_Vector256_ToScalar)); + assert(varTypeIsSIMD(simdType)); + assert(varTypeIsArithmetic(baseType)); + assert(simdSize != 0); + + switch (baseType) + { + case TYP_BYTE: + case TYP_SHORT: + case TYP_INT: + { + node->gtType = TYP_INT; + node->gtSIMDBaseType = TYP_INT; + node->gtHWIntrinsicId = NI_SSE2_ConvertToInt32; + break; + } + + case TYP_UBYTE: + case TYP_USHORT: + case TYP_UINT: + { + node->gtType = TYP_UINT; + node->gtSIMDBaseType = TYP_UINT; + node->gtHWIntrinsicId = NI_SSE2_ConvertToUInt32; + break; + } + +#if defined(TARGET_AMD64) + case TYP_LONG: + { + node->gtHWIntrinsicId = NI_SSE2_X64_ConvertToInt64; + break; + } + + case TYP_ULONG: + { + node->gtHWIntrinsicId = NI_SSE2_X64_ConvertToUInt64; + break; + } +#endif // TARGET_AMD64 + + case TYP_FLOAT: + case TYP_DOUBLE: + { + ContainCheckHWIntrinsic(node); + return; + } + + default: + { + unreached(); + } + } + + LowerNode(node); + + if (genTypeSize(baseType) < 4) + { + LIR::Use use; + bool foundUse = BlockRange().TryGetUse(node, &use); + + GenTreeCast* cast = comp->gtNewCastNode(baseType, node, node->IsUnsigned(), baseType); + BlockRange().InsertAfter(node, cast); + + if (foundUse) + { + use.ReplaceWith(comp, cast); + } + LowerNode(cast); + } +} #endif // FEATURE_HW_INTRINSICS //---------------------------------------------------------------------------------------------- diff --git a/src/coreclr/src/jit/lsraarm64.cpp b/src/coreclr/src/jit/lsraarm64.cpp index df0c7113fa6799..4113c891d2aed5 100644 --- a/src/coreclr/src/jit/lsraarm64.cpp +++ b/src/coreclr/src/jit/lsraarm64.cpp @@ -841,10 +841,7 @@ int LinearScan::BuildSIMD(GenTreeSIMD* simdTree) } break; - case SIMDIntrinsicAdd: case SIMDIntrinsicSub: - case SIMDIntrinsicMul: - case SIMDIntrinsicDiv: case SIMDIntrinsicBitwiseAnd: case SIMDIntrinsicBitwiseOr: case SIMDIntrinsicEqual: @@ -896,19 +893,11 @@ int LinearScan::BuildSIMD(GenTreeSIMD* simdTree) // We have an array and an index, which may be contained. break; - case SIMDIntrinsicDotProduct: - buildInternalFloatRegisterDefForNode(simdTree); - break; - case SIMDIntrinsicInitArrayX: case SIMDIntrinsicInitFixed: case SIMDIntrinsicCopyToArray: case SIMDIntrinsicCopyToArrayX: case SIMDIntrinsicNone: - case SIMDIntrinsicGetCount: - case SIMDIntrinsicGetOne: - case SIMDIntrinsicGetZero: - case SIMDIntrinsicGetAllOnes: case SIMDIntrinsicGetX: case SIMDIntrinsicGetY: case SIMDIntrinsicGetZ: diff --git a/src/coreclr/src/jit/lsraxarch.cpp b/src/coreclr/src/jit/lsraxarch.cpp index df8c897dc14bd5..f90412d6a8adaa 100644 --- a/src/coreclr/src/jit/lsraxarch.cpp +++ b/src/coreclr/src/jit/lsraxarch.cpp @@ -1933,67 +1933,14 @@ int LinearScan::BuildSIMD(GenTreeSIMD* simdTree) // We have an array and an index, which may be contained. break; - case SIMDIntrinsicDiv: - // SSE2 has no instruction support for division on integer vectors - noway_assert(varTypeIsFloating(simdTree->gtSIMDBaseType)); - break; - - case SIMDIntrinsicAdd: case SIMDIntrinsicSub: - case SIMDIntrinsicMul: case SIMDIntrinsicBitwiseAnd: case SIMDIntrinsicBitwiseOr: - // SSE2 32-bit integer multiplication requires two temp regs - if (simdTree->gtSIMDIntrinsicID == SIMDIntrinsicMul && simdTree->gtSIMDBaseType == TYP_INT && - compiler->getSIMDSupportLevel() == SIMD_SSE2_Supported) - { - buildInternalFloatRegisterDefForNode(simdTree); - buildInternalFloatRegisterDefForNode(simdTree); - } break; case SIMDIntrinsicEqual: break; - case SIMDIntrinsicDotProduct: - // Float/Double vectors: - // For SSE, or AVX with 32-byte vectors, we also need an internal register - // as scratch. Further we need the targetReg and internal reg to be distinct - // registers. Note that if this is a TYP_SIMD16 or smaller on AVX, then we - // don't need a tmpReg. - // - // 32-byte integer vector on SSE4/AVX: - // will take advantage of phaddd, which operates only on 128-bit xmm reg. - // This will need 1 (in case of SSE4) or 2 (in case of AVX) internal - // registers since targetReg is an int type register. - // - // See genSIMDIntrinsicDotProduct() for details on code sequence generated - // and the need for scratch registers. - if (varTypeIsFloating(simdTree->gtSIMDBaseType)) - { - if ((compiler->getSIMDSupportLevel() == SIMD_SSE2_Supported) || - (simdTree->gtGetOp1()->TypeGet() == TYP_SIMD32)) - { - buildInternalFloatRegisterDefForNode(simdTree); - setInternalRegsDelayFree = true; - } - // else don't need scratch reg(s). - } - else - { - assert(simdTree->gtSIMDBaseType == TYP_INT && compiler->getSIMDSupportLevel() >= SIMD_SSE4_Supported); - - // No need to setInternalRegsDelayFree since targetReg is a - // an int type reg and guaranteed to be different from xmm/ymm - // regs. - buildInternalFloatRegisterDefForNode(simdTree); - if (compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported) - { - buildInternalFloatRegisterDefForNode(simdTree); - } - } - break; - case SIMDIntrinsicGetItem: { // This implements get_Item method. The sources are: @@ -2163,10 +2110,6 @@ int LinearScan::BuildSIMD(GenTreeSIMD* simdTree) case SIMDIntrinsicGetY: case SIMDIntrinsicGetZ: case SIMDIntrinsicGetW: - case SIMDIntrinsicGetOne: - case SIMDIntrinsicGetZero: - case SIMDIntrinsicGetCount: - case SIMDIntrinsicGetAllOnes: assert(!"Get intrinsics should not be seen during Lowering."); unreached(); diff --git a/src/coreclr/src/jit/simd.cpp b/src/coreclr/src/jit/simd.cpp index 345351947fba35..ed92766cc29aff 100644 --- a/src/coreclr/src/jit/simd.cpp +++ b/src/coreclr/src/jit/simd.cpp @@ -1074,14 +1074,10 @@ const SIMDIntrinsicInfo* Compiler::getSIMDIntrinsicInfo(CORINFO_CLASS_HANDLE* in { case SIMDIntrinsicInit: case SIMDIntrinsicGetItem: - case SIMDIntrinsicAdd: case SIMDIntrinsicSub: - case SIMDIntrinsicMul: - case SIMDIntrinsicDiv: case SIMDIntrinsicEqual: case SIMDIntrinsicBitwiseAnd: case SIMDIntrinsicBitwiseOr: - case SIMDIntrinsicDotProduct: case SIMDIntrinsicCast: case SIMDIntrinsicConvertToSingle: case SIMDIntrinsicConvertToDouble: @@ -1837,7 +1833,7 @@ GenTree* Compiler::impSIMDIntrinsic(OPCODE opcode, #error Unsupported platform #endif // !TARGET_XARCH && !TARGET_ARM64 - if (!compOpportunisticallyDependsOn(minimumIsa)) + if (!compOpportunisticallyDependsOn(minimumIsa) || !JitConfig.EnableHWIntrinsic()) { // The user disabled support for the baseline ISA so // don't emit any SIMD intrinsics as they all require @@ -1880,38 +1876,6 @@ GenTree* Compiler::impSIMDIntrinsic(OPCODE opcode, switch (simdIntrinsicID) { - case SIMDIntrinsicGetCount: - { - int length = getSIMDVectorLength(clsHnd); - GenTreeIntCon* intConstTree = new (this, GT_CNS_INT) GenTreeIntCon(TYP_INT, length); - retVal = intConstTree; - - intConstTree->gtFlags |= GTF_ICON_SIMD_COUNT; - } - break; - - case SIMDIntrinsicGetZero: - retVal = gtNewSIMDVectorZero(simdType, baseType, size); - break; - - case SIMDIntrinsicGetOne: - retVal = gtNewSIMDVectorOne(simdType, baseType, size); - break; - - case SIMDIntrinsicGetAllOnes: - { - // Equivalent to (Vector) new Vector(0xffffffff); - GenTree* initVal = gtNewIconNode(0xffffffff, TYP_INT); - simdTree = gtNewSIMDNode(simdType, initVal, nullptr, SIMDIntrinsicInit, TYP_INT, size); - if (baseType != TYP_INT) - { - // cast it to required baseType if different from TYP_INT - simdTree = gtNewSIMDNode(simdType, simdTree, nullptr, SIMDIntrinsicCast, baseType, size); - } - retVal = simdTree; - } - break; - case SIMDIntrinsicInit: case SIMDIntrinsicInitN: { @@ -2262,55 +2226,10 @@ GenTree* Compiler::impSIMDIntrinsic(OPCODE opcode, } break; - case SIMDIntrinsicAdd: case SIMDIntrinsicSub: - case SIMDIntrinsicMul: - case SIMDIntrinsicDiv: case SIMDIntrinsicBitwiseAnd: case SIMDIntrinsicBitwiseOr: { -#if defined(DEBUG) - // check for the cases where we don't support intrinsics. - // This check should be done before we make modifications to type stack. - // Note that this is more of a double safety check for robustness since - // we expect getSIMDIntrinsicInfo() to have filtered out intrinsics on - // unsupported base types. If getSIMdIntrinsicInfo() doesn't filter due - // to some bug, assert in chk/dbg will fire. - if (!varTypeIsFloating(baseType)) - { - if (simdIntrinsicID == SIMDIntrinsicMul) - { -#if defined(TARGET_XARCH) - if ((baseType != TYP_INT) && (baseType != TYP_SHORT)) - { - // TODO-CQ: implement mul on these integer vectors. - // Note that SSE2 has no direct support for these vectors. - assert(!"Mul not supported on long/ulong/uint/small int vectors\n"); - return nullptr; - } -#endif // TARGET_XARCH -#if defined(TARGET_ARM64) - if ((baseType == TYP_ULONG) && (baseType == TYP_LONG)) - { - // TODO-CQ: implement mul on these integer vectors. - // Note that ARM64 has no direct support for these vectors. - assert(!"Mul not supported on long/ulong vectors\n"); - return nullptr; - } -#endif // TARGET_ARM64 - } -#if defined(TARGET_XARCH) || defined(TARGET_ARM64) - // common to all integer type vectors - if (simdIntrinsicID == SIMDIntrinsicDiv) - { - // SSE2 doesn't support div on non-floating point vectors. - assert(!"Div not supported on integer type vectors\n"); - return nullptr; - } -#endif // defined(TARGET_XARCH) || defined(TARGET_ARM64) - } -#endif // DEBUG - // op1 is the first operand; if instance method, op1 is "this" arg // op2 is the second operand op2 = impSIMDPopStack(simdType); @@ -2362,31 +2281,6 @@ GenTree* Compiler::impSIMDIntrinsic(OPCODE opcode, } break; - case SIMDIntrinsicDotProduct: - { -#if defined(TARGET_XARCH) - // Right now dot product is supported only for float/double vectors and - // int vectors on SSE4/AVX. - if (!varTypeIsFloating(baseType) && !(baseType == TYP_INT && getSIMDSupportLevel() >= SIMD_SSE4_Supported)) - { - return nullptr; - } -#endif // TARGET_XARCH - - // op1 is a SIMD variable that is the first source and also "this" arg. - // op2 is a SIMD variable which is the second source. - op2 = impSIMDPopStack(simdType); - op1 = impSIMDPopStack(simdType, instMethod); - - simdTree = gtNewSIMDNode(baseType, op1, op2, simdIntrinsicID, baseType, size); - if (simdType == TYP_SIMD12) - { - simdTree->gtFlags |= GTF_SIMD12_OP; - } - retVal = simdTree; - } - break; - case SIMDIntrinsicGetW: retVal = impSIMDGetFixed(simdType, baseType, size, 3); break; diff --git a/src/coreclr/src/jit/simdashwintrinsic.cpp b/src/coreclr/src/jit/simdashwintrinsic.cpp index 4c0af6ff950c6f..64d559e149adfb 100644 --- a/src/coreclr/src/jit/simdashwintrinsic.cpp +++ b/src/coreclr/src/jit/simdashwintrinsic.cpp @@ -169,10 +169,8 @@ GenTree* Compiler::impSimdAsHWIntrinsic(NamedIntrinsic intrinsic, CORINFO_CLASS_HANDLE clsHnd, CORINFO_METHOD_HANDLE method, CORINFO_SIG_INFO* sig, - bool mustExpand) + GenTree* newobjThis) { - assert(!mustExpand); - if (!featureSIMD) { // We can't support SIMD intrinsics if the JIT doesn't support the feature @@ -187,7 +185,7 @@ GenTree* Compiler::impSimdAsHWIntrinsic(NamedIntrinsic intrinsic, #error Unsupported platform #endif // !TARGET_XARCH && !TARGET_ARM64 - if (!compOpportunisticallyDependsOn(minimumIsa)) + if (!compOpportunisticallyDependsOn(minimumIsa) || !JitConfig.EnableHWIntrinsic()) { // The user disabled support for the baseline ISA so // don't emit any SIMD intrinsics as they all require @@ -274,7 +272,7 @@ GenTree* Compiler::impSimdAsHWIntrinsic(NamedIntrinsic intrinsic, if (hwIntrinsic == intrinsic) { // The SIMD intrinsic requires special handling outside the normal code path - return impSimdAsHWIntrinsicSpecial(intrinsic, clsHnd, sig, retType, baseType, simdSize); + return impSimdAsHWIntrinsicSpecial(intrinsic, clsHnd, sig, retType, baseType, simdSize, newobjThis); } CORINFO_InstructionSet hwIntrinsicIsa = HWIntrinsicInfo::lookupIsa(hwIntrinsic); @@ -352,7 +350,8 @@ GenTree* Compiler::impSimdAsHWIntrinsicSpecial(NamedIntrinsic intrinsic, CORINFO_SIG_INFO* sig, var_types retType, var_types baseType, - unsigned simdSize) + unsigned simdSize, + GenTree* newobjThis) { assert(featureSIMD); assert(retType != TYP_UNKNOWN); @@ -387,28 +386,110 @@ GenTree* Compiler::impSimdAsHWIntrinsicSpecial(NamedIntrinsic intrinsic, #if defined(TARGET_XARCH) bool isVectorT256 = (SimdAsHWIntrinsicInfo::lookupClassId(intrinsic) == SimdAsHWIntrinsicClassId::VectorT256); - if ((baseType != TYP_FLOAT) && !compOpportunisticallyDependsOn(InstructionSet_SSE2)) - { - // Vector, for everything but float, requires at least SSE2 - return nullptr; - } - else if (!compOpportunisticallyDependsOn(InstructionSet_SSE)) + // We should have alredy exited early if SSE2 isn't supported + assert(compIsaSupportedDebugOnly(InstructionSet_SSE2)); + + switch (intrinsic) { - // Vector requires at least SSE - return nullptr; +#if defined(TARGET_X86) + case NI_VectorT128_CreateBroadcast: + case NI_VectorT256_CreateBroadcast: + { + if (varTypeIsLong(baseType)) + { + // TODO-XARCH-CQ: It may be beneficial to emit the movq + // instruction, which takes a 64-bit memory address and + // works on 32-bit x86 systems. + return nullptr; + } + break; + } +#endif // TARGET_X86 + + case NI_VectorT128_Dot: + { + if (!compOpportunisticallyDependsOn(InstructionSet_SSE41)) + { + // We need to exit early if this is Vector.Dot for int or uint and SSE41 is not supported + // The other types should be handled via the table driven paths + + assert((baseType == TYP_INT) || (baseType == TYP_UINT)); + return nullptr; + } + break; + } + + default: + { + // Most intrinsics have some path that works even if only SSE2 is available + break; + } } // Vector, when 32-bytes, requires at least AVX2 assert(!isVectorT256 || compIsaSupportedDebugOnly(InstructionSet_AVX2)); -#endif +#elif defined(TARGET_ARM64) + // We should have alredy exited early if AdvSimd isn't supported + assert(compIsaSupportedDebugOnly(InstructionSet_AdvSimd)); +#else +#error Unsupported platform +#endif // !TARGET_XARCH && !TARGET_ARM64 + + GenTree* copyBlkDst = nullptr; + GenTree* copyBlkSrc = nullptr; switch (numArgs) { case 0: { + assert(newobjThis == nullptr); + switch (intrinsic) { #if defined(TARGET_XARCH) + case NI_Vector2_get_One: + case NI_Vector3_get_One: + case NI_Vector4_get_One: + case NI_VectorT128_get_One: + case NI_VectorT256_get_One: + { + switch (baseType) + { + case TYP_BYTE: + case TYP_UBYTE: + case TYP_SHORT: + case TYP_USHORT: + case TYP_INT: + case TYP_UINT: + { + op1 = gtNewIconNode(1, TYP_INT); + break; + } + + case TYP_LONG: + case TYP_ULONG: + { + op1 = gtNewLconNode(1); + break; + } + + case TYP_FLOAT: + case TYP_DOUBLE: + { + op1 = gtNewDconNode(1.0, baseType); + break; + } + + default: + { + unreached(); + } + } + + return gtNewSimdCreateBroadcastNode(retType, op1, baseType, simdSize, + /* isSimdAsHWIntrinsic */ true); + } + case NI_VectorT128_get_Count: case NI_VectorT256_get_Count: { @@ -417,6 +498,48 @@ GenTree* Compiler::impSimdAsHWIntrinsicSpecial(NamedIntrinsic intrinsic, return countNode; } #elif defined(TARGET_ARM64) + case NI_Vector2_get_One: + case NI_Vector3_get_One: + case NI_Vector4_get_One: + case NI_VectorT128_get_One: + { + switch (baseType) + { + case TYP_BYTE: + case TYP_UBYTE: + case TYP_SHORT: + case TYP_USHORT: + case TYP_INT: + case TYP_UINT: + { + op1 = gtNewIconNode(1, TYP_INT); + break; + } + + case TYP_LONG: + case TYP_ULONG: + { + op1 = gtNewLconNode(1); + break; + } + + case TYP_FLOAT: + case TYP_DOUBLE: + { + op1 = gtNewDconNode(1.0, baseType); + break; + } + + default: + { + unreached(); + } + } + + return gtNewSimdCreateBroadcastNode(retType, op1, baseType, simdSize, + /* isSimdAsHWIntrinsic */ true); + } + case NI_VectorT128_get_Count: { GenTreeIntCon* countNode = gtNewIconNode(getSIMDVectorLength(simdSize, baseType), TYP_INT); @@ -438,6 +561,8 @@ GenTree* Compiler::impSimdAsHWIntrinsicSpecial(NamedIntrinsic intrinsic, case 1: { + assert(newobjThis == nullptr); + bool isOpExplicit = (intrinsic == NI_VectorT128_op_Explicit); #if defined(TARGET_XARCH) @@ -494,7 +619,8 @@ GenTree* Compiler::impSimdAsHWIntrinsicSpecial(NamedIntrinsic intrinsic, } assert(bitMask != nullptr); - bitMask = gtNewSIMDNode(retType, bitMask, SIMDIntrinsicInit, baseType, simdSize); + bitMask = gtNewSimdCreateBroadcastNode(retType, bitMask, baseType, simdSize, + /* isSimdAsHWIntrinsic */ true); intrinsic = isVectorT256 ? NI_VectorT256_op_BitwiseAnd : NI_VectorT128_op_BitwiseAnd; intrinsic = SimdAsHWIntrinsicInfo::lookupHWIntrinsic(intrinsic, baseType); @@ -565,13 +691,27 @@ GenTree* Compiler::impSimdAsHWIntrinsicSpecial(NamedIntrinsic intrinsic, argType = isInstanceMethod ? simdType : JITtype2varType(strip(info.compCompHnd->getArgType(sig, argList, &argClass))); - op1 = getArgForHWIntrinsic(argType, argClass, isInstanceMethod); + op1 = getArgForHWIntrinsic(argType, argClass, isInstanceMethod, newobjThis); assert(!SimdAsHWIntrinsicInfo::NeedsOperandsSwapped(intrinsic)); switch (intrinsic) { #if defined(TARGET_XARCH) + case NI_Vector2_CreateBroadcast: + case NI_Vector3_CreateBroadcast: + case NI_Vector4_CreateBroadcast: + case NI_VectorT128_CreateBroadcast: + case NI_VectorT256_CreateBroadcast: + { + assert(retType == TYP_VOID); + + copyBlkDst = op1; + copyBlkSrc = + gtNewSimdCreateBroadcastNode(simdType, op2, baseType, simdSize, /* isSimdAsHWIntrinsic */ true); + break; + } + case NI_Vector2_op_Division: case NI_Vector3_op_Division: { @@ -598,6 +738,13 @@ GenTree* Compiler::impSimdAsHWIntrinsicSpecial(NamedIntrinsic intrinsic, return retNode; } + case NI_VectorT128_Dot: + { + assert((baseType == TYP_INT) || (baseType == TYP_UINT)); + assert(compIsaSupportedDebugOnly(InstructionSet_SSE41)); + return gtNewSimdAsHWIntrinsicNode(retType, op1, op2, NI_Vector128_Dot, baseType, simdSize); + } + case NI_VectorT128_Equals: case NI_VectorT128_GreaterThan: case NI_VectorT128_GreaterThanOrEqual: @@ -648,8 +795,8 @@ GenTree* Compiler::impSimdAsHWIntrinsicSpecial(NamedIntrinsic intrinsic, } } - GenTree* constVector = - gtNewSIMDNode(retType, constVal, nullptr, SIMDIntrinsicInit, TYP_INT, simdSize); + GenTree* constVector = gtNewSimdCreateBroadcastNode(retType, constVal, TYP_INT, simdSize, + /* isSimdAsHWIntrinsic */ true); GenTree* constVectorDup1; constVector = impCloneExpr(constVector, &constVectorDup1, clsHnd, (unsigned)CHECK_SPILL_ALL, @@ -766,6 +913,19 @@ GenTree* Compiler::impSimdAsHWIntrinsicSpecial(NamedIntrinsic intrinsic, return gtNewSimdAsHWIntrinsicNode(retType, op1, op2, hwIntrinsic, baseType, simdSize); } #elif defined(TARGET_ARM64) + case NI_Vector2_CreateBroadcast: + case NI_Vector3_CreateBroadcast: + case NI_Vector4_CreateBroadcast: + case NI_VectorT128_CreateBroadcast: + { + assert(retType == TYP_VOID); + + copyBlkDst = op1; + copyBlkSrc = + gtNewSimdCreateBroadcastNode(simdType, op2, baseType, simdSize, /* isSimdAsHWIntrinsic */ true); + break; + } + case NI_VectorT128_Max: case NI_VectorT128_Min: { @@ -808,6 +968,8 @@ GenTree* Compiler::impSimdAsHWIntrinsicSpecial(NamedIntrinsic intrinsic, case 3: { + assert(newobjThis == nullptr); + CORINFO_ARG_LIST_HANDLE arg2 = isInstanceMethod ? argList : info.compCompHnd->getArgNext(argList); CORINFO_ARG_LIST_HANDLE arg3 = info.compCompHnd->getArgNext(arg2); @@ -819,7 +981,7 @@ GenTree* Compiler::impSimdAsHWIntrinsicSpecial(NamedIntrinsic intrinsic, argType = isInstanceMethod ? simdType : JITtype2varType(strip(info.compCompHnd->getArgType(sig, argList, &argClass))); - op1 = getArgForHWIntrinsic(argType, argClass, isInstanceMethod); + op1 = getArgForHWIntrinsic(argType, argClass, isInstanceMethod, newobjThis); assert(!SimdAsHWIntrinsicInfo::NeedsOperandsSwapped(intrinsic)); @@ -850,6 +1012,27 @@ GenTree* Compiler::impSimdAsHWIntrinsicSpecial(NamedIntrinsic intrinsic, } } + if (copyBlkDst != nullptr) + { + assert(copyBlkSrc != nullptr); + + // At this point, we have a tree that we are going to store into a destination. + // TODO-1stClassStructs: This should be a simple store or assignment, and should not require + // GTF_ALL_EFFECT for the dest. This is currently emulating the previous behavior of + // block ops. + + GenTree* dest = gtNewBlockVal(copyBlkDst, simdSize); + + dest->gtType = simdType; + dest->gtFlags |= GTF_GLOB_REF; + + GenTree* retNode = gtNewBlkOpNode(dest, copyBlkSrc, /* isVolatile */ false, /* isCopyBlock */ true); + retNode->gtFlags |= ((copyBlkDst->gtFlags | copyBlkSrc->gtFlags) & GTF_ALL_EFFECT); + + return retNode; + } + assert(copyBlkSrc == nullptr); + assert(!"Unexpected SimdAsHWIntrinsic"); return nullptr; } @@ -1155,8 +1338,8 @@ GenTree* Compiler::impSimdAsHWIntrinsicRelOp(NamedIntrinsic intrinsic, } } - GenTree* constVector = - gtNewSIMDNode(retType, constVal, nullptr, SIMDIntrinsicInit, constVal->TypeGet(), simdSize); + GenTree* constVector = gtNewSimdCreateBroadcastNode(retType, constVal, constVal->TypeGet(), simdSize, + /* isSimdAsHWIntrinsic */ true); GenTree* constVectorDup; constVector = impCloneExpr(constVector, &constVectorDup, clsHnd, (unsigned)CHECK_SPILL_ALL, diff --git a/src/coreclr/src/jit/simdashwintrinsiclistarm64.h b/src/coreclr/src/jit/simdashwintrinsiclistarm64.h index ba23bcd193469b..9022642fa6bffc 100644 --- a/src/coreclr/src/jit/simdashwintrinsiclistarm64.h +++ b/src/coreclr/src/jit/simdashwintrinsiclistarm64.h @@ -39,7 +39,10 @@ // ************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************* // Vector2 Intrinsics SIMD_AS_HWINTRINSIC_ID(Vector2, Abs, 1, {NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_AdvSimd_Abs, NI_Illegal}, SimdAsHWIntrinsicFlag::None) +SIMD_AS_HWINTRINSIC_NM(Vector2, CreateBroadcast, ".ctor", 2, {NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Vector2_CreateBroadcast, NI_Illegal}, SimdAsHWIntrinsicFlag::InstanceMethod) +SIMD_AS_HWINTRINSIC_ID(Vector2, Dot, 2, {NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Vector64_Dot, NI_Illegal}, SimdAsHWIntrinsicFlag::None) SIMD_AS_HWINTRINSIC_NM(Vector2, EqualsInstance, "Equals", 2, {NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Vector64_op_Equality, NI_Illegal}, SimdAsHWIntrinsicFlag::InstanceMethod) +SIMD_AS_HWINTRINSIC_ID(Vector2, get_One, 0, {NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Vector2_get_One, NI_Illegal}, SimdAsHWIntrinsicFlag::None) SIMD_AS_HWINTRINSIC_ID(Vector2, get_Zero, 0, {NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Vector64_get_Zero, NI_Illegal}, SimdAsHWIntrinsicFlag::None) SIMD_AS_HWINTRINSIC_ID(Vector2, Max, 2, {NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_AdvSimd_Max, NI_Illegal}, SimdAsHWIntrinsicFlag::None) SIMD_AS_HWINTRINSIC_ID(Vector2, Min, 2, {NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_AdvSimd_Min, NI_Illegal}, SimdAsHWIntrinsicFlag::None) @@ -57,7 +60,10 @@ SIMD_AS_HWINTRINSIC_ID(Vector2, SquareRoot, // ************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************* // Vector3 Intrinsics SIMD_AS_HWINTRINSIC_ID(Vector3, Abs, 1, {NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_AdvSimd_Abs, NI_Illegal}, SimdAsHWIntrinsicFlag::None) +SIMD_AS_HWINTRINSIC_NM(Vector3, CreateBroadcast, ".ctor", 2, {NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Vector3_CreateBroadcast, NI_Illegal}, SimdAsHWIntrinsicFlag::InstanceMethod) +SIMD_AS_HWINTRINSIC_ID(Vector3, Dot, 2, {NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Vector128_Dot, NI_Illegal}, SimdAsHWIntrinsicFlag::None) SIMD_AS_HWINTRINSIC_NM(Vector3, EqualsInstance, "Equals", 2, {NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Vector128_op_Equality, NI_Illegal}, SimdAsHWIntrinsicFlag::InstanceMethod) +SIMD_AS_HWINTRINSIC_ID(Vector3, get_One, 0, {NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Vector3_get_One, NI_Illegal}, SimdAsHWIntrinsicFlag::None) SIMD_AS_HWINTRINSIC_ID(Vector3, get_Zero, 0, {NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Vector128_get_Zero, NI_Illegal}, SimdAsHWIntrinsicFlag::None) SIMD_AS_HWINTRINSIC_ID(Vector3, Max, 2, {NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_AdvSimd_Max, NI_Illegal}, SimdAsHWIntrinsicFlag::None) SIMD_AS_HWINTRINSIC_ID(Vector3, Min, 2, {NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_AdvSimd_Min, NI_Illegal}, SimdAsHWIntrinsicFlag::None) @@ -75,7 +81,10 @@ SIMD_AS_HWINTRINSIC_ID(Vector3, SquareRoot, // ************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************* // Vector4 Intrinsics SIMD_AS_HWINTRINSIC_ID(Vector4, Abs, 1, {NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_AdvSimd_Abs, NI_Illegal}, SimdAsHWIntrinsicFlag::None) +SIMD_AS_HWINTRINSIC_NM(Vector4, CreateBroadcast, ".ctor", 2, {NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Vector4_CreateBroadcast, NI_Illegal}, SimdAsHWIntrinsicFlag::InstanceMethod) +SIMD_AS_HWINTRINSIC_ID(Vector4, Dot, 2, {NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Vector128_Dot, NI_Illegal}, SimdAsHWIntrinsicFlag::None) SIMD_AS_HWINTRINSIC_NM(Vector4, EqualsInstance, "Equals", 2, {NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Vector128_op_Equality, NI_Illegal}, SimdAsHWIntrinsicFlag::InstanceMethod) +SIMD_AS_HWINTRINSIC_ID(Vector4, get_One, 0, {NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Vector4_get_One, NI_Illegal}, SimdAsHWIntrinsicFlag::None) SIMD_AS_HWINTRINSIC_ID(Vector4, get_Zero, 0, {NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Vector128_get_Zero, NI_Illegal}, SimdAsHWIntrinsicFlag::None) SIMD_AS_HWINTRINSIC_ID(Vector4, Max, 2, {NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_AdvSimd_Max, NI_Illegal}, SimdAsHWIntrinsicFlag::None) SIMD_AS_HWINTRINSIC_ID(Vector4, Min, 2, {NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_AdvSimd_Min, NI_Illegal}, SimdAsHWIntrinsicFlag::None) @@ -96,11 +105,14 @@ SIMD_AS_HWINTRINSIC_ID(VectorT128, Abs, SIMD_AS_HWINTRINSIC_ID(VectorT128, AndNot, 2, {NI_AdvSimd_BitwiseClear, NI_AdvSimd_BitwiseClear, NI_AdvSimd_BitwiseClear, NI_AdvSimd_BitwiseClear, NI_AdvSimd_BitwiseClear, NI_AdvSimd_BitwiseClear, NI_AdvSimd_BitwiseClear, NI_AdvSimd_BitwiseClear, NI_AdvSimd_BitwiseClear, NI_AdvSimd_BitwiseClear}, SimdAsHWIntrinsicFlag::None) SIMD_AS_HWINTRINSIC_ID(VectorT128, Ceiling, 1, {NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_AdvSimd_Ceiling, NI_AdvSimd_Arm64_Ceiling}, SimdAsHWIntrinsicFlag::None) SIMD_AS_HWINTRINSIC_ID(VectorT128, ConditionalSelect, 3, {NI_VectorT128_ConditionalSelect, NI_VectorT128_ConditionalSelect, NI_VectorT128_ConditionalSelect, NI_VectorT128_ConditionalSelect, NI_VectorT128_ConditionalSelect, NI_VectorT128_ConditionalSelect, NI_VectorT128_ConditionalSelect, NI_VectorT128_ConditionalSelect, NI_VectorT128_ConditionalSelect, NI_VectorT128_ConditionalSelect}, SimdAsHWIntrinsicFlag::None) +SIMD_AS_HWINTRINSIC_NM(VectorT128, CreateBroadcast, ".ctor", 2, {NI_VectorT128_CreateBroadcast, NI_VectorT128_CreateBroadcast, NI_VectorT128_CreateBroadcast, NI_VectorT128_CreateBroadcast, NI_VectorT128_CreateBroadcast, NI_VectorT128_CreateBroadcast, NI_VectorT128_CreateBroadcast, NI_VectorT128_CreateBroadcast, NI_VectorT128_CreateBroadcast, NI_VectorT128_CreateBroadcast}, SimdAsHWIntrinsicFlag::InstanceMethod) +SIMD_AS_HWINTRINSIC_ID(VectorT128, Dot, 2, {NI_Vector128_Dot, NI_Vector128_Dot, NI_Vector128_Dot, NI_Vector128_Dot, NI_Vector128_Dot, NI_Vector128_Dot, NI_Illegal, NI_Illegal, NI_Vector128_Dot, NI_Vector128_Dot}, SimdAsHWIntrinsicFlag::None) SIMD_AS_HWINTRINSIC_ID(VectorT128, Equals, 2, {NI_AdvSimd_CompareEqual, NI_AdvSimd_CompareEqual, NI_AdvSimd_CompareEqual, NI_AdvSimd_CompareEqual, NI_AdvSimd_CompareEqual, NI_AdvSimd_CompareEqual, NI_AdvSimd_Arm64_CompareEqual, NI_AdvSimd_Arm64_CompareEqual, NI_AdvSimd_CompareEqual, NI_AdvSimd_Arm64_CompareEqual}, SimdAsHWIntrinsicFlag::None) SIMD_AS_HWINTRINSIC_NM(VectorT128, EqualsInstance, "Equals", 2, {NI_Vector128_op_Equality, NI_Vector128_op_Equality, NI_Vector128_op_Equality, NI_Vector128_op_Equality, NI_Vector128_op_Equality, NI_Vector128_op_Equality, NI_Vector128_op_Equality, NI_Vector128_op_Equality, NI_Vector128_op_Equality, NI_Vector128_op_Equality}, SimdAsHWIntrinsicFlag::InstanceMethod) SIMD_AS_HWINTRINSIC_ID(VectorT128, Floor, 1, {NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_AdvSimd_Floor, NI_AdvSimd_Arm64_Floor}, SimdAsHWIntrinsicFlag::None) SIMD_AS_HWINTRINSIC_ID(VectorT128, get_AllBitsSet, 0, {NI_Vector128_get_AllBitsSet, NI_Vector128_get_AllBitsSet, NI_Vector128_get_AllBitsSet, NI_Vector128_get_AllBitsSet, NI_Vector128_get_AllBitsSet, NI_Vector128_get_AllBitsSet, NI_Vector128_get_AllBitsSet, NI_Vector128_get_AllBitsSet, NI_Vector128_get_AllBitsSet, NI_Vector128_get_AllBitsSet}, SimdAsHWIntrinsicFlag::None) SIMD_AS_HWINTRINSIC_ID(VectorT128, get_Count, 0, {NI_VectorT128_get_Count, NI_VectorT128_get_Count, NI_VectorT128_get_Count, NI_VectorT128_get_Count, NI_VectorT128_get_Count, NI_VectorT128_get_Count, NI_VectorT128_get_Count, NI_VectorT128_get_Count, NI_VectorT128_get_Count, NI_VectorT128_get_Count}, SimdAsHWIntrinsicFlag::None) +SIMD_AS_HWINTRINSIC_ID(VectorT128, get_One, 0, {NI_VectorT128_get_One, NI_VectorT128_get_One, NI_VectorT128_get_One, NI_VectorT128_get_One, NI_VectorT128_get_One, NI_VectorT128_get_One, NI_VectorT128_get_One, NI_VectorT128_get_One, NI_VectorT128_get_One, NI_VectorT128_get_One}, SimdAsHWIntrinsicFlag::None) SIMD_AS_HWINTRINSIC_ID(VectorT128, get_Zero, 0, {NI_Vector128_get_Zero, NI_Vector128_get_Zero, NI_Vector128_get_Zero, NI_Vector128_get_Zero, NI_Vector128_get_Zero, NI_Vector128_get_Zero, NI_Vector128_get_Zero, NI_Vector128_get_Zero, NI_Vector128_get_Zero, NI_Vector128_get_Zero}, SimdAsHWIntrinsicFlag::None) SIMD_AS_HWINTRINSIC_ID(VectorT128, GreaterThan, 2, {NI_AdvSimd_CompareGreaterThan, NI_AdvSimd_CompareGreaterThan, NI_AdvSimd_CompareGreaterThan, NI_AdvSimd_CompareGreaterThan, NI_AdvSimd_CompareGreaterThan, NI_AdvSimd_CompareGreaterThan, NI_AdvSimd_Arm64_CompareGreaterThan, NI_AdvSimd_Arm64_CompareGreaterThan, NI_AdvSimd_CompareGreaterThan, NI_AdvSimd_Arm64_CompareGreaterThan}, SimdAsHWIntrinsicFlag::None) SIMD_AS_HWINTRINSIC_ID(VectorT128, GreaterThanOrEqual, 2, {NI_AdvSimd_CompareGreaterThanOrEqual, NI_AdvSimd_CompareGreaterThanOrEqual, NI_AdvSimd_CompareGreaterThanOrEqual, NI_AdvSimd_CompareGreaterThanOrEqual, NI_AdvSimd_CompareGreaterThanOrEqual, NI_AdvSimd_CompareGreaterThanOrEqual, NI_AdvSimd_Arm64_CompareGreaterThanOrEqual, NI_AdvSimd_Arm64_CompareGreaterThanOrEqual, NI_AdvSimd_CompareGreaterThanOrEqual, NI_AdvSimd_Arm64_CompareGreaterThanOrEqual}, SimdAsHWIntrinsicFlag::None) diff --git a/src/coreclr/src/jit/simdashwintrinsiclistxarch.h b/src/coreclr/src/jit/simdashwintrinsiclistxarch.h index d13153db4aad7b..1e77eb1804bda5 100644 --- a/src/coreclr/src/jit/simdashwintrinsiclistxarch.h +++ b/src/coreclr/src/jit/simdashwintrinsiclistxarch.h @@ -35,11 +35,14 @@ // ************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************* // ISA ID Name NumArg Instructions Flags -// {TYP_BYTE, TYP_UBYTE, TYP_SHORT, TYP_USHORT, TYP_INT, TYP_UINT, TYP_LONG, TYP_ULONG, TYP_FLOAT, TYP_DOUBLE} +// {TYP_BYTE, TYP_UBYTE, TYP_SHORT, TYP_USHORT, TYP_INT, TYP_UINT, TYP_LONG, TYP_ULONG, TYP_FLOAT, TYP_DOUBLE} // ************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************* // Vector2 Intrinsics SIMD_AS_HWINTRINSIC_ID(Vector2, Abs, 1, {NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Vector2_Abs, NI_Illegal}, SimdAsHWIntrinsicFlag::None) +SIMD_AS_HWINTRINSIC_NM(Vector2, CreateBroadcast, ".ctor", 2, {NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Vector2_CreateBroadcast, NI_Illegal}, SimdAsHWIntrinsicFlag::InstanceMethod) +SIMD_AS_HWINTRINSIC_ID(Vector2, Dot, 2, {NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Vector128_Dot, NI_Illegal}, SimdAsHWIntrinsicFlag::None) SIMD_AS_HWINTRINSIC_NM(Vector2, EqualsInstance, "Equals", 2, {NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Vector128_op_Equality, NI_Illegal}, SimdAsHWIntrinsicFlag::InstanceMethod) +SIMD_AS_HWINTRINSIC_ID(Vector2, get_One, 0, {NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Vector2_get_One, NI_Illegal}, SimdAsHWIntrinsicFlag::None) SIMD_AS_HWINTRINSIC_ID(Vector2, get_Zero, 0, {NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Vector128_get_Zero, NI_Illegal}, SimdAsHWIntrinsicFlag::None) SIMD_AS_HWINTRINSIC_ID(Vector2, Max, 2, {NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_SSE_Max, NI_Illegal}, SimdAsHWIntrinsicFlag::None) SIMD_AS_HWINTRINSIC_ID(Vector2, Min, 2, {NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_SSE_Min, NI_Illegal}, SimdAsHWIntrinsicFlag::None) @@ -57,7 +60,10 @@ SIMD_AS_HWINTRINSIC_ID(Vector2, SquareRoot, // ************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************* // Vector3 Intrinsics SIMD_AS_HWINTRINSIC_ID(Vector3, Abs, 1, {NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Vector3_Abs, NI_Illegal}, SimdAsHWIntrinsicFlag::None) +SIMD_AS_HWINTRINSIC_NM(Vector3, CreateBroadcast, ".ctor", 2, {NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Vector3_CreateBroadcast, NI_Illegal}, SimdAsHWIntrinsicFlag::InstanceMethod) +SIMD_AS_HWINTRINSIC_ID(Vector3, Dot, 2, {NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Vector128_Dot, NI_Illegal}, SimdAsHWIntrinsicFlag::None) SIMD_AS_HWINTRINSIC_NM(Vector3, EqualsInstance, "Equals", 2, {NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Vector128_op_Equality, NI_Illegal}, SimdAsHWIntrinsicFlag::InstanceMethod) +SIMD_AS_HWINTRINSIC_ID(Vector3, get_One, 0, {NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Vector3_get_One, NI_Illegal}, SimdAsHWIntrinsicFlag::None) SIMD_AS_HWINTRINSIC_ID(Vector3, get_Zero, 0, {NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Vector128_get_Zero, NI_Illegal}, SimdAsHWIntrinsicFlag::None) SIMD_AS_HWINTRINSIC_ID(Vector3, Max, 2, {NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_SSE_Max, NI_Illegal}, SimdAsHWIntrinsicFlag::None) SIMD_AS_HWINTRINSIC_ID(Vector3, Min, 2, {NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_SSE_Min, NI_Illegal}, SimdAsHWIntrinsicFlag::None) @@ -75,7 +81,10 @@ SIMD_AS_HWINTRINSIC_ID(Vector3, SquareRoot, // ************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************* // Vector4 Intrinsics SIMD_AS_HWINTRINSIC_ID(Vector4, Abs, 1, {NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Vector4_Abs, NI_Illegal}, SimdAsHWIntrinsicFlag::None) +SIMD_AS_HWINTRINSIC_NM(Vector4, CreateBroadcast, ".ctor", 2, {NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Vector4_CreateBroadcast, NI_Illegal}, SimdAsHWIntrinsicFlag::InstanceMethod) +SIMD_AS_HWINTRINSIC_ID(Vector4, Dot, 2, {NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Vector128_Dot, NI_Illegal}, SimdAsHWIntrinsicFlag::None) SIMD_AS_HWINTRINSIC_NM(Vector4, EqualsInstance, "Equals", 2, {NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Vector128_op_Equality, NI_Illegal}, SimdAsHWIntrinsicFlag::InstanceMethod) +SIMD_AS_HWINTRINSIC_ID(Vector4, get_One, 0, {NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Vector4_get_One, NI_Illegal}, SimdAsHWIntrinsicFlag::None) SIMD_AS_HWINTRINSIC_ID(Vector4, get_Zero, 0, {NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Vector128_get_Zero, NI_Illegal}, SimdAsHWIntrinsicFlag::None) SIMD_AS_HWINTRINSIC_ID(Vector4, Max, 2, {NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_SSE_Max, NI_Illegal}, SimdAsHWIntrinsicFlag::None) SIMD_AS_HWINTRINSIC_ID(Vector4, Min, 2, {NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_SSE_Min, NI_Illegal}, SimdAsHWIntrinsicFlag::None) @@ -96,11 +105,14 @@ SIMD_AS_HWINTRINSIC_ID(VectorT128, Abs, SIMD_AS_HWINTRINSIC_ID(VectorT128, AndNot, 2, {NI_SSE2_AndNot, NI_SSE2_AndNot, NI_SSE2_AndNot, NI_SSE2_AndNot, NI_SSE2_AndNot, NI_SSE2_AndNot, NI_SSE2_AndNot, NI_SSE2_AndNot, NI_SSE_AndNot, NI_SSE2_AndNot}, SimdAsHWIntrinsicFlag::NeedsOperandsSwapped) SIMD_AS_HWINTRINSIC_ID(VectorT128, Ceiling, 1, {NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_SSE41_Ceiling, NI_SSE41_Ceiling}, SimdAsHWIntrinsicFlag::None) SIMD_AS_HWINTRINSIC_ID(VectorT128, ConditionalSelect, 3, {NI_VectorT128_ConditionalSelect, NI_VectorT128_ConditionalSelect, NI_VectorT128_ConditionalSelect, NI_VectorT128_ConditionalSelect, NI_VectorT128_ConditionalSelect, NI_VectorT128_ConditionalSelect, NI_VectorT128_ConditionalSelect, NI_VectorT128_ConditionalSelect, NI_VectorT128_ConditionalSelect, NI_VectorT128_ConditionalSelect}, SimdAsHWIntrinsicFlag::None) +SIMD_AS_HWINTRINSIC_NM(VectorT128, CreateBroadcast, ".ctor", 2, {NI_VectorT128_CreateBroadcast, NI_VectorT128_CreateBroadcast, NI_VectorT128_CreateBroadcast, NI_VectorT128_CreateBroadcast, NI_VectorT128_CreateBroadcast, NI_VectorT128_CreateBroadcast, NI_VectorT128_CreateBroadcast, NI_VectorT128_CreateBroadcast, NI_VectorT128_CreateBroadcast, NI_VectorT128_CreateBroadcast}, SimdAsHWIntrinsicFlag::InstanceMethod) +SIMD_AS_HWINTRINSIC_ID(VectorT128, Dot, 2, {NI_Illegal, NI_Illegal, NI_Vector128_Dot, NI_Vector128_Dot, NI_VectorT128_Dot, NI_VectorT128_Dot, NI_Illegal, NI_Illegal, NI_Vector128_Dot, NI_Vector128_Dot}, SimdAsHWIntrinsicFlag::None) SIMD_AS_HWINTRINSIC_ID(VectorT128, Equals, 2, {NI_SSE2_CompareEqual, NI_SSE2_CompareEqual, NI_SSE2_CompareEqual, NI_SSE2_CompareEqual, NI_SSE2_CompareEqual, NI_SSE2_CompareEqual, NI_VectorT128_Equals, NI_VectorT128_Equals, NI_SSE_CompareEqual, NI_SSE2_CompareEqual}, SimdAsHWIntrinsicFlag::None) SIMD_AS_HWINTRINSIC_NM(VectorT128, EqualsInstance, "Equals", 2, {NI_Vector128_op_Equality, NI_Vector128_op_Equality, NI_Vector128_op_Equality, NI_Vector128_op_Equality, NI_Vector128_op_Equality, NI_Vector128_op_Equality, NI_Vector128_op_Equality, NI_Vector128_op_Equality, NI_Vector128_op_Equality, NI_Vector128_op_Equality}, SimdAsHWIntrinsicFlag::InstanceMethod) SIMD_AS_HWINTRINSIC_ID(VectorT128, Floor, 1, {NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_SSE41_Floor, NI_SSE41_Floor}, SimdAsHWIntrinsicFlag::None) SIMD_AS_HWINTRINSIC_ID(VectorT128, get_AllBitsSet, 0, {NI_Vector128_get_AllBitsSet, NI_Vector128_get_AllBitsSet, NI_Vector128_get_AllBitsSet, NI_Vector128_get_AllBitsSet, NI_Vector128_get_AllBitsSet, NI_Vector128_get_AllBitsSet, NI_Vector128_get_AllBitsSet, NI_Vector128_get_AllBitsSet, NI_Vector128_get_AllBitsSet, NI_Vector128_get_AllBitsSet}, SimdAsHWIntrinsicFlag::None) -SIMD_AS_HWINTRINSIC_ID(VectorT128, get_Count, 0, {NI_VectorT128_get_Count, NI_VectorT128_get_Count, NI_VectorT128_get_Count, NI_VectorT128_get_Count, NI_VectorT128_get_Count, NI_VectorT128_get_Count, NI_VectorT128_get_Count, NI_VectorT128_get_Count, NI_VectorT128_get_Count, NI_VectorT128_get_Count}, SimdAsHWIntrinsicFlag::None) +SIMD_AS_HWINTRINSIC_ID(VectorT128, get_Count, 0, {NI_VectorT128_get_Count, NI_VectorT128_get_Count, NI_VectorT128_get_Count, NI_VectorT128_get_Count, NI_VectorT128_get_Count, NI_VectorT128_get_Count, NI_VectorT128_get_Count, NI_VectorT128_get_Count, NI_VectorT128_get_Count, NI_VectorT128_get_Count}, SimdAsHWIntrinsicFlag::None) +SIMD_AS_HWINTRINSIC_ID(VectorT128, get_One, 0, {NI_VectorT128_get_One, NI_VectorT128_get_One, NI_VectorT128_get_One, NI_VectorT128_get_One, NI_VectorT128_get_One, NI_VectorT128_get_One, NI_VectorT128_get_One, NI_VectorT128_get_One, NI_VectorT128_get_One, NI_VectorT128_get_One}, SimdAsHWIntrinsicFlag::None) SIMD_AS_HWINTRINSIC_ID(VectorT128, get_Zero, 0, {NI_Vector128_get_Zero, NI_Vector128_get_Zero, NI_Vector128_get_Zero, NI_Vector128_get_Zero, NI_Vector128_get_Zero, NI_Vector128_get_Zero, NI_Vector128_get_Zero, NI_Vector128_get_Zero, NI_Vector128_get_Zero, NI_Vector128_get_Zero}, SimdAsHWIntrinsicFlag::None) SIMD_AS_HWINTRINSIC_ID(VectorT128, GreaterThan, 2, {NI_SSE2_CompareGreaterThan, NI_VectorT128_GreaterThan, NI_SSE2_CompareGreaterThan, NI_VectorT128_GreaterThan, NI_SSE2_CompareGreaterThan, NI_VectorT128_GreaterThan, NI_VectorT128_GreaterThan, NI_VectorT128_GreaterThan, NI_SSE_CompareGreaterThan, NI_SSE2_CompareGreaterThan}, SimdAsHWIntrinsicFlag::None) SIMD_AS_HWINTRINSIC_ID(VectorT128, GreaterThanOrEqual, 2, {NI_VectorT128_GreaterThanOrEqual, NI_VectorT128_GreaterThanOrEqual, NI_VectorT128_GreaterThanOrEqual, NI_VectorT128_GreaterThanOrEqual, NI_VectorT128_GreaterThanOrEqual, NI_VectorT128_GreaterThanOrEqual, NI_VectorT128_GreaterThanOrEqual, NI_VectorT128_GreaterThanOrEqual, NI_SSE_CompareGreaterThanOrEqual, NI_SSE2_CompareGreaterThanOrEqual}, SimdAsHWIntrinsicFlag::None) @@ -129,11 +141,14 @@ SIMD_AS_HWINTRINSIC_ID(VectorT256, Abs, SIMD_AS_HWINTRINSIC_ID(VectorT256, AndNot, 2, {NI_AVX2_AndNot, NI_AVX2_AndNot, NI_AVX2_AndNot, NI_AVX2_AndNot, NI_AVX2_AndNot, NI_AVX2_AndNot, NI_AVX2_AndNot, NI_AVX2_AndNot, NI_AVX_AndNot, NI_AVX_AndNot}, SimdAsHWIntrinsicFlag::NeedsOperandsSwapped) SIMD_AS_HWINTRINSIC_ID(VectorT256, Ceiling, 1, {NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_AVX_Ceiling, NI_AVX_Ceiling}, SimdAsHWIntrinsicFlag::None) SIMD_AS_HWINTRINSIC_ID(VectorT256, ConditionalSelect, 3, {NI_VectorT256_ConditionalSelect, NI_VectorT256_ConditionalSelect, NI_VectorT256_ConditionalSelect, NI_VectorT256_ConditionalSelect, NI_VectorT256_ConditionalSelect, NI_VectorT256_ConditionalSelect, NI_VectorT256_ConditionalSelect, NI_VectorT256_ConditionalSelect, NI_VectorT256_ConditionalSelect, NI_VectorT256_ConditionalSelect}, SimdAsHWIntrinsicFlag::None) +SIMD_AS_HWINTRINSIC_NM(VectorT256, CreateBroadcast, ".ctor", 2, {NI_VectorT256_CreateBroadcast, NI_VectorT256_CreateBroadcast, NI_VectorT256_CreateBroadcast, NI_VectorT256_CreateBroadcast, NI_VectorT256_CreateBroadcast, NI_VectorT256_CreateBroadcast, NI_VectorT256_CreateBroadcast, NI_VectorT256_CreateBroadcast, NI_VectorT256_CreateBroadcast, NI_VectorT256_CreateBroadcast}, SimdAsHWIntrinsicFlag::InstanceMethod) +SIMD_AS_HWINTRINSIC_ID(VectorT256, Dot, 2, {NI_Illegal, NI_Illegal, NI_Vector256_Dot, NI_Vector256_Dot, NI_Vector256_Dot, NI_Vector256_Dot, NI_Illegal, NI_Illegal, NI_Vector256_Dot, NI_Vector256_Dot}, SimdAsHWIntrinsicFlag::None) SIMD_AS_HWINTRINSIC_ID(VectorT256, Equals, 2, {NI_AVX2_CompareEqual, NI_AVX2_CompareEqual, NI_AVX2_CompareEqual, NI_AVX2_CompareEqual, NI_AVX2_CompareEqual, NI_AVX2_CompareEqual, NI_AVX2_CompareEqual, NI_AVX2_CompareEqual, NI_AVX_CompareEqual, NI_AVX_CompareEqual}, SimdAsHWIntrinsicFlag::None) SIMD_AS_HWINTRINSIC_NM(VectorT256, EqualsInstance, "Equals", 2, {NI_Vector256_op_Equality, NI_Vector256_op_Equality, NI_Vector256_op_Equality, NI_Vector256_op_Equality, NI_Vector256_op_Equality, NI_Vector256_op_Equality, NI_Vector256_op_Equality, NI_Vector256_op_Equality, NI_Vector256_op_Equality, NI_Vector256_op_Equality}, SimdAsHWIntrinsicFlag::InstanceMethod) SIMD_AS_HWINTRINSIC_ID(VectorT256, Floor, 1, {NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_AVX_Floor, NI_AVX_Floor}, SimdAsHWIntrinsicFlag::None) SIMD_AS_HWINTRINSIC_ID(VectorT256, get_AllBitsSet, 0, {NI_Vector256_get_AllBitsSet, NI_Vector256_get_AllBitsSet, NI_Vector256_get_AllBitsSet, NI_Vector256_get_AllBitsSet, NI_Vector256_get_AllBitsSet, NI_Vector256_get_AllBitsSet, NI_Vector256_get_AllBitsSet, NI_Vector256_get_AllBitsSet, NI_Vector256_get_AllBitsSet, NI_Vector256_get_AllBitsSet}, SimdAsHWIntrinsicFlag::None) -SIMD_AS_HWINTRINSIC_ID(VectorT256, get_Count, 0, {NI_VectorT256_get_Count, NI_VectorT256_get_Count, NI_VectorT256_get_Count, NI_VectorT256_get_Count, NI_VectorT256_get_Count, NI_VectorT256_get_Count, NI_VectorT256_get_Count, NI_VectorT256_get_Count, NI_VectorT256_get_Count, NI_VectorT256_get_Count}, SimdAsHWIntrinsicFlag::None) +SIMD_AS_HWINTRINSIC_ID(VectorT256, get_Count, 0, {NI_VectorT256_get_Count, NI_VectorT256_get_Count, NI_VectorT256_get_Count, NI_VectorT256_get_Count, NI_VectorT256_get_Count, NI_VectorT256_get_Count, NI_VectorT256_get_Count, NI_VectorT256_get_Count, NI_VectorT256_get_Count, NI_VectorT256_get_Count}, SimdAsHWIntrinsicFlag::None) +SIMD_AS_HWINTRINSIC_ID(VectorT256, get_One, 0, {NI_VectorT256_get_One, NI_VectorT256_get_One, NI_VectorT256_get_One, NI_VectorT256_get_One, NI_VectorT256_get_One, NI_VectorT256_get_One, NI_VectorT256_get_One, NI_VectorT256_get_One, NI_VectorT256_get_One, NI_VectorT256_get_One}, SimdAsHWIntrinsicFlag::None) SIMD_AS_HWINTRINSIC_ID(VectorT256, get_Zero, 0, {NI_Vector256_get_Zero, NI_Vector256_get_Zero, NI_Vector256_get_Zero, NI_Vector256_get_Zero, NI_Vector256_get_Zero, NI_Vector256_get_Zero, NI_Vector256_get_Zero, NI_Vector256_get_Zero, NI_Vector256_get_Zero, NI_Vector256_get_Zero}, SimdAsHWIntrinsicFlag::None) SIMD_AS_HWINTRINSIC_ID(VectorT256, GreaterThan, 2, {NI_AVX2_CompareGreaterThan, NI_VectorT256_GreaterThan, NI_AVX2_CompareGreaterThan, NI_VectorT256_GreaterThan, NI_AVX2_CompareGreaterThan, NI_VectorT256_GreaterThan, NI_AVX2_CompareGreaterThan, NI_VectorT256_GreaterThan, NI_AVX_CompareGreaterThan, NI_AVX_CompareGreaterThan}, SimdAsHWIntrinsicFlag::None) SIMD_AS_HWINTRINSIC_ID(VectorT256, GreaterThanOrEqual, 2, {NI_VectorT256_GreaterThanOrEqual, NI_VectorT256_GreaterThanOrEqual, NI_VectorT256_GreaterThanOrEqual, NI_VectorT256_GreaterThanOrEqual, NI_VectorT256_GreaterThanOrEqual, NI_VectorT256_GreaterThanOrEqual, NI_VectorT256_GreaterThanOrEqual, NI_VectorT256_GreaterThanOrEqual, NI_AVX_CompareGreaterThanOrEqual, NI_AVX_CompareGreaterThanOrEqual}, SimdAsHWIntrinsicFlag::None) diff --git a/src/coreclr/src/jit/simdcodegenxarch.cpp b/src/coreclr/src/jit/simdcodegenxarch.cpp index 5147d3d912cfc0..16aee6d5849ad4 100644 --- a/src/coreclr/src/jit/simdcodegenxarch.cpp +++ b/src/coreclr/src/jit/simdcodegenxarch.cpp @@ -130,33 +130,6 @@ instruction CodeGen::getOpForSIMDIntrinsic(SIMDIntrinsicID intrinsicId, var_type } break; - case SIMDIntrinsicAdd: - if (baseType == TYP_FLOAT) - { - result = INS_addps; - } - else if (baseType == TYP_DOUBLE) - { - result = INS_addpd; - } - else if (baseType == TYP_INT || baseType == TYP_UINT) - { - result = INS_paddd; - } - else if (baseType == TYP_USHORT || baseType == TYP_SHORT) - { - result = INS_paddw; - } - else if (baseType == TYP_UBYTE || baseType == TYP_BYTE) - { - result = INS_paddb; - } - else if (baseType == TYP_LONG || baseType == TYP_ULONG) - { - result = INS_paddq; - } - break; - case SIMDIntrinsicSub: if (baseType == TYP_FLOAT) { @@ -184,40 +157,6 @@ instruction CodeGen::getOpForSIMDIntrinsic(SIMDIntrinsicID intrinsicId, var_type } break; - case SIMDIntrinsicMul: - if (baseType == TYP_FLOAT) - { - result = INS_mulps; - } - else if (baseType == TYP_DOUBLE) - { - result = INS_mulpd; - } - else if (baseType == TYP_SHORT) - { - result = INS_pmullw; - } - else if ((baseType == TYP_INT) && (compiler->getSIMDSupportLevel() >= SIMD_SSE4_Supported)) - { - result = INS_pmulld; - } - break; - - case SIMDIntrinsicDiv: - if (baseType == TYP_FLOAT) - { - result = INS_divps; - } - else if (baseType == TYP_DOUBLE) - { - result = INS_divpd; - } - else - { - unreached(); - } - break; - case SIMDIntrinsicEqual: if (baseType == TYP_FLOAT) { @@ -1556,9 +1495,7 @@ void CodeGen::genSIMDIntrinsicNarrow(GenTreeSIMD* simdNode) // void CodeGen::genSIMDIntrinsicBinOp(GenTreeSIMD* simdNode) { - assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicAdd || simdNode->gtSIMDIntrinsicID == SIMDIntrinsicSub || - simdNode->gtSIMDIntrinsicID == SIMDIntrinsicMul || simdNode->gtSIMDIntrinsicID == SIMDIntrinsicDiv || - simdNode->gtSIMDIntrinsicID == SIMDIntrinsicBitwiseAnd || + assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicSub || simdNode->gtSIMDIntrinsicID == SIMDIntrinsicBitwiseAnd || simdNode->gtSIMDIntrinsicID == SIMDIntrinsicBitwiseOr); GenTree* op1 = simdNode->gtGetOp1(); @@ -1574,156 +1511,27 @@ void CodeGen::genSIMDIntrinsicBinOp(GenTreeSIMD* simdNode) regNumber op2Reg = op2->GetRegNum(); regNumber otherReg = op2Reg; - // Vector.Mul: - // SSE2 doesn't have an instruction to perform this operation directly - // whereas SSE4.1 does (pmulld). This is special cased and computed - // as follows. - if (simdNode->gtSIMDIntrinsicID == SIMDIntrinsicMul && baseType == TYP_INT && level == SIMD_SSE2_Supported) - { - // We need a temporary register that is NOT the same as the target, - // and we MAY need another. - regNumber tmpReg = simdNode->ExtractTempReg(); - regNumber tmpReg2 = simdNode->GetSingleTempReg(); - - // The register allocator guarantees the following conditions: - // - the only registers that may be the same among op1Reg, op2Reg, tmpReg - // and tmpReg2 are op1Reg and op2Reg. - // Let's be extra-careful and assert that now. - assert((op1Reg != tmpReg) && (op1Reg != tmpReg2) && (op2Reg != tmpReg) && (op2Reg != tmpReg2) && - (tmpReg != tmpReg2)); - - // We will start by setting things up so that: - // - We have op1 in op1Reg and targetReg, and they are different registers. - // - We have op2 in op2Reg and tmpReg - // - Either we will leave the input registers (the original op1Reg and op2Reg) unmodified, - // OR they are the targetReg that will be produced. - // (Note that in the code we generate below op1Reg and op2Reg are never written.) - // We will copy things as necessary to ensure that this is the case. - // Note that we can swap op1 and op2, since multiplication is commutative. - // We will not modify the values in op1Reg and op2Reg. - // (Though note that if either op1 or op2 is the same as targetReg, we will make - // a copy and use that copy as the input register. In that case we WILL modify - // the original value in the register, but will wind up with the result in targetReg - // in the end, as expected.) - - // First, we need a tmpReg that is NOT the same as targetReg. - // Note that if we have another reg that is the same as targetReg, - // we can use tmpReg2 for that case, as we will not have hit this case. - if (tmpReg == targetReg) - { - tmpReg = tmpReg2; - } + instruction ins = getOpForSIMDIntrinsic(simdNode->gtSIMDIntrinsicID, baseType); - if (op2Reg == targetReg) - { - // We will swap the operands. - // Since the code below only deals with registers, this now becomes the case where - // op1Reg == targetReg. - op2Reg = op1Reg; - op1Reg = targetReg; - } - if (op1Reg == targetReg) - { - // Copy op1, and make tmpReg2 the new op1Reg. - // Note that those regs can't be the same, as we asserted above. - // Also, we know that tmpReg2 hasn't been used, because we couldn't have hit - // the "tmpReg == targetReg" case. - inst_RV_RV(INS_movaps, tmpReg2, op1Reg, targetType, emitActualTypeSize(targetType)); - op1Reg = tmpReg2; - inst_RV_RV(INS_movaps, tmpReg, op2Reg, targetType, emitActualTypeSize(targetType)); - // However, we have one more case to worry about: what if op2Reg is also targetReg - // (i.e. we have the same operand as op1 and op2)? - // In that case we will set op2Reg to the same register as op1Reg. - if (op2Reg == targetReg) - { - op2Reg = tmpReg2; - } - } - else - { - // Copy op1 to targetReg and op2 to tmpReg. - inst_RV_RV(INS_movaps, targetReg, op1Reg, targetType, emitActualTypeSize(targetType)); - inst_RV_RV(INS_movaps, tmpReg, op2Reg, targetType, emitActualTypeSize(targetType)); - } - // Let's assert that things are as we expect. - // - We have op1 in op1Reg and targetReg, and they are different registers. - assert(op1Reg != targetReg); - // - We have op2 in op2Reg and tmpReg, and they are different registers. - assert(op2Reg != tmpReg); - // - Either we are going to leave op1's reg unmodified, or it is the targetReg. - assert((op1->GetRegNum() == op1Reg) || (op1->GetRegNum() == op2Reg) || (op1->GetRegNum() == targetReg)); - // - Similarly, we are going to leave op2's reg unmodified, or it is the targetReg. - assert((op2->GetRegNum() == op1Reg) || (op2->GetRegNum() == op2Reg) || (op2->GetRegNum() == targetReg)); - - // Now we can generate the code. - - // targetReg = op1 >> 4-bytes (op1 is already in targetReg) - GetEmitter()->emitIns_R_I(INS_psrldq, emitActualTypeSize(targetType), targetReg, 4); - - // tmpReg = op2 >> 4-bytes (op2 is already in tmpReg) - GetEmitter()->emitIns_R_I(INS_psrldq, emitActualTypeSize(targetType), tmpReg, 4); - - // tmp = unsigned double word multiply of targetReg and tmpReg. Essentially - // tmpReg[63:0] = op1[1] * op2[1] - // tmpReg[127:64] = op1[3] * op2[3] - inst_RV_RV(INS_pmuludq, tmpReg, targetReg, targetType, emitActualTypeSize(targetType)); - - // Extract first and third double word results from tmpReg - // tmpReg = shuffle(0,0,2,0) of tmpReg - GetEmitter()->emitIns_R_R_I(INS_pshufd, emitActualTypeSize(targetType), tmpReg, tmpReg, (int8_t)SHUFFLE_XXZX); - - // targetReg[63:0] = op1[0] * op2[0] - // targetReg[127:64] = op1[2] * op2[2] - inst_RV_RV(INS_movaps, targetReg, op1Reg, targetType, emitActualTypeSize(targetType)); - inst_RV_RV(INS_pmuludq, targetReg, op2Reg, targetType, emitActualTypeSize(targetType)); - - // Extract first and third double word results from targetReg - // targetReg = shuffle(0,0,2,0) of targetReg - GetEmitter()->emitIns_R_R_I(INS_pshufd, emitActualTypeSize(targetType), targetReg, targetReg, - (int8_t)SHUFFLE_XXZX); - - // pack the results into a single vector - inst_RV_RV(INS_punpckldq, targetReg, tmpReg, targetType, emitActualTypeSize(targetType)); + // Currently AVX doesn't support integer. + // if the ins is INS_cvtsi2ss or INS_cvtsi2sd, we won't use AVX. + if (op1Reg != targetReg && compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported && + !(ins == INS_cvtsi2ss || ins == INS_cvtsi2sd) && GetEmitter()->IsThreeOperandAVXInstruction(ins)) + { + inst_RV_RV_RV(ins, targetReg, op1Reg, op2Reg, emitActualTypeSize(targetType)); } else { - instruction ins = getOpForSIMDIntrinsic(simdNode->gtSIMDIntrinsicID, baseType); - - // Currently AVX doesn't support integer. - // if the ins is INS_cvtsi2ss or INS_cvtsi2sd, we won't use AVX. - if (op1Reg != targetReg && compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported && - !(ins == INS_cvtsi2ss || ins == INS_cvtsi2sd) && GetEmitter()->IsThreeOperandAVXInstruction(ins)) + if (op2Reg == targetReg) { - inst_RV_RV_RV(ins, targetReg, op1Reg, op2Reg, emitActualTypeSize(targetType)); + otherReg = op1Reg; } - else + else if (op1Reg != targetReg) { - if (op2Reg == targetReg) - { - otherReg = op1Reg; - } - else if (op1Reg != targetReg) - { - inst_RV_RV(ins_Copy(targetType), targetReg, op1Reg, targetType, emitActualTypeSize(targetType)); - } - - inst_RV_RV(ins, targetReg, otherReg, targetType, emitActualTypeSize(targetType)); + inst_RV_RV(ins_Copy(targetType), targetReg, op1Reg, targetType, emitActualTypeSize(targetType)); } - } - // Vector2/3 div: since the top-most elements will be zero, we end up - // perfoming 0/0 which is a NAN. Therefore, post division we need to set the - // top-most elements to zero. This is achieved by left logical shift followed - // by right logical shift of targetReg. - if (simdNode->gtSIMDIntrinsicID == SIMDIntrinsicDiv && (simdNode->gtSIMDSize < 16)) - { - // These are 16 byte operations, so we subtract from 16 bytes, not the vector register length. - unsigned shiftCount = 16 - simdNode->gtSIMDSize; - assert((shiftCount > 0) && (shiftCount <= 16)); - instruction ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, TYP_SIMD16); - GetEmitter()->emitIns_R_I(ins, EA_16BYTE, targetReg, shiftCount); - ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, TYP_SIMD16); - GetEmitter()->emitIns_R_I(ins, EA_16BYTE, targetReg, shiftCount); + inst_RV_RV(ins, targetReg, otherReg, targetType, emitActualTypeSize(targetType)); } genProduceReg(simdNode); @@ -1807,290 +1615,6 @@ void CodeGen::genSIMDIntrinsicRelOp(GenTreeSIMD* simdNode) genProduceReg(simdNode); } -//-------------------------------------------------------------------------------- -// genSIMDIntrinsicDotProduct: Generate code for SIMD Intrinsic Dot Product. -// -// Arguments: -// simdNode - The GT_SIMD node -// -// Return Value: -// None. -// -void CodeGen::genSIMDIntrinsicDotProduct(GenTreeSIMD* simdNode) -{ - assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicDotProduct); - - GenTree* op1 = simdNode->gtGetOp1(); - GenTree* op2 = simdNode->gtGetOp2(); - var_types baseType = simdNode->gtSIMDBaseType; - var_types simdType = op1->TypeGet(); - // TODO-1stClassStructs: Temporary to minimize asmDiffs - if (simdType == TYP_DOUBLE) - { - simdType = TYP_SIMD8; - } - var_types simdEvalType = (simdType == TYP_SIMD12) ? TYP_SIMD16 : simdType; - regNumber targetReg = simdNode->GetRegNum(); - assert(targetReg != REG_NA); - - var_types targetType = simdNode->TypeGet(); - assert(targetType == baseType); - - genConsumeOperands(simdNode); - regNumber op1Reg = op1->GetRegNum(); - regNumber op2Reg = op2->GetRegNum(); - regNumber tmpReg1 = REG_NA; - regNumber tmpReg2 = REG_NA; - - SIMDLevel level = compiler->getSIMDSupportLevel(); - - // Dot product intrinsic is supported only on float/double vectors - // and 32-byte int vectors on AVX. - // - // Float/Double Vectors: - // For SSE, or AVX with 32-byte vectors, we need one additional Xmm register - // different from targetReg as scratch. Note that if this is a TYP_SIMD16 or - // smaller on AVX, then we don't need a tmpReg. - // - // 32-byte integer vector on AVX: we need two additional Xmm registers - // different from targetReg as scratch. - // - // 16-byte integer vector on SSE4: we need one additional Xmm register - // different from targetReg as scratch. - if (varTypeIsFloating(baseType)) - { - if ((compiler->getSIMDSupportLevel() == SIMD_SSE2_Supported) || (simdEvalType == TYP_SIMD32)) - { - tmpReg1 = simdNode->GetSingleTempReg(); - assert(tmpReg1 != targetReg); - } - else - { - assert(simdNode->AvailableTempRegCount() == 0); - } - } - else - { - assert(baseType == TYP_INT); - assert(level >= SIMD_SSE4_Supported); - - if (level == SIMD_SSE4_Supported) - { - tmpReg1 = simdNode->GetSingleTempReg(); - } - else - { - tmpReg1 = simdNode->ExtractTempReg(); - tmpReg2 = simdNode->GetSingleTempReg(); - } - } - - if (level == SIMD_SSE2_Supported) - { - // We avoid reg move if either op1Reg == targetReg or op2Reg == targetReg - if (op1Reg == targetReg) - { - // Best case - // nothing to do, we have registers in the right place - } - else if (op2Reg == targetReg) - { - op2Reg = op1Reg; - } - else - { - inst_RV_RV(ins_Copy(simdType), targetReg, op1Reg, simdEvalType, emitActualTypeSize(simdType)); - } - - // DotProduct(v1, v2) - // Here v0 = targetReg, v1 = op1Reg, v2 = op2Reg and tmp = tmpReg1 - if ((simdNode->gtFlags & GTF_SIMD12_OP) != 0) - { - assert(baseType == TYP_FLOAT); - // v0 = v1 * v2 - // tmp = v0 // v0 = (3, 2, 1, 0) - each element is given by its - // // position - // tmp = shuffle(tmp, tmp, SHUFFLE_ZXXY) // tmp = (2, 0, 0, 1) - don't really care what's in upper - // // bits - // v0 = v0 + tmp // v0 = (3+2, 0+2, 1+0, 0+1) - // tmp = shuffle(tmp, tmp, SHUFFLE_XXWW) // tmp = ( 1, 1, 2, 2) - // v0 = v0 + tmp // v0 = (1+2+3, 0+1+2, 0+1+2, 0+1+2) - // - inst_RV_RV(INS_mulps, targetReg, op2Reg); - inst_RV_RV(INS_movaps, tmpReg1, targetReg); - inst_RV_RV_IV(INS_shufps, EA_16BYTE, tmpReg1, tmpReg1, (int8_t)SHUFFLE_ZXXY); - inst_RV_RV(INS_addps, targetReg, tmpReg1); - inst_RV_RV_IV(INS_shufps, EA_16BYTE, tmpReg1, tmpReg1, (int8_t)SHUFFLE_XXWW); - inst_RV_RV(INS_addps, targetReg, tmpReg1); - } - else if (baseType == TYP_FLOAT) - { - // v0 = v1 * v2 - // tmp = v0 // v0 = (3, 2, 1, 0) - each element is given by its - // // position - // tmp = shuffle(tmp, tmp, SHUFFLE_ZWXY) // tmp = (2, 3, 0, 1) - // v0 = v0 + tmp // v0 = (3+2, 2+3, 1+0, 0+1) - // tmp = v0 - // tmp = shuffle(tmp, tmp, SHUFFLE_XYZW) // tmp = (0+1, 1+0, 2+3, 3+2) - // v0 = v0 + tmp // v0 = (0+1+2+3, 0+1+2+3, 0+1+2+3, 0+1+2+3) - // // Essentially horizontal addition of all elements. - // // We could achieve the same using SSEv3 instruction - // // HADDPS. - // - inst_RV_RV(INS_mulps, targetReg, op2Reg); - inst_RV_RV(INS_movaps, tmpReg1, targetReg); - inst_RV_RV_IV(INS_shufps, EA_16BYTE, tmpReg1, tmpReg1, (int8_t)SHUFFLE_ZWXY); - inst_RV_RV(INS_addps, targetReg, tmpReg1); - inst_RV_RV(INS_movaps, tmpReg1, targetReg); - inst_RV_RV_IV(INS_shufps, EA_16BYTE, tmpReg1, tmpReg1, (int8_t)SHUFFLE_XYZW); - inst_RV_RV(INS_addps, targetReg, tmpReg1); - } - else - { - assert(baseType == TYP_DOUBLE); - - // v0 = v1 * v2 - // tmp = v0 // v0 = (1, 0) - each element is given by its position - // tmp = shuffle(tmp, tmp, Shuffle(0,1)) // tmp = (0, 1) - // v0 = v0 + tmp // v0 = (1+0, 0+1) - inst_RV_RV(INS_mulpd, targetReg, op2Reg); - inst_RV_RV(INS_movaps, tmpReg1, targetReg); - inst_RV_RV_IV(INS_shufpd, EA_16BYTE, tmpReg1, tmpReg1, 0x01); - inst_RV_RV(INS_addpd, targetReg, tmpReg1); - } - } - else - { - assert(level >= SIMD_SSE4_Supported); - - if (varTypeIsFloating(baseType)) - { - // We avoid reg move if either op1Reg == targetReg or op2Reg == targetReg. - // Note that this is a duplicate of the code above for SSE, but in the AVX case we can eventually - // use the 3-op form, so that we can avoid these copies. - // TODO-CQ: Add inst_RV_RV_RV_IV(). - if (op1Reg == targetReg) - { - // Best case - // nothing to do, we have registers in the right place - } - else if (op2Reg == targetReg) - { - op2Reg = op1Reg; - } - else - { - inst_RV_RV(ins_Copy(simdType), targetReg, op1Reg, simdEvalType, emitActualTypeSize(simdType)); - } - - emitAttr emitSize = emitActualTypeSize(simdEvalType); - if (baseType == TYP_FLOAT) - { - // dpps computes the dot product of the upper & lower halves of the 32-byte register. - // Notice that if this is a TYP_SIMD16 or smaller on AVX, then we don't need a tmpReg. - unsigned mask = ((simdNode->gtFlags & GTF_SIMD12_OP) != 0) ? 0x71 : 0xf1; - assert((mask >= 0) && (mask <= 255)); - inst_RV_RV_IV(INS_dpps, emitSize, targetReg, op2Reg, (int8_t)mask); - // dpps computes the dot product of the upper & lower halves of the 32-byte register. - // Notice that if this is a TYP_SIMD16 or smaller on AVX, then we don't need a tmpReg. - // If this is TYP_SIMD32, we need to combine the lower & upper results. - if (simdEvalType == TYP_SIMD32) - { - GetEmitter()->emitIns_R_R_I(INS_vextractf128, EA_32BYTE, tmpReg1, targetReg, 0x01); - inst_RV_RV(INS_addps, targetReg, tmpReg1, targetType, emitTypeSize(targetType)); - } - } - else if (baseType == TYP_DOUBLE) - { - if (simdEvalType == TYP_SIMD32) - { - // targetReg = targetReg * op2Reg - // targetReg = vhaddpd(targetReg, targetReg) ; horizontal sum of lower & upper halves - // tmpReg = vextractf128(targetReg, 1) ; Moves the upper sum into tempReg - // targetReg = targetReg + tmpReg1 - inst_RV_RV(INS_mulpd, targetReg, op2Reg, simdEvalType, emitActualTypeSize(simdType)); - inst_RV_RV(INS_haddpd, targetReg, targetReg, simdEvalType, emitActualTypeSize(simdType)); - GetEmitter()->emitIns_R_R_I(INS_vextractf128, EA_32BYTE, tmpReg1, targetReg, 0x01); - inst_RV_RV(INS_addpd, targetReg, tmpReg1, targetType, emitTypeSize(targetType)); - } - else - { - // On AVX, we have no 16-byte vectors of double. Note that, if we did, we could use - // dppd directly. - assert(level == SIMD_SSE4_Supported); - inst_RV_RV_IV(INS_dppd, emitSize, targetReg, op2Reg, 0x31); - } - } - } - else - { - // Dot product of 32-byte int vector on SSE4/AVX. - assert(baseType == TYP_INT); - assert(simdEvalType == TYP_SIMD16 || simdEvalType == TYP_SIMD32); - -#ifdef DEBUG - // SSE4: We need 1 scratch register. - // AVX2: We need 2 scratch registers. - if (simdEvalType == TYP_SIMD16) - { - assert(tmpReg1 != REG_NA); - } - else - { - assert(tmpReg1 != REG_NA); - assert(tmpReg2 != REG_NA); - } -#endif - - // tmpReg1 = op1 * op2 - if (level == SIMD_AVX2_Supported) - { - // On AVX take advantage 3 operand form of pmulld - inst_RV_RV_RV(INS_pmulld, tmpReg1, op1Reg, op2Reg, emitTypeSize(simdEvalType)); - } - else - { - inst_RV_RV(ins_Copy(simdEvalType), tmpReg1, op1Reg, simdEvalType); - inst_RV_RV(INS_pmulld, tmpReg1, op2Reg, simdEvalType); - } - - if (simdEvalType == TYP_SIMD32) - { - // tmpReg2[127..0] = Upper 128-bits of tmpReg1 - GetEmitter()->emitIns_R_R_I(INS_vextractf128, EA_32BYTE, tmpReg2, tmpReg1, 0x01); - - // tmpReg1[127..0] = tmpReg1[127..0] + tmpReg2[127..0] - // This will compute - // tmpReg1[0] = op1[0]*op2[0] + op1[4]*op2[4] - // tmpReg1[1] = op1[1]*op2[1] + op1[5]*op2[5] - // tmpReg1[2] = op1[2]*op2[2] + op1[6]*op2[6] - // tmpReg1[4] = op1[4]*op2[4] + op1[7]*op2[7] - inst_RV_RV(INS_paddd, tmpReg1, tmpReg2, TYP_SIMD16, EA_16BYTE); - } - - // This horizontal add will compute - // - // TYP_SIMD16: - // tmpReg1[0] = tmpReg1[2] = op1[0]*op2[0] + op1[1]*op2[1] - // tmpReg1[1] = tmpReg1[3] = op1[2]*op2[2] + op1[4]*op2[4] - // - // TYP_SIMD32: - // tmpReg1[0] = tmpReg1[2] = op1[0]*op2[0] + op1[4]*op2[4] + op1[1]*op2[1] + op1[5]*op2[5] - // tmpReg1[1] = tmpReg1[3] = op1[2]*op2[2] + op1[6]*op2[6] + op1[4]*op2[4] + op1[7]*op2[7] - inst_RV_RV(INS_phaddd, tmpReg1, tmpReg1, TYP_SIMD16, EA_16BYTE); - - // DotProduct(op1, op2) = tmpReg1[0] = tmpReg1[0] + tmpReg1[1] - inst_RV_RV(INS_phaddd, tmpReg1, tmpReg1, TYP_SIMD16, EA_16BYTE); - - // TargetReg = integer result from tmpReg1 - // (Note that for mov_xmm2i, the int register is always in the reg2 position) - inst_RV_RV(INS_mov_xmm2i, tmpReg1, targetReg, TYP_INT); - } - } - - genProduceReg(simdNode); -} - //------------------------------------------------------------------------------------ // genSIMDIntrinsicGetItem: Generate code for SIMD Intrinsic get element at index i. // @@ -2903,10 +2427,7 @@ void CodeGen::genSIMDIntrinsic(GenTreeSIMD* simdNode) genSIMDIntrinsicNarrow(simdNode); break; - case SIMDIntrinsicAdd: case SIMDIntrinsicSub: - case SIMDIntrinsicMul: - case SIMDIntrinsicDiv: case SIMDIntrinsicBitwiseAnd: case SIMDIntrinsicBitwiseOr: genSIMDIntrinsicBinOp(simdNode); @@ -2916,10 +2437,6 @@ void CodeGen::genSIMDIntrinsic(GenTreeSIMD* simdNode) genSIMDIntrinsicRelOp(simdNode); break; - case SIMDIntrinsicDotProduct: - genSIMDIntrinsicDotProduct(simdNode); - break; - case SIMDIntrinsicGetItem: genSIMDIntrinsicGetItem(simdNode); break; diff --git a/src/coreclr/src/jit/simdintrinsiclist.h b/src/coreclr/src/jit/simdintrinsiclist.h index 813a937fd056b8..399fc7d84a2259 100644 --- a/src/coreclr/src/jit/simdintrinsiclist.h +++ b/src/coreclr/src/jit/simdintrinsiclist.h @@ -39,11 +39,6 @@ ***************************************************************************************************************************************************************************************************************************/ SIMD_INTRINSIC(nullptr, false, None, "None", TYP_UNDEF, 0, {TYP_UNDEF, TYP_UNDEF, TYP_UNDEF}, {TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF}) -SIMD_INTRINSIC("get_Count", false, GetCount, "count", TYP_INT, 0, {TYP_VOID, TYP_UNDEF, TYP_UNDEF}, {TYP_INT, TYP_FLOAT, TYP_DOUBLE, TYP_LONG, TYP_USHORT, TYP_UBYTE, TYP_BYTE, TYP_SHORT, TYP_UINT, TYP_ULONG}) -SIMD_INTRINSIC("get_One", false, GetOne, "one", TYP_STRUCT, 0, {TYP_VOID, TYP_UNDEF, TYP_UNDEF}, {TYP_INT, TYP_FLOAT, TYP_DOUBLE, TYP_LONG, TYP_USHORT, TYP_UBYTE, TYP_BYTE, TYP_SHORT, TYP_UINT, TYP_ULONG}) -SIMD_INTRINSIC("get_Zero", false, GetZero, "zero", TYP_STRUCT, 0, {TYP_VOID, TYP_UNDEF, TYP_UNDEF}, {TYP_INT, TYP_FLOAT, TYP_DOUBLE, TYP_LONG, TYP_USHORT, TYP_UBYTE, TYP_BYTE, TYP_SHORT, TYP_UINT, TYP_ULONG}) -SIMD_INTRINSIC("get_AllOnes", false, GetAllOnes, "allOnes", TYP_STRUCT, 0, {TYP_VOID, TYP_UNDEF, TYP_UNDEF}, {TYP_INT, TYP_FLOAT, TYP_DOUBLE, TYP_LONG, TYP_USHORT, TYP_UBYTE, TYP_BYTE, TYP_SHORT, TYP_UINT, TYP_ULONG}) - // .ctor call or newobj - there are four forms. // This form takes the object plus a value of the base (element) type: SIMD_INTRINSIC(".ctor", true, Init, "init", TYP_VOID, 2, {TYP_BYREF, TYP_UNKNOWN, TYP_UNDEF}, {TYP_INT, TYP_FLOAT, TYP_DOUBLE, TYP_LONG, TYP_USHORT, TYP_UBYTE, TYP_BYTE, TYP_SHORT, TYP_UINT, TYP_ULONG}) @@ -77,18 +72,8 @@ SIMD_INTRINSIC("set_Z", true, SetZ, SIMD_INTRINSIC("set_W", true, SetW, "setW", TYP_VOID, 2, {TYP_BYREF, TYP_UNKNOWN, TYP_UNDEF}, {TYP_FLOAT, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF}) // Arithmetic Operations -SIMD_INTRINSIC("op_Addition", false, Add, "+", TYP_STRUCT, 2, {TYP_STRUCT, TYP_STRUCT, TYP_UNDEF}, {TYP_INT, TYP_FLOAT, TYP_DOUBLE, TYP_LONG, TYP_USHORT, TYP_UBYTE, TYP_BYTE, TYP_SHORT, TYP_UINT, TYP_ULONG}) SIMD_INTRINSIC("op_Subtraction", false, Sub, "-", TYP_STRUCT, 2, {TYP_STRUCT, TYP_STRUCT, TYP_UNDEF}, {TYP_INT, TYP_FLOAT, TYP_DOUBLE, TYP_LONG, TYP_USHORT, TYP_UBYTE, TYP_BYTE, TYP_SHORT, TYP_UINT, TYP_ULONG}) -#if defined(TARGET_XARCH) -SIMD_INTRINSIC("op_Multiply", false, Mul, "*", TYP_STRUCT, 2, {TYP_STRUCT, TYP_STRUCT, TYP_UNDEF}, {TYP_INT, TYP_FLOAT, TYP_DOUBLE, TYP_SHORT,TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF}) -#elif defined(TARGET_ARM64) -// TODO-ARM64-CQ Investigate code sequence to accelerate LONG/ULONG vector multiply -SIMD_INTRINSIC("op_Multiply", false, Mul, "*", TYP_STRUCT, 2, {TYP_STRUCT, TYP_STRUCT, TYP_UNDEF}, {TYP_INT, TYP_FLOAT, TYP_DOUBLE, TYP_USHORT, TYP_UBYTE, TYP_BYTE, TYP_SHORT, TYP_UINT, TYP_UNDEF, TYP_UNDEF}) -#endif - -SIMD_INTRINSIC("op_Division", false, Div, "/", TYP_STRUCT, 2, {TYP_STRUCT, TYP_STRUCT, TYP_UNDEF}, {TYP_FLOAT, TYP_DOUBLE, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF}) - // Vector Relational operators SIMD_INTRINSIC("Equals", false, Equal, "eq", TYP_STRUCT, 2, {TYP_STRUCT, TYP_STRUCT, TYP_UNDEF}, {TYP_INT, TYP_FLOAT, TYP_DOUBLE, TYP_LONG, TYP_USHORT, TYP_UBYTE, TYP_BYTE, TYP_SHORT, TYP_UINT, TYP_ULONG}) @@ -96,15 +81,6 @@ SIMD_INTRINSIC("Equals", false, Equal, SIMD_INTRINSIC("op_BitwiseAnd", false, BitwiseAnd, "&", TYP_STRUCT, 2, {TYP_STRUCT, TYP_STRUCT, TYP_UNDEF}, {TYP_INT, TYP_FLOAT, TYP_DOUBLE, TYP_LONG, TYP_USHORT, TYP_UBYTE, TYP_BYTE, TYP_SHORT, TYP_UINT, TYP_ULONG}) SIMD_INTRINSIC("op_BitwiseOr", false, BitwiseOr, "|", TYP_STRUCT, 2, {TYP_STRUCT, TYP_STRUCT, TYP_UNDEF}, {TYP_INT, TYP_FLOAT, TYP_DOUBLE, TYP_LONG, TYP_USHORT, TYP_UBYTE, TYP_BYTE, TYP_SHORT, TYP_UINT, TYP_ULONG}) -// Dot Product -#if defined(TARGET_XARCH) -// Is supported only on Vector on AVX. -SIMD_INTRINSIC("Dot", false, DotProduct, "Dot", TYP_UNKNOWN, 2, {TYP_STRUCT, TYP_STRUCT, TYP_UNDEF}, {TYP_INT, TYP_FLOAT, TYP_DOUBLE, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF}) -#elif defined(TARGET_ARM64) -// Dot Product does not support LONG/ULONG due to lack of multiply support (see TODO-ARM64-CQ above) -SIMD_INTRINSIC("Dot", false, DotProduct, "Dot", TYP_UNKNOWN, 2, {TYP_STRUCT, TYP_STRUCT, TYP_UNDEF}, {TYP_INT, TYP_FLOAT, TYP_DOUBLE, TYP_USHORT, TYP_UBYTE, TYP_BYTE, TYP_SHORT, TYP_UINT, TYP_UNDEF, TYP_UNDEF}) -#endif - // Cast SIMD_INTRINSIC("op_Explicit", false, Cast, "Cast", TYP_STRUCT, 1, {TYP_STRUCT, TYP_UNDEF, TYP_UNDEF}, {TYP_INT, TYP_FLOAT, TYP_DOUBLE, TYP_LONG, TYP_USHORT, TYP_UBYTE, TYP_BYTE, TYP_SHORT, TYP_UINT, TYP_ULONG})