diff --git a/src/coreclr/src/jit/codegen.h b/src/coreclr/src/jit/codegen.h
index 6227d0133ae54f..bdf9d45ed2ce32 100644
--- a/src/coreclr/src/jit/codegen.h
+++ b/src/coreclr/src/jit/codegen.h
@@ -980,7 +980,6 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
     void genSIMDIntrinsicUnOp(GenTreeSIMD* simdNode);
     void genSIMDIntrinsicBinOp(GenTreeSIMD* simdNode);
     void genSIMDIntrinsicRelOp(GenTreeSIMD* simdNode);
-    void genSIMDIntrinsicDotProduct(GenTreeSIMD* simdNode);
     void genSIMDIntrinsicSetItem(GenTreeSIMD* simdNode);
     void genSIMDIntrinsicGetItem(GenTreeSIMD* simdNode);
     void genSIMDIntrinsicShuffleSSE2(GenTreeSIMD* simdNode);
diff --git a/src/coreclr/src/jit/codegenarm64.cpp b/src/coreclr/src/jit/codegenarm64.cpp
index 8e8e93ab01fc3a..39bbd4264bcad5 100644
--- a/src/coreclr/src/jit/codegenarm64.cpp
+++ b/src/coreclr/src/jit/codegenarm64.cpp
@@ -3855,20 +3855,13 @@ void CodeGen::genSIMDIntrinsic(GenTreeSIMD* simdNode)
             genSIMDIntrinsicNarrow(simdNode);
             break;
 
-        case SIMDIntrinsicAdd:
         case SIMDIntrinsicSub:
-        case SIMDIntrinsicMul:
-        case SIMDIntrinsicDiv:
         case SIMDIntrinsicBitwiseAnd:
         case SIMDIntrinsicBitwiseOr:
         case SIMDIntrinsicEqual:
             genSIMDIntrinsicBinOp(simdNode);
             break;
 
-        case SIMDIntrinsicDotProduct:
-            genSIMDIntrinsicDotProduct(simdNode);
-            break;
-
         case SIMDIntrinsicGetItem:
             genSIMDIntrinsicGetItem(simdNode);
             break;
@@ -3945,9 +3938,6 @@ instruction CodeGen::getOpForSIMDIntrinsic(SIMDIntrinsicID intrinsicId, var_type
     {
         switch (intrinsicId)
         {
-            case SIMDIntrinsicAdd:
-                result = INS_fadd;
-                break;
             case SIMDIntrinsicBitwiseAnd:
                 result = INS_and;
                 break;
@@ -3961,15 +3951,9 @@ instruction CodeGen::getOpForSIMDIntrinsic(SIMDIntrinsicID intrinsicId, var_type
             case SIMDIntrinsicConvertToInt64:
                 result = INS_fcvtzs;
                 break;
-            case SIMDIntrinsicDiv:
-                result = INS_fdiv;
-                break;
             case SIMDIntrinsicEqual:
                 result = INS_fcmeq;
                 break;
-            case SIMDIntrinsicMul:
-                result = INS_fmul;
-                break;
             case SIMDIntrinsicNarrow:
                 // Use INS_fcvtn lower bytes of result followed by INS_fcvtn2 for upper bytes
                 // Return lower bytes instruction here
@@ -3995,9 +3979,6 @@ instruction CodeGen::getOpForSIMDIntrinsic(SIMDIntrinsicID intrinsicId, var_type
 
         switch (intrinsicId)
         {
-            case SIMDIntrinsicAdd:
-                result = INS_add;
-                break;
             case SIMDIntrinsicBitwiseAnd:
                 result = INS_and;
                 break;
@@ -4014,9 +3995,6 @@ instruction CodeGen::getOpForSIMDIntrinsic(SIMDIntrinsicID intrinsicId, var_type
             case SIMDIntrinsicEqual:
                 result = INS_cmeq;
                 break;
-            case SIMDIntrinsicMul:
-                result = INS_mul;
-                break;
             case SIMDIntrinsicNarrow:
                 // Use INS_xtn lower bytes of result followed by INS_xtn2 for upper bytes
                 // Return lower bytes instruction here
@@ -4326,9 +4304,7 @@ void CodeGen::genSIMDIntrinsicNarrow(GenTreeSIMD* simdNode)
 //
 void CodeGen::genSIMDIntrinsicBinOp(GenTreeSIMD* simdNode)
 {
-    assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicAdd || simdNode->gtSIMDIntrinsicID == SIMDIntrinsicSub ||
-           simdNode->gtSIMDIntrinsicID == SIMDIntrinsicMul || simdNode->gtSIMDIntrinsicID == SIMDIntrinsicDiv ||
-           simdNode->gtSIMDIntrinsicID == SIMDIntrinsicBitwiseAnd ||
+    assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicSub || simdNode->gtSIMDIntrinsicID == SIMDIntrinsicBitwiseAnd ||
            simdNode->gtSIMDIntrinsicID == SIMDIntrinsicBitwiseOr || simdNode->gtSIMDIntrinsicID == SIMDIntrinsicEqual);
 
     GenTree*  op1       = simdNode->gtGetOp1();
@@ -4357,90 +4333,6 @@ void CodeGen::genSIMDIntrinsicBinOp(GenTreeSIMD* simdNode)
     genProduceReg(simdNode);
 }
 
-//--------------------------------------------------------------------------------
-// genSIMDIntrinsicDotProduct: Generate code for SIMD Intrinsic Dot Product.
-//
-// Arguments:
-//    simdNode - The GT_SIMD node
-//
-// Return Value:
-//    None.
-//
-void CodeGen::genSIMDIntrinsicDotProduct(GenTreeSIMD* simdNode)
-{
-    assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicDotProduct);
-
-    GenTree*  op1      = simdNode->gtGetOp1();
-    GenTree*  op2      = simdNode->gtGetOp2();
-    var_types baseType = simdNode->gtSIMDBaseType;
-    var_types simdType = op1->TypeGet();
-
-    regNumber targetReg = simdNode->GetRegNum();
-    assert(targetReg != REG_NA);
-
-    var_types targetType = simdNode->TypeGet();
-    assert(targetType == baseType);
-
-    genConsumeOperands(simdNode);
-    regNumber op1Reg = op1->GetRegNum();
-    regNumber op2Reg = op2->GetRegNum();
-    regNumber tmpReg = targetReg;
-
-    if (!varTypeIsFloating(baseType))
-    {
-        tmpReg = simdNode->GetSingleTempReg(RBM_ALLFLOAT);
-    }
-
-    instruction ins  = getOpForSIMDIntrinsic(SIMDIntrinsicMul, baseType);
-    emitAttr    attr = (simdNode->gtSIMDSize > 8) ? EA_16BYTE : EA_8BYTE;
-    insOpts     opt  = genGetSimdInsOpt(attr, baseType);
-
-    // Vector multiply
-    GetEmitter()->emitIns_R_R_R(ins, attr, tmpReg, op1Reg, op2Reg, opt);
-
-    if ((simdNode->gtFlags & GTF_SIMD12_OP) != 0)
-    {
-        // For 12Byte vectors we must zero upper bits to get correct dot product
-        // We do not assume upper bits are zero.
-        GetEmitter()->emitIns_R_R_I(INS_ins, EA_4BYTE, tmpReg, REG_ZR, 3);
-    }
-
-    // Vector add horizontal
-    if (varTypeIsFloating(baseType))
-    {
-        if (baseType == TYP_FLOAT)
-        {
-            if (opt == INS_OPTS_4S)
-            {
-                GetEmitter()->emitIns_R_R_R(INS_faddp, EA_16BYTE, tmpReg, tmpReg, tmpReg, INS_OPTS_4S);
-            }
-            GetEmitter()->emitIns_R_R(INS_faddp, EA_8BYTE, targetReg, tmpReg, INS_OPTS_2S);
-        }
-        else
-        {
-            GetEmitter()->emitIns_R_R(INS_faddp, EA_16BYTE, targetReg, tmpReg, INS_OPTS_2D);
-        }
-    }
-    else
-    {
-        ins = varTypeIsUnsigned(baseType) ? INS_uaddlv : INS_saddlv;
-
-        GetEmitter()->emitIns_R_R(ins, attr, tmpReg, tmpReg, opt);
-
-        // Mov to integer register
-        if (varTypeIsUnsigned(baseType) || (genTypeSize(baseType) < 4))
-        {
-            GetEmitter()->emitIns_R_R_I(INS_mov, emitTypeSize(baseType), targetReg, tmpReg, 0);
-        }
-        else
-        {
-            GetEmitter()->emitIns_R_R_I(INS_smov, emitActualTypeSize(baseType), targetReg, tmpReg, 0);
-        }
-    }
-
-    genProduceReg(simdNode);
-}
-
 //------------------------------------------------------------------------------------
 // genSIMDIntrinsicGetItem: Generate code for SIMD Intrinsic get element at index i.
 //
diff --git a/src/coreclr/src/jit/compiler.h b/src/coreclr/src/jit/compiler.h
index be64fa8c3e6b4a..2078ec2f786aa7 100644
--- a/src/coreclr/src/jit/compiler.h
+++ b/src/coreclr/src/jit/compiler.h
@@ -2542,7 +2542,6 @@ class Compiler
 
 #ifdef FEATURE_SIMD
     GenTree* gtNewSIMDVectorZero(var_types simdType, var_types baseType, unsigned size);
-    GenTree* gtNewSIMDVectorOne(var_types simdType, var_types baseType, unsigned size);
 #endif
 
     GenTree* gtNewBlkOpNode(GenTree* dst, GenTree* srcOrFillVal, bool isVolatile, bool isCopyBlock);
@@ -2630,6 +2629,9 @@ class Compiler
                                                  var_types      baseType,
                                                  unsigned       size);
 
+    GenTreeHWIntrinsic* gtNewSimdCreateBroadcastNode(
+        var_types type, GenTree* op1, var_types baseType, unsigned size, bool isSimdAsHWIntrinsic);
+
     GenTreeHWIntrinsic* gtNewSimdAsHWIntrinsicNode(var_types      type,
                                                    NamedIntrinsic hwIntrinsicID,
                                                    var_types      baseType,
@@ -3751,7 +3753,7 @@ class Compiler
                                   CORINFO_CLASS_HANDLE  clsHnd,
                                   CORINFO_METHOD_HANDLE method,
                                   CORINFO_SIG_INFO*     sig,
-                                  bool                  mustExpand);
+                                  GenTree*              newobjThis);
 
 protected:
     bool compSupportsHWIntrinsic(CORINFO_InstructionSet isa);
@@ -3761,7 +3763,8 @@ class Compiler
                                          CORINFO_SIG_INFO*    sig,
                                          var_types            retType,
                                          var_types            baseType,
-                                         unsigned             simdSize);
+                                         unsigned             simdSize,
+                                         GenTree*             newobjThis);
 
     GenTree* impSimdAsHWIntrinsicCndSel(CORINFO_CLASS_HANDLE clsHnd,
                                         var_types            retType,
@@ -3779,7 +3782,10 @@ class Compiler
                                  var_types             retType,
                                  unsigned              simdSize);
 
-    GenTree* getArgForHWIntrinsic(var_types argType, CORINFO_CLASS_HANDLE argClass, bool expectAddr = false);
+    GenTree* getArgForHWIntrinsic(var_types            argType,
+                                  CORINFO_CLASS_HANDLE argClass,
+                                  bool                 expectAddr = false,
+                                  GenTree*             newobjThis = nullptr);
     GenTree* impNonConstFallback(NamedIntrinsic intrinsic, var_types simdType, var_types baseType);
     GenTree* addRangeCheckIfNeeded(
         NamedIntrinsic intrinsic, GenTree* immOp, bool mustExpand, int immLowerBound, int immUpperBound);
diff --git a/src/coreclr/src/jit/gentree.cpp b/src/coreclr/src/jit/gentree.cpp
index 6a316b1dcf2f12..3c241c0c7c3e95 100644
--- a/src/coreclr/src/jit/gentree.cpp
+++ b/src/coreclr/src/jit/gentree.cpp
@@ -6058,40 +6058,6 @@ GenTree* Compiler::gtNewSIMDVectorZero(var_types simdType, var_types baseType, u
     initVal->gtType  = baseType;
     return gtNewSIMDNode(simdType, initVal, nullptr, SIMDIntrinsicInit, baseType, size);
 }
-
-//---------------------------------------------------------------------
-// gtNewSIMDVectorOne: create a GT_SIMD node for Vector<T>.One
-//
-// Arguments:
-//    simdType  -  simd vector type
-//    baseType  -  element type of vector
-//    size      -  size of vector in bytes
-GenTree* Compiler::gtNewSIMDVectorOne(var_types simdType, var_types baseType, unsigned size)
-{
-    GenTree* initVal;
-    if (varTypeIsSmallInt(baseType))
-    {
-        unsigned baseSize = genTypeSize(baseType);
-        int      val;
-        if (baseSize == 1)
-        {
-            val = 0x01010101;
-        }
-        else
-        {
-            val = 0x00010001;
-        }
-        initVal = gtNewIconNode(val);
-    }
-    else
-    {
-        initVal = gtNewOneConNode(baseType);
-    }
-
-    baseType        = genActualType(baseType);
-    initVal->gtType = baseType;
-    return gtNewSIMDNode(simdType, initVal, nullptr, SIMDIntrinsicInit, baseType, size);
-}
 #endif // FEATURE_SIMD
 
 GenTreeCall* Compiler::gtNewIndCallNode(GenTree* addr, var_types type, GenTreeCall::Use* args, IL_OFFSETX ilOffset)
@@ -18463,11 +18429,9 @@ bool GenTree::isCommutativeSIMDIntrinsic()
     assert(gtOper == GT_SIMD);
     switch (AsSIMD()->gtSIMDIntrinsicID)
     {
-        case SIMDIntrinsicAdd:
         case SIMDIntrinsicBitwiseAnd:
         case SIMDIntrinsicBitwiseOr:
         case SIMDIntrinsicEqual:
-        case SIMDIntrinsicMul:
             return true;
         default:
             return false;
@@ -18630,6 +18594,43 @@ GenTreeHWIntrinsic* Compiler::gtNewSimdHWIntrinsicNode(var_types      type,
         GenTreeHWIntrinsic(type, gtNewArgList(op1, op2, op3, op4), hwIntrinsicID, baseType, size);
 }
 
+GenTreeHWIntrinsic* Compiler::gtNewSimdCreateBroadcastNode(
+    var_types type, GenTree* op1, var_types baseType, unsigned size, bool isSimdAsHWIntrinsic)
+{
+    NamedIntrinsic hwIntrinsicID = NI_Vector128_Create;
+
+#if defined(TARGET_XARCH)
+#if defined(TARGET_X86)
+    if (varTypeIsLong(baseType) && !op1->IsIntegralConst())
+    {
+        // TODO-XARCH-CQ: It may be beneficial to emit the movq
+        // instruction, which takes a 64-bit memory address and
+        // works on 32-bit x86 systems.
+        unreached();
+    }
+#endif // TARGET_X86
+
+    if (size == 32)
+    {
+        hwIntrinsicID = NI_Vector256_Create;
+    }
+#elif defined(TARGET_ARM64)
+    if (size == 8)
+    {
+        hwIntrinsicID = NI_Vector64_Create;
+    }
+#else
+#error Unsupported platform
+#endif // !TARGET_XARCH && !TARGET_ARM64
+
+    if (isSimdAsHWIntrinsic)
+    {
+        return gtNewSimdAsHWIntrinsicNode(type, op1, hwIntrinsicID, baseType, size);
+    }
+
+    return gtNewSimdHWIntrinsicNode(type, op1, hwIntrinsicID, baseType, size);
+}
+
 GenTreeHWIntrinsic* Compiler::gtNewScalarHWIntrinsicNode(var_types type, GenTree* op1, NamedIntrinsic hwIntrinsicID)
 {
     SetOpLclRelatedToSIMDIntrinsic(op1);
diff --git a/src/coreclr/src/jit/hwintrinsic.cpp b/src/coreclr/src/jit/hwintrinsic.cpp
index 0ce3ec25b69f5c..237816e9d43ca0 100644
--- a/src/coreclr/src/jit/hwintrinsic.cpp
+++ b/src/coreclr/src/jit/hwintrinsic.cpp
@@ -487,14 +487,19 @@ bool HWIntrinsicInfo::isImmOp(NamedIntrinsic id, const GenTree* op)
 // Arguments:
 //    argType    -- the required type of argument
 //    argClass   -- the class handle of argType
-//    expectAddr --  if true indicates we are expecting type stack entry to be a TYP_BYREF.
+//    expectAddr -- if true indicates we are expecting type stack entry to be a TYP_BYREF.
+//    newobjThis -- For CEE_NEWOBJ, this is the temp grabbed for the allocated uninitalized object.
 //
 // Return Value:
 //     the validated argument
 //
-GenTree* Compiler::getArgForHWIntrinsic(var_types argType, CORINFO_CLASS_HANDLE argClass, bool expectAddr)
+GenTree* Compiler::getArgForHWIntrinsic(var_types            argType,
+                                        CORINFO_CLASS_HANDLE argClass,
+                                        bool                 expectAddr,
+                                        GenTree*             newobjThis)
 {
     GenTree* arg = nullptr;
+
     if (varTypeIsStruct(argType))
     {
         if (!varTypeIsSIMD(argType))
@@ -504,16 +509,32 @@ GenTree* Compiler::getArgForHWIntrinsic(var_types argType, CORINFO_CLASS_HANDLE
             argType           = getSIMDTypeForSize(argSizeBytes);
         }
         assert(varTypeIsSIMD(argType));
-        arg = impSIMDPopStack(argType, expectAddr);
-        assert(varTypeIsSIMD(arg->TypeGet()));
+
+        if (newobjThis == nullptr)
+        {
+            arg = impSIMDPopStack(argType, expectAddr);
+            assert(varTypeIsSIMD(arg->TypeGet()));
+        }
+        else
+        {
+            assert((newobjThis->gtOper == GT_ADDR) && (newobjThis->AsOp()->gtOp1->gtOper == GT_LCL_VAR));
+            arg = newobjThis;
+
+            // push newobj result on type stack
+            unsigned tmp = arg->AsOp()->gtOp1->AsLclVarCommon()->GetLclNum();
+            impPushOnStack(gtNewLclvNode(tmp, lvaGetRealType(tmp)), verMakeTypeInfo(argClass).NormaliseForStack());
+        }
     }
     else
     {
         assert(varTypeIsArithmetic(argType));
+
         arg = impPopStack().val;
         assert(varTypeIsArithmetic(arg->TypeGet()));
+
         assert(genActualType(arg->gtType) == genActualType(argType));
     }
+
     return arg;
 }
 
diff --git a/src/coreclr/src/jit/hwintrinsiclistarm64.h b/src/coreclr/src/jit/hwintrinsiclistarm64.h
index 194929b4ac1c73..7de865284e211c 100644
--- a/src/coreclr/src/jit/hwintrinsiclistarm64.h
+++ b/src/coreclr/src/jit/hwintrinsiclistarm64.h
@@ -30,6 +30,7 @@ HARDWARE_INTRINSIC(Vector64,      AsUInt32,
 HARDWARE_INTRINSIC(Vector64,      AsUInt64,                                                    8,      1,     {INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid},     HW_Category_Helper,                HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen|HW_Flag_SpecialImport)
 HARDWARE_INTRINSIC(Vector64,      Create,                                                      8,     -1,     {INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_mov,            INS_mov,            INS_invalid,        INS_invalid},     HW_Category_Helper,                HW_Flag_NoCodeGen|HW_Flag_SpecialImport)
 HARDWARE_INTRINSIC(Vector64,      CreateScalarUnsafe,                                          8,      1,     {INS_ins,            INS_ins,            INS_ins,            INS_ins,            INS_ins,            INS_ins,            INS_invalid,        INS_invalid,        INS_fmov,           INS_invalid},     HW_Category_SIMD,                  HW_Flag_SpecialCodeGen|HW_Flag_SupportsContainment)
+HARDWARE_INTRINSIC(Vector64,      Dot,                                                         8,      2,     {INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid},     HW_Category_Helper,                HW_Flag_NoCodeGen)
 HARDWARE_INTRINSIC(Vector64,      get_AllBitsSet,                                              8,      0,     {INS_mvni,           INS_mvni,           INS_mvni,           INS_mvni,           INS_mvni,           INS_mvni,           INS_mvni,           INS_mvni,           INS_mvni,           INS_mvni},        HW_Category_Helper,                HW_Flag_SpecialCodeGen|HW_Flag_SpecialImport)
 HARDWARE_INTRINSIC(Vector64,      get_Count,                                                   8,      0,     {INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid},     HW_Category_Helper,                HW_Flag_NoCodeGen|HW_Flag_SpecialImport)
 HARDWARE_INTRINSIC(Vector64,      get_Zero,                                                    8,      0,     {INS_movi,           INS_movi,           INS_movi,           INS_movi,           INS_movi,           INS_movi,           INS_movi,           INS_movi,           INS_movi,           INS_movi},        HW_Category_Helper,                HW_Flag_SpecialCodeGen|HW_Flag_SpecialImport)
@@ -62,6 +63,7 @@ HARDWARE_INTRINSIC(Vector128,     AsVector4,
 HARDWARE_INTRINSIC(Vector128,     AsVector128,                                                16,      1,     {INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid},     HW_Category_Helper,                HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen|HW_Flag_SpecialImport)
 HARDWARE_INTRINSIC(Vector128,     Create,                                                     16,     -1,     {INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid},     HW_Category_Helper,                HW_Flag_NoCodeGen|HW_Flag_SpecialImport)
 HARDWARE_INTRINSIC(Vector128,     CreateScalarUnsafe,                                         16,      1,     {INS_ins,            INS_ins,            INS_ins,            INS_ins,            INS_ins,            INS_ins,            INS_ins,            INS_ins,            INS_fmov,           INS_fmov},        HW_Category_SIMD,                  HW_Flag_SpecialCodeGen|HW_Flag_SupportsContainment)
+HARDWARE_INTRINSIC(Vector128,     Dot,                                                        16,      2,     {INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid},     HW_Category_Helper,                HW_Flag_NoCodeGen)
 HARDWARE_INTRINSIC(Vector128,     get_AllBitsSet,                                             16,      0,     {INS_mvni,           INS_mvni,           INS_mvni,           INS_mvni,           INS_mvni,           INS_mvni,           INS_mvni,           INS_mvni,           INS_mvni,           INS_mvni},        HW_Category_Helper,                HW_Flag_SpecialCodeGen|HW_Flag_SpecialImport)
 HARDWARE_INTRINSIC(Vector128,     get_Count,                                                  16,      0,     {INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid},     HW_Category_Helper,                HW_Flag_NoCodeGen|HW_Flag_SpecialImport)
 HARDWARE_INTRINSIC(Vector128,     get_Zero,                                                   16,      0,     {INS_movi,           INS_movi,           INS_movi,           INS_movi,           INS_movi,           INS_movi,           INS_movi,           INS_movi,           INS_movi,           INS_movi},        HW_Category_Helper,                HW_Flag_SpecialCodeGen|HW_Flag_SpecialImport)
diff --git a/src/coreclr/src/jit/hwintrinsiclistxarch.h b/src/coreclr/src/jit/hwintrinsiclistxarch.h
index c6017fb12c44ca..02d1edb15fae12 100644
--- a/src/coreclr/src/jit/hwintrinsiclistxarch.h
+++ b/src/coreclr/src/jit/hwintrinsiclistxarch.h
@@ -45,6 +45,7 @@ HARDWARE_INTRINSIC(Vector128,       AsVector4,
 HARDWARE_INTRINSIC(Vector128,       AsVector128,                                16,             1,      {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_Helper,                 HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
 HARDWARE_INTRINSIC(Vector128,       Create,                                     16,            -1,      {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_Helper,                 HW_Flag_SpecialImport|HW_Flag_NoCodeGen)
 HARDWARE_INTRINSIC(Vector128,       CreateScalarUnsafe,                         16,             1,      {INS_mov_i2xmm,         INS_mov_i2xmm,          INS_mov_i2xmm,          INS_mov_i2xmm,          INS_mov_i2xmm,          INS_mov_i2xmm,          INS_mov_i2xmm,          INS_mov_i2xmm,          INS_movss,              INS_movsdsse2},         HW_Category_SIMDScalar,             HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics)
+HARDWARE_INTRINSIC(Vector128,       Dot,                                        16,             2,      {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_Helper,                 HW_Flag_NoCodeGen)
 // The instruction generated for float/double depends on which ISAs are supported
 HARDWARE_INTRINSIC(Vector128,       get_AllBitsSet,                             16,             0,      {INS_pcmpeqd,           INS_pcmpeqd,            INS_pcmpeqd,            INS_pcmpeqd,            INS_pcmpeqd,            INS_pcmpeqd,            INS_pcmpeqd,            INS_pcmpeqd,            INS_cmpps,              INS_cmppd},             HW_Category_Helper,                 HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)
 HARDWARE_INTRINSIC(Vector128,       get_Count,                                  16,             0,      {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_Helper,                 HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)
@@ -81,6 +82,7 @@ HARDWARE_INTRINSIC(Vector256,       get_Count,
 HARDWARE_INTRINSIC(Vector256,       get_Zero,                                   32,             0,      {INS_xorps,             INS_xorps,              INS_xorps,              INS_xorps,              INS_xorps,              INS_xorps,              INS_xorps,              INS_xorps,              INS_xorps,              INS_xorps},             HW_Category_Helper,                 HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)
 HARDWARE_INTRINSIC(Vector256,       Create,                                     32,            -1,      {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_Helper,                 HW_Flag_SpecialImport|HW_Flag_NoCodeGen)
 HARDWARE_INTRINSIC(Vector256,       CreateScalarUnsafe,                         32,             1,      {INS_mov_i2xmm,         INS_mov_i2xmm,          INS_mov_i2xmm,          INS_mov_i2xmm,          INS_mov_i2xmm,          INS_mov_i2xmm,          INS_mov_i2xmm,          INS_mov_i2xmm,          INS_movss,              INS_movsdsse2},         HW_Category_SIMDScalar,             HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics)
+HARDWARE_INTRINSIC(Vector256,       Dot,                                        32,             2,      {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_Helper,                 HW_Flag_NoCodeGen)
 HARDWARE_INTRINSIC(Vector256,       GetElement,                                 32,             2,      {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_Helper,                 HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg)
 HARDWARE_INTRINSIC(Vector256,       GetLower,                                   32,             1,      {INS_movdqu,            INS_movdqu,             INS_movdqu,             INS_movdqu,             INS_movdqu,             INS_movdqu,             INS_movdqu,             INS_movdqu,             INS_movups,             INS_movupd},            HW_Category_SimpleSIMD,             HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
 HARDWARE_INTRINSIC(Vector256,       op_Equality,                                32,             2,      {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_Helper,                 HW_Flag_NoCodeGen)
@@ -346,7 +348,7 @@ HARDWARE_INTRINSIC(SSE3,            MoveLowAndDuplicate,
 //  SSSE3 Intrinsics
 HARDWARE_INTRINSIC(SSSE3,           Abs,                                        16,              1,     {INS_pabsb,             INS_invalid,            INS_pabsw,              INS_invalid,            INS_pabsd,              INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_NoRMWSemantics|HW_Flag_BaseTypeFromFirstArg)
 HARDWARE_INTRINSIC(SSSE3,           AlignRight,                                 16,              3,     {INS_palignr,           INS_palignr,            INS_palignr,            INS_palignr,            INS_palignr,            INS_palignr,            INS_palignr,            INS_palignr,            INS_invalid,            INS_invalid},           HW_Category_IMM,                    HW_Flag_FullRangeIMM)
-HARDWARE_INTRINSIC(SSSE3,           HorizontalAdd,                              16,              2,     {INS_invalid,           INS_invalid,            INS_phaddw,             INS_invalid,            INS_phaddd,             INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSSE3,           HorizontalAdd,                              16,              2,     {INS_invalid,           INS_invalid,            INS_phaddw,             INS_phaddw,             INS_phaddd,             INS_phaddd,             INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_NoFlag)
 HARDWARE_INTRINSIC(SSSE3,           HorizontalAddSaturate,                      16,              2,     {INS_invalid,           INS_invalid,            INS_phaddsw,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_NoFlag)
 HARDWARE_INTRINSIC(SSSE3,           HorizontalSubtract,                         16,              2,     {INS_invalid,           INS_invalid,            INS_phsubw,             INS_invalid,            INS_phsubd,             INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_NoFlag)
 HARDWARE_INTRINSIC(SSSE3,           HorizontalSubtractSaturate,                 16,              2,     {INS_invalid,           INS_invalid,            INS_phsubsw,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_NoFlag)
@@ -527,7 +529,7 @@ HARDWARE_INTRINSIC(AVX2,            GatherVector128,
 HARDWARE_INTRINSIC(AVX2,            GatherVector256,                            32,              3,     {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_vpgatherdd,         INS_vpgatherdd,         INS_vpgatherdq,         INS_vpgatherdq,         INS_vgatherdps,         INS_vgatherdpd},        HW_Category_IMM,                    HW_Flag_MaybeMemoryLoad|HW_Flag_SpecialCodeGen|HW_Flag_NoContainment)
 HARDWARE_INTRINSIC(AVX2,            GatherMaskVector128,                        16,              5,     {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_vpgatherdd,         INS_vpgatherdd,         INS_vpgatherdq,         INS_vpgatherdq,         INS_vgatherdps,         INS_vgatherdpd},        HW_Category_IMM,                    HW_Flag_MaybeMemoryLoad|HW_Flag_SpecialCodeGen|HW_Flag_SpecialImport|HW_Flag_NoContainment)
 HARDWARE_INTRINSIC(AVX2,            GatherMaskVector256,                        32,              5,     {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_vpgatherdd,         INS_vpgatherdd,         INS_vpgatherdq,         INS_vpgatherdq,         INS_vgatherdps,         INS_vgatherdpd},        HW_Category_IMM,                    HW_Flag_MaybeMemoryLoad|HW_Flag_SpecialCodeGen|HW_Flag_SpecialImport|HW_Flag_NoContainment)
-HARDWARE_INTRINSIC(AVX2,            HorizontalAdd,                              32,              2,     {INS_invalid,           INS_invalid,            INS_phaddw,             INS_invalid,            INS_phaddd,             INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(AVX2,            HorizontalAdd,                              32,              2,     {INS_invalid,           INS_invalid,            INS_phaddw,             INS_phaddw,             INS_phaddd,             INS_phaddd,             INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_NoFlag)
 HARDWARE_INTRINSIC(AVX2,            HorizontalAddSaturate,                      32,              2,     {INS_invalid,           INS_invalid,            INS_phaddsw,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_NoFlag)
 HARDWARE_INTRINSIC(AVX2,            HorizontalSubtract,                         32,              2,     {INS_invalid,           INS_invalid,            INS_phsubw,             INS_invalid,            INS_phsubd,             INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_NoFlag)
 HARDWARE_INTRINSIC(AVX2,            HorizontalSubtractSaturate,                 32,              2,     {INS_invalid,           INS_invalid,            INS_phsubsw,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_NoFlag)
diff --git a/src/coreclr/src/jit/hwintrinsicxarch.cpp b/src/coreclr/src/jit/hwintrinsicxarch.cpp
index c7dfaf5f7311e5..3c40ac1d96d86c 100644
--- a/src/coreclr/src/jit/hwintrinsicxarch.cpp
+++ b/src/coreclr/src/jit/hwintrinsicxarch.cpp
@@ -790,10 +790,93 @@ GenTree* Compiler::impBaseIntrinsic(NamedIntrinsic        intrinsic,
         {
             assert(sig->numArgs == 1);
 
-            if (compExactlyDependsOn(InstructionSet_SSE) && varTypeIsFloating(baseType))
+            bool isSupported = false;
+
+            switch (baseType)
+            {
+                case TYP_BYTE:
+                case TYP_UBYTE:
+                case TYP_SHORT:
+                case TYP_USHORT:
+                case TYP_INT:
+                case TYP_UINT:
+                {
+                    isSupported = compExactlyDependsOn(InstructionSet_SSE2);
+                    break;
+                }
+
+                case TYP_LONG:
+                case TYP_ULONG:
+                {
+                    isSupported = compExactlyDependsOn(InstructionSet_SSE2_X64);
+                    break;
+                }
+
+                case TYP_FLOAT:
+                case TYP_DOUBLE:
+                {
+                    isSupported = compExactlyDependsOn(InstructionSet_SSE);
+                    break;
+                }
+
+                default:
+                {
+                    unreached();
+                }
+            }
+
+            if (isSupported)
+            {
+                op1     = impSIMDPopStack(getSIMDTypeForSize(simdSize));
+                retNode = gtNewSimdHWIntrinsicNode(retType, op1, intrinsic, baseType, simdSize);
+            }
+            break;
+        }
+
+        case NI_Vector256_ToScalar:
+        {
+            assert(sig->numArgs == 1);
+
+            bool isSupported = false;
+
+            switch (baseType)
+            {
+                case TYP_BYTE:
+                case TYP_UBYTE:
+                case TYP_SHORT:
+                case TYP_USHORT:
+                case TYP_INT:
+                case TYP_UINT:
+                {
+                    isSupported = compExactlyDependsOn(InstructionSet_AVX);
+                    break;
+                }
+
+                case TYP_LONG:
+                case TYP_ULONG:
+                {
+                    isSupported =
+                        compExactlyDependsOn(InstructionSet_AVX) && compExactlyDependsOn(InstructionSet_SSE2_X64);
+                    break;
+                }
+
+                case TYP_FLOAT:
+                case TYP_DOUBLE:
+                {
+                    isSupported = compExactlyDependsOn(InstructionSet_AVX);
+                    break;
+                }
+
+                default:
+                {
+                    unreached();
+                }
+            }
+
+            if (isSupported)
             {
                 op1     = impSIMDPopStack(getSIMDTypeForSize(simdSize));
-                retNode = gtNewSimdHWIntrinsicNode(retType, op1, intrinsic, baseType, 16);
+                retNode = gtNewSimdHWIntrinsicNode(retType, op1, intrinsic, baseType, simdSize);
             }
             break;
         }
@@ -846,18 +929,6 @@ GenTree* Compiler::impBaseIntrinsic(NamedIntrinsic        intrinsic,
             break;
         }
 
-        case NI_Vector256_ToScalar:
-        {
-            assert(sig->numArgs == 1);
-
-            if (compExactlyDependsOn(InstructionSet_AVX) && varTypeIsFloating(baseType))
-            {
-                op1     = impSIMDPopStack(getSIMDTypeForSize(simdSize));
-                retNode = gtNewSimdHWIntrinsicNode(retType, op1, intrinsic, baseType, 32);
-            }
-            break;
-        }
-
         case NI_Vector256_get_Zero:
         case NI_Vector256_get_AllBitsSet:
         {
diff --git a/src/coreclr/src/jit/importer.cpp b/src/coreclr/src/jit/importer.cpp
index 943a64bdf93306..a549620c24f89a 100644
--- a/src/coreclr/src/jit/importer.cpp
+++ b/src/coreclr/src/jit/importer.cpp
@@ -3519,7 +3519,11 @@ GenTree* Compiler::impIntrinsic(GenTree*                newobjThis,
 
             if ((ni > NI_SIMD_AS_HWINTRINSIC_START) && (ni < NI_SIMD_AS_HWINTRINSIC_END))
             {
-                return impSimdAsHWIntrinsic(ni, clsHnd, method, sig, mustExpand);
+                // These intrinsics aren't defined recursively and so they will never be mustExpand
+                // Instead, they provide software fallbacks that will be executed instead.
+
+                assert(!mustExpand);
+                return impSimdAsHWIntrinsic(ni, clsHnd, method, sig, newobjThis);
             }
 #endif // FEATURE_HW_INTRINSICS
         }
diff --git a/src/coreclr/src/jit/lower.h b/src/coreclr/src/jit/lower.h
index 49b72630eb7078..01e1e401101851 100644
--- a/src/coreclr/src/jit/lower.h
+++ b/src/coreclr/src/jit/lower.h
@@ -326,11 +326,14 @@ class Lowering final : public Phase
     void LowerHWIntrinsicCC(GenTreeHWIntrinsic* node, NamedIntrinsic newIntrinsicId, GenCondition condition);
     void LowerHWIntrinsicCmpOp(GenTreeHWIntrinsic* node, genTreeOps cmpOp);
     void LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node);
+    void LowerHWIntrinsicDot(GenTreeHWIntrinsic* node);
     void LowerFusedMultiplyAdd(GenTreeHWIntrinsic* node);
 
-#ifdef TARGET_ARM64
+#if defined(TARGET_XARCH)
+    void LowerHWIntrinsicToScalar(GenTreeHWIntrinsic* node);
+#elif defined(TARGET_ARM64)
     bool IsValidConstForMovImm(GenTreeHWIntrinsic* node);
-#endif // TARGET_ARM64
+#endif // !TARGET_XARCH && !TARGET_ARM64
 
     union VectorConstant {
         int8_t   i8[32];
@@ -411,11 +414,26 @@ class Lowering final : public Phase
             case TYP_LONG:
             case TYP_ULONG:
             {
-                if (arg->OperIs(GT_CNS_LNG))
+#if defined(TARGET_64BIT)
+                if (arg->IsCnsIntOrI())
                 {
-                    vecCns.i64[argIdx] = static_cast<int64_t>(arg->AsLngCon()->gtLconVal);
+                    vecCns.i64[argIdx] = static_cast<int64_t>(arg->AsIntCon()->gtIconVal);
+                    return true;
+                }
+#else
+                if (arg->OperIsLong() && arg->AsOp()->gtOp1->IsCnsIntOrI() && arg->AsOp()->gtOp2->IsCnsIntOrI())
+                {
+                    // 32-bit targets will decompose GT_CNS_LNG into two GT_CNS_INT
+                    // We need to reconstruct the 64-bit value in order to handle this
+
+                    INT64 gtLconVal = arg->AsOp()->gtOp2->AsIntCon()->gtIconVal;
+                    gtLconVal <<= 32;
+                    gtLconVal |= arg->AsOp()->gtOp1->AsIntCon()->gtIconVal;
+
+                    vecCns.i64[argIdx] = gtLconVal;
                     return true;
                 }
+#endif // TARGET_64BIT
                 else
                 {
                     // We expect the VectorConstant to have been already zeroed
diff --git a/src/coreclr/src/jit/lowerarmarch.cpp b/src/coreclr/src/jit/lowerarmarch.cpp
index 84c664a7606e9b..69aa22c99fe271 100644
--- a/src/coreclr/src/jit/lowerarmarch.cpp
+++ b/src/coreclr/src/jit/lowerarmarch.cpp
@@ -553,6 +553,13 @@ void Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node)
             return;
         }
 
+        case NI_Vector64_Dot:
+        case NI_Vector128_Dot:
+        {
+            LowerHWIntrinsicDot(node);
+            return;
+        }
+
         case NI_Vector64_op_Equality:
         case NI_Vector128_op_Equality:
         {
@@ -773,6 +780,8 @@ void Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node)
 
     if ((simdSize == 8) && (simdType == TYP_DOUBLE))
     {
+        // TODO-Cleanup: Struct retyping means we have the wrong type here. We need to
+        //               manually fix it up so the simdType checks below are correct.
         simdType = TYP_SIMD8;
     }
 
@@ -887,7 +896,30 @@ void Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node)
 
         assert((simdSize == 8) || (simdSize == 16));
 
-        UNATIVE_OFFSET cnsSize  = simdSize;
+        if ((argCnt == 1) || (simdSize == 8) || (vecCns.i64[0] == vecCns.i64[1]))
+        {
+            // If we are a single constant or if all parts are the same, we might be able to optimize
+            // this even further for certain values, such as Zero or AllBitsSet.
+
+            if (vecCns.i64[0] == 0)
+            {
+                node->gtOp1 = nullptr;
+                node->gtOp2 = nullptr;
+
+                node->gtHWIntrinsicId = NI_Vector128_get_Zero;
+                return;
+            }
+            else if (vecCns.i64[0] == -1)
+            {
+                node->gtOp1 = nullptr;
+                node->gtOp2 = nullptr;
+
+                node->gtHWIntrinsicId = NI_Vector128_get_AllBitsSet;
+                return;
+            }
+        }
+
+        UNATIVE_OFFSET cnsSize  = (simdSize == 12) ? 16 : simdSize;
         UNATIVE_OFFSET cnsAlign = cnsSize;
 
         CORINFO_FIELD_HANDLE hnd = comp->GetEmitter()->emitAnyConst(&vecCns, cnsSize, cnsAlign);
@@ -1013,6 +1045,230 @@ void Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node)
 
     node->gtHWIntrinsicId = NI_AdvSimd_Insert;
 }
+
+//----------------------------------------------------------------------------------------------
+// Lowering::LowerHWIntrinsicDot: Lowers a Vector64 or Vector128 Dot call
+//
+//  Arguments:
+//     node - The hardware intrinsic node.
+//
+void Lowering::LowerHWIntrinsicDot(GenTreeHWIntrinsic* node)
+{
+    NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
+    var_types      baseType    = node->gtSIMDBaseType;
+    unsigned       simdSize    = node->gtSIMDSize;
+    var_types      simdType    = Compiler::getSIMDTypeForSize(simdSize);
+
+    assert((intrinsicId == NI_Vector64_Dot) || (intrinsicId == NI_Vector128_Dot));
+    assert(varTypeIsSIMD(simdType));
+    assert(varTypeIsArithmetic(baseType));
+    assert(simdSize != 0);
+
+    GenTree* op1 = node->gtGetOp1();
+    GenTree* op2 = node->gtGetOp2();
+
+    assert(op1 != nullptr);
+    assert(op2 != nullptr);
+    assert(!op1->OperIsList());
+
+    // Spare GenTrees to be used for the lowering logic below
+    // Defined upfront to avoid naming conflicts, etc...
+    GenTree* idx  = nullptr;
+    GenTree* tmp1 = nullptr;
+    GenTree* tmp2 = nullptr;
+
+    if (simdSize == 12)
+    {
+        assert(baseType == TYP_FLOAT);
+
+        // For 12 byte SIMD, we need to clear the upper 4 bytes:
+        //   idx  =    CNS_INT       int    0x03
+        //   tmp1 = *  CNS_DLB       float  0.0
+        //          /--*  op1  simd16
+        //          +--*  idx  int
+        //          +--*  tmp1 simd16
+        //   op1  = *  HWINTRINSIC   simd16 T Insert
+        //   ...
+
+        // This is roughly the following managed code:
+        //    op1 = AdvSimd.Insert(op1, 0x03, 0.0f);
+        //    ...
+
+        idx = comp->gtNewIconNode(0x03, TYP_INT);
+        BlockRange().InsertAfter(op1, idx);
+
+        tmp1 = comp->gtNewZeroConNode(TYP_FLOAT);
+        BlockRange().InsertAfter(idx, tmp1);
+        LowerNode(tmp1);
+
+        op1 = comp->gtNewSimdAsHWIntrinsicNode(simdType, op1, idx, tmp1, NI_AdvSimd_Insert, baseType, simdSize);
+        BlockRange().InsertAfter(tmp1, op1);
+        LowerNode(op1);
+    }
+
+    // We will be constructing the following parts:
+    //   ...
+    //          /--*  op1  simd16
+    //          +--*  op2  simd16
+    //   tmp1 = *  HWINTRINSIC   simd16 T Multiply
+    //   ...
+
+    // This is roughly the following managed code:
+    //   ...
+    //   var tmp1 = AdvSimd.Multiply(op1, op2);
+    //   ...
+
+    NamedIntrinsic multiply = (baseType == TYP_DOUBLE) ? NI_AdvSimd_Arm64_Multiply : NI_AdvSimd_Multiply;
+    assert(!varTypeIsLong(baseType));
+
+    tmp1 = comp->gtNewSimdAsHWIntrinsicNode(simdType, op1, op2, multiply, baseType, simdSize);
+    BlockRange().InsertBefore(node, tmp1);
+    LowerNode(tmp1);
+
+    if (varTypeIsFloating(baseType))
+    {
+        // We will be constructing the following parts:
+        //   ...
+        //          /--*  tmp1 simd16
+        //          *  STORE_LCL_VAR simd16
+        //   tmp1 =    LCL_VAR       simd16
+        //   tmp2 =    LCL_VAR       simd16
+        //   ...
+
+        // This is roughly the following managed code:
+        //   ...
+        //   var tmp2 = tmp1;
+        //   ...
+
+        node->gtOp1 = tmp1;
+        LIR::Use tmp1Use(BlockRange(), &node->gtOp1, node);
+        ReplaceWithLclVar(tmp1Use);
+        tmp1 = node->gtOp1;
+
+        tmp2 = comp->gtClone(tmp1);
+        BlockRange().InsertAfter(tmp1, tmp2);
+
+        if (simdSize == 8)
+        {
+            assert(baseType == TYP_FLOAT);
+
+            // We will be constructing the following parts:
+            //   ...
+            //          /--*  tmp1 simd8
+            //          +--*  tmp2 simd8
+            //   tmp1 = *  HWINTRINSIC   simd8  T AddPairwise
+            //   ...
+
+            // This is roughly the following managed code:
+            //   ...
+            //   var tmp1 = AdvSimd.AddPairwise(tmp1, tmp2);
+            //   ...
+
+            tmp1 = comp->gtNewSimdAsHWIntrinsicNode(simdType, tmp1, tmp2, NI_AdvSimd_AddPairwise, baseType, simdSize);
+            BlockRange().InsertAfter(tmp2, tmp1);
+            LowerNode(tmp1);
+        }
+        else
+        {
+            assert((simdSize == 12) || (simdSize == 16));
+
+            // We will be constructing the following parts:
+            //   ...
+            //          /--*  tmp1 simd16
+            //          +--*  tmp2 simd16
+            //   tmp2 = *  HWINTRINSIC   simd16 T AddPairwise
+            //   ...
+
+            // This is roughly the following managed code:
+            //   ...
+            //   var tmp1 = AdvSimd.Arm64.AddPairwise(tmp1, tmp2);
+            //   ...
+
+            tmp1 = comp->gtNewSimdAsHWIntrinsicNode(simdType, tmp1, tmp2, NI_AdvSimd_Arm64_AddPairwise, baseType,
+                                                    simdSize);
+            BlockRange().InsertAfter(tmp2, tmp1);
+            LowerNode(tmp1);
+
+            if (baseType == TYP_FLOAT)
+            {
+                // Float needs an additional pairwise add to finish summing the parts
+                // The first will have summed e0 with e1 and e2 with e3 and then repeats that for the upper half
+                // So, we will have a vector that looks like this:
+                //    < e0 + e1, e2 + e3, e0 + e1, e2 + e3>
+                // Doing a second horizontal add with itself will then give us
+                //    e0 + e1 + e2 + e3 in all elements of the vector
+
+                // We will be constructing the following parts:
+                //   ...
+                //          /--*  tmp1 simd16
+                //          *  STORE_LCL_VAR simd16
+                //   tmp1 =    LCL_VAR       simd16
+                //   tmp2 =    LCL_VAR       simd16
+                //          /--*  tmp1 simd16
+                //          +--*  tmp2 simd16
+                //   tmp2 = *  HWINTRINSIC   simd16 T AddPairwise
+                //   ...
+
+                // This is roughly the following managed code:
+                //   ...
+                //   var tmp2 = tmp1;
+                //   var tmp1 = AdvSimd.Arm64.AddPairwise(tmp1, tmp2);
+                //   ...
+
+                node->gtOp1 = tmp1;
+                LIR::Use tmp1Use(BlockRange(), &node->gtOp1, node);
+                ReplaceWithLclVar(tmp1Use);
+                tmp1 = node->gtOp1;
+
+                tmp2 = comp->gtClone(tmp1);
+                BlockRange().InsertAfter(tmp1, tmp2);
+
+                tmp1 = comp->gtNewSimdAsHWIntrinsicNode(simdType, tmp1, tmp2, NI_AdvSimd_Arm64_AddPairwise, baseType,
+                                                        simdSize);
+                BlockRange().InsertAfter(tmp2, tmp1);
+                LowerNode(tmp1);
+            }
+        }
+
+        tmp2 = tmp1;
+    }
+    else
+    {
+        assert(varTypeIsIntegral(baseType));
+
+        // We will be constructing the following parts:
+        //   ...
+        //          /--*  tmp1 simd16
+        //   tmp2 = *  HWINTRINSIC   simd16 T AddAcross
+        //   ...
+
+        // This is roughly the following managed code:
+        //   ...
+        //   var tmp2 = AdvSimd.Arm64.AddAcross(tmp1);
+        //   ...
+
+        tmp2 = comp->gtNewSimdAsHWIntrinsicNode(simdType, tmp1, NI_AdvSimd_Arm64_AddAcross, baseType, simdSize);
+        BlockRange().InsertAfter(tmp1, tmp2);
+        LowerNode(tmp2);
+    }
+
+    // We will be constructing the following parts:
+    //   ...
+    //          /--*  tmp2 simd16
+    //   node = *  HWINTRINSIC   simd16 T ToScalar
+
+    // This is roughly the following managed code:
+    //   ...
+    //   return tmp2.ToScalar();
+
+    node->gtOp1 = tmp2;
+    node->gtOp2 = nullptr;
+
+    node->gtHWIntrinsicId = (simdSize == 8) ? NI_Vector64_ToScalar : NI_Vector128_ToScalar;
+    LowerNode(node);
+
+    return;
+}
 #endif // FEATURE_HW_INTRINSICS
 
 //------------------------------------------------------------------------
diff --git a/src/coreclr/src/jit/lowerxarch.cpp b/src/coreclr/src/jit/lowerxarch.cpp
index 7b861c238dc85a..bf5d71aad36f12 100644
--- a/src/coreclr/src/jit/lowerxarch.cpp
+++ b/src/coreclr/src/jit/lowerxarch.cpp
@@ -943,6 +943,13 @@ void Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node)
             return;
         }
 
+        case NI_Vector128_Dot:
+        case NI_Vector256_Dot:
+        {
+            LowerHWIntrinsicDot(node);
+            return;
+        }
+
         case NI_Vector128_op_Equality:
         case NI_Vector256_op_Equality:
         {
@@ -957,6 +964,13 @@ void Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node)
             return;
         }
 
+        case NI_Vector128_ToScalar:
+        case NI_Vector256_ToScalar:
+        {
+            LowerHWIntrinsicToScalar(node);
+            break;
+        }
+
         case NI_SSE2_Insert:
         case NI_SSE41_Insert:
         case NI_SSE41_X64_Insert:
@@ -1350,7 +1364,7 @@ void Lowering::LowerHWIntrinsicCmpOp(GenTreeHWIntrinsic* node, genTreeOps cmpOp)
 
         GenTree* tmp = comp->gtNewOperNode(GT_AND, TYP_INT, msk, mskCns);
         BlockRange().InsertAfter(mskCns, tmp);
-        LowerNode(msk);
+        LowerNode(tmp);
 
         msk = tmp;
 
@@ -1386,6 +1400,13 @@ void Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node)
     unsigned       simdSize    = node->gtSIMDSize;
     VectorConstant vecCns      = {};
 
+    if ((simdSize == 8) && (simdType == TYP_DOUBLE))
+    {
+        // TODO-Cleanup: Struct retyping means we have the wrong type here. We need to
+        //               manually fix it up so the simdType checks below are correct.
+        simdType = TYP_SIMD8;
+    }
+
     assert(varTypeIsSIMD(simdType));
     assert(varTypeIsArithmetic(baseType));
     assert(simdSize != 0);
@@ -1455,22 +1476,72 @@ void Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node)
         {
             for (argList = op1->AsArgList(); argList != nullptr; argList = argList->Rest())
             {
-                BlockRange().Remove(argList->Current());
+                GenTree* arg = argList->Current();
+
+#if !defined(TARGET_64BIT)
+                if (arg->OperIsLong())
+                {
+                    BlockRange().Remove(arg->AsOp()->gtOp1);
+                    BlockRange().Remove(arg->AsOp()->gtOp2);
+                }
+#endif // !TARGET_64BIT
+
+                BlockRange().Remove(arg);
             }
         }
         else
         {
+#if !defined(TARGET_64BIT)
+            if (op1->OperIsLong())
+            {
+                BlockRange().Remove(op1->AsOp()->gtOp1);
+                BlockRange().Remove(op1->AsOp()->gtOp2);
+            }
+#endif // !TARGET_64BIT
+
             BlockRange().Remove(op1);
 
             if (op2 != nullptr)
             {
+#if defined(TARGET_64BIT)
+                if (op2->OperIsLong())
+                {
+                    BlockRange().Remove(op2->AsOp()->gtOp1);
+                    BlockRange().Remove(op2->AsOp()->gtOp2);
+                }
+#endif // !TARGET_64BIT
+
                 BlockRange().Remove(op2);
             }
         }
 
-        assert((simdSize == 16) || (simdSize == 32));
+        assert((simdSize == 8) || (simdSize == 12) || (simdSize == 16) || (simdSize == 32));
 
-        UNATIVE_OFFSET cnsSize  = simdSize;
+        if ((argCnt == 1) ||
+            ((vecCns.i64[0] == vecCns.i64[1]) && ((simdSize <= 16) || (vecCns.i64[2] == vecCns.i64[3]))))
+        {
+            // If we are a single constant or if all parts are the same, we might be able to optimize
+            // this even further for certain values, such as Zero or AllBitsSet.
+
+            if (vecCns.i64[0] == 0)
+            {
+                node->gtOp1 = nullptr;
+                node->gtOp2 = nullptr;
+
+                node->gtHWIntrinsicId = NI_Vector128_get_Zero;
+                return;
+            }
+            else if (vecCns.i64[0] == -1)
+            {
+                node->gtOp1 = nullptr;
+                node->gtOp2 = nullptr;
+
+                node->gtHWIntrinsicId = NI_Vector128_get_AllBitsSet;
+                return;
+            }
+        }
+
+        UNATIVE_OFFSET cnsSize  = (simdSize != 12) ? simdSize : 16;
         UNATIVE_OFFSET cnsAlign = (comp->compCodeOpt() != Compiler::SMALL_CODE) ? cnsSize : 1;
 
         CORINFO_FIELD_HANDLE hnd = comp->GetEmitter()->emitAnyConst(&vecCns, cnsSize, cnsAlign);
@@ -2244,7 +2315,7 @@ void Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node)
                 //   return Sse41.X64.Insert(tmp1, op2, 0x01);
 
                 idx = comp->gtNewIconNode(0x01, TYP_INT);
-                BlockRange().InsertAfter(op2, idx);
+                BlockRange().InsertBefore(node, idx);
 
                 node->gtOp1 = comp->gtNewArgList(tmp1, op2, idx);
                 node->gtOp2 = nullptr;
@@ -2451,6 +2522,703 @@ void Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node)
         }
     }
 }
+
+//----------------------------------------------------------------------------------------------
+// Lowering::LowerHWIntrinsicDot: Lowers a Vector128 or Vector256 Dot call
+//
+//  Arguments:
+//     node - The hardware intrinsic node.
+//
+void Lowering::LowerHWIntrinsicDot(GenTreeHWIntrinsic* node)
+{
+    NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
+    ;
+    var_types baseType    = node->gtSIMDBaseType;
+    unsigned  simdSize    = node->gtSIMDSize;
+    var_types simdType    = Compiler::getSIMDTypeForSize(simdSize);
+    unsigned  simd16Count = comp->getSIMDVectorLength(16, baseType);
+
+    assert((intrinsicId == NI_Vector128_Dot) || (intrinsicId == NI_Vector256_Dot));
+    assert(varTypeIsSIMD(simdType));
+    assert(varTypeIsArithmetic(baseType));
+    assert(simdSize != 0);
+
+    GenTree* op1 = node->gtGetOp1();
+    GenTree* op2 = node->gtGetOp2();
+
+    assert(op1 != nullptr);
+    assert(op2 != nullptr);
+    assert(!op1->OperIsList());
+
+    // Spare GenTrees to be used for the lowering logic below
+    // Defined upfront to avoid naming conflicts, etc...
+    GenTree* idx  = nullptr;
+    GenTree* tmp1 = nullptr;
+    GenTree* tmp2 = nullptr;
+    GenTree* tmp3 = nullptr;
+
+    NamedIntrinsic multiply      = NI_Illegal;
+    NamedIntrinsic horizontalAdd = NI_Illegal;
+    NamedIntrinsic add           = NI_Illegal;
+    NamedIntrinsic shuffle       = NI_Illegal;
+
+    if (simdSize == 32)
+    {
+        assert(comp->compIsaSupportedDebugOnly(InstructionSet_AVX2));
+
+        switch (baseType)
+        {
+            case TYP_SHORT:
+            case TYP_USHORT:
+            case TYP_INT:
+            case TYP_UINT:
+            {
+                multiply      = NI_AVX2_MultiplyLow;
+                horizontalAdd = NI_AVX2_HorizontalAdd;
+                add           = NI_AVX2_Add;
+                break;
+            }
+
+            case TYP_FLOAT:
+            {
+                // We will be constructing the following parts:
+                //   idx  =    CNS_INT       int    0xF1
+                //          /--*  op1  simd16
+                //          +--*  op2  simd16
+                //          +--*  idx  int
+                //   tmp1 = *  HWINTRINSIC   simd16 T DotProduct
+                //          /--*  tmp1 simd16
+                //          *  STORE_LCL_VAR simd16
+                //   tmp1 =    LCL_VAR       simd16
+                //   tmp2 =    LCL_VAR       simd16
+                //   idx  =    CNS_INT       int    0x01
+                //          /--*  tmp2 simd16
+                //          +--*  idx  int
+                //   tmp2 = *  HWINTRINSIC   simd16 T ExtractVector128
+                //          /--*  tmp1 simd16
+                //          +--*  tmp2 simd16
+                //   tmp3 = *  HWINTRINSIC   simd16 T Add
+                //          /--*  tmp3 simd16
+                //   node = *  HWINTRINSIC   simd16 T ToScalar
+
+                // This is roughly the following managed code:
+                //   var tmp1 = Avx.DotProduct(op1, op2, 0xFF);
+                //   var tmp2 = Avx.ExtractVector128(tmp1, 0x01);
+                //   var tmp3 = Sse.Add(tmp1, tmp2);
+                //   return tmp3.ToScalar();
+
+                idx = comp->gtNewIconNode(0xF1, TYP_INT);
+                BlockRange().InsertBefore(node, idx);
+
+                tmp1 = comp->gtNewSimdHWIntrinsicNode(simdType, op1, op2, idx, NI_AVX_DotProduct, baseType, simdSize);
+                BlockRange().InsertAfter(idx, tmp1);
+                LowerNode(tmp1);
+
+                node->gtOp1 = tmp1;
+                LIR::Use tmp1Use(BlockRange(), &node->gtOp1, node);
+                ReplaceWithLclVar(tmp1Use);
+                tmp1 = node->gtOp1;
+
+                tmp2 = comp->gtClone(tmp1);
+                BlockRange().InsertAfter(tmp1, tmp2);
+
+                idx = comp->gtNewIconNode(0x01, TYP_INT);
+                BlockRange().InsertAfter(tmp2, idx);
+
+                tmp2 =
+                    comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, tmp2, idx, NI_AVX_ExtractVector128, baseType, simdSize);
+                BlockRange().InsertAfter(idx, tmp2);
+                LowerNode(tmp2);
+
+                tmp3 = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, tmp1, tmp2, NI_SSE_Add, baseType, 16);
+                BlockRange().InsertAfter(tmp2, tmp3);
+                LowerNode(tmp3);
+
+                node->gtSIMDSize = 16;
+
+                node->gtOp1 = tmp3;
+                node->gtOp2 = nullptr;
+
+                node->gtHWIntrinsicId = NI_Vector128_ToScalar;
+                LowerNode(node);
+
+                return;
+            }
+
+            case TYP_DOUBLE:
+            {
+                multiply      = NI_AVX_Multiply;
+                horizontalAdd = NI_AVX_HorizontalAdd;
+                add           = NI_AVX_Add;
+                break;
+            }
+
+            default:
+            {
+                unreached();
+            }
+        }
+    }
+    else
+    {
+        assert(comp->compIsaSupportedDebugOnly(InstructionSet_SSE2));
+
+        switch (baseType)
+        {
+            case TYP_SHORT:
+            case TYP_USHORT:
+            {
+                multiply      = NI_SSE2_MultiplyLow;
+                horizontalAdd = NI_SSSE3_HorizontalAdd;
+                add           = NI_SSE2_Add;
+
+                if (!comp->compOpportunisticallyDependsOn(InstructionSet_SSSE3))
+                {
+                    shuffle = NI_SSE2_ShuffleLow;
+                }
+                break;
+            }
+
+            case TYP_INT:
+            case TYP_UINT:
+            {
+                multiply      = NI_SSE41_MultiplyLow;
+                horizontalAdd = NI_SSSE3_HorizontalAdd;
+                add           = NI_SSE2_Add;
+
+                assert(comp->compIsaSupportedDebugOnly(InstructionSet_SSE41));
+                break;
+            }
+
+            case TYP_FLOAT:
+            {
+                if (comp->compOpportunisticallyDependsOn(InstructionSet_SSE41))
+                {
+                    // We will be constructing the following parts:
+                    //   idx  =    CNS_INT       int    0xFF
+                    //          /--*  op1  simd16
+                    //          +--*  op2  simd16
+                    //          +--*  idx  int
+                    //   tmp3 = *  HWINTRINSIC   simd16 T DotProduct
+                    //          /--*  tmp3 simd16
+                    //   node = *  HWINTRINSIC   simd16 T ToScalar
+
+                    // This is roughly the following managed code:
+                    //   var tmp3 = Avx.DotProduct(op1, op2, 0xFF);
+                    //   return tmp3.ToScalar();
+
+                    if (simdSize == 8)
+                    {
+                        idx = comp->gtNewIconNode(0x31, TYP_INT);
+                    }
+                    else if (simdSize == 12)
+                    {
+                        idx = comp->gtNewIconNode(0x71, TYP_INT);
+                    }
+                    else
+                    {
+                        assert(simdSize == 16);
+                        idx = comp->gtNewIconNode(0xF1, TYP_INT);
+                    }
+                    BlockRange().InsertBefore(node, idx);
+
+                    tmp3 = comp->gtNewSimdHWIntrinsicNode(simdType, op1, op2, idx, NI_SSE41_DotProduct, baseType,
+                                                          simdSize);
+                    BlockRange().InsertAfter(idx, tmp3);
+                    LowerNode(tmp3);
+
+                    node->gtOp1 = tmp3;
+                    node->gtOp2 = nullptr;
+
+                    node->gtHWIntrinsicId = NI_Vector128_ToScalar;
+                    LowerNode(node);
+
+                    return;
+                }
+
+                multiply      = NI_SSE_Multiply;
+                horizontalAdd = NI_SSE3_HorizontalAdd;
+                add           = NI_SSE_Add;
+
+                if (!comp->compOpportunisticallyDependsOn(InstructionSet_SSE3))
+                {
+                    shuffle = NI_SSE_Shuffle;
+                }
+                break;
+            }
+
+            case TYP_DOUBLE:
+            {
+                if (comp->compOpportunisticallyDependsOn(InstructionSet_SSE41))
+                {
+                    // We will be constructing the following parts:
+                    //   idx  =    CNS_INT       int    0x31
+                    //          /--*  op1  simd16
+                    //          +--*  op2  simd16
+                    //          +--*  idx  int
+                    //   tmp3 = *  HWINTRINSIC   simd16 T DotProduct
+                    //          /--*  tmp3 simd16
+                    //   node = *  HWINTRINSIC   simd16 T ToScalar
+
+                    // This is roughly the following managed code:
+                    //   var tmp3 = Avx.DotProduct(op1, op2, 0x31);
+                    //   return tmp3.ToScalar();
+
+                    idx = comp->gtNewIconNode(0x31, TYP_INT);
+                    BlockRange().InsertBefore(node, idx);
+
+                    tmp3 = comp->gtNewSimdHWIntrinsicNode(simdType, op1, op2, idx, NI_SSE41_DotProduct, baseType,
+                                                          simdSize);
+                    BlockRange().InsertAfter(idx, tmp3);
+                    LowerNode(tmp3);
+
+                    node->gtOp1 = tmp3;
+                    node->gtOp2 = nullptr;
+
+                    node->gtHWIntrinsicId = NI_Vector128_ToScalar;
+                    LowerNode(node);
+
+                    return;
+                }
+
+                multiply      = NI_SSE2_Multiply;
+                horizontalAdd = NI_SSE3_HorizontalAdd;
+                add           = NI_SSE2_Add;
+
+                if (!comp->compOpportunisticallyDependsOn(InstructionSet_SSE3))
+                {
+                    shuffle = NI_SSE2_Shuffle;
+                }
+                break;
+            }
+
+            default:
+            {
+                unreached();
+            }
+        }
+
+        if (simdSize == 8)
+        {
+            assert(baseType == TYP_FLOAT);
+
+            // If simdSize == 8 then we have only two elements, not the 4 that we got from getSIMDVectorLength,
+            // which we gave a simdSize of 16. So, we set the simd16Count to 2 so that only 1 hadd will
+            // be emitted rather than 2, so that the upper two elements will be ignored.
+
+            simd16Count = 2;
+        }
+        else if (simdSize == 12)
+        {
+            assert(baseType == TYP_FLOAT);
+
+            // We will be constructing the following parts:
+            //   ...
+            //          +--*  CNS_INT    int    -1
+            //          +--*  CNS_INT    int    -1
+            //          +--*  CNS_INT    int    -1
+            //          +--*  CNS_INT    int    0
+            //   tmp1 = *  HWINTRINSIC   simd16 T Create
+            //          /--*  op2 simd16
+            //          +--*  tmp1 simd16
+            //   op1  = *  HWINTRINSIC   simd16 T And
+            //   ...
+
+            // This is roughly the following managed code:
+            //   ...
+            //   tmp1 = Vector128.Create(-1, -1, -1, 0);
+            //   op1  = Sse.And(op1, tmp2);
+            //   ...
+
+            GenTree* cns0 = comp->gtNewIconNode(-1, TYP_INT);
+            BlockRange().InsertAfter(op1, cns0);
+
+            GenTree* cns1 = comp->gtNewIconNode(-1, TYP_INT);
+            BlockRange().InsertAfter(cns0, cns1);
+
+            GenTree* cns2 = comp->gtNewIconNode(-1, TYP_INT);
+            BlockRange().InsertAfter(cns1, cns2);
+
+            GenTree* cns3 = comp->gtNewIconNode(0, TYP_INT);
+            BlockRange().InsertAfter(cns2, cns3);
+
+            tmp1 = comp->gtNewSimdHWIntrinsicNode(simdType, cns0, cns1, cns2, cns3, NI_Vector128_Create, TYP_INT, 16);
+            BlockRange().InsertAfter(cns3, tmp1);
+            LowerNode(tmp1);
+
+            op1 = comp->gtNewSimdHWIntrinsicNode(simdType, op1, tmp1, NI_SSE_And, baseType, simdSize);
+            BlockRange().InsertAfter(tmp1, op1);
+            LowerNode(op1);
+        }
+    }
+
+    // We will be constructing the following parts:
+    //          /--*  op1  simd16
+    //          +--*  op2  simd16
+    //   tmp1 = *  HWINTRINSIC   simd16 T Multiply
+    //   ...
+
+    // This is roughly the following managed code:
+    //   var tmp1 = Isa.Multiply(op1, op2);
+    //   ...
+
+    tmp1 = comp->gtNewSimdHWIntrinsicNode(simdType, op1, op2, multiply, baseType, simdSize);
+    BlockRange().InsertBefore(node, tmp1);
+    LowerNode(tmp1);
+
+    // HorizontalAdd combines pairs so we need log2(simd16Count) passes to sum all elements together.
+    int haddCount = genLog2(simd16Count);
+
+    for (int i = 0; i < haddCount; i++)
+    {
+        // We will be constructing the following parts:
+        //   ...
+        //          /--*  tmp1 simd16
+        //          *  STORE_LCL_VAR simd16
+        //   tmp1 =    LCL_VAR       simd16
+        //   tmp2 =    LCL_VAR       simd16
+        //   ...
+
+        // This is roughly the following managed code:
+        //   ...
+        //   tmp2 = tmp1;
+        //   ...
+
+        node->gtOp1 = tmp1;
+        LIR::Use tmp1Use(BlockRange(), &node->gtOp1, node);
+        ReplaceWithLclVar(tmp1Use);
+        tmp1 = node->gtOp1;
+
+        tmp2 = comp->gtClone(tmp1);
+        BlockRange().InsertAfter(tmp1, tmp2);
+
+        if (shuffle == NI_Illegal)
+        {
+            // We will be constructing the following parts:
+            //   ...
+            //          /--*  tmp1 simd16
+            //          +--*  tmp2 simd16
+            //   tmp1 = *  HWINTRINSIC   simd16 T HorizontalAdd
+            //   ...
+
+            // This is roughly the following managed code:
+            //   ...
+            //   tmp1 = Isa.HorizontalAdd(tmp1, tmp2);
+            //   ...
+
+            tmp1 = comp->gtNewSimdHWIntrinsicNode(simdType, tmp1, tmp2, horizontalAdd, baseType, simdSize);
+        }
+        else
+        {
+            int shuffleConst = 0x00;
+
+            switch (i)
+            {
+                case 0:
+                {
+                    assert((baseType == TYP_SHORT) || (baseType == TYP_USHORT) || varTypeIsFloating(baseType));
+
+                    // Adds (e0 + e1, e1 + e0, e2 + e3, e3 + e2), giving:
+                    //   e0, e1, e2, e3 | e4, e5, e6, e7
+                    //   e1, e0, e3, e2 | e5, e4, e7, e6
+                    //   ...
+
+                    shuffleConst = 0xB1;
+                    break;
+                }
+
+                case 1:
+                {
+                    assert((baseType == TYP_SHORT) || (baseType == TYP_USHORT) || (baseType == TYP_FLOAT));
+
+                    // Adds (e0 + e2, e1 + e3, e2 + e0, e3 + e1), giving:
+                    //   ...
+                    //   e2, e3, e0, e1 | e6, e7, e4, e5
+                    //   e3, e2, e1, e0 | e7, e6, e5, e4
+
+                    shuffleConst = 0x4E;
+                    break;
+                }
+
+                case 2:
+                {
+                    assert((baseType == TYP_SHORT) || (baseType == TYP_USHORT));
+
+                    // Adds (e0 + e4, e1 + e5, e2 + e6, e3 + e7), giving:
+                    //   ...
+                    //   e4, e5, e6, e7 | e0, e1, e2, e3
+                    //   e5, e4, e7, e6 | e1, e0, e3, e2
+                    //   e6, e7, e4, e5 | e2, e3, e0, e1
+                    //   e7, e6, e5, e4 | e3, e2, e1, e0
+
+                    shuffleConst = 0x4D;
+                    break;
+                }
+
+                default:
+                {
+                    unreached();
+                }
+            }
+
+            idx = comp->gtNewIconNode(shuffleConst, TYP_INT);
+            BlockRange().InsertAfter(tmp2, idx);
+
+            if (varTypeIsFloating(baseType))
+            {
+                // We will be constructing the following parts:
+                //   ...
+                //          /--*  tmp2 simd16
+                //          *  STORE_LCL_VAR simd16
+                //   tmp2 =    LCL_VAR       simd16
+                //   tmp3 =    LCL_VAR       simd16
+                //   idx  =    CNS_INT       int    shuffleConst
+                //          /--*  tmp2 simd16
+                //          +--*  tmp3 simd16
+                //          +--*  idx  simd16
+                //   tmp2 = *  HWINTRINSIC   simd16 T Shuffle
+                //   ...
+
+                // This is roughly the following managed code:
+                //   ...
+                //   tmp3 = tmp2;
+                //   tmp2 = Isa.Shuffle(tmp2, tmp3, shuffleConst);
+                //   ...
+
+                node->gtOp1 = tmp2;
+                LIR::Use tmp2Use(BlockRange(), &node->gtOp1, node);
+                ReplaceWithLclVar(tmp2Use);
+                tmp2 = node->gtOp1;
+
+                tmp3 = comp->gtClone(tmp2);
+                BlockRange().InsertAfter(tmp2, tmp3);
+
+                tmp2 = comp->gtNewSimdHWIntrinsicNode(simdType, tmp2, tmp3, idx, shuffle, baseType, simdSize);
+            }
+            else
+            {
+                assert((baseType == TYP_SHORT) || (baseType == TYP_USHORT));
+
+                if (i < 2)
+                {
+                    // We will be constructing the following parts:
+                    //   ...
+                    //   idx  =    CNS_INT       int    shuffleConst
+                    //          /--*  tmp2 simd16
+                    //          +--*  idx  simd16
+                    //   tmp2 = *  HWINTRINSIC   simd16 T ShuffleLow
+                    //   idx  =    CNS_INT       int    shuffleConst
+                    //          /--*  tmp2 simd16
+                    //          +--*  idx  simd16
+                    //   tmp2 = *  HWINTRINSIC   simd16 T ShuffleHigh
+                    //   ...
+
+                    // This is roughly the following managed code:
+                    //   ...
+                    //   tmp2 = Isa.Shuffle(tmp1, shuffleConst);
+                    //   ...
+
+                    tmp2 = comp->gtNewSimdHWIntrinsicNode(simdType, tmp2, idx, NI_SSE2_ShuffleLow, baseType, simdSize);
+                    BlockRange().InsertAfter(idx, tmp2);
+                    LowerNode(tmp2);
+
+                    idx = comp->gtNewIconNode(shuffleConst, TYP_INT);
+                    BlockRange().InsertAfter(tmp2, idx);
+
+                    tmp2 = comp->gtNewSimdHWIntrinsicNode(simdType, tmp2, idx, NI_SSE2_ShuffleHigh, baseType, simdSize);
+                }
+                else
+                {
+                    assert(i == 2);
+
+                    // We will be constructing the following parts:
+                    //   ...
+                    //   idx  =    CNS_INT       int    shuffleConst
+                    //          /--*  tmp2 simd16
+                    //          +--*  idx  simd16
+                    //   tmp2 = *  HWINTRINSIC   simd16 T ShuffleLow
+                    //   ...
+
+                    // This is roughly the following managed code:
+                    //   ...
+                    //   tmp2 = Isa.Shuffle(tmp1, shuffleConst);
+                    //   ...
+
+                    tmp2 = comp->gtNewSimdHWIntrinsicNode(simdType, tmp2, idx, NI_SSE2_Shuffle, TYP_INT, simdSize);
+                }
+            }
+
+            BlockRange().InsertAfter(idx, tmp2);
+            LowerNode(tmp2);
+
+            // We will be constructing the following parts:
+            //   ...
+            //          /--*  tmp1 simd16
+            //          +--*  tmp2 simd16
+            //   tmp1 = *  HWINTRINSIC   simd16 T Add
+            //   ...
+
+            // This is roughly the following managed code:
+            //   ...
+            //   tmp1 = Isa.Add(tmp1, tmp2);
+            //   ...
+
+            tmp1 = comp->gtNewSimdHWIntrinsicNode(simdType, tmp1, tmp2, add, baseType, simdSize);
+        }
+
+        BlockRange().InsertAfter(tmp2, tmp1);
+        LowerNode(tmp1);
+    }
+
+    if (simdSize == 32)
+    {
+        // We will be constructing the following parts:
+        //   ...
+        //          /--*  tmp1 simd16
+        //          *  STORE_LCL_VAR simd16
+        //   tmp1 =    LCL_VAR       simd16
+        //   tmp2 =    LCL_VAR       simd16
+        //   idx  =    CNS_INT       int    0x01
+        //          /--*  tmp2 simd16
+        //          +--*  idx  int
+        //   tmp2 = *  HWINTRINSIC   simd16 T ExtractVector128
+        //          /--*  tmp1 simd16
+        //          +--*  tmp2 simd16
+        //   tmp1 = *  HWINTRINSIC   simd16 T Add
+        //   ...
+
+        // This is roughly the following managed code:
+        //   ...
+        //   var tmp2 = tmp1;
+        //       tmp2 = Avx.ExtractVector128(tmp2, 0x01);
+        //   var tmp1 = Isa.Add(tmp1, tmp2);
+        //   ...
+
+        node->gtOp1 = tmp1;
+        LIR::Use tmp1Use(BlockRange(), &node->gtOp1, node);
+        ReplaceWithLclVar(tmp1Use);
+        tmp1 = node->gtOp1;
+
+        tmp2 = comp->gtClone(tmp1);
+        BlockRange().InsertAfter(tmp1, tmp2);
+
+        idx = comp->gtNewIconNode(0x01, TYP_INT);
+        BlockRange().InsertAfter(tmp2, idx);
+
+        tmp2 = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, tmp2, idx, NI_AVX_ExtractVector128, baseType, simdSize);
+        BlockRange().InsertAfter(idx, tmp2);
+        LowerNode(tmp2);
+
+        tmp1 = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, tmp1, tmp2, add, baseType, 16);
+        BlockRange().InsertAfter(tmp2, tmp1);
+        LowerNode(tmp1);
+
+        node->gtSIMDSize = 16;
+    }
+
+    // We will be constructing the following parts:
+    //   ...
+    //          /--*  tmp1 simd16
+    //   node = *  HWINTRINSIC   simd16 T ToScalar
+
+    // This is roughly the following managed code:
+    //   ...
+    //   return tmp1.ToScalar();
+
+    node->gtOp1 = tmp1;
+    node->gtOp2 = nullptr;
+
+    node->gtHWIntrinsicId = NI_Vector128_ToScalar;
+    LowerNode(node);
+
+    return;
+}
+
+//----------------------------------------------------------------------------------------------
+// Lowering::LowerHWIntrinsicToScalar: Lowers a Vector128 or Vector256 ToScalar call
+//
+//  Arguments:
+//     node - The hardware intrinsic node.
+//
+void Lowering::LowerHWIntrinsicToScalar(GenTreeHWIntrinsic* node)
+{
+    NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
+    ;
+    var_types baseType = node->gtSIMDBaseType;
+    unsigned  simdSize = node->gtSIMDSize;
+    var_types simdType = Compiler::getSIMDTypeForSize(simdSize);
+
+    assert((intrinsicId == NI_Vector128_ToScalar) || (intrinsicId == NI_Vector256_ToScalar));
+    assert(varTypeIsSIMD(simdType));
+    assert(varTypeIsArithmetic(baseType));
+    assert(simdSize != 0);
+
+    switch (baseType)
+    {
+        case TYP_BYTE:
+        case TYP_SHORT:
+        case TYP_INT:
+        {
+            node->gtType          = TYP_INT;
+            node->gtSIMDBaseType  = TYP_INT;
+            node->gtHWIntrinsicId = NI_SSE2_ConvertToInt32;
+            break;
+        }
+
+        case TYP_UBYTE:
+        case TYP_USHORT:
+        case TYP_UINT:
+        {
+            node->gtType          = TYP_UINT;
+            node->gtSIMDBaseType  = TYP_UINT;
+            node->gtHWIntrinsicId = NI_SSE2_ConvertToUInt32;
+            break;
+        }
+
+#if defined(TARGET_AMD64)
+        case TYP_LONG:
+        {
+            node->gtHWIntrinsicId = NI_SSE2_X64_ConvertToInt64;
+            break;
+        }
+
+        case TYP_ULONG:
+        {
+            node->gtHWIntrinsicId = NI_SSE2_X64_ConvertToUInt64;
+            break;
+        }
+#endif // TARGET_AMD64
+
+        case TYP_FLOAT:
+        case TYP_DOUBLE:
+        {
+            ContainCheckHWIntrinsic(node);
+            return;
+        }
+
+        default:
+        {
+            unreached();
+        }
+    }
+
+    LowerNode(node);
+
+    if (genTypeSize(baseType) < 4)
+    {
+        LIR::Use use;
+        bool     foundUse = BlockRange().TryGetUse(node, &use);
+
+        GenTreeCast* cast = comp->gtNewCastNode(baseType, node, node->IsUnsigned(), baseType);
+        BlockRange().InsertAfter(node, cast);
+
+        if (foundUse)
+        {
+            use.ReplaceWith(comp, cast);
+        }
+        LowerNode(cast);
+    }
+}
 #endif // FEATURE_HW_INTRINSICS
 
 //----------------------------------------------------------------------------------------------
diff --git a/src/coreclr/src/jit/lsraarm64.cpp b/src/coreclr/src/jit/lsraarm64.cpp
index df0c7113fa6799..4113c891d2aed5 100644
--- a/src/coreclr/src/jit/lsraarm64.cpp
+++ b/src/coreclr/src/jit/lsraarm64.cpp
@@ -841,10 +841,7 @@ int LinearScan::BuildSIMD(GenTreeSIMD* simdTree)
         }
         break;
 
-        case SIMDIntrinsicAdd:
         case SIMDIntrinsicSub:
-        case SIMDIntrinsicMul:
-        case SIMDIntrinsicDiv:
         case SIMDIntrinsicBitwiseAnd:
         case SIMDIntrinsicBitwiseOr:
         case SIMDIntrinsicEqual:
@@ -896,19 +893,11 @@ int LinearScan::BuildSIMD(GenTreeSIMD* simdTree)
             // We have an array and an index, which may be contained.
             break;
 
-        case SIMDIntrinsicDotProduct:
-            buildInternalFloatRegisterDefForNode(simdTree);
-            break;
-
         case SIMDIntrinsicInitArrayX:
         case SIMDIntrinsicInitFixed:
         case SIMDIntrinsicCopyToArray:
         case SIMDIntrinsicCopyToArrayX:
         case SIMDIntrinsicNone:
-        case SIMDIntrinsicGetCount:
-        case SIMDIntrinsicGetOne:
-        case SIMDIntrinsicGetZero:
-        case SIMDIntrinsicGetAllOnes:
         case SIMDIntrinsicGetX:
         case SIMDIntrinsicGetY:
         case SIMDIntrinsicGetZ:
diff --git a/src/coreclr/src/jit/lsraxarch.cpp b/src/coreclr/src/jit/lsraxarch.cpp
index df8c897dc14bd5..f90412d6a8adaa 100644
--- a/src/coreclr/src/jit/lsraxarch.cpp
+++ b/src/coreclr/src/jit/lsraxarch.cpp
@@ -1933,67 +1933,14 @@ int LinearScan::BuildSIMD(GenTreeSIMD* simdTree)
             // We have an array and an index, which may be contained.
             break;
 
-        case SIMDIntrinsicDiv:
-            // SSE2 has no instruction support for division on integer vectors
-            noway_assert(varTypeIsFloating(simdTree->gtSIMDBaseType));
-            break;
-
-        case SIMDIntrinsicAdd:
         case SIMDIntrinsicSub:
-        case SIMDIntrinsicMul:
         case SIMDIntrinsicBitwiseAnd:
         case SIMDIntrinsicBitwiseOr:
-            // SSE2 32-bit integer multiplication requires two temp regs
-            if (simdTree->gtSIMDIntrinsicID == SIMDIntrinsicMul && simdTree->gtSIMDBaseType == TYP_INT &&
-                compiler->getSIMDSupportLevel() == SIMD_SSE2_Supported)
-            {
-                buildInternalFloatRegisterDefForNode(simdTree);
-                buildInternalFloatRegisterDefForNode(simdTree);
-            }
             break;
 
         case SIMDIntrinsicEqual:
             break;
 
-        case SIMDIntrinsicDotProduct:
-            // Float/Double vectors:
-            // For SSE, or AVX with 32-byte vectors, we also need an internal register
-            // as scratch. Further we need the targetReg and internal reg to be distinct
-            // registers. Note that if this is a TYP_SIMD16 or smaller on AVX, then we
-            // don't need a tmpReg.
-            //
-            // 32-byte integer vector on SSE4/AVX:
-            // will take advantage of phaddd, which operates only on 128-bit xmm reg.
-            // This will need 1 (in case of SSE4) or 2 (in case of AVX) internal
-            // registers since targetReg is an int type register.
-            //
-            // See genSIMDIntrinsicDotProduct() for details on code sequence generated
-            // and the need for scratch registers.
-            if (varTypeIsFloating(simdTree->gtSIMDBaseType))
-            {
-                if ((compiler->getSIMDSupportLevel() == SIMD_SSE2_Supported) ||
-                    (simdTree->gtGetOp1()->TypeGet() == TYP_SIMD32))
-                {
-                    buildInternalFloatRegisterDefForNode(simdTree);
-                    setInternalRegsDelayFree = true;
-                }
-                // else don't need scratch reg(s).
-            }
-            else
-            {
-                assert(simdTree->gtSIMDBaseType == TYP_INT && compiler->getSIMDSupportLevel() >= SIMD_SSE4_Supported);
-
-                // No need to setInternalRegsDelayFree since targetReg is a
-                // an int type reg and guaranteed to be different from xmm/ymm
-                // regs.
-                buildInternalFloatRegisterDefForNode(simdTree);
-                if (compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported)
-                {
-                    buildInternalFloatRegisterDefForNode(simdTree);
-                }
-            }
-            break;
-
         case SIMDIntrinsicGetItem:
         {
             // This implements get_Item method. The sources are:
@@ -2163,10 +2110,6 @@ int LinearScan::BuildSIMD(GenTreeSIMD* simdTree)
         case SIMDIntrinsicGetY:
         case SIMDIntrinsicGetZ:
         case SIMDIntrinsicGetW:
-        case SIMDIntrinsicGetOne:
-        case SIMDIntrinsicGetZero:
-        case SIMDIntrinsicGetCount:
-        case SIMDIntrinsicGetAllOnes:
             assert(!"Get intrinsics should not be seen during Lowering.");
             unreached();
 
diff --git a/src/coreclr/src/jit/simd.cpp b/src/coreclr/src/jit/simd.cpp
index 345351947fba35..ed92766cc29aff 100644
--- a/src/coreclr/src/jit/simd.cpp
+++ b/src/coreclr/src/jit/simd.cpp
@@ -1074,14 +1074,10 @@ const SIMDIntrinsicInfo* Compiler::getSIMDIntrinsicInfo(CORINFO_CLASS_HANDLE* in
     {
         case SIMDIntrinsicInit:
         case SIMDIntrinsicGetItem:
-        case SIMDIntrinsicAdd:
         case SIMDIntrinsicSub:
-        case SIMDIntrinsicMul:
-        case SIMDIntrinsicDiv:
         case SIMDIntrinsicEqual:
         case SIMDIntrinsicBitwiseAnd:
         case SIMDIntrinsicBitwiseOr:
-        case SIMDIntrinsicDotProduct:
         case SIMDIntrinsicCast:
         case SIMDIntrinsicConvertToSingle:
         case SIMDIntrinsicConvertToDouble:
@@ -1837,7 +1833,7 @@ GenTree* Compiler::impSIMDIntrinsic(OPCODE                opcode,
 #error Unsupported platform
 #endif // !TARGET_XARCH && !TARGET_ARM64
 
-    if (!compOpportunisticallyDependsOn(minimumIsa))
+    if (!compOpportunisticallyDependsOn(minimumIsa) || !JitConfig.EnableHWIntrinsic())
     {
         // The user disabled support for the baseline ISA so
         // don't emit any SIMD intrinsics as they all require
@@ -1880,38 +1876,6 @@ GenTree* Compiler::impSIMDIntrinsic(OPCODE                opcode,
 
     switch (simdIntrinsicID)
     {
-        case SIMDIntrinsicGetCount:
-        {
-            int            length       = getSIMDVectorLength(clsHnd);
-            GenTreeIntCon* intConstTree = new (this, GT_CNS_INT) GenTreeIntCon(TYP_INT, length);
-            retVal                      = intConstTree;
-
-            intConstTree->gtFlags |= GTF_ICON_SIMD_COUNT;
-        }
-        break;
-
-        case SIMDIntrinsicGetZero:
-            retVal = gtNewSIMDVectorZero(simdType, baseType, size);
-            break;
-
-        case SIMDIntrinsicGetOne:
-            retVal = gtNewSIMDVectorOne(simdType, baseType, size);
-            break;
-
-        case SIMDIntrinsicGetAllOnes:
-        {
-            // Equivalent to (Vector<T>) new Vector<int>(0xffffffff);
-            GenTree* initVal = gtNewIconNode(0xffffffff, TYP_INT);
-            simdTree         = gtNewSIMDNode(simdType, initVal, nullptr, SIMDIntrinsicInit, TYP_INT, size);
-            if (baseType != TYP_INT)
-            {
-                // cast it to required baseType if different from TYP_INT
-                simdTree = gtNewSIMDNode(simdType, simdTree, nullptr, SIMDIntrinsicCast, baseType, size);
-            }
-            retVal = simdTree;
-        }
-        break;
-
         case SIMDIntrinsicInit:
         case SIMDIntrinsicInitN:
         {
@@ -2262,55 +2226,10 @@ GenTree* Compiler::impSIMDIntrinsic(OPCODE                opcode,
         }
         break;
 
-        case SIMDIntrinsicAdd:
         case SIMDIntrinsicSub:
-        case SIMDIntrinsicMul:
-        case SIMDIntrinsicDiv:
         case SIMDIntrinsicBitwiseAnd:
         case SIMDIntrinsicBitwiseOr:
         {
-#if defined(DEBUG)
-            // check for the cases where we don't support intrinsics.
-            // This check should be done before we make modifications to type stack.
-            // Note that this is more of a double safety check for robustness since
-            // we expect getSIMDIntrinsicInfo() to have filtered out intrinsics on
-            // unsupported base types. If getSIMdIntrinsicInfo() doesn't filter due
-            // to some bug, assert in chk/dbg will fire.
-            if (!varTypeIsFloating(baseType))
-            {
-                if (simdIntrinsicID == SIMDIntrinsicMul)
-                {
-#if defined(TARGET_XARCH)
-                    if ((baseType != TYP_INT) && (baseType != TYP_SHORT))
-                    {
-                        // TODO-CQ: implement mul on these integer vectors.
-                        // Note that SSE2 has no direct support for these vectors.
-                        assert(!"Mul not supported on long/ulong/uint/small int vectors\n");
-                        return nullptr;
-                    }
-#endif // TARGET_XARCH
-#if defined(TARGET_ARM64)
-                    if ((baseType == TYP_ULONG) && (baseType == TYP_LONG))
-                    {
-                        // TODO-CQ: implement mul on these integer vectors.
-                        // Note that ARM64 has no direct support for these vectors.
-                        assert(!"Mul not supported on long/ulong vectors\n");
-                        return nullptr;
-                    }
-#endif // TARGET_ARM64
-                }
-#if defined(TARGET_XARCH) || defined(TARGET_ARM64)
-                // common to all integer type vectors
-                if (simdIntrinsicID == SIMDIntrinsicDiv)
-                {
-                    // SSE2 doesn't support div on non-floating point vectors.
-                    assert(!"Div not supported on integer type vectors\n");
-                    return nullptr;
-                }
-#endif // defined(TARGET_XARCH) || defined(TARGET_ARM64)
-            }
-#endif // DEBUG
-
             // op1 is the first operand; if instance method, op1 is "this" arg
             // op2 is the second operand
             op2 = impSIMDPopStack(simdType);
@@ -2362,31 +2281,6 @@ GenTree* Compiler::impSIMDIntrinsic(OPCODE                opcode,
         }
         break;
 
-        case SIMDIntrinsicDotProduct:
-        {
-#if defined(TARGET_XARCH)
-            // Right now dot product is supported only for float/double vectors and
-            // int vectors on SSE4/AVX.
-            if (!varTypeIsFloating(baseType) && !(baseType == TYP_INT && getSIMDSupportLevel() >= SIMD_SSE4_Supported))
-            {
-                return nullptr;
-            }
-#endif // TARGET_XARCH
-
-            // op1 is a SIMD variable that is the first source and also "this" arg.
-            // op2 is a SIMD variable which is the second source.
-            op2 = impSIMDPopStack(simdType);
-            op1 = impSIMDPopStack(simdType, instMethod);
-
-            simdTree = gtNewSIMDNode(baseType, op1, op2, simdIntrinsicID, baseType, size);
-            if (simdType == TYP_SIMD12)
-            {
-                simdTree->gtFlags |= GTF_SIMD12_OP;
-            }
-            retVal = simdTree;
-        }
-        break;
-
         case SIMDIntrinsicGetW:
             retVal = impSIMDGetFixed(simdType, baseType, size, 3);
             break;
diff --git a/src/coreclr/src/jit/simdashwintrinsic.cpp b/src/coreclr/src/jit/simdashwintrinsic.cpp
index 4c0af6ff950c6f..64d559e149adfb 100644
--- a/src/coreclr/src/jit/simdashwintrinsic.cpp
+++ b/src/coreclr/src/jit/simdashwintrinsic.cpp
@@ -169,10 +169,8 @@ GenTree* Compiler::impSimdAsHWIntrinsic(NamedIntrinsic        intrinsic,
                                         CORINFO_CLASS_HANDLE  clsHnd,
                                         CORINFO_METHOD_HANDLE method,
                                         CORINFO_SIG_INFO*     sig,
-                                        bool                  mustExpand)
+                                        GenTree*              newobjThis)
 {
-    assert(!mustExpand);
-
     if (!featureSIMD)
     {
         // We can't support SIMD intrinsics if the JIT doesn't support the feature
@@ -187,7 +185,7 @@ GenTree* Compiler::impSimdAsHWIntrinsic(NamedIntrinsic        intrinsic,
 #error Unsupported platform
 #endif // !TARGET_XARCH && !TARGET_ARM64
 
-    if (!compOpportunisticallyDependsOn(minimumIsa))
+    if (!compOpportunisticallyDependsOn(minimumIsa) || !JitConfig.EnableHWIntrinsic())
     {
         // The user disabled support for the baseline ISA so
         // don't emit any SIMD intrinsics as they all require
@@ -274,7 +272,7 @@ GenTree* Compiler::impSimdAsHWIntrinsic(NamedIntrinsic        intrinsic,
     if (hwIntrinsic == intrinsic)
     {
         // The SIMD intrinsic requires special handling outside the normal code path
-        return impSimdAsHWIntrinsicSpecial(intrinsic, clsHnd, sig, retType, baseType, simdSize);
+        return impSimdAsHWIntrinsicSpecial(intrinsic, clsHnd, sig, retType, baseType, simdSize, newobjThis);
     }
 
     CORINFO_InstructionSet hwIntrinsicIsa = HWIntrinsicInfo::lookupIsa(hwIntrinsic);
@@ -352,7 +350,8 @@ GenTree* Compiler::impSimdAsHWIntrinsicSpecial(NamedIntrinsic       intrinsic,
                                                CORINFO_SIG_INFO*    sig,
                                                var_types            retType,
                                                var_types            baseType,
-                                               unsigned             simdSize)
+                                               unsigned             simdSize,
+                                               GenTree*             newobjThis)
 {
     assert(featureSIMD);
     assert(retType != TYP_UNKNOWN);
@@ -387,28 +386,110 @@ GenTree* Compiler::impSimdAsHWIntrinsicSpecial(NamedIntrinsic       intrinsic,
 #if defined(TARGET_XARCH)
     bool isVectorT256 = (SimdAsHWIntrinsicInfo::lookupClassId(intrinsic) == SimdAsHWIntrinsicClassId::VectorT256);
 
-    if ((baseType != TYP_FLOAT) && !compOpportunisticallyDependsOn(InstructionSet_SSE2))
-    {
-        // Vector<T>, for everything but float, requires at least SSE2
-        return nullptr;
-    }
-    else if (!compOpportunisticallyDependsOn(InstructionSet_SSE))
+    // We should have alredy exited early if SSE2 isn't supported
+    assert(compIsaSupportedDebugOnly(InstructionSet_SSE2));
+
+    switch (intrinsic)
     {
-        // Vector<float> requires at least SSE
-        return nullptr;
+#if defined(TARGET_X86)
+        case NI_VectorT128_CreateBroadcast:
+        case NI_VectorT256_CreateBroadcast:
+        {
+            if (varTypeIsLong(baseType))
+            {
+                // TODO-XARCH-CQ: It may be beneficial to emit the movq
+                // instruction, which takes a 64-bit memory address and
+                // works on 32-bit x86 systems.
+                return nullptr;
+            }
+            break;
+        }
+#endif // TARGET_X86
+
+        case NI_VectorT128_Dot:
+        {
+            if (!compOpportunisticallyDependsOn(InstructionSet_SSE41))
+            {
+                // We need to exit early if this is Vector<T>.Dot for int or uint and SSE41 is not supported
+                // The other types should be handled via the table driven paths
+
+                assert((baseType == TYP_INT) || (baseType == TYP_UINT));
+                return nullptr;
+            }
+            break;
+        }
+
+        default:
+        {
+            // Most intrinsics have some path that works even if only SSE2 is available
+            break;
+        }
     }
 
     // Vector<T>, when 32-bytes, requires at least AVX2
     assert(!isVectorT256 || compIsaSupportedDebugOnly(InstructionSet_AVX2));
-#endif
+#elif defined(TARGET_ARM64)
+    // We should have alredy exited early if AdvSimd isn't supported
+    assert(compIsaSupportedDebugOnly(InstructionSet_AdvSimd));
+#else
+#error Unsupported platform
+#endif // !TARGET_XARCH && !TARGET_ARM64
+
+    GenTree* copyBlkDst = nullptr;
+    GenTree* copyBlkSrc = nullptr;
 
     switch (numArgs)
     {
         case 0:
         {
+            assert(newobjThis == nullptr);
+
             switch (intrinsic)
             {
 #if defined(TARGET_XARCH)
+                case NI_Vector2_get_One:
+                case NI_Vector3_get_One:
+                case NI_Vector4_get_One:
+                case NI_VectorT128_get_One:
+                case NI_VectorT256_get_One:
+                {
+                    switch (baseType)
+                    {
+                        case TYP_BYTE:
+                        case TYP_UBYTE:
+                        case TYP_SHORT:
+                        case TYP_USHORT:
+                        case TYP_INT:
+                        case TYP_UINT:
+                        {
+                            op1 = gtNewIconNode(1, TYP_INT);
+                            break;
+                        }
+
+                        case TYP_LONG:
+                        case TYP_ULONG:
+                        {
+                            op1 = gtNewLconNode(1);
+                            break;
+                        }
+
+                        case TYP_FLOAT:
+                        case TYP_DOUBLE:
+                        {
+                            op1 = gtNewDconNode(1.0, baseType);
+                            break;
+                        }
+
+                        default:
+                        {
+                            unreached();
+                        }
+                    }
+
+                    return gtNewSimdCreateBroadcastNode(retType, op1, baseType, simdSize,
+                                                        /* isSimdAsHWIntrinsic */ true);
+                }
+
                 case NI_VectorT128_get_Count:
                 case NI_VectorT256_get_Count:
                 {
@@ -417,6 +498,48 @@ GenTree* Compiler::impSimdAsHWIntrinsicSpecial(NamedIntrinsic       intrinsic,
                     return countNode;
                 }
 #elif defined(TARGET_ARM64)
+                case NI_Vector2_get_One:
+                case NI_Vector3_get_One:
+                case NI_Vector4_get_One:
+                case NI_VectorT128_get_One:
+                {
+                    switch (baseType)
+                    {
+                        case TYP_BYTE:
+                        case TYP_UBYTE:
+                        case TYP_SHORT:
+                        case TYP_USHORT:
+                        case TYP_INT:
+                        case TYP_UINT:
+                        {
+                            op1 = gtNewIconNode(1, TYP_INT);
+                            break;
+                        }
+
+                        case TYP_LONG:
+                        case TYP_ULONG:
+                        {
+                            op1 = gtNewLconNode(1);
+                            break;
+                        }
+
+                        case TYP_FLOAT:
+                        case TYP_DOUBLE:
+                        {
+                            op1 = gtNewDconNode(1.0, baseType);
+                            break;
+                        }
+
+                        default:
+                        {
+                            unreached();
+                        }
+                    }
+
+                    return gtNewSimdCreateBroadcastNode(retType, op1, baseType, simdSize,
+                                                        /* isSimdAsHWIntrinsic */ true);
+                }
+
                 case NI_VectorT128_get_Count:
                 {
                     GenTreeIntCon* countNode = gtNewIconNode(getSIMDVectorLength(simdSize, baseType), TYP_INT);
@@ -438,6 +561,8 @@ GenTree* Compiler::impSimdAsHWIntrinsicSpecial(NamedIntrinsic       intrinsic,
 
         case 1:
         {
+            assert(newobjThis == nullptr);
+
             bool isOpExplicit = (intrinsic == NI_VectorT128_op_Explicit);
 
 #if defined(TARGET_XARCH)
@@ -494,7 +619,8 @@ GenTree* Compiler::impSimdAsHWIntrinsicSpecial(NamedIntrinsic       intrinsic,
                         }
                         assert(bitMask != nullptr);
 
-                        bitMask = gtNewSIMDNode(retType, bitMask, SIMDIntrinsicInit, baseType, simdSize);
+                        bitMask = gtNewSimdCreateBroadcastNode(retType, bitMask, baseType, simdSize,
+                                                               /* isSimdAsHWIntrinsic */ true);
 
                         intrinsic = isVectorT256 ? NI_VectorT256_op_BitwiseAnd : NI_VectorT128_op_BitwiseAnd;
                         intrinsic = SimdAsHWIntrinsicInfo::lookupHWIntrinsic(intrinsic, baseType);
@@ -565,13 +691,27 @@ GenTree* Compiler::impSimdAsHWIntrinsicSpecial(NamedIntrinsic       intrinsic,
 
             argType = isInstanceMethod ? simdType
                                        : JITtype2varType(strip(info.compCompHnd->getArgType(sig, argList, &argClass)));
-            op1 = getArgForHWIntrinsic(argType, argClass, isInstanceMethod);
+            op1 = getArgForHWIntrinsic(argType, argClass, isInstanceMethod, newobjThis);
 
             assert(!SimdAsHWIntrinsicInfo::NeedsOperandsSwapped(intrinsic));
 
             switch (intrinsic)
             {
 #if defined(TARGET_XARCH)
+                case NI_Vector2_CreateBroadcast:
+                case NI_Vector3_CreateBroadcast:
+                case NI_Vector4_CreateBroadcast:
+                case NI_VectorT128_CreateBroadcast:
+                case NI_VectorT256_CreateBroadcast:
+                {
+                    assert(retType == TYP_VOID);
+
+                    copyBlkDst = op1;
+                    copyBlkSrc =
+                        gtNewSimdCreateBroadcastNode(simdType, op2, baseType, simdSize, /* isSimdAsHWIntrinsic */ true);
+                    break;
+                }
+
                 case NI_Vector2_op_Division:
                 case NI_Vector3_op_Division:
                 {
@@ -598,6 +738,13 @@ GenTree* Compiler::impSimdAsHWIntrinsicSpecial(NamedIntrinsic       intrinsic,
                     return retNode;
                 }
 
+                case NI_VectorT128_Dot:
+                {
+                    assert((baseType == TYP_INT) || (baseType == TYP_UINT));
+                    assert(compIsaSupportedDebugOnly(InstructionSet_SSE41));
+                    return gtNewSimdAsHWIntrinsicNode(retType, op1, op2, NI_Vector128_Dot, baseType, simdSize);
+                }
+
                 case NI_VectorT128_Equals:
                 case NI_VectorT128_GreaterThan:
                 case NI_VectorT128_GreaterThanOrEqual:
@@ -648,8 +795,8 @@ GenTree* Compiler::impSimdAsHWIntrinsicSpecial(NamedIntrinsic       intrinsic,
                             }
                         }
 
-                        GenTree* constVector =
-                            gtNewSIMDNode(retType, constVal, nullptr, SIMDIntrinsicInit, TYP_INT, simdSize);
+                        GenTree* constVector = gtNewSimdCreateBroadcastNode(retType, constVal, TYP_INT, simdSize,
+                                                                            /* isSimdAsHWIntrinsic */ true);
 
                         GenTree* constVectorDup1;
                         constVector = impCloneExpr(constVector, &constVectorDup1, clsHnd, (unsigned)CHECK_SPILL_ALL,
@@ -766,6 +913,19 @@ GenTree* Compiler::impSimdAsHWIntrinsicSpecial(NamedIntrinsic       intrinsic,
                     return gtNewSimdAsHWIntrinsicNode(retType, op1, op2, hwIntrinsic, baseType, simdSize);
                 }
 #elif defined(TARGET_ARM64)
+                case NI_Vector2_CreateBroadcast:
+                case NI_Vector3_CreateBroadcast:
+                case NI_Vector4_CreateBroadcast:
+                case NI_VectorT128_CreateBroadcast:
+                {
+                    assert(retType == TYP_VOID);
+
+                    copyBlkDst = op1;
+                    copyBlkSrc =
+                        gtNewSimdCreateBroadcastNode(simdType, op2, baseType, simdSize, /* isSimdAsHWIntrinsic */ true);
+                    break;
+                }
+
                 case NI_VectorT128_Max:
                 case NI_VectorT128_Min:
                 {
@@ -808,6 +968,8 @@ GenTree* Compiler::impSimdAsHWIntrinsicSpecial(NamedIntrinsic       intrinsic,
 
         case 3:
         {
+            assert(newobjThis == nullptr);
+
             CORINFO_ARG_LIST_HANDLE arg2 = isInstanceMethod ? argList : info.compCompHnd->getArgNext(argList);
             CORINFO_ARG_LIST_HANDLE arg3 = info.compCompHnd->getArgNext(arg2);
 
@@ -819,7 +981,7 @@ GenTree* Compiler::impSimdAsHWIntrinsicSpecial(NamedIntrinsic       intrinsic,
 
             argType = isInstanceMethod ? simdType
                                        : JITtype2varType(strip(info.compCompHnd->getArgType(sig, argList, &argClass)));
-            op1 = getArgForHWIntrinsic(argType, argClass, isInstanceMethod);
+            op1 = getArgForHWIntrinsic(argType, argClass, isInstanceMethod, newobjThis);
 
             assert(!SimdAsHWIntrinsicInfo::NeedsOperandsSwapped(intrinsic));
 
@@ -850,6 +1012,27 @@ GenTree* Compiler::impSimdAsHWIntrinsicSpecial(NamedIntrinsic       intrinsic,
         }
     }
 
+    if (copyBlkDst != nullptr)
+    {
+        assert(copyBlkSrc != nullptr);
+
+        // At this point, we have a tree that we are going to store into a destination.
+        // TODO-1stClassStructs: This should be a simple store or assignment, and should not require
+        // GTF_ALL_EFFECT for the dest. This is currently emulating the previous behavior of
+        // block ops.
+
+        GenTree* dest = gtNewBlockVal(copyBlkDst, simdSize);
+
+        dest->gtType = simdType;
+        dest->gtFlags |= GTF_GLOB_REF;
+
+        GenTree* retNode = gtNewBlkOpNode(dest, copyBlkSrc, /* isVolatile */ false, /* isCopyBlock */ true);
+        retNode->gtFlags |= ((copyBlkDst->gtFlags | copyBlkSrc->gtFlags) & GTF_ALL_EFFECT);
+
+        return retNode;
+    }
+    assert(copyBlkSrc == nullptr);
+
     assert(!"Unexpected SimdAsHWIntrinsic");
     return nullptr;
 }
@@ -1155,8 +1338,8 @@ GenTree* Compiler::impSimdAsHWIntrinsicRelOp(NamedIntrinsic       intrinsic,
                     }
                 }
 
-                GenTree* constVector =
-                    gtNewSIMDNode(retType, constVal, nullptr, SIMDIntrinsicInit, constVal->TypeGet(), simdSize);
+                GenTree* constVector = gtNewSimdCreateBroadcastNode(retType, constVal, constVal->TypeGet(), simdSize,
+                                                                    /* isSimdAsHWIntrinsic */ true);
 
                 GenTree* constVectorDup;
                 constVector = impCloneExpr(constVector, &constVectorDup, clsHnd, (unsigned)CHECK_SPILL_ALL,
diff --git a/src/coreclr/src/jit/simdashwintrinsiclistarm64.h b/src/coreclr/src/jit/simdashwintrinsiclistarm64.h
index ba23bcd193469b..9022642fa6bffc 100644
--- a/src/coreclr/src/jit/simdashwintrinsiclistarm64.h
+++ b/src/coreclr/src/jit/simdashwintrinsiclistarm64.h
@@ -39,7 +39,10 @@
 // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************
 //  Vector2 Intrinsics
 SIMD_AS_HWINTRINSIC_ID(Vector2,     Abs,                                                    1,         {NI_Illegal,                                    NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_AdvSimd_Abs,                                 NI_Illegal},                                    SimdAsHWIntrinsicFlag::None)
+SIMD_AS_HWINTRINSIC_NM(Vector2,     CreateBroadcast,            ".ctor",                    2,         {NI_Illegal,                                    NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Vector2_CreateBroadcast,                     NI_Illegal},                                    SimdAsHWIntrinsicFlag::InstanceMethod)
+SIMD_AS_HWINTRINSIC_ID(Vector2,     Dot,                                                    2,         {NI_Illegal,                                    NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Vector64_Dot,                                NI_Illegal},                                    SimdAsHWIntrinsicFlag::None)
 SIMD_AS_HWINTRINSIC_NM(Vector2,     EqualsInstance,             "Equals",                   2,         {NI_Illegal,                                    NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Vector64_op_Equality,                        NI_Illegal},                                    SimdAsHWIntrinsicFlag::InstanceMethod)
+SIMD_AS_HWINTRINSIC_ID(Vector2,     get_One,                                                0,         {NI_Illegal,                                    NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Vector2_get_One,                             NI_Illegal},                                    SimdAsHWIntrinsicFlag::None)
 SIMD_AS_HWINTRINSIC_ID(Vector2,     get_Zero,                                               0,         {NI_Illegal,                                    NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Vector64_get_Zero,                           NI_Illegal},                                    SimdAsHWIntrinsicFlag::None)
 SIMD_AS_HWINTRINSIC_ID(Vector2,     Max,                                                    2,         {NI_Illegal,                                    NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_AdvSimd_Max,                                 NI_Illegal},                                    SimdAsHWIntrinsicFlag::None)
 SIMD_AS_HWINTRINSIC_ID(Vector2,     Min,                                                    2,         {NI_Illegal,                                    NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_AdvSimd_Min,                                 NI_Illegal},                                    SimdAsHWIntrinsicFlag::None)
@@ -57,7 +60,10 @@ SIMD_AS_HWINTRINSIC_ID(Vector2,     SquareRoot,
 // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************
 //  Vector3 Intrinsics
 SIMD_AS_HWINTRINSIC_ID(Vector3,     Abs,                                                    1,         {NI_Illegal,                                    NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_AdvSimd_Abs,                                 NI_Illegal},                                    SimdAsHWIntrinsicFlag::None)
+SIMD_AS_HWINTRINSIC_NM(Vector3,     CreateBroadcast,            ".ctor",                    2,         {NI_Illegal,                                    NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Vector3_CreateBroadcast,                     NI_Illegal},                                    SimdAsHWIntrinsicFlag::InstanceMethod)
+SIMD_AS_HWINTRINSIC_ID(Vector3,     Dot,                                                    2,         {NI_Illegal,                                    NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Vector128_Dot,                               NI_Illegal},                                    SimdAsHWIntrinsicFlag::None)
 SIMD_AS_HWINTRINSIC_NM(Vector3,     EqualsInstance,             "Equals",                   2,         {NI_Illegal,                                    NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Vector128_op_Equality,                       NI_Illegal},                                    SimdAsHWIntrinsicFlag::InstanceMethod)
+SIMD_AS_HWINTRINSIC_ID(Vector3,     get_One,                                                0,         {NI_Illegal,                                    NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Vector3_get_One,                             NI_Illegal},                                    SimdAsHWIntrinsicFlag::None)
 SIMD_AS_HWINTRINSIC_ID(Vector3,     get_Zero,                                               0,         {NI_Illegal,                                    NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Vector128_get_Zero,                          NI_Illegal},                                    SimdAsHWIntrinsicFlag::None)
 SIMD_AS_HWINTRINSIC_ID(Vector3,     Max,                                                    2,         {NI_Illegal,                                    NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_AdvSimd_Max,                                 NI_Illegal},                                    SimdAsHWIntrinsicFlag::None)
 SIMD_AS_HWINTRINSIC_ID(Vector3,     Min,                                                    2,         {NI_Illegal,                                    NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_AdvSimd_Min,                                 NI_Illegal},                                    SimdAsHWIntrinsicFlag::None)
@@ -75,7 +81,10 @@ SIMD_AS_HWINTRINSIC_ID(Vector3,     SquareRoot,
 // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************
 //  Vector4 Intrinsics
 SIMD_AS_HWINTRINSIC_ID(Vector4,     Abs,                                                    1,         {NI_Illegal,                                    NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_AdvSimd_Abs,                                 NI_Illegal},                                    SimdAsHWIntrinsicFlag::None)
+SIMD_AS_HWINTRINSIC_NM(Vector4,     CreateBroadcast,            ".ctor",                    2,         {NI_Illegal,                                    NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Vector4_CreateBroadcast,                     NI_Illegal},                                    SimdAsHWIntrinsicFlag::InstanceMethod)
+SIMD_AS_HWINTRINSIC_ID(Vector4,     Dot,                                                    2,         {NI_Illegal,                                    NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Vector128_Dot,                               NI_Illegal},                                    SimdAsHWIntrinsicFlag::None)
 SIMD_AS_HWINTRINSIC_NM(Vector4,     EqualsInstance,             "Equals",                   2,         {NI_Illegal,                                    NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Vector128_op_Equality,                       NI_Illegal},                                    SimdAsHWIntrinsicFlag::InstanceMethod)
+SIMD_AS_HWINTRINSIC_ID(Vector4,     get_One,                                                0,         {NI_Illegal,                                    NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Vector4_get_One,                             NI_Illegal},                                    SimdAsHWIntrinsicFlag::None)
 SIMD_AS_HWINTRINSIC_ID(Vector4,     get_Zero,                                               0,         {NI_Illegal,                                    NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Vector128_get_Zero,                          NI_Illegal},                                    SimdAsHWIntrinsicFlag::None)
 SIMD_AS_HWINTRINSIC_ID(Vector4,     Max,                                                    2,         {NI_Illegal,                                    NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_AdvSimd_Max,                                 NI_Illegal},                                    SimdAsHWIntrinsicFlag::None)
 SIMD_AS_HWINTRINSIC_ID(Vector4,     Min,                                                    2,         {NI_Illegal,                                    NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_AdvSimd_Min,                                 NI_Illegal},                                    SimdAsHWIntrinsicFlag::None)
@@ -96,11 +105,14 @@ SIMD_AS_HWINTRINSIC_ID(VectorT128,  Abs,
 SIMD_AS_HWINTRINSIC_ID(VectorT128,  AndNot,                                                 2,         {NI_AdvSimd_BitwiseClear,                       NI_AdvSimd_BitwiseClear,                        NI_AdvSimd_BitwiseClear,                        NI_AdvSimd_BitwiseClear,                        NI_AdvSimd_BitwiseClear,                        NI_AdvSimd_BitwiseClear,                        NI_AdvSimd_BitwiseClear,                        NI_AdvSimd_BitwiseClear,                        NI_AdvSimd_BitwiseClear,                        NI_AdvSimd_BitwiseClear},                       SimdAsHWIntrinsicFlag::None)
 SIMD_AS_HWINTRINSIC_ID(VectorT128,  Ceiling,                                                1,         {NI_Illegal,                                    NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_AdvSimd_Ceiling,                             NI_AdvSimd_Arm64_Ceiling},                      SimdAsHWIntrinsicFlag::None)
 SIMD_AS_HWINTRINSIC_ID(VectorT128,  ConditionalSelect,                                      3,         {NI_VectorT128_ConditionalSelect,               NI_VectorT128_ConditionalSelect,                NI_VectorT128_ConditionalSelect,                NI_VectorT128_ConditionalSelect,                NI_VectorT128_ConditionalSelect,                NI_VectorT128_ConditionalSelect,                NI_VectorT128_ConditionalSelect,                NI_VectorT128_ConditionalSelect,                NI_VectorT128_ConditionalSelect,                NI_VectorT128_ConditionalSelect},               SimdAsHWIntrinsicFlag::None)
+SIMD_AS_HWINTRINSIC_NM(VectorT128,  CreateBroadcast,            ".ctor",                    2,         {NI_VectorT128_CreateBroadcast,                 NI_VectorT128_CreateBroadcast,                  NI_VectorT128_CreateBroadcast,                  NI_VectorT128_CreateBroadcast,                  NI_VectorT128_CreateBroadcast,                  NI_VectorT128_CreateBroadcast,                  NI_VectorT128_CreateBroadcast,                  NI_VectorT128_CreateBroadcast,                  NI_VectorT128_CreateBroadcast,                  NI_VectorT128_CreateBroadcast},                 SimdAsHWIntrinsicFlag::InstanceMethod)
+SIMD_AS_HWINTRINSIC_ID(VectorT128,  Dot,                                                    2,         {NI_Vector128_Dot,                              NI_Vector128_Dot,                               NI_Vector128_Dot,                               NI_Vector128_Dot,                               NI_Vector128_Dot,                               NI_Vector128_Dot,                               NI_Illegal,                                     NI_Illegal,                                     NI_Vector128_Dot,                               NI_Vector128_Dot},                              SimdAsHWIntrinsicFlag::None)
 SIMD_AS_HWINTRINSIC_ID(VectorT128,  Equals,                                                 2,         {NI_AdvSimd_CompareEqual,                       NI_AdvSimd_CompareEqual,                        NI_AdvSimd_CompareEqual,                        NI_AdvSimd_CompareEqual,                        NI_AdvSimd_CompareEqual,                        NI_AdvSimd_CompareEqual,                        NI_AdvSimd_Arm64_CompareEqual,                  NI_AdvSimd_Arm64_CompareEqual,                  NI_AdvSimd_CompareEqual,                        NI_AdvSimd_Arm64_CompareEqual},                 SimdAsHWIntrinsicFlag::None)
 SIMD_AS_HWINTRINSIC_NM(VectorT128,  EqualsInstance,             "Equals",                   2,         {NI_Vector128_op_Equality,                      NI_Vector128_op_Equality,                       NI_Vector128_op_Equality,                       NI_Vector128_op_Equality,                       NI_Vector128_op_Equality,                       NI_Vector128_op_Equality,                       NI_Vector128_op_Equality,                       NI_Vector128_op_Equality,                       NI_Vector128_op_Equality,                       NI_Vector128_op_Equality},                      SimdAsHWIntrinsicFlag::InstanceMethod)
 SIMD_AS_HWINTRINSIC_ID(VectorT128,  Floor,                                                  1,         {NI_Illegal,                                    NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_Illegal,                                     NI_AdvSimd_Floor,                               NI_AdvSimd_Arm64_Floor},                        SimdAsHWIntrinsicFlag::None)
 SIMD_AS_HWINTRINSIC_ID(VectorT128,  get_AllBitsSet,                                         0,         {NI_Vector128_get_AllBitsSet,                   NI_Vector128_get_AllBitsSet,                    NI_Vector128_get_AllBitsSet,                    NI_Vector128_get_AllBitsSet,                    NI_Vector128_get_AllBitsSet,                    NI_Vector128_get_AllBitsSet,                    NI_Vector128_get_AllBitsSet,                    NI_Vector128_get_AllBitsSet,                    NI_Vector128_get_AllBitsSet,                    NI_Vector128_get_AllBitsSet},                   SimdAsHWIntrinsicFlag::None)
 SIMD_AS_HWINTRINSIC_ID(VectorT128,  get_Count,                                              0,         {NI_VectorT128_get_Count,                       NI_VectorT128_get_Count,                        NI_VectorT128_get_Count,                        NI_VectorT128_get_Count,                        NI_VectorT128_get_Count,                        NI_VectorT128_get_Count,                        NI_VectorT128_get_Count,                        NI_VectorT128_get_Count,                        NI_VectorT128_get_Count,                        NI_VectorT128_get_Count},                       SimdAsHWIntrinsicFlag::None)
+SIMD_AS_HWINTRINSIC_ID(VectorT128,  get_One,                                                0,         {NI_VectorT128_get_One,                         NI_VectorT128_get_One,                          NI_VectorT128_get_One,                          NI_VectorT128_get_One,                          NI_VectorT128_get_One,                          NI_VectorT128_get_One,                          NI_VectorT128_get_One,                          NI_VectorT128_get_One,                          NI_VectorT128_get_One,                          NI_VectorT128_get_One},                         SimdAsHWIntrinsicFlag::None)
 SIMD_AS_HWINTRINSIC_ID(VectorT128,  get_Zero,                                               0,         {NI_Vector128_get_Zero,                         NI_Vector128_get_Zero,                          NI_Vector128_get_Zero,                          NI_Vector128_get_Zero,                          NI_Vector128_get_Zero,                          NI_Vector128_get_Zero,                          NI_Vector128_get_Zero,                          NI_Vector128_get_Zero,                          NI_Vector128_get_Zero,                          NI_Vector128_get_Zero},                         SimdAsHWIntrinsicFlag::None)
 SIMD_AS_HWINTRINSIC_ID(VectorT128,  GreaterThan,                                            2,         {NI_AdvSimd_CompareGreaterThan,                 NI_AdvSimd_CompareGreaterThan,                  NI_AdvSimd_CompareGreaterThan,                  NI_AdvSimd_CompareGreaterThan,                  NI_AdvSimd_CompareGreaterThan,                  NI_AdvSimd_CompareGreaterThan,                  NI_AdvSimd_Arm64_CompareGreaterThan,            NI_AdvSimd_Arm64_CompareGreaterThan,            NI_AdvSimd_CompareGreaterThan,                  NI_AdvSimd_Arm64_CompareGreaterThan},           SimdAsHWIntrinsicFlag::None)
 SIMD_AS_HWINTRINSIC_ID(VectorT128,  GreaterThanOrEqual,                                     2,         {NI_AdvSimd_CompareGreaterThanOrEqual,          NI_AdvSimd_CompareGreaterThanOrEqual,           NI_AdvSimd_CompareGreaterThanOrEqual,           NI_AdvSimd_CompareGreaterThanOrEqual,           NI_AdvSimd_CompareGreaterThanOrEqual,           NI_AdvSimd_CompareGreaterThanOrEqual,           NI_AdvSimd_Arm64_CompareGreaterThanOrEqual,     NI_AdvSimd_Arm64_CompareGreaterThanOrEqual,     NI_AdvSimd_CompareGreaterThanOrEqual,           NI_AdvSimd_Arm64_CompareGreaterThanOrEqual},    SimdAsHWIntrinsicFlag::None)
diff --git a/src/coreclr/src/jit/simdashwintrinsiclistxarch.h b/src/coreclr/src/jit/simdashwintrinsiclistxarch.h
index d13153db4aad7b..1e77eb1804bda5 100644
--- a/src/coreclr/src/jit/simdashwintrinsiclistxarch.h
+++ b/src/coreclr/src/jit/simdashwintrinsiclistxarch.h
@@ -35,11 +35,14 @@
 
 // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************
 //                     ISA          ID                          Name                        NumArg                                                                                                                                                                                                      Instructions                                                                                                                                                                                                                                           Flags
-//                                                                                                              {TYP_BYTE,                                  TYP_UBYTE,                                  TYP_SHORT,                                  TYP_USHORT,                                 TYP_INT,                                    TYP_UINT,                                   TYP_LONG,                                   TYP_ULONG,                                  TYP_FLOAT,                                  TYP_DOUBLE}
+//                                                                                                     {TYP_BYTE,                                  TYP_UBYTE,                                  TYP_SHORT,                                  TYP_USHORT,                                 TYP_INT,                                    TYP_UINT,                                   TYP_LONG,                                   TYP_ULONG,                                  TYP_FLOAT,                                  TYP_DOUBLE}
 // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************
 //  Vector2 Intrinsics
 SIMD_AS_HWINTRINSIC_ID(Vector2,     Abs,                                                    1,         {NI_Illegal,                                NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Vector2_Abs,                             NI_Illegal},                                SimdAsHWIntrinsicFlag::None)
+SIMD_AS_HWINTRINSIC_NM(Vector2,     CreateBroadcast,            ".ctor",                    2,         {NI_Illegal,                                NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Vector2_CreateBroadcast,                 NI_Illegal},                                SimdAsHWIntrinsicFlag::InstanceMethod)
+SIMD_AS_HWINTRINSIC_ID(Vector2,     Dot,                                                    2,         {NI_Illegal,                                NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Vector128_Dot,                           NI_Illegal},                                SimdAsHWIntrinsicFlag::None)
 SIMD_AS_HWINTRINSIC_NM(Vector2,     EqualsInstance,             "Equals",                   2,         {NI_Illegal,                                NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Vector128_op_Equality,                   NI_Illegal},                                SimdAsHWIntrinsicFlag::InstanceMethod)
+SIMD_AS_HWINTRINSIC_ID(Vector2,     get_One,                                                0,         {NI_Illegal,                                NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Vector2_get_One,                         NI_Illegal},                                SimdAsHWIntrinsicFlag::None)
 SIMD_AS_HWINTRINSIC_ID(Vector2,     get_Zero,                                               0,         {NI_Illegal,                                NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Vector128_get_Zero,                      NI_Illegal},                                SimdAsHWIntrinsicFlag::None)
 SIMD_AS_HWINTRINSIC_ID(Vector2,     Max,                                                    2,         {NI_Illegal,                                NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_SSE_Max,                                 NI_Illegal},                                SimdAsHWIntrinsicFlag::None)
 SIMD_AS_HWINTRINSIC_ID(Vector2,     Min,                                                    2,         {NI_Illegal,                                NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_SSE_Min,                                 NI_Illegal},                                SimdAsHWIntrinsicFlag::None)
@@ -57,7 +60,10 @@ SIMD_AS_HWINTRINSIC_ID(Vector2,     SquareRoot,
 // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************
 //  Vector3 Intrinsics
 SIMD_AS_HWINTRINSIC_ID(Vector3,     Abs,                                                    1,         {NI_Illegal,                                NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Vector3_Abs,                             NI_Illegal},                                SimdAsHWIntrinsicFlag::None)
+SIMD_AS_HWINTRINSIC_NM(Vector3,     CreateBroadcast,            ".ctor",                    2,         {NI_Illegal,                                NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Vector3_CreateBroadcast,                 NI_Illegal},                                SimdAsHWIntrinsicFlag::InstanceMethod)
+SIMD_AS_HWINTRINSIC_ID(Vector3,     Dot,                                                    2,         {NI_Illegal,                                NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Vector128_Dot,                           NI_Illegal},                                SimdAsHWIntrinsicFlag::None)
 SIMD_AS_HWINTRINSIC_NM(Vector3,     EqualsInstance,             "Equals",                   2,         {NI_Illegal,                                NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Vector128_op_Equality,                   NI_Illegal},                                SimdAsHWIntrinsicFlag::InstanceMethod)
+SIMD_AS_HWINTRINSIC_ID(Vector3,     get_One,                                                0,         {NI_Illegal,                                NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Vector3_get_One,                         NI_Illegal},                                SimdAsHWIntrinsicFlag::None)
 SIMD_AS_HWINTRINSIC_ID(Vector3,     get_Zero,                                               0,         {NI_Illegal,                                NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Vector128_get_Zero,                      NI_Illegal},                                SimdAsHWIntrinsicFlag::None)
 SIMD_AS_HWINTRINSIC_ID(Vector3,     Max,                                                    2,         {NI_Illegal,                                NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_SSE_Max,                                 NI_Illegal},                                SimdAsHWIntrinsicFlag::None)
 SIMD_AS_HWINTRINSIC_ID(Vector3,     Min,                                                    2,         {NI_Illegal,                                NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_SSE_Min,                                 NI_Illegal},                                SimdAsHWIntrinsicFlag::None)
@@ -75,7 +81,10 @@ SIMD_AS_HWINTRINSIC_ID(Vector3,     SquareRoot,
 // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************
 //  Vector4 Intrinsics
 SIMD_AS_HWINTRINSIC_ID(Vector4,     Abs,                                                    1,         {NI_Illegal,                                NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Vector4_Abs,                             NI_Illegal},                                SimdAsHWIntrinsicFlag::None)
+SIMD_AS_HWINTRINSIC_NM(Vector4,     CreateBroadcast,            ".ctor",                    2,         {NI_Illegal,                                NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Vector4_CreateBroadcast,                 NI_Illegal},                                SimdAsHWIntrinsicFlag::InstanceMethod)
+SIMD_AS_HWINTRINSIC_ID(Vector4,     Dot,                                                    2,         {NI_Illegal,                                NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Vector128_Dot,                           NI_Illegal},                                SimdAsHWIntrinsicFlag::None)
 SIMD_AS_HWINTRINSIC_NM(Vector4,     EqualsInstance,             "Equals",                   2,         {NI_Illegal,                                NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Vector128_op_Equality,                   NI_Illegal},                                SimdAsHWIntrinsicFlag::InstanceMethod)
+SIMD_AS_HWINTRINSIC_ID(Vector4,     get_One,                                                0,         {NI_Illegal,                                NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Vector4_get_One,                         NI_Illegal},                                SimdAsHWIntrinsicFlag::None)
 SIMD_AS_HWINTRINSIC_ID(Vector4,     get_Zero,                                               0,         {NI_Illegal,                                NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Vector128_get_Zero,                      NI_Illegal},                                SimdAsHWIntrinsicFlag::None)
 SIMD_AS_HWINTRINSIC_ID(Vector4,     Max,                                                    2,         {NI_Illegal,                                NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_SSE_Max,                                 NI_Illegal},                                SimdAsHWIntrinsicFlag::None)
 SIMD_AS_HWINTRINSIC_ID(Vector4,     Min,                                                    2,         {NI_Illegal,                                NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_SSE_Min,                                 NI_Illegal},                                SimdAsHWIntrinsicFlag::None)
@@ -96,11 +105,14 @@ SIMD_AS_HWINTRINSIC_ID(VectorT128,  Abs,
 SIMD_AS_HWINTRINSIC_ID(VectorT128,  AndNot,                                                 2,         {NI_SSE2_AndNot,                            NI_SSE2_AndNot,                             NI_SSE2_AndNot,                             NI_SSE2_AndNot,                             NI_SSE2_AndNot,                             NI_SSE2_AndNot,                             NI_SSE2_AndNot,                             NI_SSE2_AndNot,                             NI_SSE_AndNot,                              NI_SSE2_AndNot},                            SimdAsHWIntrinsicFlag::NeedsOperandsSwapped)
 SIMD_AS_HWINTRINSIC_ID(VectorT128,  Ceiling,                                                1,         {NI_Illegal,                                NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_SSE41_Ceiling,                           NI_SSE41_Ceiling},                          SimdAsHWIntrinsicFlag::None)
 SIMD_AS_HWINTRINSIC_ID(VectorT128,  ConditionalSelect,                                      3,         {NI_VectorT128_ConditionalSelect,           NI_VectorT128_ConditionalSelect,            NI_VectorT128_ConditionalSelect,            NI_VectorT128_ConditionalSelect,            NI_VectorT128_ConditionalSelect,            NI_VectorT128_ConditionalSelect,            NI_VectorT128_ConditionalSelect,            NI_VectorT128_ConditionalSelect,            NI_VectorT128_ConditionalSelect,            NI_VectorT128_ConditionalSelect},           SimdAsHWIntrinsicFlag::None)
+SIMD_AS_HWINTRINSIC_NM(VectorT128,  CreateBroadcast,            ".ctor",                    2,         {NI_VectorT128_CreateBroadcast,             NI_VectorT128_CreateBroadcast,              NI_VectorT128_CreateBroadcast,              NI_VectorT128_CreateBroadcast,              NI_VectorT128_CreateBroadcast,              NI_VectorT128_CreateBroadcast,              NI_VectorT128_CreateBroadcast,              NI_VectorT128_CreateBroadcast,              NI_VectorT128_CreateBroadcast,              NI_VectorT128_CreateBroadcast},             SimdAsHWIntrinsicFlag::InstanceMethod)
+SIMD_AS_HWINTRINSIC_ID(VectorT128,  Dot,                                                    2,         {NI_Illegal,                                NI_Illegal,                                 NI_Vector128_Dot,                           NI_Vector128_Dot,                           NI_VectorT128_Dot,                          NI_VectorT128_Dot,                          NI_Illegal,                                 NI_Illegal,                                 NI_Vector128_Dot,                           NI_Vector128_Dot},                          SimdAsHWIntrinsicFlag::None)
 SIMD_AS_HWINTRINSIC_ID(VectorT128,  Equals,                                                 2,         {NI_SSE2_CompareEqual,                      NI_SSE2_CompareEqual,                       NI_SSE2_CompareEqual,                       NI_SSE2_CompareEqual,                       NI_SSE2_CompareEqual,                       NI_SSE2_CompareEqual,                       NI_VectorT128_Equals,                       NI_VectorT128_Equals,                       NI_SSE_CompareEqual,                        NI_SSE2_CompareEqual},                      SimdAsHWIntrinsicFlag::None)
 SIMD_AS_HWINTRINSIC_NM(VectorT128,  EqualsInstance,             "Equals",                   2,         {NI_Vector128_op_Equality,                  NI_Vector128_op_Equality,                   NI_Vector128_op_Equality,                   NI_Vector128_op_Equality,                   NI_Vector128_op_Equality,                   NI_Vector128_op_Equality,                   NI_Vector128_op_Equality,                   NI_Vector128_op_Equality,                   NI_Vector128_op_Equality,                   NI_Vector128_op_Equality},                  SimdAsHWIntrinsicFlag::InstanceMethod)
 SIMD_AS_HWINTRINSIC_ID(VectorT128,  Floor,                                                  1,         {NI_Illegal,                                NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_SSE41_Floor,                             NI_SSE41_Floor},                            SimdAsHWIntrinsicFlag::None)
 SIMD_AS_HWINTRINSIC_ID(VectorT128,  get_AllBitsSet,                                         0,         {NI_Vector128_get_AllBitsSet,               NI_Vector128_get_AllBitsSet,                NI_Vector128_get_AllBitsSet,                NI_Vector128_get_AllBitsSet,                NI_Vector128_get_AllBitsSet,                NI_Vector128_get_AllBitsSet,                NI_Vector128_get_AllBitsSet,                NI_Vector128_get_AllBitsSet,                NI_Vector128_get_AllBitsSet,                NI_Vector128_get_AllBitsSet},               SimdAsHWIntrinsicFlag::None)
-SIMD_AS_HWINTRINSIC_ID(VectorT128,  get_Count,                                              0,         {NI_VectorT128_get_Count,                   NI_VectorT128_get_Count,                    NI_VectorT128_get_Count,                    NI_VectorT128_get_Count,                    NI_VectorT128_get_Count,                    NI_VectorT128_get_Count,                    NI_VectorT128_get_Count,                    NI_VectorT128_get_Count,                    NI_VectorT128_get_Count,                    NI_VectorT128_get_Count},                    SimdAsHWIntrinsicFlag::None)
+SIMD_AS_HWINTRINSIC_ID(VectorT128,  get_Count,                                              0,         {NI_VectorT128_get_Count,                   NI_VectorT128_get_Count,                    NI_VectorT128_get_Count,                    NI_VectorT128_get_Count,                    NI_VectorT128_get_Count,                    NI_VectorT128_get_Count,                    NI_VectorT128_get_Count,                    NI_VectorT128_get_Count,                    NI_VectorT128_get_Count,                    NI_VectorT128_get_Count},                   SimdAsHWIntrinsicFlag::None)
+SIMD_AS_HWINTRINSIC_ID(VectorT128,  get_One,                                                0,         {NI_VectorT128_get_One,                     NI_VectorT128_get_One,                      NI_VectorT128_get_One,                      NI_VectorT128_get_One,                      NI_VectorT128_get_One,                      NI_VectorT128_get_One,                      NI_VectorT128_get_One,                      NI_VectorT128_get_One,                      NI_VectorT128_get_One,                      NI_VectorT128_get_One},                     SimdAsHWIntrinsicFlag::None)
 SIMD_AS_HWINTRINSIC_ID(VectorT128,  get_Zero,                                               0,         {NI_Vector128_get_Zero,                     NI_Vector128_get_Zero,                      NI_Vector128_get_Zero,                      NI_Vector128_get_Zero,                      NI_Vector128_get_Zero,                      NI_Vector128_get_Zero,                      NI_Vector128_get_Zero,                      NI_Vector128_get_Zero,                      NI_Vector128_get_Zero,                      NI_Vector128_get_Zero},                     SimdAsHWIntrinsicFlag::None)
 SIMD_AS_HWINTRINSIC_ID(VectorT128,  GreaterThan,                                            2,         {NI_SSE2_CompareGreaterThan,                NI_VectorT128_GreaterThan,                  NI_SSE2_CompareGreaterThan,                 NI_VectorT128_GreaterThan,                  NI_SSE2_CompareGreaterThan,                 NI_VectorT128_GreaterThan,                  NI_VectorT128_GreaterThan,                  NI_VectorT128_GreaterThan,                  NI_SSE_CompareGreaterThan,                  NI_SSE2_CompareGreaterThan},                SimdAsHWIntrinsicFlag::None)
 SIMD_AS_HWINTRINSIC_ID(VectorT128,  GreaterThanOrEqual,                                     2,         {NI_VectorT128_GreaterThanOrEqual,          NI_VectorT128_GreaterThanOrEqual,           NI_VectorT128_GreaterThanOrEqual,           NI_VectorT128_GreaterThanOrEqual,           NI_VectorT128_GreaterThanOrEqual,           NI_VectorT128_GreaterThanOrEqual,           NI_VectorT128_GreaterThanOrEqual,           NI_VectorT128_GreaterThanOrEqual,           NI_SSE_CompareGreaterThanOrEqual,           NI_SSE2_CompareGreaterThanOrEqual},         SimdAsHWIntrinsicFlag::None)
@@ -129,11 +141,14 @@ SIMD_AS_HWINTRINSIC_ID(VectorT256,  Abs,
 SIMD_AS_HWINTRINSIC_ID(VectorT256,  AndNot,                                                 2,         {NI_AVX2_AndNot,                            NI_AVX2_AndNot,                             NI_AVX2_AndNot,                             NI_AVX2_AndNot,                             NI_AVX2_AndNot,                             NI_AVX2_AndNot,                             NI_AVX2_AndNot,                             NI_AVX2_AndNot,                             NI_AVX_AndNot,                              NI_AVX_AndNot},                             SimdAsHWIntrinsicFlag::NeedsOperandsSwapped)
 SIMD_AS_HWINTRINSIC_ID(VectorT256,  Ceiling,                                                1,         {NI_Illegal,                                NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_AVX_Ceiling,                             NI_AVX_Ceiling},                            SimdAsHWIntrinsicFlag::None)
 SIMD_AS_HWINTRINSIC_ID(VectorT256,  ConditionalSelect,                                      3,         {NI_VectorT256_ConditionalSelect,           NI_VectorT256_ConditionalSelect,            NI_VectorT256_ConditionalSelect,            NI_VectorT256_ConditionalSelect,            NI_VectorT256_ConditionalSelect,            NI_VectorT256_ConditionalSelect,            NI_VectorT256_ConditionalSelect,            NI_VectorT256_ConditionalSelect,            NI_VectorT256_ConditionalSelect,            NI_VectorT256_ConditionalSelect},           SimdAsHWIntrinsicFlag::None)
+SIMD_AS_HWINTRINSIC_NM(VectorT256,  CreateBroadcast,            ".ctor",                    2,         {NI_VectorT256_CreateBroadcast,             NI_VectorT256_CreateBroadcast,              NI_VectorT256_CreateBroadcast,              NI_VectorT256_CreateBroadcast,              NI_VectorT256_CreateBroadcast,              NI_VectorT256_CreateBroadcast,              NI_VectorT256_CreateBroadcast,              NI_VectorT256_CreateBroadcast,              NI_VectorT256_CreateBroadcast,              NI_VectorT256_CreateBroadcast},             SimdAsHWIntrinsicFlag::InstanceMethod)
+SIMD_AS_HWINTRINSIC_ID(VectorT256,  Dot,                                                    2,         {NI_Illegal,                                NI_Illegal,                                 NI_Vector256_Dot,                           NI_Vector256_Dot,                           NI_Vector256_Dot,                           NI_Vector256_Dot,                           NI_Illegal,                                 NI_Illegal,                                 NI_Vector256_Dot,                           NI_Vector256_Dot},                          SimdAsHWIntrinsicFlag::None)
 SIMD_AS_HWINTRINSIC_ID(VectorT256,  Equals,                                                 2,         {NI_AVX2_CompareEqual,                      NI_AVX2_CompareEqual,                       NI_AVX2_CompareEqual,                       NI_AVX2_CompareEqual,                       NI_AVX2_CompareEqual,                       NI_AVX2_CompareEqual,                       NI_AVX2_CompareEqual,                       NI_AVX2_CompareEqual,                       NI_AVX_CompareEqual,                        NI_AVX_CompareEqual},                       SimdAsHWIntrinsicFlag::None)
 SIMD_AS_HWINTRINSIC_NM(VectorT256,  EqualsInstance,             "Equals",                   2,         {NI_Vector256_op_Equality,                  NI_Vector256_op_Equality,                   NI_Vector256_op_Equality,                   NI_Vector256_op_Equality,                   NI_Vector256_op_Equality,                   NI_Vector256_op_Equality,                   NI_Vector256_op_Equality,                   NI_Vector256_op_Equality,                   NI_Vector256_op_Equality,                   NI_Vector256_op_Equality},                  SimdAsHWIntrinsicFlag::InstanceMethod)
 SIMD_AS_HWINTRINSIC_ID(VectorT256,  Floor,                                                  1,         {NI_Illegal,                                NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_Illegal,                                 NI_AVX_Floor,                               NI_AVX_Floor},                              SimdAsHWIntrinsicFlag::None)
 SIMD_AS_HWINTRINSIC_ID(VectorT256,  get_AllBitsSet,                                         0,         {NI_Vector256_get_AllBitsSet,               NI_Vector256_get_AllBitsSet,                NI_Vector256_get_AllBitsSet,                NI_Vector256_get_AllBitsSet,                NI_Vector256_get_AllBitsSet,                NI_Vector256_get_AllBitsSet,                NI_Vector256_get_AllBitsSet,                NI_Vector256_get_AllBitsSet,                NI_Vector256_get_AllBitsSet,                NI_Vector256_get_AllBitsSet},               SimdAsHWIntrinsicFlag::None)
-SIMD_AS_HWINTRINSIC_ID(VectorT256,  get_Count,                                              0,         {NI_VectorT256_get_Count,                   NI_VectorT256_get_Count,                    NI_VectorT256_get_Count,                    NI_VectorT256_get_Count,                    NI_VectorT256_get_Count,                    NI_VectorT256_get_Count,                    NI_VectorT256_get_Count,                    NI_VectorT256_get_Count,                    NI_VectorT256_get_Count,                    NI_VectorT256_get_Count},                    SimdAsHWIntrinsicFlag::None)
+SIMD_AS_HWINTRINSIC_ID(VectorT256,  get_Count,                                              0,         {NI_VectorT256_get_Count,                   NI_VectorT256_get_Count,                    NI_VectorT256_get_Count,                    NI_VectorT256_get_Count,                    NI_VectorT256_get_Count,                    NI_VectorT256_get_Count,                    NI_VectorT256_get_Count,                    NI_VectorT256_get_Count,                    NI_VectorT256_get_Count,                    NI_VectorT256_get_Count},                   SimdAsHWIntrinsicFlag::None)
+SIMD_AS_HWINTRINSIC_ID(VectorT256,  get_One,                                                0,         {NI_VectorT256_get_One,                     NI_VectorT256_get_One,                      NI_VectorT256_get_One,                      NI_VectorT256_get_One,                      NI_VectorT256_get_One,                      NI_VectorT256_get_One,                      NI_VectorT256_get_One,                      NI_VectorT256_get_One,                      NI_VectorT256_get_One,                      NI_VectorT256_get_One},                     SimdAsHWIntrinsicFlag::None)
 SIMD_AS_HWINTRINSIC_ID(VectorT256,  get_Zero,                                               0,         {NI_Vector256_get_Zero,                     NI_Vector256_get_Zero,                      NI_Vector256_get_Zero,                      NI_Vector256_get_Zero,                      NI_Vector256_get_Zero,                      NI_Vector256_get_Zero,                      NI_Vector256_get_Zero,                      NI_Vector256_get_Zero,                      NI_Vector256_get_Zero,                      NI_Vector256_get_Zero},                     SimdAsHWIntrinsicFlag::None)
 SIMD_AS_HWINTRINSIC_ID(VectorT256,  GreaterThan,                                            2,         {NI_AVX2_CompareGreaterThan,                NI_VectorT256_GreaterThan,                  NI_AVX2_CompareGreaterThan,                 NI_VectorT256_GreaterThan,                  NI_AVX2_CompareGreaterThan,                 NI_VectorT256_GreaterThan,                  NI_AVX2_CompareGreaterThan,                 NI_VectorT256_GreaterThan,                  NI_AVX_CompareGreaterThan,                  NI_AVX_CompareGreaterThan},                 SimdAsHWIntrinsicFlag::None)
 SIMD_AS_HWINTRINSIC_ID(VectorT256,  GreaterThanOrEqual,                                     2,         {NI_VectorT256_GreaterThanOrEqual,          NI_VectorT256_GreaterThanOrEqual,           NI_VectorT256_GreaterThanOrEqual,           NI_VectorT256_GreaterThanOrEqual,           NI_VectorT256_GreaterThanOrEqual,           NI_VectorT256_GreaterThanOrEqual,           NI_VectorT256_GreaterThanOrEqual,           NI_VectorT256_GreaterThanOrEqual,           NI_AVX_CompareGreaterThanOrEqual,           NI_AVX_CompareGreaterThanOrEqual},          SimdAsHWIntrinsicFlag::None)
diff --git a/src/coreclr/src/jit/simdcodegenxarch.cpp b/src/coreclr/src/jit/simdcodegenxarch.cpp
index 5147d3d912cfc0..16aee6d5849ad4 100644
--- a/src/coreclr/src/jit/simdcodegenxarch.cpp
+++ b/src/coreclr/src/jit/simdcodegenxarch.cpp
@@ -130,33 +130,6 @@ instruction CodeGen::getOpForSIMDIntrinsic(SIMDIntrinsicID intrinsicId, var_type
             }
             break;
 
-        case SIMDIntrinsicAdd:
-            if (baseType == TYP_FLOAT)
-            {
-                result = INS_addps;
-            }
-            else if (baseType == TYP_DOUBLE)
-            {
-                result = INS_addpd;
-            }
-            else if (baseType == TYP_INT || baseType == TYP_UINT)
-            {
-                result = INS_paddd;
-            }
-            else if (baseType == TYP_USHORT || baseType == TYP_SHORT)
-            {
-                result = INS_paddw;
-            }
-            else if (baseType == TYP_UBYTE || baseType == TYP_BYTE)
-            {
-                result = INS_paddb;
-            }
-            else if (baseType == TYP_LONG || baseType == TYP_ULONG)
-            {
-                result = INS_paddq;
-            }
-            break;
-
         case SIMDIntrinsicSub:
             if (baseType == TYP_FLOAT)
             {
@@ -184,40 +157,6 @@ instruction CodeGen::getOpForSIMDIntrinsic(SIMDIntrinsicID intrinsicId, var_type
             }
             break;
 
-        case SIMDIntrinsicMul:
-            if (baseType == TYP_FLOAT)
-            {
-                result = INS_mulps;
-            }
-            else if (baseType == TYP_DOUBLE)
-            {
-                result = INS_mulpd;
-            }
-            else if (baseType == TYP_SHORT)
-            {
-                result = INS_pmullw;
-            }
-            else if ((baseType == TYP_INT) && (compiler->getSIMDSupportLevel() >= SIMD_SSE4_Supported))
-            {
-                result = INS_pmulld;
-            }
-            break;
-
-        case SIMDIntrinsicDiv:
-            if (baseType == TYP_FLOAT)
-            {
-                result = INS_divps;
-            }
-            else if (baseType == TYP_DOUBLE)
-            {
-                result = INS_divpd;
-            }
-            else
-            {
-                unreached();
-            }
-            break;
-
         case SIMDIntrinsicEqual:
             if (baseType == TYP_FLOAT)
             {
@@ -1556,9 +1495,7 @@ void CodeGen::genSIMDIntrinsicNarrow(GenTreeSIMD* simdNode)
 //
 void CodeGen::genSIMDIntrinsicBinOp(GenTreeSIMD* simdNode)
 {
-    assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicAdd || simdNode->gtSIMDIntrinsicID == SIMDIntrinsicSub ||
-           simdNode->gtSIMDIntrinsicID == SIMDIntrinsicMul || simdNode->gtSIMDIntrinsicID == SIMDIntrinsicDiv ||
-           simdNode->gtSIMDIntrinsicID == SIMDIntrinsicBitwiseAnd ||
+    assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicSub || simdNode->gtSIMDIntrinsicID == SIMDIntrinsicBitwiseAnd ||
            simdNode->gtSIMDIntrinsicID == SIMDIntrinsicBitwiseOr);
 
     GenTree*  op1       = simdNode->gtGetOp1();
@@ -1574,156 +1511,27 @@ void CodeGen::genSIMDIntrinsicBinOp(GenTreeSIMD* simdNode)
     regNumber op2Reg   = op2->GetRegNum();
     regNumber otherReg = op2Reg;
 
-    // Vector<Int>.Mul:
-    // SSE2 doesn't have an instruction to perform this operation directly
-    // whereas SSE4.1 does (pmulld).  This is special cased and computed
-    // as follows.
-    if (simdNode->gtSIMDIntrinsicID == SIMDIntrinsicMul && baseType == TYP_INT && level == SIMD_SSE2_Supported)
-    {
-        // We need a temporary register that is NOT the same as the target,
-        // and we MAY need another.
-        regNumber tmpReg  = simdNode->ExtractTempReg();
-        regNumber tmpReg2 = simdNode->GetSingleTempReg();
-
-        // The register allocator guarantees the following conditions:
-        // - the only registers that may be the same among op1Reg, op2Reg, tmpReg
-        //   and tmpReg2 are op1Reg and op2Reg.
-        // Let's be extra-careful and assert that now.
-        assert((op1Reg != tmpReg) && (op1Reg != tmpReg2) && (op2Reg != tmpReg) && (op2Reg != tmpReg2) &&
-               (tmpReg != tmpReg2));
-
-        // We will start by setting things up so that:
-        //    - We have op1 in op1Reg and targetReg, and they are different registers.
-        //    - We have op2 in op2Reg and tmpReg
-        //    - Either we will leave the input registers (the original op1Reg and op2Reg) unmodified,
-        //      OR they are the targetReg that will be produced.
-        //      (Note that in the code we generate below op1Reg and op2Reg are never written.)
-        // We will copy things as necessary to ensure that this is the case.
-        // Note that we can swap op1 and op2, since multiplication is commutative.
-        // We will not modify the values in op1Reg and op2Reg.
-        // (Though note that if either op1 or op2 is the same as targetReg, we will make
-        // a copy and use that copy as the input register.  In that case we WILL modify
-        // the original value in the register, but will wind up with the result in targetReg
-        // in the end, as expected.)
-
-        // First, we need a tmpReg that is NOT the same as targetReg.
-        // Note that if we have another reg that is the same as targetReg,
-        // we can use tmpReg2 for that case, as we will not have hit this case.
-        if (tmpReg == targetReg)
-        {
-            tmpReg = tmpReg2;
-        }
+    instruction ins = getOpForSIMDIntrinsic(simdNode->gtSIMDIntrinsicID, baseType);
 
-        if (op2Reg == targetReg)
-        {
-            // We will swap the operands.
-            // Since the code below only deals with registers, this now becomes the case where
-            // op1Reg == targetReg.
-            op2Reg = op1Reg;
-            op1Reg = targetReg;
-        }
-        if (op1Reg == targetReg)
-        {
-            // Copy op1, and make tmpReg2 the new op1Reg.
-            // Note that those regs can't be the same, as we asserted above.
-            // Also, we know that tmpReg2 hasn't been used, because we couldn't have hit
-            // the "tmpReg == targetReg" case.
-            inst_RV_RV(INS_movaps, tmpReg2, op1Reg, targetType, emitActualTypeSize(targetType));
-            op1Reg = tmpReg2;
-            inst_RV_RV(INS_movaps, tmpReg, op2Reg, targetType, emitActualTypeSize(targetType));
-            // However, we have one more case to worry about: what if op2Reg is also targetReg
-            // (i.e. we have the same operand as op1 and op2)?
-            // In that case we will set op2Reg to the same register as op1Reg.
-            if (op2Reg == targetReg)
-            {
-                op2Reg = tmpReg2;
-            }
-        }
-        else
-        {
-            // Copy op1 to targetReg and op2 to tmpReg.
-            inst_RV_RV(INS_movaps, targetReg, op1Reg, targetType, emitActualTypeSize(targetType));
-            inst_RV_RV(INS_movaps, tmpReg, op2Reg, targetType, emitActualTypeSize(targetType));
-        }
-        // Let's assert that things are as we expect.
-        //    - We have op1 in op1Reg and targetReg, and they are different registers.
-        assert(op1Reg != targetReg);
-        //    - We have op2 in op2Reg and tmpReg, and they are different registers.
-        assert(op2Reg != tmpReg);
-        //    - Either we are going to leave op1's reg unmodified, or it is the targetReg.
-        assert((op1->GetRegNum() == op1Reg) || (op1->GetRegNum() == op2Reg) || (op1->GetRegNum() == targetReg));
-        //    - Similarly, we are going to leave op2's reg unmodified, or it is the targetReg.
-        assert((op2->GetRegNum() == op1Reg) || (op2->GetRegNum() == op2Reg) || (op2->GetRegNum() == targetReg));
-
-        // Now we can generate the code.
-
-        // targetReg = op1 >> 4-bytes (op1 is already in targetReg)
-        GetEmitter()->emitIns_R_I(INS_psrldq, emitActualTypeSize(targetType), targetReg, 4);
-
-        // tmpReg  = op2 >> 4-bytes (op2 is already in tmpReg)
-        GetEmitter()->emitIns_R_I(INS_psrldq, emitActualTypeSize(targetType), tmpReg, 4);
-
-        // tmp = unsigned double word multiply of targetReg and tmpReg. Essentially
-        // tmpReg[63:0] = op1[1] * op2[1]
-        // tmpReg[127:64] = op1[3] * op2[3]
-        inst_RV_RV(INS_pmuludq, tmpReg, targetReg, targetType, emitActualTypeSize(targetType));
-
-        // Extract first and third double word results from tmpReg
-        // tmpReg = shuffle(0,0,2,0) of tmpReg
-        GetEmitter()->emitIns_R_R_I(INS_pshufd, emitActualTypeSize(targetType), tmpReg, tmpReg, (int8_t)SHUFFLE_XXZX);
-
-        // targetReg[63:0] = op1[0] * op2[0]
-        // targetReg[127:64] = op1[2] * op2[2]
-        inst_RV_RV(INS_movaps, targetReg, op1Reg, targetType, emitActualTypeSize(targetType));
-        inst_RV_RV(INS_pmuludq, targetReg, op2Reg, targetType, emitActualTypeSize(targetType));
-
-        // Extract first and third double word results from targetReg
-        // targetReg = shuffle(0,0,2,0) of targetReg
-        GetEmitter()->emitIns_R_R_I(INS_pshufd, emitActualTypeSize(targetType), targetReg, targetReg,
-                                    (int8_t)SHUFFLE_XXZX);
-
-        // pack the results into a single vector
-        inst_RV_RV(INS_punpckldq, targetReg, tmpReg, targetType, emitActualTypeSize(targetType));
+    // Currently AVX doesn't support integer.
+    // if the ins is INS_cvtsi2ss or INS_cvtsi2sd, we won't use AVX.
+    if (op1Reg != targetReg && compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported &&
+        !(ins == INS_cvtsi2ss || ins == INS_cvtsi2sd) && GetEmitter()->IsThreeOperandAVXInstruction(ins))
+    {
+        inst_RV_RV_RV(ins, targetReg, op1Reg, op2Reg, emitActualTypeSize(targetType));
     }
     else
     {
-        instruction ins = getOpForSIMDIntrinsic(simdNode->gtSIMDIntrinsicID, baseType);
-
-        // Currently AVX doesn't support integer.
-        // if the ins is INS_cvtsi2ss or INS_cvtsi2sd, we won't use AVX.
-        if (op1Reg != targetReg && compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported &&
-            !(ins == INS_cvtsi2ss || ins == INS_cvtsi2sd) && GetEmitter()->IsThreeOperandAVXInstruction(ins))
+        if (op2Reg == targetReg)
         {
-            inst_RV_RV_RV(ins, targetReg, op1Reg, op2Reg, emitActualTypeSize(targetType));
+            otherReg = op1Reg;
         }
-        else
+        else if (op1Reg != targetReg)
         {
-            if (op2Reg == targetReg)
-            {
-                otherReg = op1Reg;
-            }
-            else if (op1Reg != targetReg)
-            {
-                inst_RV_RV(ins_Copy(targetType), targetReg, op1Reg, targetType, emitActualTypeSize(targetType));
-            }
-
-            inst_RV_RV(ins, targetReg, otherReg, targetType, emitActualTypeSize(targetType));
+            inst_RV_RV(ins_Copy(targetType), targetReg, op1Reg, targetType, emitActualTypeSize(targetType));
         }
-    }
 
-    // Vector2/3 div: since the top-most elements will be zero, we end up
-    // perfoming 0/0 which is a NAN. Therefore, post division we need to set the
-    // top-most elements to zero. This is achieved by left logical shift followed
-    // by right logical shift of targetReg.
-    if (simdNode->gtSIMDIntrinsicID == SIMDIntrinsicDiv && (simdNode->gtSIMDSize < 16))
-    {
-        // These are 16 byte operations, so we subtract from 16 bytes, not the vector register length.
-        unsigned shiftCount = 16 - simdNode->gtSIMDSize;
-        assert((shiftCount > 0) && (shiftCount <= 16));
-        instruction ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, TYP_SIMD16);
-        GetEmitter()->emitIns_R_I(ins, EA_16BYTE, targetReg, shiftCount);
-        ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, TYP_SIMD16);
-        GetEmitter()->emitIns_R_I(ins, EA_16BYTE, targetReg, shiftCount);
+        inst_RV_RV(ins, targetReg, otherReg, targetType, emitActualTypeSize(targetType));
     }
 
     genProduceReg(simdNode);
@@ -1807,290 +1615,6 @@ void CodeGen::genSIMDIntrinsicRelOp(GenTreeSIMD* simdNode)
     genProduceReg(simdNode);
 }
 
-//--------------------------------------------------------------------------------
-// genSIMDIntrinsicDotProduct: Generate code for SIMD Intrinsic Dot Product.
-//
-// Arguments:
-//    simdNode - The GT_SIMD node
-//
-// Return Value:
-//    None.
-//
-void CodeGen::genSIMDIntrinsicDotProduct(GenTreeSIMD* simdNode)
-{
-    assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicDotProduct);
-
-    GenTree*  op1      = simdNode->gtGetOp1();
-    GenTree*  op2      = simdNode->gtGetOp2();
-    var_types baseType = simdNode->gtSIMDBaseType;
-    var_types simdType = op1->TypeGet();
-    // TODO-1stClassStructs: Temporary to minimize asmDiffs
-    if (simdType == TYP_DOUBLE)
-    {
-        simdType = TYP_SIMD8;
-    }
-    var_types simdEvalType = (simdType == TYP_SIMD12) ? TYP_SIMD16 : simdType;
-    regNumber targetReg    = simdNode->GetRegNum();
-    assert(targetReg != REG_NA);
-
-    var_types targetType = simdNode->TypeGet();
-    assert(targetType == baseType);
-
-    genConsumeOperands(simdNode);
-    regNumber op1Reg  = op1->GetRegNum();
-    regNumber op2Reg  = op2->GetRegNum();
-    regNumber tmpReg1 = REG_NA;
-    regNumber tmpReg2 = REG_NA;
-
-    SIMDLevel level = compiler->getSIMDSupportLevel();
-
-    // Dot product intrinsic is supported only on float/double vectors
-    // and 32-byte int vectors on AVX.
-    //
-    // Float/Double Vectors:
-    // For SSE, or AVX with 32-byte vectors, we need one additional Xmm register
-    // different from targetReg as scratch. Note that if this is a TYP_SIMD16 or
-    // smaller on AVX, then we don't need a tmpReg.
-    //
-    // 32-byte integer vector on AVX: we need two additional Xmm registers
-    // different from targetReg as scratch.
-    //
-    // 16-byte integer vector on SSE4: we need one additional Xmm register
-    // different from targetReg as scratch.
-    if (varTypeIsFloating(baseType))
-    {
-        if ((compiler->getSIMDSupportLevel() == SIMD_SSE2_Supported) || (simdEvalType == TYP_SIMD32))
-        {
-            tmpReg1 = simdNode->GetSingleTempReg();
-            assert(tmpReg1 != targetReg);
-        }
-        else
-        {
-            assert(simdNode->AvailableTempRegCount() == 0);
-        }
-    }
-    else
-    {
-        assert(baseType == TYP_INT);
-        assert(level >= SIMD_SSE4_Supported);
-
-        if (level == SIMD_SSE4_Supported)
-        {
-            tmpReg1 = simdNode->GetSingleTempReg();
-        }
-        else
-        {
-            tmpReg1 = simdNode->ExtractTempReg();
-            tmpReg2 = simdNode->GetSingleTempReg();
-        }
-    }
-
-    if (level == SIMD_SSE2_Supported)
-    {
-        // We avoid reg move if either op1Reg == targetReg or op2Reg == targetReg
-        if (op1Reg == targetReg)
-        {
-            // Best case
-            // nothing to do, we have registers in the right place
-        }
-        else if (op2Reg == targetReg)
-        {
-            op2Reg = op1Reg;
-        }
-        else
-        {
-            inst_RV_RV(ins_Copy(simdType), targetReg, op1Reg, simdEvalType, emitActualTypeSize(simdType));
-        }
-
-        // DotProduct(v1, v2)
-        // Here v0 = targetReg, v1 = op1Reg, v2 = op2Reg and tmp = tmpReg1
-        if ((simdNode->gtFlags & GTF_SIMD12_OP) != 0)
-        {
-            assert(baseType == TYP_FLOAT);
-            // v0 = v1 * v2
-            // tmp = v0                                       // v0  = (3, 2, 1, 0) - each element is given by its
-            //                                                // position
-            // tmp = shuffle(tmp, tmp, SHUFFLE_ZXXY)          // tmp = (2, 0, 0, 1) - don't really care what's in upper
-            //                                                // bits
-            // v0 = v0 + tmp                                  // v0  = (3+2, 0+2, 1+0, 0+1)
-            // tmp = shuffle(tmp, tmp, SHUFFLE_XXWW)          // tmp = (  1,   1,   2,   2)
-            // v0 = v0 + tmp                                  // v0  = (1+2+3,  0+1+2, 0+1+2, 0+1+2)
-            //
-            inst_RV_RV(INS_mulps, targetReg, op2Reg);
-            inst_RV_RV(INS_movaps, tmpReg1, targetReg);
-            inst_RV_RV_IV(INS_shufps, EA_16BYTE, tmpReg1, tmpReg1, (int8_t)SHUFFLE_ZXXY);
-            inst_RV_RV(INS_addps, targetReg, tmpReg1);
-            inst_RV_RV_IV(INS_shufps, EA_16BYTE, tmpReg1, tmpReg1, (int8_t)SHUFFLE_XXWW);
-            inst_RV_RV(INS_addps, targetReg, tmpReg1);
-        }
-        else if (baseType == TYP_FLOAT)
-        {
-            // v0 = v1 * v2
-            // tmp = v0                                       // v0  = (3, 2, 1, 0) - each element is given by its
-            //                                                // position
-            // tmp = shuffle(tmp, tmp, SHUFFLE_ZWXY)          // tmp = (2, 3, 0, 1)
-            // v0 = v0 + tmp                                  // v0  = (3+2, 2+3, 1+0, 0+1)
-            // tmp = v0
-            // tmp = shuffle(tmp, tmp, SHUFFLE_XYZW)          // tmp = (0+1, 1+0, 2+3, 3+2)
-            // v0 = v0 + tmp                                  // v0  = (0+1+2+3, 0+1+2+3, 0+1+2+3, 0+1+2+3)
-            //                                                // Essentially horizontal addition of all elements.
-            //                                                // We could achieve the same using SSEv3 instruction
-            //                                                // HADDPS.
-            //
-            inst_RV_RV(INS_mulps, targetReg, op2Reg);
-            inst_RV_RV(INS_movaps, tmpReg1, targetReg);
-            inst_RV_RV_IV(INS_shufps, EA_16BYTE, tmpReg1, tmpReg1, (int8_t)SHUFFLE_ZWXY);
-            inst_RV_RV(INS_addps, targetReg, tmpReg1);
-            inst_RV_RV(INS_movaps, tmpReg1, targetReg);
-            inst_RV_RV_IV(INS_shufps, EA_16BYTE, tmpReg1, tmpReg1, (int8_t)SHUFFLE_XYZW);
-            inst_RV_RV(INS_addps, targetReg, tmpReg1);
-        }
-        else
-        {
-            assert(baseType == TYP_DOUBLE);
-
-            // v0 = v1 * v2
-            // tmp = v0                                       // v0  = (1, 0) - each element is given by its position
-            // tmp = shuffle(tmp, tmp, Shuffle(0,1))          // tmp = (0, 1)
-            // v0 = v0 + tmp                                  // v0  = (1+0, 0+1)
-            inst_RV_RV(INS_mulpd, targetReg, op2Reg);
-            inst_RV_RV(INS_movaps, tmpReg1, targetReg);
-            inst_RV_RV_IV(INS_shufpd, EA_16BYTE, tmpReg1, tmpReg1, 0x01);
-            inst_RV_RV(INS_addpd, targetReg, tmpReg1);
-        }
-    }
-    else
-    {
-        assert(level >= SIMD_SSE4_Supported);
-
-        if (varTypeIsFloating(baseType))
-        {
-            // We avoid reg move if either op1Reg == targetReg or op2Reg == targetReg.
-            // Note that this is a duplicate of the code above for SSE, but in the AVX case we can eventually
-            // use the 3-op form, so that we can avoid these copies.
-            // TODO-CQ: Add inst_RV_RV_RV_IV().
-            if (op1Reg == targetReg)
-            {
-                // Best case
-                // nothing to do, we have registers in the right place
-            }
-            else if (op2Reg == targetReg)
-            {
-                op2Reg = op1Reg;
-            }
-            else
-            {
-                inst_RV_RV(ins_Copy(simdType), targetReg, op1Reg, simdEvalType, emitActualTypeSize(simdType));
-            }
-
-            emitAttr emitSize = emitActualTypeSize(simdEvalType);
-            if (baseType == TYP_FLOAT)
-            {
-                // dpps computes the dot product of the upper & lower halves of the 32-byte register.
-                // Notice that if this is a TYP_SIMD16 or smaller on AVX, then we don't need a tmpReg.
-                unsigned mask = ((simdNode->gtFlags & GTF_SIMD12_OP) != 0) ? 0x71 : 0xf1;
-                assert((mask >= 0) && (mask <= 255));
-                inst_RV_RV_IV(INS_dpps, emitSize, targetReg, op2Reg, (int8_t)mask);
-                // dpps computes the dot product of the upper & lower halves of the 32-byte register.
-                // Notice that if this is a TYP_SIMD16 or smaller on AVX, then we don't need a tmpReg.
-                // If this is TYP_SIMD32, we need to combine the lower & upper results.
-                if (simdEvalType == TYP_SIMD32)
-                {
-                    GetEmitter()->emitIns_R_R_I(INS_vextractf128, EA_32BYTE, tmpReg1, targetReg, 0x01);
-                    inst_RV_RV(INS_addps, targetReg, tmpReg1, targetType, emitTypeSize(targetType));
-                }
-            }
-            else if (baseType == TYP_DOUBLE)
-            {
-                if (simdEvalType == TYP_SIMD32)
-                {
-                    // targetReg = targetReg * op2Reg
-                    // targetReg = vhaddpd(targetReg, targetReg) ; horizontal sum of lower & upper halves
-                    // tmpReg    = vextractf128(targetReg, 1)    ; Moves the upper sum into tempReg
-                    // targetReg = targetReg + tmpReg1
-                    inst_RV_RV(INS_mulpd, targetReg, op2Reg, simdEvalType, emitActualTypeSize(simdType));
-                    inst_RV_RV(INS_haddpd, targetReg, targetReg, simdEvalType, emitActualTypeSize(simdType));
-                    GetEmitter()->emitIns_R_R_I(INS_vextractf128, EA_32BYTE, tmpReg1, targetReg, 0x01);
-                    inst_RV_RV(INS_addpd, targetReg, tmpReg1, targetType, emitTypeSize(targetType));
-                }
-                else
-                {
-                    // On AVX, we have no 16-byte vectors of double.  Note that, if we did, we could use
-                    // dppd directly.
-                    assert(level == SIMD_SSE4_Supported);
-                    inst_RV_RV_IV(INS_dppd, emitSize, targetReg, op2Reg, 0x31);
-                }
-            }
-        }
-        else
-        {
-            // Dot product of 32-byte int vector on SSE4/AVX.
-            assert(baseType == TYP_INT);
-            assert(simdEvalType == TYP_SIMD16 || simdEvalType == TYP_SIMD32);
-
-#ifdef DEBUG
-            // SSE4: We need 1 scratch register.
-            // AVX2: We need 2 scratch registers.
-            if (simdEvalType == TYP_SIMD16)
-            {
-                assert(tmpReg1 != REG_NA);
-            }
-            else
-            {
-                assert(tmpReg1 != REG_NA);
-                assert(tmpReg2 != REG_NA);
-            }
-#endif
-
-            // tmpReg1 = op1 * op2
-            if (level == SIMD_AVX2_Supported)
-            {
-                // On AVX take advantage 3 operand form of pmulld
-                inst_RV_RV_RV(INS_pmulld, tmpReg1, op1Reg, op2Reg, emitTypeSize(simdEvalType));
-            }
-            else
-            {
-                inst_RV_RV(ins_Copy(simdEvalType), tmpReg1, op1Reg, simdEvalType);
-                inst_RV_RV(INS_pmulld, tmpReg1, op2Reg, simdEvalType);
-            }
-
-            if (simdEvalType == TYP_SIMD32)
-            {
-                // tmpReg2[127..0] = Upper 128-bits of tmpReg1
-                GetEmitter()->emitIns_R_R_I(INS_vextractf128, EA_32BYTE, tmpReg2, tmpReg1, 0x01);
-
-                // tmpReg1[127..0] = tmpReg1[127..0] + tmpReg2[127..0]
-                // This will compute
-                //    tmpReg1[0] = op1[0]*op2[0] + op1[4]*op2[4]
-                //    tmpReg1[1] = op1[1]*op2[1] + op1[5]*op2[5]
-                //    tmpReg1[2] = op1[2]*op2[2] + op1[6]*op2[6]
-                //    tmpReg1[4] = op1[4]*op2[4] + op1[7]*op2[7]
-                inst_RV_RV(INS_paddd, tmpReg1, tmpReg2, TYP_SIMD16, EA_16BYTE);
-            }
-
-            // This horizontal add will compute
-            //
-            // TYP_SIMD16:
-            //   tmpReg1[0] = tmpReg1[2] = op1[0]*op2[0] + op1[1]*op2[1]
-            //   tmpReg1[1] = tmpReg1[3] = op1[2]*op2[2] + op1[4]*op2[4]
-            //
-            // TYP_SIMD32:
-            //   tmpReg1[0] = tmpReg1[2] = op1[0]*op2[0] + op1[4]*op2[4] + op1[1]*op2[1] + op1[5]*op2[5]
-            //   tmpReg1[1] = tmpReg1[3] = op1[2]*op2[2] + op1[6]*op2[6] + op1[4]*op2[4] + op1[7]*op2[7]
-            inst_RV_RV(INS_phaddd, tmpReg1, tmpReg1, TYP_SIMD16, EA_16BYTE);
-
-            // DotProduct(op1, op2) = tmpReg1[0] = tmpReg1[0] + tmpReg1[1]
-            inst_RV_RV(INS_phaddd, tmpReg1, tmpReg1, TYP_SIMD16, EA_16BYTE);
-
-            // TargetReg = integer result from tmpReg1
-            // (Note that for mov_xmm2i, the int register is always in the reg2 position)
-            inst_RV_RV(INS_mov_xmm2i, tmpReg1, targetReg, TYP_INT);
-        }
-    }
-
-    genProduceReg(simdNode);
-}
-
 //------------------------------------------------------------------------------------
 // genSIMDIntrinsicGetItem: Generate code for SIMD Intrinsic get element at index i.
 //
@@ -2903,10 +2427,7 @@ void CodeGen::genSIMDIntrinsic(GenTreeSIMD* simdNode)
             genSIMDIntrinsicNarrow(simdNode);
             break;
 
-        case SIMDIntrinsicAdd:
         case SIMDIntrinsicSub:
-        case SIMDIntrinsicMul:
-        case SIMDIntrinsicDiv:
         case SIMDIntrinsicBitwiseAnd:
         case SIMDIntrinsicBitwiseOr:
             genSIMDIntrinsicBinOp(simdNode);
@@ -2916,10 +2437,6 @@ void CodeGen::genSIMDIntrinsic(GenTreeSIMD* simdNode)
             genSIMDIntrinsicRelOp(simdNode);
             break;
 
-        case SIMDIntrinsicDotProduct:
-            genSIMDIntrinsicDotProduct(simdNode);
-            break;
-
         case SIMDIntrinsicGetItem:
             genSIMDIntrinsicGetItem(simdNode);
             break;
diff --git a/src/coreclr/src/jit/simdintrinsiclist.h b/src/coreclr/src/jit/simdintrinsiclist.h
index 813a937fd056b8..399fc7d84a2259 100644
--- a/src/coreclr/src/jit/simdintrinsiclist.h
+++ b/src/coreclr/src/jit/simdintrinsiclist.h
@@ -39,11 +39,6 @@
  ***************************************************************************************************************************************************************************************************************************/
 SIMD_INTRINSIC(nullptr,                     false,       None,                     "None",                   TYP_UNDEF,      0,      {TYP_UNDEF, TYP_UNDEF, TYP_UNDEF},     {TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF})
 
-SIMD_INTRINSIC("get_Count",                 false,       GetCount,                 "count",                  TYP_INT,        0,      {TYP_VOID, TYP_UNDEF, TYP_UNDEF},      {TYP_INT, TYP_FLOAT, TYP_DOUBLE, TYP_LONG, TYP_USHORT, TYP_UBYTE, TYP_BYTE, TYP_SHORT, TYP_UINT, TYP_ULONG})
-SIMD_INTRINSIC("get_One",                   false,       GetOne,                   "one",                    TYP_STRUCT,     0,      {TYP_VOID, TYP_UNDEF, TYP_UNDEF},      {TYP_INT, TYP_FLOAT, TYP_DOUBLE, TYP_LONG, TYP_USHORT, TYP_UBYTE, TYP_BYTE, TYP_SHORT, TYP_UINT, TYP_ULONG})
-SIMD_INTRINSIC("get_Zero",                  false,       GetZero,                  "zero",                   TYP_STRUCT,     0,      {TYP_VOID, TYP_UNDEF, TYP_UNDEF},      {TYP_INT, TYP_FLOAT, TYP_DOUBLE, TYP_LONG, TYP_USHORT, TYP_UBYTE, TYP_BYTE, TYP_SHORT, TYP_UINT, TYP_ULONG})
-SIMD_INTRINSIC("get_AllOnes",               false,       GetAllOnes,               "allOnes",                TYP_STRUCT,     0,      {TYP_VOID, TYP_UNDEF, TYP_UNDEF},      {TYP_INT, TYP_FLOAT, TYP_DOUBLE, TYP_LONG, TYP_USHORT, TYP_UBYTE, TYP_BYTE, TYP_SHORT, TYP_UINT, TYP_ULONG})
-
 // .ctor call or newobj - there are four forms.
 // This form takes the object plus a value of the base (element) type:
 SIMD_INTRINSIC(".ctor",                     true,        Init,                     "init",                   TYP_VOID,       2,      {TYP_BYREF, TYP_UNKNOWN, TYP_UNDEF},   {TYP_INT, TYP_FLOAT, TYP_DOUBLE, TYP_LONG, TYP_USHORT, TYP_UBYTE, TYP_BYTE, TYP_SHORT, TYP_UINT, TYP_ULONG})
@@ -77,18 +72,8 @@ SIMD_INTRINSIC("set_Z",                     true,        SetZ,
 SIMD_INTRINSIC("set_W",                     true,        SetW,                     "setW",                   TYP_VOID,       2,      {TYP_BYREF, TYP_UNKNOWN,   TYP_UNDEF},   {TYP_FLOAT, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF})
 
 // Arithmetic Operations
-SIMD_INTRINSIC("op_Addition",               false,       Add,                      "+",                      TYP_STRUCT,     2,      {TYP_STRUCT, TYP_STRUCT, TYP_UNDEF},   {TYP_INT, TYP_FLOAT, TYP_DOUBLE, TYP_LONG, TYP_USHORT, TYP_UBYTE, TYP_BYTE, TYP_SHORT, TYP_UINT, TYP_ULONG})
 SIMD_INTRINSIC("op_Subtraction",            false,       Sub,                      "-",                      TYP_STRUCT,     2,      {TYP_STRUCT, TYP_STRUCT, TYP_UNDEF},   {TYP_INT, TYP_FLOAT, TYP_DOUBLE, TYP_LONG, TYP_USHORT, TYP_UBYTE, TYP_BYTE, TYP_SHORT, TYP_UINT, TYP_ULONG})
 
-#if defined(TARGET_XARCH)
-SIMD_INTRINSIC("op_Multiply",               false,       Mul,                      "*",                      TYP_STRUCT,     2,      {TYP_STRUCT, TYP_STRUCT, TYP_UNDEF},   {TYP_INT, TYP_FLOAT, TYP_DOUBLE, TYP_SHORT,TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF})
-#elif defined(TARGET_ARM64)
-// TODO-ARM64-CQ Investigate code sequence to accelerate LONG/ULONG vector multiply
-SIMD_INTRINSIC("op_Multiply",               false,       Mul,                      "*",                      TYP_STRUCT,     2,      {TYP_STRUCT, TYP_STRUCT, TYP_UNDEF},   {TYP_INT, TYP_FLOAT, TYP_DOUBLE, TYP_USHORT, TYP_UBYTE, TYP_BYTE, TYP_SHORT, TYP_UINT, TYP_UNDEF, TYP_UNDEF})
-#endif
-
-SIMD_INTRINSIC("op_Division",               false,       Div,                      "/",                      TYP_STRUCT,     2,      {TYP_STRUCT, TYP_STRUCT, TYP_UNDEF},   {TYP_FLOAT, TYP_DOUBLE, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF})
-
 // Vector Relational operators
 SIMD_INTRINSIC("Equals",                    false,       Equal,                    "eq",                     TYP_STRUCT,     2,      {TYP_STRUCT, TYP_STRUCT, TYP_UNDEF},   {TYP_INT, TYP_FLOAT, TYP_DOUBLE, TYP_LONG, TYP_USHORT, TYP_UBYTE, TYP_BYTE, TYP_SHORT, TYP_UINT, TYP_ULONG})
 
@@ -96,15 +81,6 @@ SIMD_INTRINSIC("Equals",                    false,       Equal,
 SIMD_INTRINSIC("op_BitwiseAnd",             false,       BitwiseAnd,               "&",                      TYP_STRUCT,     2,      {TYP_STRUCT, TYP_STRUCT, TYP_UNDEF},   {TYP_INT, TYP_FLOAT, TYP_DOUBLE, TYP_LONG, TYP_USHORT, TYP_UBYTE, TYP_BYTE, TYP_SHORT, TYP_UINT, TYP_ULONG})
 SIMD_INTRINSIC("op_BitwiseOr",              false,       BitwiseOr,                "|",                      TYP_STRUCT,     2,      {TYP_STRUCT, TYP_STRUCT, TYP_UNDEF},   {TYP_INT, TYP_FLOAT, TYP_DOUBLE, TYP_LONG, TYP_USHORT, TYP_UBYTE, TYP_BYTE, TYP_SHORT, TYP_UINT, TYP_ULONG})
 
-// Dot Product
-#if defined(TARGET_XARCH)
-// Is supported only on Vector<int> on AVX.
-SIMD_INTRINSIC("Dot",                       false,       DotProduct,               "Dot",                    TYP_UNKNOWN,    2,      {TYP_STRUCT, TYP_STRUCT, TYP_UNDEF},   {TYP_INT, TYP_FLOAT, TYP_DOUBLE, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF})
-#elif defined(TARGET_ARM64)
-// Dot Product does not support LONG/ULONG due to lack of multiply support (see TODO-ARM64-CQ above)
-SIMD_INTRINSIC("Dot",                       false,       DotProduct,               "Dot",                    TYP_UNKNOWN,    2,      {TYP_STRUCT, TYP_STRUCT, TYP_UNDEF},   {TYP_INT, TYP_FLOAT, TYP_DOUBLE, TYP_USHORT, TYP_UBYTE, TYP_BYTE, TYP_SHORT, TYP_UINT, TYP_UNDEF, TYP_UNDEF})
-#endif
-
 // Cast
 SIMD_INTRINSIC("op_Explicit",               false,       Cast,                     "Cast",                   TYP_STRUCT,     1,      {TYP_STRUCT, TYP_UNDEF,  TYP_UNDEF},   {TYP_INT, TYP_FLOAT, TYP_DOUBLE, TYP_LONG, TYP_USHORT, TYP_UBYTE, TYP_BYTE, TYP_SHORT, TYP_UINT, TYP_ULONG})