Background and motivation
Intel x86/x64 provides MUL/IMUL instructions that compute the low and high bits of a multiplication in a single instruction.
This would be very useful for Math.BigMul implementations (currently using intrinsics on ARM64 and MULX on x64, but that has bad CQ at the moment).
This would also speed up System.Decimal calculations significantly (currently can't use Math.BigMul there because it's slower than the hand-tuned existing code that composes big multiplications from smaller 32x32 multiplications).
API Proposal
namespace System.Runtime.Intrinsics.X86
{
partial class X86Base
{
partial class X64
{
internal static (ulong Lower, ulong Upper) Multiply(ulong left, ulong right);
internal static (ulong Lower, long Upper) Multiply(long left, long right);
}
}
}
Related DivMod API: #27292
API Usage
In decimal.DecCalc.VarDecFromR8 this would allow:
// Add -power factors of 10, -power <= (29 - 15) = 14.
power = -power;
if (X86.X86Base.X64.IsSupported || Arm.ArmBase.Arm64.IsSupported)
{
ulong low64;
ulong hi64 = Math.BigMul(mant, s_ulongPowers10[power], out low64);
if (hi64 > uint.MaxValue)
Number.ThrowOverflowException(TypeCode.Decimal);
result.High = (uint)hi64;
result.Low64 = low64;
}
else if (power < 10)
{
uint pow10 = s_powers10[power];
ulong low64 = UInt32x32To64((uint)mant, pow10);
ulong hi64 = UInt32x32To64((uint)(mant >> 32), pow10);
result.Low = (uint)low64;
hi64 += low64 >> 32;
result.Mid = (uint)hi64;
hi64 >>= 32;
result.High = (uint)hi64;
}
else
{
UInt64x64To128(mant, s_ulongPowers10[power], ref result);
}
Another example in decimal.DecCalc.VarDecMul that would only be faster on x64 (because on ARM64 BigMul is actually two expensive instructions):
// Highest 32 bits is non-zero. Calculate 5 more partial products.
if (X86.X86Base.X64.IsSupported)
{
ulong mid64 = tmp;
tmp = Math.BigMul(d1.High, d2.Low64, out tmp2);
if (mid64 > (mid64 += tmp2)) // add with carry detection
tmp++;
tmp += Math.BigMul(d2.High, d1.Low64, out tmp2);
if (mid64 > (mid64 += tmp2)) // add with carry detection
tmp++;
bufProd.Mid64 = mid64;
}
else
{
tmp2 = UInt32x32To64(d1.Low, d2.High);
tmp += tmp2; // this could generate carry
uint tmp3 = 0;
if (tmp < tmp2) // detect carry
tmp3 = 1;
tmp2 = UInt32x32To64(d1.High, d2.Low);
tmp += tmp2; // this could generate carry
bufProd.U2 = (uint)tmp;
if (tmp < tmp2) // detect carry
tmp3++;
tmp2 = ((ulong)tmp3 << 32) | (tmp >> 32);
tmp = UInt32x32To64(d1.Mid, d2.High);
tmp += tmp2; // this could generate carry
tmp3 = 0;
if (tmp < tmp2) // detect carry
tmp3 = 1;
tmp2 = UInt32x32To64(d1.High, d2.Mid);
tmp += tmp2; // this could generate carry
bufProd.U3 = (uint)tmp;
if (tmp < tmp2) // detect carry
tmp3++;
tmp = ((ulong)tmp3 << 32) | (tmp >> 32);
}
bufProd.High64 = UInt32x32To64(d1.High, d2.High) + tmp;
hiProd = 5;
Background and motivation
Intel x86/x64 provides MUL/IMUL instructions that compute the low and high bits of a multiplication in a single instruction.
This would be very useful for
Math.BigMulimplementations (currently using intrinsics on ARM64 and MULX on x64, but that has bad CQ at the moment).This would also speed up
System.Decimalcalculations significantly (currently can't useMath.BigMulthere because it's slower than the hand-tuned existing code that composes big multiplications from smaller 32x32 multiplications).API Proposal
Related DivMod API: #27292
API Usage
In
decimal.DecCalc.VarDecFromR8this would allow:Another example in
decimal.DecCalc.VarDecMulthat would only be faster on x64 (because on ARM64 BigMul is actually two expensive instructions):