364 lines
11 KiB
D
364 lines
11 KiB
D
/**
|
|
* BMI2 intrinsics.
|
|
* https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#othertechs=BMI2
|
|
*
|
|
* Copyright: Copyright Johan Engelen 2021.
|
|
* License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
|
|
*/
|
|
module inteli.bmi2intrin;
|
|
|
|
import inteli.internals;
|
|
|
|
nothrow @nogc pure @safe:
|
|
|
|
/// Copy all bits from unsigned 32-bit integer `a` to dst, and reset (set to 0) the high bits in dst starting at index.
|
|
uint _bzhi_u32 (uint a, uint index)
|
|
{
|
|
static if (GDC_or_LDC_with_BMI2)
|
|
{
|
|
if (!__ctfe)
|
|
return __builtin_ia32_bzhi_si(a, index);
|
|
else
|
|
return bzhi!uint(a, index);
|
|
}
|
|
else
|
|
{
|
|
return bzhi!uint(a, index);
|
|
}
|
|
}
|
|
unittest
|
|
{
|
|
static assert (_bzhi_u32(0x1234_5678, 5) == 0x18);
|
|
assert (_bzhi_u32(0x1234_5678, 5) == 0x18);
|
|
static assert (_bzhi_u32(0x1234_5678, 10) == 0x278);
|
|
assert (_bzhi_u32(0x1234_5678, 10) == 0x278);
|
|
static assert (_bzhi_u32(0x1234_5678, 21) == 0x14_5678);
|
|
assert (_bzhi_u32(0x1234_5678, 21) == 0x14_5678);
|
|
}
|
|
|
|
/// Copy all bits from unsigned 64-bit integer `a` to dst, and reset (set to 0) the high bits in dst starting at index.
|
|
ulong _bzhi_u64 (ulong a, uint index)
|
|
{
|
|
static if (GDC_or_LDC_with_BMI2)
|
|
{
|
|
if (!__ctfe)
|
|
{
|
|
version(X86_64)
|
|
{
|
|
// This instruction not available in 32-bit x86.
|
|
return __builtin_ia32_bzhi_di(a, index);
|
|
}
|
|
else
|
|
return bzhi!ulong(a, index);
|
|
}
|
|
else
|
|
return bzhi!ulong(a, index);
|
|
}
|
|
else
|
|
{
|
|
return bzhi!ulong(a, index);
|
|
}
|
|
}
|
|
unittest
|
|
{
|
|
static assert (_bzhi_u64(0x1234_5678, 5) == 0x18);
|
|
assert (_bzhi_u64(0x1234_5678, 5) == 0x18);
|
|
static assert (_bzhi_u64(0x1234_5678, 10) == 0x278);
|
|
assert (_bzhi_u64(0x1234_5678, 10) == 0x278);
|
|
static assert (_bzhi_u64(0x1234_5678, 21) == 0x14_5678);
|
|
assert (_bzhi_u64(0x1234_5678, 21) == 0x14_5678);
|
|
static assert (_bzhi_u64(0x8765_4321_1234_5678, 54) == 0x0025_4321_1234_5678);
|
|
assert (_bzhi_u64(0x8765_4321_1234_5678, 54) == 0x0025_4321_1234_5678);
|
|
}
|
|
|
|
// Helper function for BZHI
|
|
private T bzhi(T)(T a, uint index)
|
|
{
|
|
/+
|
|
n := index[7:0]
|
|
dst := a
|
|
IF (n < number of bits)
|
|
dst[MSB:n] := 0
|
|
FI
|
|
+/
|
|
enum numbits = T.sizeof*8;
|
|
T dst = a;
|
|
if (index < numbits)
|
|
{
|
|
T mask = (T(1) << index) - 1;
|
|
dst &= mask;
|
|
}
|
|
return dst;
|
|
}
|
|
|
|
/// Multiply unsigned 32-bit integers `a` and `b`, store the low 32-bits of the result in dst,
|
|
/// and store the high 32-bits in `hi`. This does not read or write arithmetic flags.
|
|
/// Note: the implementation _does_ set arithmetic flags, unlike the instruction semantics say.
|
|
/// But, those particular semantics don't exist at the level of intrinsics.
|
|
uint _mulx_u32 (uint a, uint b, uint* hi)
|
|
{
|
|
// Note: that does NOT generate mulx with LDC, and there seems to be no way to do that for
|
|
// some reason, even with LLVM IR.
|
|
// Also same with GDC.
|
|
ulong result = cast(ulong) a * b;
|
|
*hi = cast(uint) (result >>> 32);
|
|
return cast(uint)result;
|
|
}
|
|
@system unittest
|
|
{
|
|
uint hi;
|
|
assert (_mulx_u32(0x1234_5678, 0x1234_5678, &hi) == 0x1DF4_D840);
|
|
assert (hi == 0x014B_66DC);
|
|
}
|
|
|
|
/// Multiply unsigned 64-bit integers `a` and `b`, store the low 64-bits of the result in dst, and
|
|
/// store the high 64-bits in `hi`. This does not read or write arithmetic flags.
|
|
/// Note: the implementation _does_ set arithmetic flags, unlike the instruction semantics say.
|
|
/// But, those particular semantics don't exist at the level of intrinsics.
|
|
ulong _mulx_u64 (ulong a, ulong b, ulong* hi)
|
|
{
|
|
/+
|
|
dst[63:0] := (a * b)[63:0]
|
|
MEM[hi+63:hi] := (a * b)[127:64]
|
|
+/
|
|
|
|
static if (LDC_with_optimizations)
|
|
{
|
|
static if (__VERSION__ >= 2094)
|
|
enum bool withLDCIR = true;
|
|
else
|
|
enum bool withLDCIR = false;
|
|
}
|
|
else
|
|
{
|
|
enum bool withLDCIR = false;
|
|
}
|
|
|
|
static if (withLDCIR)
|
|
{
|
|
// LDC x86: Generates mulx from -O0
|
|
enum ir = `
|
|
%4 = zext i64 %0 to i128
|
|
%5 = zext i64 %1 to i128
|
|
%6 = mul nuw i128 %5, %4
|
|
%7 = lshr i128 %6, 64
|
|
%8 = trunc i128 %7 to i64
|
|
store i64 %8, i64* %2, align 8
|
|
%9 = trunc i128 %6 to i64
|
|
ret i64 %9`;
|
|
return LDCInlineIR!(ir, ulong, ulong, ulong, ulong*)(a, b, hi);
|
|
}
|
|
else
|
|
{
|
|
/+ Straight-forward implementation with `ucent`:
|
|
ucent result = cast(ucent) a * b;
|
|
*hi = cast(ulong) ((result >>> 64) & 0xFFFF_FFFF_FFFF_FFFF);
|
|
return cast(ulong) (result & 0xFFFF_FFFF_FFFF_FFFF);
|
|
+/
|
|
|
|
/+
|
|
Implementation using 64bit math is more complex...
|
|
a * b = (a_high << 32 + a_low) * (b_high << 32 + b_low)
|
|
= (a_high << 32)*(b_high << 32) + (a_high << 32)*b_low + a_low* (b_high << 32) + a_low*b_low
|
|
= (a_high*b_high) << 64 + (a_high*b_low) << 32 + (a_low*b_high) << 32 + a_low*b_low
|
|
= c2 << 64 + c11 << 32 + c12 << 32 + c0
|
|
= z1 << 64 + z0
|
|
// The sums may overflow, so we need to carry the carry (from low 64bits to high 64bits). We can do that
|
|
// by separately creating the sum to get the high 32 bits of z0 using 64bit math. The high 32 bits of that
|
|
// intermediate result is then the 'carry' that we need to add when calculating z1's sum.
|
|
z0 = (c0 & 0xFFFF_FFFF) + (c0 >> 32 + c11 & 0xFFFF_FFFF + c12 & 0xFFFF_FFFF ) << 32
|
|
The carry part from z0's sum = (c0 >> 32 + c11 & 0xFFFF_FFFF + c12 & 0xFFFF_FFFF ) >> 32
|
|
z1 = c2 + (c11 >> 32 + c12 >> 32 + (c0 >> 32 + c11 & 0xFFFF_FFFF + c12 & 0xFFFF_FFFF ) >> 32
|
|
+/
|
|
|
|
const ulong a_low = a & 0xFFFF_FFFF;
|
|
const ulong a_high = a >>> 32;
|
|
const ulong b_low = b & 0xFFFF_FFFF;
|
|
const ulong b_high = b >>> 32;
|
|
|
|
const ulong c2 = a_high*b_high;
|
|
const ulong c11 = a_high*b_low;
|
|
const ulong c12 = a_low*b_high;
|
|
const ulong c0 = a_low*b_low;
|
|
|
|
const ulong common_term = (c0 >> 32) + (c11 & 0xFFFF_FFFF) + (c12 & 0xFFFF_FFFF);
|
|
const ulong z0 = (c0 & 0xFFFF_FFFF) + (common_term << 32);
|
|
const ulong z1 = c2 + (c11 >> 32) + (c12 >> 32) + (common_term >> 32);
|
|
|
|
*hi = z1;
|
|
return z0;
|
|
}
|
|
}
|
|
@system unittest
|
|
{
|
|
ulong hi;
|
|
// 0x1234_5678_9ABC_DEF0 * 0x1234_5678_9ABC_DEF0 == 0x14b_66dc_33f6_acdc_a5e2_0890_f2a5_2100
|
|
assert (_mulx_u64(0x1234_5678_9ABC_DEF0, 0x1234_5678_9ABC_DEF0, &hi) == 0xa5e2_0890_f2a5_2100);
|
|
assert (hi == 0x14b_66dc_33f6_acdc);
|
|
}
|
|
|
|
/// Deposit contiguous low bits from unsigned 32-bit integer `a` to dst at the corresponding bit locations specified by `mask`; all other bits in dst are set to zero.
|
|
uint _pdep_u32 (uint a, uint mask)
|
|
{
|
|
static if (GDC_or_LDC_with_BMI2)
|
|
{
|
|
if (!__ctfe)
|
|
return __builtin_ia32_pdep_si(a, mask);
|
|
else
|
|
return pdep!uint(a, mask);
|
|
}
|
|
else
|
|
{
|
|
return pdep!uint(a, mask);
|
|
}
|
|
}
|
|
unittest
|
|
{
|
|
static assert (_pdep_u32(0x1234_5678, 0x0F0F_0F0F) == 0x0506_0708);
|
|
assert (_pdep_u32(0x1234_5678, 0x0F0F_0F0F) == 0x0506_0708);
|
|
}
|
|
|
|
/// Deposit contiguous low bits from unsigned 64-bit integer `a` to dst at the corresponding bit locations specified by `mask`; all other bits in dst are set to zero.
|
|
ulong _pdep_u64 (ulong a, ulong mask)
|
|
{
|
|
static if (GDC_or_LDC_with_BMI2)
|
|
{
|
|
if (!__ctfe)
|
|
{
|
|
version(X86_64)
|
|
{
|
|
// This instruction not available in 32-bit x86.
|
|
return __builtin_ia32_pdep_di(a, mask);
|
|
}
|
|
else
|
|
return pdep!ulong(a, mask);
|
|
}
|
|
else
|
|
return pdep!ulong(a, mask);
|
|
}
|
|
else
|
|
{
|
|
return pdep!ulong(a, mask);
|
|
}
|
|
}
|
|
unittest
|
|
{
|
|
static assert (_pdep_u64(0x1234_5678_8765_4321, 0x0F0F_0F0F_0F0F_0F0F) == 0x0807_0605_0403_0201);
|
|
assert (_pdep_u64(0x1234_5678_8765_4321, 0x0F0F_0F0F_0F0F_0F0F) == 0x0807_0605_0403_0201);
|
|
}
|
|
|
|
// Helper function for PDEP
|
|
private T pdep(T)(T a, T mask)
|
|
{
|
|
/+
|
|
tmp := a
|
|
dst := 0
|
|
m := 0
|
|
k := 0
|
|
DO WHILE m < 32
|
|
IF mask[m] == 1
|
|
dst[m] := tmp[k]
|
|
k := k + 1
|
|
FI
|
|
m := m + 1
|
|
OD
|
|
+/
|
|
T dst;
|
|
T k_bitpos = 1;
|
|
T m_bitpos = 1; // for each iteration, this has one bit set to 1 in the position probed
|
|
foreach (m; 0..T.sizeof*8)
|
|
{
|
|
if (mask & m_bitpos)
|
|
{
|
|
dst |= (a & k_bitpos) ? m_bitpos : 0;
|
|
k_bitpos <<= 1;
|
|
}
|
|
m_bitpos <<= 1;
|
|
}
|
|
return dst;
|
|
}
|
|
|
|
|
|
/// Extract bits from unsigned 32-bit integer `a` at the corresponding bit locations specified by
|
|
/// `mask` to contiguous low bits in dst; the remaining upper bits in dst are set to zero.
|
|
uint _pext_u32 (uint a, uint mask)
|
|
{
|
|
static if (GDC_or_LDC_with_BMI2)
|
|
{
|
|
if (!__ctfe)
|
|
return __builtin_ia32_pext_si(a, mask);
|
|
else
|
|
return pext!uint(a, mask);
|
|
}
|
|
else
|
|
{
|
|
return pext!uint(a, mask);
|
|
}
|
|
}
|
|
unittest
|
|
{
|
|
static assert (_pext_u32(0x1234_5678, 0x0F0F_0F0F) == 0x2468);
|
|
assert (_pext_u32(0x1234_5678, 0x0F0F_0F0F) == 0x2468);
|
|
}
|
|
|
|
/// Extract bits from unsigned 64-bit integer `a` at the corresponding bit locations specified by
|
|
/// `mask` to contiguous low bits in dst; the remaining upper bits in dst are set to zero.
|
|
ulong _pext_u64 (ulong a, ulong mask)
|
|
{
|
|
static if (GDC_or_LDC_with_BMI2)
|
|
{
|
|
if (!__ctfe)
|
|
{
|
|
version(X86_64)
|
|
{
|
|
// This instruction not available in 32-bit x86.
|
|
return __builtin_ia32_pext_di(a, mask);
|
|
}
|
|
else
|
|
return pext!ulong(a, mask);
|
|
}
|
|
else
|
|
return pext!ulong(a, mask);
|
|
}
|
|
else
|
|
{
|
|
return pext!ulong(a, mask);
|
|
}
|
|
}
|
|
unittest
|
|
{
|
|
static assert (_pext_u64(0x1234_5678_8765_4321, 0x0F0F_0F0F_0F0F_0F0F) == 0x2468_7531);
|
|
assert (_pext_u64(0x1234_5678_8765_4321, 0x0F0F_0F0F_0F0F_0F0F) == 0x2468_7531);
|
|
}
|
|
|
|
// Helper function for PEXT
|
|
private T pext(T)(T a, T mask)
|
|
{
|
|
/+
|
|
tmp := a
|
|
dst := 0
|
|
m := 0
|
|
k := 0
|
|
DO WHILE m < number of bits in T
|
|
IF mask[m] == 1
|
|
dst[k] := tmp[m]
|
|
k := k + 1
|
|
FI
|
|
m := m + 1
|
|
OD
|
|
+/
|
|
T dst;
|
|
T k_bitpos = 1;
|
|
T m_bitpos = 1; // for each iteration, this has one bit set to 1 in the position probed
|
|
foreach (m; 0..T.sizeof*8)
|
|
{
|
|
if (mask & m_bitpos)
|
|
{
|
|
dst |= (a & m_bitpos) ? k_bitpos : 0;
|
|
k_bitpos <<= 1;
|
|
}
|
|
m_bitpos <<= 1;
|
|
}
|
|
return dst;
|
|
}
|