Gears/external/inteli/bmi2intrin.d

364 lines
11 KiB
D

/**
* BMI2 intrinsics.
* https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#othertechs=BMI2
*
* Copyright: Copyright Johan Engelen 2021.
* License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
*/
module inteli.bmi2intrin;
import inteli.internals;
nothrow @nogc pure @safe:
/// Copy all bits from unsigned 32-bit integer `a` to dst, and reset (set to 0) the high bits in dst starting at index.
uint _bzhi_u32 (uint a, uint index)
{
static if (GDC_or_LDC_with_BMI2)
{
if (!__ctfe)
return __builtin_ia32_bzhi_si(a, index);
else
return bzhi!uint(a, index);
}
else
{
return bzhi!uint(a, index);
}
}
unittest
{
static assert (_bzhi_u32(0x1234_5678, 5) == 0x18);
assert (_bzhi_u32(0x1234_5678, 5) == 0x18);
static assert (_bzhi_u32(0x1234_5678, 10) == 0x278);
assert (_bzhi_u32(0x1234_5678, 10) == 0x278);
static assert (_bzhi_u32(0x1234_5678, 21) == 0x14_5678);
assert (_bzhi_u32(0x1234_5678, 21) == 0x14_5678);
}
/// Copy all bits from unsigned 64-bit integer `a` to dst, and reset (set to 0) the high bits in dst starting at index.
ulong _bzhi_u64 (ulong a, uint index)
{
static if (GDC_or_LDC_with_BMI2)
{
if (!__ctfe)
{
version(X86_64)
{
// This instruction not available in 32-bit x86.
return __builtin_ia32_bzhi_di(a, index);
}
else
return bzhi!ulong(a, index);
}
else
return bzhi!ulong(a, index);
}
else
{
return bzhi!ulong(a, index);
}
}
unittest
{
static assert (_bzhi_u64(0x1234_5678, 5) == 0x18);
assert (_bzhi_u64(0x1234_5678, 5) == 0x18);
static assert (_bzhi_u64(0x1234_5678, 10) == 0x278);
assert (_bzhi_u64(0x1234_5678, 10) == 0x278);
static assert (_bzhi_u64(0x1234_5678, 21) == 0x14_5678);
assert (_bzhi_u64(0x1234_5678, 21) == 0x14_5678);
static assert (_bzhi_u64(0x8765_4321_1234_5678, 54) == 0x0025_4321_1234_5678);
assert (_bzhi_u64(0x8765_4321_1234_5678, 54) == 0x0025_4321_1234_5678);
}
// Helper function for BZHI
private T bzhi(T)(T a, uint index)
{
/+
n := index[7:0]
dst := a
IF (n < number of bits)
dst[MSB:n] := 0
FI
+/
enum numbits = T.sizeof*8;
T dst = a;
if (index < numbits)
{
T mask = (T(1) << index) - 1;
dst &= mask;
}
return dst;
}
/// Multiply unsigned 32-bit integers `a` and `b`, store the low 32-bits of the result in dst,
/// and store the high 32-bits in `hi`. This does not read or write arithmetic flags.
/// Note: the implementation _does_ set arithmetic flags, unlike the instruction semantics say.
/// But, those particular semantics don't exist at the level of intrinsics.
uint _mulx_u32 (uint a, uint b, uint* hi)
{
// Note: that does NOT generate mulx with LDC, and there seems to be no way to do that for
// some reason, even with LLVM IR.
// Also same with GDC.
ulong result = cast(ulong) a * b;
*hi = cast(uint) (result >>> 32);
return cast(uint)result;
}
@system unittest
{
uint hi;
assert (_mulx_u32(0x1234_5678, 0x1234_5678, &hi) == 0x1DF4_D840);
assert (hi == 0x014B_66DC);
}
/// Multiply unsigned 64-bit integers `a` and `b`, store the low 64-bits of the result in dst, and
/// store the high 64-bits in `hi`. This does not read or write arithmetic flags.
/// Note: the implementation _does_ set arithmetic flags, unlike the instruction semantics say.
/// But, those particular semantics don't exist at the level of intrinsics.
ulong _mulx_u64 (ulong a, ulong b, ulong* hi)
{
/+
dst[63:0] := (a * b)[63:0]
MEM[hi+63:hi] := (a * b)[127:64]
+/
static if (LDC_with_optimizations)
{
static if (__VERSION__ >= 2094)
enum bool withLDCIR = true;
else
enum bool withLDCIR = false;
}
else
{
enum bool withLDCIR = false;
}
static if (withLDCIR)
{
// LDC x86: Generates mulx from -O0
enum ir = `
%4 = zext i64 %0 to i128
%5 = zext i64 %1 to i128
%6 = mul nuw i128 %5, %4
%7 = lshr i128 %6, 64
%8 = trunc i128 %7 to i64
store i64 %8, i64* %2, align 8
%9 = trunc i128 %6 to i64
ret i64 %9`;
return LDCInlineIR!(ir, ulong, ulong, ulong, ulong*)(a, b, hi);
}
else
{
/+ Straight-forward implementation with `ucent`:
ucent result = cast(ucent) a * b;
*hi = cast(ulong) ((result >>> 64) & 0xFFFF_FFFF_FFFF_FFFF);
return cast(ulong) (result & 0xFFFF_FFFF_FFFF_FFFF);
+/
/+
Implementation using 64bit math is more complex...
a * b = (a_high << 32 + a_low) * (b_high << 32 + b_low)
= (a_high << 32)*(b_high << 32) + (a_high << 32)*b_low + a_low* (b_high << 32) + a_low*b_low
= (a_high*b_high) << 64 + (a_high*b_low) << 32 + (a_low*b_high) << 32 + a_low*b_low
= c2 << 64 + c11 << 32 + c12 << 32 + c0
= z1 << 64 + z0
// The sums may overflow, so we need to carry the carry (from low 64bits to high 64bits). We can do that
// by separately creating the sum to get the high 32 bits of z0 using 64bit math. The high 32 bits of that
// intermediate result is then the 'carry' that we need to add when calculating z1's sum.
z0 = (c0 & 0xFFFF_FFFF) + (c0 >> 32 + c11 & 0xFFFF_FFFF + c12 & 0xFFFF_FFFF ) << 32
The carry part from z0's sum = (c0 >> 32 + c11 & 0xFFFF_FFFF + c12 & 0xFFFF_FFFF ) >> 32
z1 = c2 + (c11 >> 32 + c12 >> 32 + (c0 >> 32 + c11 & 0xFFFF_FFFF + c12 & 0xFFFF_FFFF ) >> 32
+/
const ulong a_low = a & 0xFFFF_FFFF;
const ulong a_high = a >>> 32;
const ulong b_low = b & 0xFFFF_FFFF;
const ulong b_high = b >>> 32;
const ulong c2 = a_high*b_high;
const ulong c11 = a_high*b_low;
const ulong c12 = a_low*b_high;
const ulong c0 = a_low*b_low;
const ulong common_term = (c0 >> 32) + (c11 & 0xFFFF_FFFF) + (c12 & 0xFFFF_FFFF);
const ulong z0 = (c0 & 0xFFFF_FFFF) + (common_term << 32);
const ulong z1 = c2 + (c11 >> 32) + (c12 >> 32) + (common_term >> 32);
*hi = z1;
return z0;
}
}
@system unittest
{
ulong hi;
// 0x1234_5678_9ABC_DEF0 * 0x1234_5678_9ABC_DEF0 == 0x14b_66dc_33f6_acdc_a5e2_0890_f2a5_2100
assert (_mulx_u64(0x1234_5678_9ABC_DEF0, 0x1234_5678_9ABC_DEF0, &hi) == 0xa5e2_0890_f2a5_2100);
assert (hi == 0x14b_66dc_33f6_acdc);
}
/// Deposit contiguous low bits from unsigned 32-bit integer `a` to dst at the corresponding bit locations specified by `mask`; all other bits in dst are set to zero.
uint _pdep_u32 (uint a, uint mask)
{
static if (GDC_or_LDC_with_BMI2)
{
if (!__ctfe)
return __builtin_ia32_pdep_si(a, mask);
else
return pdep!uint(a, mask);
}
else
{
return pdep!uint(a, mask);
}
}
unittest
{
static assert (_pdep_u32(0x1234_5678, 0x0F0F_0F0F) == 0x0506_0708);
assert (_pdep_u32(0x1234_5678, 0x0F0F_0F0F) == 0x0506_0708);
}
/// Deposit contiguous low bits from unsigned 64-bit integer `a` to dst at the corresponding bit locations specified by `mask`; all other bits in dst are set to zero.
ulong _pdep_u64 (ulong a, ulong mask)
{
static if (GDC_or_LDC_with_BMI2)
{
if (!__ctfe)
{
version(X86_64)
{
// This instruction not available in 32-bit x86.
return __builtin_ia32_pdep_di(a, mask);
}
else
return pdep!ulong(a, mask);
}
else
return pdep!ulong(a, mask);
}
else
{
return pdep!ulong(a, mask);
}
}
unittest
{
static assert (_pdep_u64(0x1234_5678_8765_4321, 0x0F0F_0F0F_0F0F_0F0F) == 0x0807_0605_0403_0201);
assert (_pdep_u64(0x1234_5678_8765_4321, 0x0F0F_0F0F_0F0F_0F0F) == 0x0807_0605_0403_0201);
}
// Helper function for PDEP
private T pdep(T)(T a, T mask)
{
/+
tmp := a
dst := 0
m := 0
k := 0
DO WHILE m < 32
IF mask[m] == 1
dst[m] := tmp[k]
k := k + 1
FI
m := m + 1
OD
+/
T dst;
T k_bitpos = 1;
T m_bitpos = 1; // for each iteration, this has one bit set to 1 in the position probed
foreach (m; 0..T.sizeof*8)
{
if (mask & m_bitpos)
{
dst |= (a & k_bitpos) ? m_bitpos : 0;
k_bitpos <<= 1;
}
m_bitpos <<= 1;
}
return dst;
}
/// Extract bits from unsigned 32-bit integer `a` at the corresponding bit locations specified by
/// `mask` to contiguous low bits in dst; the remaining upper bits in dst are set to zero.
uint _pext_u32 (uint a, uint mask)
{
static if (GDC_or_LDC_with_BMI2)
{
if (!__ctfe)
return __builtin_ia32_pext_si(a, mask);
else
return pext!uint(a, mask);
}
else
{
return pext!uint(a, mask);
}
}
unittest
{
static assert (_pext_u32(0x1234_5678, 0x0F0F_0F0F) == 0x2468);
assert (_pext_u32(0x1234_5678, 0x0F0F_0F0F) == 0x2468);
}
/// Extract bits from unsigned 64-bit integer `a` at the corresponding bit locations specified by
/// `mask` to contiguous low bits in dst; the remaining upper bits in dst are set to zero.
ulong _pext_u64 (ulong a, ulong mask)
{
static if (GDC_or_LDC_with_BMI2)
{
if (!__ctfe)
{
version(X86_64)
{
// This instruction not available in 32-bit x86.
return __builtin_ia32_pext_di(a, mask);
}
else
return pext!ulong(a, mask);
}
else
return pext!ulong(a, mask);
}
else
{
return pext!ulong(a, mask);
}
}
unittest
{
static assert (_pext_u64(0x1234_5678_8765_4321, 0x0F0F_0F0F_0F0F_0F0F) == 0x2468_7531);
assert (_pext_u64(0x1234_5678_8765_4321, 0x0F0F_0F0F_0F0F_0F0F) == 0x2468_7531);
}
// Helper function for PEXT
private T pext(T)(T a, T mask)
{
/+
tmp := a
dst := 0
m := 0
k := 0
DO WHILE m < number of bits in T
IF mask[m] == 1
dst[k] := tmp[m]
k := k + 1
FI
m := m + 1
OD
+/
T dst;
T k_bitpos = 1;
T m_bitpos = 1; // for each iteration, this has one bit set to 1 in the position probed
foreach (m; 0..T.sizeof*8)
{
if (mask & m_bitpos)
{
dst |= (a & m_bitpos) ? k_bitpos : 0;
k_bitpos <<= 1;
}
m_bitpos <<= 1;
}
return dst;
}