/** * BMI2 intrinsics. * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#othertechs=BMI2 * * Copyright: Copyright Johan Engelen 2021. * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) */ module inteli.bmi2intrin; import inteli.internals; nothrow @nogc pure @safe: /// Copy all bits from unsigned 32-bit integer `a` to dst, and reset (set to 0) the high bits in dst starting at index. uint _bzhi_u32 (uint a, uint index) { static if (GDC_or_LDC_with_BMI2) { if (!__ctfe) return __builtin_ia32_bzhi_si(a, index); else return bzhi!uint(a, index); } else { return bzhi!uint(a, index); } } unittest { static assert (_bzhi_u32(0x1234_5678, 5) == 0x18); assert (_bzhi_u32(0x1234_5678, 5) == 0x18); static assert (_bzhi_u32(0x1234_5678, 10) == 0x278); assert (_bzhi_u32(0x1234_5678, 10) == 0x278); static assert (_bzhi_u32(0x1234_5678, 21) == 0x14_5678); assert (_bzhi_u32(0x1234_5678, 21) == 0x14_5678); } /// Copy all bits from unsigned 64-bit integer `a` to dst, and reset (set to 0) the high bits in dst starting at index. ulong _bzhi_u64 (ulong a, uint index) { static if (GDC_or_LDC_with_BMI2) { if (!__ctfe) { version(X86_64) { // This instruction not available in 32-bit x86. return __builtin_ia32_bzhi_di(a, index); } else return bzhi!ulong(a, index); } else return bzhi!ulong(a, index); } else { return bzhi!ulong(a, index); } } unittest { static assert (_bzhi_u64(0x1234_5678, 5) == 0x18); assert (_bzhi_u64(0x1234_5678, 5) == 0x18); static assert (_bzhi_u64(0x1234_5678, 10) == 0x278); assert (_bzhi_u64(0x1234_5678, 10) == 0x278); static assert (_bzhi_u64(0x1234_5678, 21) == 0x14_5678); assert (_bzhi_u64(0x1234_5678, 21) == 0x14_5678); static assert (_bzhi_u64(0x8765_4321_1234_5678, 54) == 0x0025_4321_1234_5678); assert (_bzhi_u64(0x8765_4321_1234_5678, 54) == 0x0025_4321_1234_5678); } // Helper function for BZHI private T bzhi(T)(T a, uint index) { /+ n := index[7:0] dst := a IF (n < number of bits) dst[MSB:n] := 0 FI +/ enum numbits = T.sizeof*8; T dst = a; if (index < numbits) { T mask = (T(1) << index) - 1; dst &= mask; } return dst; } /// Multiply unsigned 32-bit integers `a` and `b`, store the low 32-bits of the result in dst, /// and store the high 32-bits in `hi`. This does not read or write arithmetic flags. /// Note: the implementation _does_ set arithmetic flags, unlike the instruction semantics say. /// But, those particular semantics don't exist at the level of intrinsics. uint _mulx_u32 (uint a, uint b, uint* hi) { // Note: that does NOT generate mulx with LDC, and there seems to be no way to do that for // some reason, even with LLVM IR. // Also same with GDC. ulong result = cast(ulong) a * b; *hi = cast(uint) (result >>> 32); return cast(uint)result; } @system unittest { uint hi; assert (_mulx_u32(0x1234_5678, 0x1234_5678, &hi) == 0x1DF4_D840); assert (hi == 0x014B_66DC); } /// Multiply unsigned 64-bit integers `a` and `b`, store the low 64-bits of the result in dst, and /// store the high 64-bits in `hi`. This does not read or write arithmetic flags. /// Note: the implementation _does_ set arithmetic flags, unlike the instruction semantics say. /// But, those particular semantics don't exist at the level of intrinsics. ulong _mulx_u64 (ulong a, ulong b, ulong* hi) { /+ dst[63:0] := (a * b)[63:0] MEM[hi+63:hi] := (a * b)[127:64] +/ static if (LDC_with_optimizations) { static if (__VERSION__ >= 2094) enum bool withLDCIR = true; else enum bool withLDCIR = false; } else { enum bool withLDCIR = false; } static if (withLDCIR) { // LDC x86: Generates mulx from -O0 enum ir = ` %4 = zext i64 %0 to i128 %5 = zext i64 %1 to i128 %6 = mul nuw i128 %5, %4 %7 = lshr i128 %6, 64 %8 = trunc i128 %7 to i64 store i64 %8, i64* %2, align 8 %9 = trunc i128 %6 to i64 ret i64 %9`; return LDCInlineIR!(ir, ulong, ulong, ulong, ulong*)(a, b, hi); } else { /+ Straight-forward implementation with `ucent`: ucent result = cast(ucent) a * b; *hi = cast(ulong) ((result >>> 64) & 0xFFFF_FFFF_FFFF_FFFF); return cast(ulong) (result & 0xFFFF_FFFF_FFFF_FFFF); +/ /+ Implementation using 64bit math is more complex... a * b = (a_high << 32 + a_low) * (b_high << 32 + b_low) = (a_high << 32)*(b_high << 32) + (a_high << 32)*b_low + a_low* (b_high << 32) + a_low*b_low = (a_high*b_high) << 64 + (a_high*b_low) << 32 + (a_low*b_high) << 32 + a_low*b_low = c2 << 64 + c11 << 32 + c12 << 32 + c0 = z1 << 64 + z0 // The sums may overflow, so we need to carry the carry (from low 64bits to high 64bits). We can do that // by separately creating the sum to get the high 32 bits of z0 using 64bit math. The high 32 bits of that // intermediate result is then the 'carry' that we need to add when calculating z1's sum. z0 = (c0 & 0xFFFF_FFFF) + (c0 >> 32 + c11 & 0xFFFF_FFFF + c12 & 0xFFFF_FFFF ) << 32 The carry part from z0's sum = (c0 >> 32 + c11 & 0xFFFF_FFFF + c12 & 0xFFFF_FFFF ) >> 32 z1 = c2 + (c11 >> 32 + c12 >> 32 + (c0 >> 32 + c11 & 0xFFFF_FFFF + c12 & 0xFFFF_FFFF ) >> 32 +/ const ulong a_low = a & 0xFFFF_FFFF; const ulong a_high = a >>> 32; const ulong b_low = b & 0xFFFF_FFFF; const ulong b_high = b >>> 32; const ulong c2 = a_high*b_high; const ulong c11 = a_high*b_low; const ulong c12 = a_low*b_high; const ulong c0 = a_low*b_low; const ulong common_term = (c0 >> 32) + (c11 & 0xFFFF_FFFF) + (c12 & 0xFFFF_FFFF); const ulong z0 = (c0 & 0xFFFF_FFFF) + (common_term << 32); const ulong z1 = c2 + (c11 >> 32) + (c12 >> 32) + (common_term >> 32); *hi = z1; return z0; } } @system unittest { ulong hi; // 0x1234_5678_9ABC_DEF0 * 0x1234_5678_9ABC_DEF0 == 0x14b_66dc_33f6_acdc_a5e2_0890_f2a5_2100 assert (_mulx_u64(0x1234_5678_9ABC_DEF0, 0x1234_5678_9ABC_DEF0, &hi) == 0xa5e2_0890_f2a5_2100); assert (hi == 0x14b_66dc_33f6_acdc); } /// Deposit contiguous low bits from unsigned 32-bit integer `a` to dst at the corresponding bit locations specified by `mask`; all other bits in dst are set to zero. uint _pdep_u32 (uint a, uint mask) { static if (GDC_or_LDC_with_BMI2) { if (!__ctfe) return __builtin_ia32_pdep_si(a, mask); else return pdep!uint(a, mask); } else { return pdep!uint(a, mask); } } unittest { static assert (_pdep_u32(0x1234_5678, 0x0F0F_0F0F) == 0x0506_0708); assert (_pdep_u32(0x1234_5678, 0x0F0F_0F0F) == 0x0506_0708); } /// Deposit contiguous low bits from unsigned 64-bit integer `a` to dst at the corresponding bit locations specified by `mask`; all other bits in dst are set to zero. ulong _pdep_u64 (ulong a, ulong mask) { static if (GDC_or_LDC_with_BMI2) { if (!__ctfe) { version(X86_64) { // This instruction not available in 32-bit x86. return __builtin_ia32_pdep_di(a, mask); } else return pdep!ulong(a, mask); } else return pdep!ulong(a, mask); } else { return pdep!ulong(a, mask); } } unittest { static assert (_pdep_u64(0x1234_5678_8765_4321, 0x0F0F_0F0F_0F0F_0F0F) == 0x0807_0605_0403_0201); assert (_pdep_u64(0x1234_5678_8765_4321, 0x0F0F_0F0F_0F0F_0F0F) == 0x0807_0605_0403_0201); } // Helper function for PDEP private T pdep(T)(T a, T mask) { /+ tmp := a dst := 0 m := 0 k := 0 DO WHILE m < 32 IF mask[m] == 1 dst[m] := tmp[k] k := k + 1 FI m := m + 1 OD +/ T dst; T k_bitpos = 1; T m_bitpos = 1; // for each iteration, this has one bit set to 1 in the position probed foreach (m; 0..T.sizeof*8) { if (mask & m_bitpos) { dst |= (a & k_bitpos) ? m_bitpos : 0; k_bitpos <<= 1; } m_bitpos <<= 1; } return dst; } /// Extract bits from unsigned 32-bit integer `a` at the corresponding bit locations specified by /// `mask` to contiguous low bits in dst; the remaining upper bits in dst are set to zero. uint _pext_u32 (uint a, uint mask) { static if (GDC_or_LDC_with_BMI2) { if (!__ctfe) return __builtin_ia32_pext_si(a, mask); else return pext!uint(a, mask); } else { return pext!uint(a, mask); } } unittest { static assert (_pext_u32(0x1234_5678, 0x0F0F_0F0F) == 0x2468); assert (_pext_u32(0x1234_5678, 0x0F0F_0F0F) == 0x2468); } /// Extract bits from unsigned 64-bit integer `a` at the corresponding bit locations specified by /// `mask` to contiguous low bits in dst; the remaining upper bits in dst are set to zero. ulong _pext_u64 (ulong a, ulong mask) { static if (GDC_or_LDC_with_BMI2) { if (!__ctfe) { version(X86_64) { // This instruction not available in 32-bit x86. return __builtin_ia32_pext_di(a, mask); } else return pext!ulong(a, mask); } else return pext!ulong(a, mask); } else { return pext!ulong(a, mask); } } unittest { static assert (_pext_u64(0x1234_5678_8765_4321, 0x0F0F_0F0F_0F0F_0F0F) == 0x2468_7531); assert (_pext_u64(0x1234_5678_8765_4321, 0x0F0F_0F0F_0F0F_0F0F) == 0x2468_7531); } // Helper function for PEXT private T pext(T)(T a, T mask) { /+ tmp := a dst := 0 m := 0 k := 0 DO WHILE m < number of bits in T IF mask[m] == 1 dst[k] := tmp[m] k := k + 1 FI m := m + 1 OD +/ T dst; T k_bitpos = 1; T m_bitpos = 1; // for each iteration, this has one bit set to 1 in the position probed foreach (m; 0..T.sizeof*8) { if (mask & m_bitpos) { dst |= (a & m_bitpos) ? k_bitpos : 0; k_bitpos <<= 1; } m_bitpos <<= 1; } return dst; }