/** * SSE4.2 intrinsics. * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=SSSE3 * * Copyright: Guillaume Piolat 2022. * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) */ module inteli.nmmintrin; public import inteli.types; import inteli.internals; public import inteli.smmintrin; import core.bitop: bsf, bsr; // Note: this header will work whether you have SSE4.2 enabled or not. // With LDC, use "dflags-ldc": ["-mattr=+sse4.2"] or equivalent to actively // generate SSE4.2 instruction (they are often enabled with -O1 or greater). // - Additionally, you need ["-mattr=+crc"] on ARM if you want hardware CRC instructions. // - Since LDC 1.30, you need ["-mattr=+crc32"] on x86_64 if you want hardware CRC instructions, // it is not considered implied by sse4.2 anymore. // With GDC, use "dflags-gdc": ["-msse4.2"] or equivalent to generate SSE4.2 instructions. nothrow @nogc: // /// String contains unsigned 8-bit characters (default). enum int _SIDD_UBYTE_OPS = 0; /// String contains unsigned 16-bit characters. enum int _SIDD_UWORD_OPS = 1; /// String contains signed 8-bit characters. enum int _SIDD_SBYTE_OPS = 2; /// String contains signed 16-bit characters. enum int _SIDD_SWORD_OPS = 3; // // /// For each character in `b`, find if it is in `a` (default) /// The resulting mask has bit set at b positions that were found in a. enum int _SIDD_CMP_EQUAL_ANY = 0; /// For each character in `b`, determine if /// `a[0] <= c <= a[1] or a[1] <= c <= a[2]...` /// Contrarily to false documentation on the Internet, pairs must be in `a`! enum int _SIDD_CMP_RANGES = 4; /// The strings defined by `a` and `b` are equal enum int _SIDD_CMP_EQUAL_EACH = 8; /// Search for the defined substring in the target enum int _SIDD_CMP_EQUAL_ORDERED = 12; // // /// Do not negate results (default, no effect) enum int _SIDD_POSITIVE_POLARITY = 0; /// Negates results enum int _SIDD_NEGATIVE_POLARITY = 16; /// No effect. Do not negate results before the end of the string. (default when using `_SIDD_NEGATIVE_POLARITY`) /// You basically never want this. enum int _SIDD_MASKED_POSITIVE_POLARITY = 32; /// Negates results only before the end of the string enum int _SIDD_MASKED_NEGATIVE_POLARITY = 48; // // /// **Index only**: return the least significant bit (default). enum int _SIDD_LEAST_SIGNIFICANT = 0; /// **Index only**: return the most significant bit. enum int _SIDD_MOST_SIGNIFICANT = 64; // /// **Mask only**: return the bit mask (default). enum int _SIDD_BIT_MASK = 0; /// **Mask only**: return the byte/word mask. enum int _SIDD_UNIT_MASK = 64; /// So SSE4.2 has a lot of hard-to-understand instructions. Here is another explanation. /// /// Alternative explanation of imm8 /// /// imm8 is an 8-bit immediate operand specifying whether the characters are bytes or /// words and the type of comparison to do. /// /// Bits [1:0]: Determine source data format. /// 00: 16 unsigned bytes /// 01: 8 unsigned words /// 10: 16 signed bytes /// 11: 8 signed words /// /// Bits [3:2]: Determine comparison type and aggregation method. /// 00: Subset: Each character in B is compared for equality with all /// the characters in A. /// 01: Ranges: Each character in B is compared to A pairs. The comparison /// basis is greater than or equal for even-indexed elements in A, /// and less than or equal for odd-indexed elements in A. /// 10: Match: Compare each pair of corresponding characters in A and /// B for equality. /// 11: Substring: Search B for substring matches of A. /// /// Bits [5:4]: Determine whether to do a one's complement on the bit /// mask of the comparison results. \n /// 00: No effect. \n /// 01: Negate the bit mask. \n /// 10: No effect. \n /// 11: Negate the bit mask only for bits with an index less than or equal /// to the size of \a A or \a B. /// /// Compare packed strings in `a` and `b` with lengths `la` and `lb` using /// the control in `imm8`, and returns 1 if `b` "does not contain a null character" /// and the resulting mask was zero, and 0 otherwise. /// Warning: actually it seems the instruction does accept \0 in input, just the length must be >= count. /// It's not clear for what purpose. int _mm_cmpestra(int imm8)(__m128i a, int la, __m128i b, int lb) @trusted { static if (GDC_with_SSE42) { return cast(int) __builtin_ia32_pcmpestria128(cast(ubyte16)a, la, cast(ubyte16)b, lb, imm8); } else static if (LDC_with_SSE42) { return __builtin_ia32_pcmpestria128(cast(byte16)a, la, cast(byte16)b, lb, imm8); } else { __m128i mask = cmpstrMaskExplicit!imm8(a, la, b, lb); __m128i equalZero = _mm_cmpeq_epi8(mask, _mm_setzero_si128()); int sigbits = _mm_movemask_epi8(equalZero); enum int Count = (imm8 & 1) ? 8 : 16; return (sigbits == 0xffff) && (lb >= Count); } } unittest { char[16] A = "Maximum\x00length!!"; char[16] B = "Mbximum\x00length!!"; __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr); __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr); // string matching a-la strcmp, for 16-bytes of data // Use _SIDD_NEGATIVE_POLARITY since mask must be null, and all match must be one assert(1 == _mm_cmpestra!(_SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_EACH | _SIDD_NEGATIVE_POLARITY)(mmA, 16, mmA, 16)); assert(0 == _mm_cmpestra!(_SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_EACH | _SIDD_NEGATIVE_POLARITY)(mmA, 16, mmB, 16)); // test negative length, this will be clamped to 16 assert(1 == _mm_cmpestra!(_SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_EACH | _SIDD_NEGATIVE_POLARITY)(mmA, -160, mmA, -17)); // it seems you can't compare shorter strings for equality using _mm_cmpestra (!) // Test 16-bit format assert(1 == _mm_cmpestra!(_SIDD_SWORD_OPS | _SIDD_CMP_EQUAL_EACH | _SIDD_NEGATIVE_POLARITY)(mmA, 8, mmA, 8)); } /// Compare packed strings in `a` and `b` with lengths `la` and `lb` using /// the control in `imm8`, and returns 1 if the resulting mask was non-zero, /// and 0 otherwise. int _mm_cmpestrc(int imm8)(__m128i a, int la, __m128i b, int lb) @trusted { static if (GDC_with_SSE42) { return cast(int) __builtin_ia32_pcmpestric128(cast(ubyte16)a, la, cast(ubyte16)b, lb, imm8); } else static if (LDC_with_SSE42) { return cast(int) __builtin_ia32_pcmpestric128(cast(byte16)a, la, cast(byte16)b, lb, imm8); } else { __m128i mask = cmpstrMaskExplicit!imm8(a, la, b, lb); int sigbits = _mm_movemask_epi8(mask); return (sigbits != 0); } } unittest { // Compare two shorter strings { char[16] A = "Hello world"; char[16] B = "Hello moon"; __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr); __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr); __m128i mask = _mm_cmpestrm!(_SIDD_UBYTE_OPS // match gives 0 like strcmp | _SIDD_CMP_EQUAL_EACH | _SIDD_NEGATIVE_POLARITY)(mmA, 6, mmB, 6); assert(0 == _mm_cmpestrc!(_SIDD_UBYTE_OPS // match gives 0 like strcmp | _SIDD_CMP_EQUAL_EACH | _SIDD_NEGATIVE_POLARITY)(mmA, 6, mmB, 6)); assert(1 == _mm_cmpestrc!(_SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_EACH | _SIDD_NEGATIVE_POLARITY)(mmA, 7, mmB, 7)); } } /// Compare packed strings in `a` and `b` with lengths `la` and `lb` using /// the control in `imm8`, and return the generated index. /// Note: if the mask is all zeroes, the returned index is always `Count` /// (8 or 16 depending on size). int _mm_cmpestri(int imm8)(__m128i a, int la, __m128i b, int lb) @trusted { static if (GDC_with_SSE42) { return __builtin_ia32_pcmpestri128(cast(ubyte16)a, la, cast(ubyte16)b, lb, imm8); } else static if (LDC_with_SSE42) { return __builtin_ia32_pcmpestri128(cast(byte16)a, la, cast(byte16)b, lb, imm8); } else { __m128i mask = cmpstrMaskExplicit!imm8(a, la, b, lb); // Convert the unit mask to bit mask static if (imm8 & 1) { enum int Count = 8; mask = _mm_packs_epi16(mask, _mm_setzero_si128()); } else { enum int Count = 16; } int signbits = _mm_movemask_epi8(mask); static if (imm8 & _SIDD_MOST_SIGNIFICANT) { if (signbits == 0) return Count; else return bsr(signbits); } else { if (signbits == 0) return Count; else return bsf(signbits); } } } unittest { // Find the index of the first difference (at index 6) // v char[16] A = "Hello sun"; char[16] B = "Hello moon"; __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr); __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr); int index = _mm_cmpestri!(_SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_EACH | _SIDD_NEGATIVE_POLARITY | _SIDD_LEAST_SIGNIFICANT)(mmA, 9, mmB, 10); assert(index == 6); // Those string must compare equal, regardless of what happens after their length. index = _mm_cmpestri!(_SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_EACH | _SIDD_NEGATIVE_POLARITY | _SIDD_LEAST_SIGNIFICANT)(mmA, 6, mmB, 6); // only look first six chars assert(index == 16); index = _mm_cmpestri!(_SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_EACH | _SIDD_NEGATIVE_POLARITY | _SIDD_MOST_SIGNIFICANT)(mmA, 6, mmB, 6); // only look first six chars assert(index == 16); } unittest { // Identify the last character that isn't an identifier character. // v (at index 7) char[16] A = "my_i(en)ifie"; char[16] identRanges = "__azAz09"; __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr); __m128i mmI = _mm_loadu_si128(cast(__m128i*)identRanges.ptr); byte16 mask = cast(byte16)_mm_cmpestrm!(_SIDD_UBYTE_OPS | _SIDD_CMP_RANGES | _SIDD_MASKED_NEGATIVE_POLARITY | _SIDD_UNIT_MASK)(mmI, 8, mmA, 12); byte[16] correctM = [0, 0, 0, 0, -1, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0]; assert(mask.array == correctM); int index = _mm_cmpestri!(_SIDD_UBYTE_OPS | _SIDD_CMP_RANGES | _SIDD_MASKED_NEGATIVE_POLARITY | _SIDD_MOST_SIGNIFICANT)(mmI, 8, mmA, 12); assert(index == 7); // ')' is the last char not to be in [__azAz09] } unittest { // testing _SIDD_CMP_RANGES but with signed shorts comparison instead (this only makes sense for _SIDD_CMP_RANGES) short[8] ranges = [0, -1, 1000, 2000, 0, 0, 0, 0]; short[8] numbers = [-32768, -1000, -1, -0, 0, 1, 1000, 32767]; __m128i mmRanges = _mm_loadu_si128(cast(__m128i*)ranges.ptr); __m128i mmNumbers = _mm_loadu_si128(cast(__m128i*)numbers.ptr); short8 mask = cast(short8)_mm_cmpestrm!(_SIDD_UWORD_OPS | _SIDD_CMP_RANGES | _SIDD_UNIT_MASK)(mmRanges, 4, mmNumbers, 8); short[8] correctM = [ -1, -1, -1, -1, -1, -1, -1, -1]; mask = cast(short8)_mm_cmpestrm!(_SIDD_SWORD_OPS | _SIDD_CMP_RANGES | _SIDD_UNIT_MASK)(mmRanges, 4, mmNumbers, 8); short[8] correctZ = [ 0, 0, 0, 0, 0, 0, -1, 0]; assert(mask.array == correctZ); } unittest { // Find a substring char[16] A = "def"; char[16] B = "abcdefghdefff"; char[16] C = "no substring"; __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr); __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr); __m128i mmC = _mm_loadu_si128(cast(__m128i*)C.ptr); byte16 mask = cast(byte16)_mm_cmpestrm!(_SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ORDERED | _SIDD_UNIT_MASK)(mmA, 3, mmB, 13); byte[16] correctM = [0, 0, 0, -1, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0]; assert(mask.array == correctM); int firstMatch = _mm_cmpestri!(_SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ORDERED)(mmA, 3, mmB, 13); assert(firstMatch == 3); int lastMatch = _mm_cmpestri!(_SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ORDERED | _SIDD_MOST_SIGNIFICANT)(mmA, 3, mmB, 13); assert(lastMatch == 8); firstMatch = _mm_cmpestri!(_SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ORDERED)(mmA, -3, mmC, -12); assert(firstMatch == 16); // no substring found } /// Compare packed strings in `a` and `b` with lengths `la` and `lb` using /// the control in `imm8`, and return the generated mask. __m128i _mm_cmpestrm(int imm8)(__m128i a, int la, __m128i b, int lb) @trusted { static if (GDC_with_SSE42) { return cast(__m128i) __builtin_ia32_pcmpestrm128(cast(ubyte16)a, la, cast(ubyte16)b, lb, imm8); } else static if (LDC_with_SSE42) { return cast(__m128i) __builtin_ia32_pcmpestrm128(cast(byte16)a, la, cast(byte16)b, lb, imm8); } else { __m128i mask = cmpstrMaskExplicit!imm8(a, la, b, lb); static if (imm8 & _SIDD_UNIT_MASK) { return mask; } else { // _SIDD_BIT_MASK static if (imm8 & 1) { mask = _mm_packs_epi16(mask, _mm_setzero_si128()); } return _mm_cvtsi32_si128( _mm_movemask_epi8(mask)); } } } unittest { char[16] A = "Hello world!"; char[16] B = "aeiou!"; __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr); __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr); // Find which letters from B where found in A. byte16 R = cast(byte16)_mm_cmpestrm!(_SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_BIT_MASK)(mmA, -12, mmB, -6); // because 'e', 'o', and '!' were found byte[16] correctR = [42, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; assert(R.array == correctR); byte16 M = cast(byte16) _mm_cmpestrm!(_SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_UNIT_MASK)(mmA, 12, mmB, 6); byte[16] correctM = [0, -1, 0, -1, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; assert(M.array == correctM); } /// Compare packed strings in `a` and `b` with lengths `la` and `lb` using /// the control in `imm8`, and returns bit 0 of the resulting bit mask. int _mm_cmpestro(int imm8)(__m128i a, int la, __m128i b, int lb) @trusted { static if (GDC_with_SSE42) { return __builtin_ia32_pcmpestrio128(cast(ubyte16)a, la, cast(ubyte16)b, lb, imm8); } else static if (LDC_with_SSE42) { return __builtin_ia32_pcmpestrio128(cast(byte16)a, la, cast(byte16)b, lb, imm8); } else { int4 mask = cast(int4) cmpstrMaskExplicit!imm8(a, la, b, lb); return mask.array[0] & 1; } } unittest { char[16] A = "Hallo world!"; char[16] B = "aeiou!"; __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr); __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr); // Find which letters from B were found in A. int res = _mm_cmpestro!(_SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_BIT_MASK)(mmA, 12, mmB, -6); // because 'a' was found in "Hallo world!" assert(res == 1); } /// Returns 1 if "any character in a was null", and 0 otherwise. /// Warning: what they mean is it returns 1 if the given length `la` is < Count. int _mm_cmpestrs(int imm8)(__m128i a, int la, __m128i b, int lb) @trusted { static if (GDC_with_SSE42) { return __builtin_ia32_pcmpestris128(cast(ubyte16)a, la, cast(ubyte16)b, lb, imm8); } else static if (LDC_with_SSE42) { return __builtin_ia32_pcmpestris128(cast(byte16)a, la, cast(byte16)b, lb, imm8); } else { // Yes, this intrinsic is there for symmetrical reasons and probably useless. // saturates lengths (the Intrinsics Guide doesn't tell this) if (la < 0) la = -la; if (la > 16) la = 16; enum int Count = (imm8 & 1) ? 8 : 16; return (la < Count); } } unittest { __m128i a; a = 0; assert(_mm_cmpestrs!_SIDD_UBYTE_OPS(a, 15, a, 8) == 1); assert(_mm_cmpestrs!_SIDD_UBYTE_OPS(a, 16, a, 8) == 0); assert(_mm_cmpestrs!_SIDD_UBYTE_OPS(a, -15, a, 8) == 1); assert(_mm_cmpestrs!_SIDD_UBYTE_OPS(a, -16, a, 8) == 0); } /// Returns 1 if "any character in b was null", and 0 otherwise. /// Warning: what they mean is it returns 1 if the given length `lb` is < Count. int _mm_cmpestrz(int imm8)(__m128i a, int la, __m128i b, int lb) @trusted { static if (GDC_with_SSE42) { return __builtin_ia32_pcmpestriz128(cast(ubyte16)a, la, cast(ubyte16)b, lb, imm8); } else static if (LDC_with_SSE42) { return __builtin_ia32_pcmpestriz128(cast(byte16)a, la, cast(byte16)b, lb, imm8); } else { // Yes, this intrinsic is there for symmetrical reasons and probably useless. // saturates lengths (the Intrinsics Guide doesn't tell this) if (lb < 0) lb = -lb; if (lb > 16) lb = 16; enum int Count = (imm8 & 1) ? 8 : 16; return (lb < Count); } } unittest { __m128i b; b = 0; assert(_mm_cmpestrs!_SIDD_UBYTE_OPS(b, 15, b, 15) == 1); assert(_mm_cmpestrs!_SIDD_UBYTE_OPS(b, 16, b, 16) == 0); assert(_mm_cmpestrs!_SIDD_UBYTE_OPS(b, -15, b, -15) == 1); assert(_mm_cmpestrs!_SIDD_UBYTE_OPS(b, -16, b, -16) == 0); } /// Compare packed signed 64-bit integers in a and b for greater-than. __m128i _mm_cmpgt_epi64 (__m128i a, __m128i b) pure @trusted { long2 la = cast(long2)a; long2 lb = cast(long2)b; // PERF: with DMD, enabling this requires SSE4.2, hence D_AVX /*static if (SIMD_COMPARISON_MASKS_16B) { return cast(__m128i)(la > lb); } else*/ static if (GDC_with_SSE42) { return cast(__m128i) __builtin_ia32_pcmpgtq(la, lb); } else version(LDC) { // LDC x86: Optimized since LDC 1.1.0 -O1 // arm64: Optimized since LDC 1.8.0 -O1 // When SSE4.2 is disabled, this gives same sequence than below. static if (SIMD_COMPARISON_MASKS_16B) return cast(__m128i)(la > lb); else return cast(__m128i)( greaterMask!long2(la, lb)); } else { long2 r; r.ptr[0] = (la.array[0] > lb.array[0]) ? 0xffffffff_ffffffff : 0; r.ptr[1] = (la.array[1] > lb.array[1]) ? 0xffffffff_ffffffff : 0; return cast(__m128i)r; } } unittest { __m128i A = _mm_setr_epi64(-3, 2); __m128i B = _mm_setr_epi64(4, -2); long[2] correct = [ 0, -1 ]; long2 R = cast(long2)(_mm_cmpgt_epi32(A, B)); assert(R.array == correct); } /// Compare packed strings with implicit lengths in `a` and `b` using the control in `imm8`, /// and returns 1 if `b` did not contain a null character and the resulting mask was zero, /// and 0 otherwise. int _mm_cmpistra(int imm8)(__m128i a, __m128i b) @trusted { static if (GDC_with_SSE42) { return cast(int) __builtin_ia32_pcmpistria128(cast(ubyte16)a, cast(ubyte16)b, imm8); } else static if (LDC_with_SSE42) { return __builtin_ia32_pcmpistria128(cast(byte16)a, cast(byte16)b, imm8); } else { static if (imm8 & 1) { int la = findLengthShort(a); int lb = findLengthShort(b); } else { int la = findLengthByte(a); int lb = findLengthByte(b); } return _mm_cmpestra!imm8(a, la, b, lb); } } unittest { char[16] A = "Maximum\x00one"; char[16] B = "Maximum\x00four"; char[16] C = "Mbximum\x00length!"; __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr); __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr); __m128i mmC = _mm_loadu_si128(cast(__m128i*)C.ptr); // string matching a-la strcmp, for 16-bytes of data // Use _SIDD_NEGATIVE_POLARITY since mask must be null, and all match must be one assert(0 == _mm_cmpistra!(_SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_EACH | _SIDD_MASKED_NEGATIVE_POLARITY)(mmA, mmB)); // match, but b is too short assert(0 == _mm_cmpistra!(_SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_EACH | _SIDD_NEGATIVE_POLARITY)(mmA, mmC)); // do not match } /// Compare packed strings with implicit lengths in `a` and `b` using the control in `imm8`, /// and returns 1 if the resulting mask was non-zero, and 0 otherwise. int _mm_cmpistrc(int imm8)(__m128i a, __m128i b) @trusted { static if (GDC_with_SSE42) { return cast(int) __builtin_ia32_pcmpistric128(cast(ubyte16)a, cast(ubyte16)b, imm8); } else static if (LDC_with_SSE42) { return cast(int) __builtin_ia32_pcmpistric128(cast(byte16)a, cast(byte16)b, imm8); } else { static if (imm8 & 1) { int la = findLengthShort(a); int lb = findLengthShort(b); } else { int la = findLengthByte(a); int lb = findLengthByte(b); } return _mm_cmpestrc!imm8(a, la, b, lb); } } unittest { // Compare two shorter strings { char[16] A = "Hello"; char[16] B = "Hello moon"; __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr); __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr); assert(0 == _mm_cmpistrc!(_SIDD_UBYTE_OPS // match gives 0 like strcmp | _SIDD_CMP_EQUAL_EACH | _SIDD_NEGATIVE_POLARITY)(mmA, mmA)); assert(1 == _mm_cmpistrc!(_SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_EACH | _SIDD_NEGATIVE_POLARITY)(mmA, mmB)); } } /// Compare packed strings with implicit lengths in `a` and `b` using the control in `imm8` /// and return the generated index. /// Note: if the mask is all zeroes, the returned index is always `Count` /// (8 or 16 depending on size). int _mm_cmpistri(int imm8)(__m128i a, __m128i b) @trusted { static if (GDC_with_SSE42) { return __builtin_ia32_pcmpistri128(cast(ubyte16)a, cast(ubyte16)b, imm8); } else static if (LDC_with_SSE42) { return __builtin_ia32_pcmpistri128(cast(byte16)a, cast(byte16)b, imm8); } else { static if (imm8 & 1) { int la = findLengthShort(a); int lb = findLengthShort(b); } else { int la = findLengthByte(a); int lb = findLengthByte(b); } return _mm_cmpestri!imm8(a, la, b, lb); } } unittest { // Identify the last character that isn't an identifier character. // v (at index 7) char[16] A = "my_i(en)ifie"; char[16] identRanges = "__azAz09"; __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr); __m128i mmI = _mm_loadu_si128(cast(__m128i*)identRanges.ptr); byte16 mask = cast(byte16)_mm_cmpistrm!(_SIDD_UBYTE_OPS | _SIDD_CMP_RANGES | _SIDD_MASKED_NEGATIVE_POLARITY | _SIDD_UNIT_MASK)(mmI, mmA); byte[16] correctM = [0, 0, 0, 0, -1, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0]; assert(mask.array == correctM); int index = _mm_cmpistri!(_SIDD_UBYTE_OPS | _SIDD_CMP_RANGES | _SIDD_MASKED_NEGATIVE_POLARITY | _SIDD_MOST_SIGNIFICANT)(mmI, mmA); assert(index == 7); // ')' is the last char not to be in [__azAz09] } /// Compare packed strings with implicit lengths in `a` and `b` using the control in /// `imm8`, and return the generated mask. __m128i _mm_cmpistrm(int imm8)(__m128i a, __m128i b) @trusted { static if (GDC_with_SSE42) { return cast(__m128i) __builtin_ia32_pcmpistrm128(cast(ubyte16)a, cast(ubyte16)b, imm8); } else static if (LDC_with_SSE42) { return cast(__m128i) __builtin_ia32_pcmpistrm128(cast(byte16)a, cast(byte16)b, imm8); } else { static if (imm8 & 1) { int la = findLengthShort(a); int lb = findLengthShort(b); } else { int la = findLengthByte(a); int lb = findLengthByte(b); } return _mm_cmpestrm!imm8(a, la, b, lb); } } unittest { char[16] A = "Hello world!"; char[16] B = "aeiou!"; __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr); __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr); // Find which letters from B where found in A. byte16 R = cast(byte16)_mm_cmpistrm!(_SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_BIT_MASK)(mmA, mmB); // because 'e', 'o', and '!' were found byte[16] correctR = [42, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; assert(R.array == correctR); byte16 M = cast(byte16) _mm_cmpistrm!(_SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_UNIT_MASK)(mmA, mmB); byte[16] correctM = [0, -1, 0, -1, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; assert(M.array == correctM); } /// Compare packed strings in `a` and `b` with lengths `la` and `lb` using /// the control in `imm8`, and returns bit 0 of the resulting bit mask. int _mm_cmpistro(int imm8)(__m128i a, __m128i b) @trusted { static if (GDC_with_SSE42) { return __builtin_ia32_pcmpistrio128(cast(ubyte16)a, cast(ubyte16)b, imm8); } else static if (LDC_with_SSE42) { return __builtin_ia32_pcmpistrio128(cast(byte16)a, cast(byte16)b, imm8); } else { static if (imm8 & 1) { int la = findLengthShort(a); int lb = findLengthShort(b); } else { int la = findLengthByte(a); int lb = findLengthByte(b); } return _mm_cmpestro!imm8(a, la, b, lb); } } unittest { char[16] A = "Hallo world!"; char[16] B = "aeiou!"; char[16] C = "Z"; __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr); __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr); __m128i mmC = _mm_loadu_si128(cast(__m128i*)C.ptr); // Find which letters from B where found in A. int res = _mm_cmpistro!(_SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_BIT_MASK)(mmA, mmB); // because 'a' was found in "Hallo world!" assert(res == 1); res = _mm_cmpistro!(_SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_BIT_MASK)(mmA, mmC); assert(res == 0); // because 'Z' wasn't found in A } /// Returns 1 if any character in `a` was null, and 0 otherwise. int _mm_cmpistrs(int imm8)(__m128i a, __m128i b) @trusted { static if (GDC_with_SSE42) { return __builtin_ia32_pcmpistris128(cast(ubyte16)a, cast(ubyte16)b, imm8); } else static if (LDC_with_SSE42) { return __builtin_ia32_pcmpistris128(cast(byte16)a, cast(byte16)b, imm8); } else { static if (imm8 & 1) { int la = findLengthShort(a); return la != 8; } else { int la = findLengthByte(a); return la != 16; } } } unittest { char[16] A = ""; char[16] B = "hello"; char[16] C = "Maximum length!!"; __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr); __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr); __m128i mmC = _mm_loadu_si128(cast(__m128i*)C.ptr); assert(_mm_cmpistrs!_SIDD_UBYTE_OPS(mmA, mmA) == 1); assert(_mm_cmpistrs!_SIDD_SBYTE_OPS(mmB, mmB) == 1); assert(_mm_cmpistrs!_SIDD_UWORD_OPS(mmC, mmC) == 0); } /// Returns 1 if any character in `b` was null, and 0 otherwise. int _mm_cmpistrz(int imm8)(__m128i a, __m128i b) @trusted { static if (GDC_with_SSE42) { return __builtin_ia32_pcmpistriz128(cast(ubyte16)a, cast(ubyte16)b, imm8); } else static if (LDC_with_SSE42) { return __builtin_ia32_pcmpistriz128(cast(byte16)a, cast(byte16)b, imm8); } else { static if (imm8 & 1) { int lb = findLengthShort(b); return lb != 8; } else { int lb = findLengthByte(b); return lb != 16; } } } unittest { char[16] A = ""; char[16] B = "hello"; char[16] C = "Maximum length!!"; __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr); __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr); __m128i mmC = _mm_loadu_si128(cast(__m128i*)C.ptr); assert(_mm_cmpistrz!_SIDD_UBYTE_OPS(mmC, mmA) == 1); assert(_mm_cmpistrz!_SIDD_SBYTE_OPS(mmC, mmB) == 1); assert(_mm_cmpistrz!_SIDD_UWORD_OPS(mmA, mmC) == 0); } /// Starting with the initial value in `crc`, accumulates a CR32 value /// for unsigned 16-bit integer `v`. /// Warning: this is computing CRC-32C (Castagnoli), not CRC-32. uint _mm_crc32_u16 (uint crc, ushort v) @safe { static if (GDC_with_SSE42) { return __builtin_ia32_crc32hi(crc, v); } else static if (LDC_with_CRC32) { return __builtin_ia32_crc32hi(crc, v); } else static if (LDC_with_ARM64_CRC) { return __crc32ch(crc, v); } else { crc = _mm_crc32_u8(crc, v & 0xff); crc = _mm_crc32_u8(crc, v >> 8); return crc; } } unittest { uint A = _mm_crc32_u16(0x12345678, 0x4512); uint B = _mm_crc32_u16(0x76543210, 0xf50f); uint C = _mm_crc32_u16(0xDEADBEEF, 0x0017); assert(A == 0x39c3f0ff); assert(B == 0xcffbcf07); assert(C == 0xc7e3fe85); } /// Starting with the initial value in `crc`, accumulates a CRC32 value /// for unsigned 32-bit integer `v`. /// Warning: this is computing CRC-32C (Castagnoli), not CRC-32. uint _mm_crc32_u32 (uint crc, uint v) @safe { static if (GDC_with_SSE42) { return __builtin_ia32_crc32si(crc, v); } else static if (LDC_with_CRC32) { return __builtin_ia32_crc32si(crc, v); } else static if (LDC_with_ARM64_CRC) { return __crc32cw(crc, v); } else { crc = _mm_crc32_u8(crc, v & 0xff); crc = _mm_crc32_u8(crc, (v >> 8) & 0xff); crc = _mm_crc32_u8(crc, (v >> 16) & 0xff); crc = _mm_crc32_u8(crc, (v >> 24) & 0xff); return crc; } } unittest { uint A = _mm_crc32_u32(0x12345678, 0x45123563); uint B = _mm_crc32_u32(0x76543210, 0xf50f9993); uint C = _mm_crc32_u32(0xDEADBEEF, 0x00170017); assert(A == 0x22a6ec54); assert(B == 0x7019a6cf); assert(C == 0xbc552c27); } /// Starting with the initial value in `crc`, accumulates a CRC32 /// value for unsigned 64-bit integer `v`. /// Warning: this is computing CRC-32C (Castagnoli), not CRC-32. ulong _mm_crc32_u64 (ulong crc, ulong v) { version(X86_64) enum bool hasX86Intrin = GDC_with_SSE42 || LDC_with_CRC32; else enum bool hasX86Intrin = false; // intrinsics not available in 32-bit static if (hasX86Intrin) { return __builtin_ia32_crc32di(crc, v); } else static if (LDC_with_ARM64_CRC) { return __crc32cd(cast(uint)crc, v); } else { uint crc32 = cast(uint)crc; crc32 = _mm_crc32_u8(crc32, (v >> 0) & 0xff); crc32 = _mm_crc32_u8(crc32, (v >> 8) & 0xff); crc32 = _mm_crc32_u8(crc32, (v >> 16) & 0xff); crc32 = _mm_crc32_u8(crc32, (v >> 24) & 0xff); crc32 = _mm_crc32_u8(crc32, (v >> 32) & 0xff); crc32 = _mm_crc32_u8(crc32, (v >> 40) & 0xff); crc32 = _mm_crc32_u8(crc32, (v >> 48) & 0xff); crc32 = _mm_crc32_u8(crc32, (v >> 56) & 0xff); return crc32; } } unittest { ulong A = _mm_crc32_u64(0x1234567812345678, 0x39C3F0FFCFFBCF07); ulong B = _mm_crc32_u64(0x7654321001234567, 0xFACEFEED); ulong C = _mm_crc32_u64(0xDEADBEEFCAFEBABE, 0x0017C7E3FE850017); assert(A == 0xd66b1074); assert(B == 0xac12f9c6); assert(C == 0xa2d13dd8); } /// Starting with the initial value in `crc`, accumulates a CRC32 value /// for unsigned 8-bit integer `v`. /// Warning: this is computing CRC-32C (Castagnoli), not CRC-32. uint _mm_crc32_u8 (uint crc, ubyte v) @safe { static if (GDC_with_SSE42) { return __builtin_ia32_crc32qi(crc, v); } else static if (LDC_with_CRC32) { return __builtin_ia32_crc32qi(crc, v); } else static if (LDC_with_ARM64_CRC) { return __crc32cb(crc, v); } else { return CRC32cTable[(crc ^ v) & 0xFF] ^ (crc >> 8); } } unittest { uint A = _mm_crc32_u8(0x12345678, 0x45); uint B = _mm_crc32_u8(0x76543210, 0xf5); uint C = _mm_crc32_u8(0xDEADBEEF, 0x00); assert(A == 0x8fd93134); assert(B == 0xd6b7e834); assert(C == 0xbdfd3980); } // Utilities for this file private: static if (GDC_with_SSE42) { version(X86_64) enum bool NeedCRC32CTable = false; else enum bool NeedCRC32CTable = true; } else static if (LDC_with_CRC32) { version(X86_64) enum bool NeedCRC32CTable = false; else enum bool NeedCRC32CTable = true; } else static if (LDC_with_ARM64_CRC) { enum bool NeedCRC32CTable = false; } else { enum bool NeedCRC32CTable = true; } static if (NeedCRC32CTable) { static immutable uint[256] CRC32cTable = [ 0x0, 0xf26b8303, 0xe13b70f7, 0x1350f3f4, 0xc79a971f, 0x35f1141c, 0x26a1e7e8, 0xd4ca64eb, 0x8ad958cf, 0x78b2dbcc, 0x6be22838, 0x9989ab3b, 0x4d43cfd0, 0xbf284cd3, 0xac78bf27, 0x5e133c24, 0x105ec76f, 0xe235446c, 0xf165b798, 0x30e349b, 0xd7c45070, 0x25afd373, 0x36ff2087, 0xc494a384, 0x9a879fa0, 0x68ec1ca3, 0x7bbcef57, 0x89d76c54, 0x5d1d08bf, 0xaf768bbc, 0xbc267848, 0x4e4dfb4b, 0x20bd8ede, 0xd2d60ddd, 0xc186fe29, 0x33ed7d2a, 0xe72719c1, 0x154c9ac2, 0x61c6936, 0xf477ea35, 0xaa64d611, 0x580f5512, 0x4b5fa6e6, 0xb93425e5, 0x6dfe410e, 0x9f95c20d, 0x8cc531f9, 0x7eaeb2fa, 0x30e349b1, 0xc288cab2, 0xd1d83946, 0x23b3ba45, 0xf779deae, 0x5125dad, 0x1642ae59, 0xe4292d5a, 0xba3a117e, 0x4851927d, 0x5b016189, 0xa96ae28a, 0x7da08661, 0x8fcb0562, 0x9c9bf696, 0x6ef07595, 0x417b1dbc, 0xb3109ebf, 0xa0406d4b, 0x522bee48, 0x86e18aa3, 0x748a09a0, 0x67dafa54, 0x95b17957, 0xcba24573, 0x39c9c670, 0x2a993584, 0xd8f2b687, 0xc38d26c, 0xfe53516f, 0xed03a29b, 0x1f682198, 0x5125dad3, 0xa34e59d0, 0xb01eaa24, 0x42752927, 0x96bf4dcc, 0x64d4cecf, 0x77843d3b, 0x85efbe38, 0xdbfc821c, 0x2997011f, 0x3ac7f2eb, 0xc8ac71e8, 0x1c661503, 0xee0d9600, 0xfd5d65f4, 0xf36e6f7, 0x61c69362, 0x93ad1061, 0x80fde395, 0x72966096, 0xa65c047d, 0x5437877e, 0x4767748a, 0xb50cf789, 0xeb1fcbad, 0x197448ae, 0xa24bb5a, 0xf84f3859, 0x2c855cb2, 0xdeeedfb1, 0xcdbe2c45, 0x3fd5af46, 0x7198540d, 0x83f3d70e, 0x90a324fa, 0x62c8a7f9, 0xb602c312, 0x44694011, 0x5739b3e5, 0xa55230e6, 0xfb410cc2, 0x92a8fc1, 0x1a7a7c35, 0xe811ff36, 0x3cdb9bdd, 0xceb018de, 0xdde0eb2a, 0x2f8b6829, 0x82f63b78, 0x709db87b, 0x63cd4b8f, 0x91a6c88c, 0x456cac67, 0xb7072f64, 0xa457dc90, 0x563c5f93, 0x82f63b7, 0xfa44e0b4, 0xe9141340, 0x1b7f9043, 0xcfb5f4a8, 0x3dde77ab, 0x2e8e845f, 0xdce5075c, 0x92a8fc17, 0x60c37f14, 0x73938ce0, 0x81f80fe3, 0x55326b08, 0xa759e80b, 0xb4091bff, 0x466298fc, 0x1871a4d8, 0xea1a27db, 0xf94ad42f, 0xb21572c, 0xdfeb33c7, 0x2d80b0c4, 0x3ed04330, 0xccbbc033, 0xa24bb5a6, 0x502036a5, 0x4370c551, 0xb11b4652, 0x65d122b9, 0x97baa1ba, 0x84ea524e, 0x7681d14d, 0x2892ed69, 0xdaf96e6a, 0xc9a99d9e, 0x3bc21e9d, 0xef087a76, 0x1d63f975, 0xe330a81, 0xfc588982, 0xb21572c9, 0x407ef1ca, 0x532e023e, 0xa145813d, 0x758fe5d6, 0x87e466d5, 0x94b49521, 0x66df1622, 0x38cc2a06, 0xcaa7a905, 0xd9f75af1, 0x2b9cd9f2, 0xff56bd19, 0xd3d3e1a, 0x1e6dcdee, 0xec064eed, 0xc38d26c4, 0x31e6a5c7, 0x22b65633, 0xd0ddd530, 0x417b1db, 0xf67c32d8, 0xe52cc12c, 0x1747422f, 0x49547e0b, 0xbb3ffd08, 0xa86f0efc, 0x5a048dff, 0x8ecee914, 0x7ca56a17, 0x6ff599e3, 0x9d9e1ae0, 0xd3d3e1ab, 0x21b862a8, 0x32e8915c, 0xc083125f, 0x144976b4, 0xe622f5b7, 0xf5720643, 0x7198540, 0x590ab964, 0xab613a67, 0xb831c993, 0x4a5a4a90, 0x9e902e7b, 0x6cfbad78, 0x7fab5e8c, 0x8dc0dd8f, 0xe330a81a, 0x115b2b19, 0x20bd8ed, 0xf0605bee, 0x24aa3f05, 0xd6c1bc06, 0xc5914ff2, 0x37faccf1, 0x69e9f0d5, 0x9b8273d6, 0x88d28022, 0x7ab90321, 0xae7367ca, 0x5c18e4c9, 0x4f48173d, 0xbd23943e, 0xf36e6f75, 0x105ec76, 0x12551f82, 0xe03e9c81, 0x34f4f86a, 0xc69f7b69, 0xd5cf889d, 0x27a40b9e, 0x79b737ba, 0x8bdcb4b9, 0x988c474d, 0x6ae7c44e, 0xbe2da0a5, 0x4c4623a6, 0x5f16d052, 0xad7d5351, ]; } int findLengthByte(__m128i a) pure @safe { const __m128i zero = _mm_setzero_si128(); const __m128i zeroMask = _mm_cmpeq_epi8(a, zero); // 0xff where a byte is zero int mask = _mm_movemask_epi8(zeroMask); // the lowest set bit is the zero index if (mask == 0) return 16; else return bsf(mask); } unittest { char[16] A = "Hel!o"; char[16] B = "Maximum length!!"; __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr); __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr); assert(findLengthByte(mmA) == 5); assert(findLengthByte(mmB) == 16); } int findLengthShort(__m128i a) pure @safe { const __m128i zero = _mm_setzero_si128(); const __m128i zeroMask = _mm_cmpeq_epi16(a, zero); // 0xffff where a short is zero int mask = _mm_movemask_epi8(zeroMask); // the lowest set bit is the zero index if (mask == 0) return 8; else return bsf(mask) >> 1; } unittest { short[8] A = [10, 5423, 475, 0, 1, 1, 1, 1 ]; short[8] B = [-1, -2, -3, 4, 5, 6, -32768, 1]; __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr); __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr); assert(findLengthShort(mmA) == 3); assert(findLengthShort(mmB) == 8); } static immutable byte[32] MASK_DATA = [ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ]; // Makes a byte validity mask with a given explicit length string. __m128i validMask8e(int len) @trusted { return _mm_loadu_si128(cast(__m128i*) &MASK_DATA[16-len]); } unittest { char[16] A = ""; char[16] B = "0123456789abcdef"; byte[16] correctA = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; byte[16] correctB = [-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1]; byte16 MA = cast(byte16) validMask8e(0); byte16 MB = cast(byte16) validMask8e(16); assert(MA.array == correctA); assert(MB.array == correctB); } // Makes a short validity mask with a given explicit length string. __m128i validMask16e(int len) @trusted { return _mm_loadu_si128(cast(__m128i*) &MASK_DATA[16-len*2]); } unittest { short[8] A = [3, 4, 5, 0, 3, 4, 5, 6]; short[8] correctA = [-1, -1, -1, 0, 0, 0, 0, 0]; short8 MA = cast(short8) validMask16e(3); assert(MA.array == correctA); } // Internal implementation for non-SSE4.2 // Compare 8-bit or 16-bit strings, get a mask. // `aValid` and `bValid` are byte-mask or word-mask of the valid // zone in `a` and `b`. __m128i cmpstrMaskExplicit(int imm8)(__m128i a, ref int la, __m128i b, ref int lb) @safe { // saturates lengths (the Intrinsics Guide doesn't tell this) if (la < 0) la = -la; if (lb < 0) lb = -lb; if (la > 16) la = 16; if (lb > 16) lb = 16; static if (imm8 & 1) { __m128i aValid = validMask16e(la); __m128i bValid = validMask16e(lb); } else { __m128i aValid = validMask8e(la); __m128i bValid = validMask8e(lb); } return cmpstrMask!imm8(a, aValid, b, bValid); } //ditto __m128i cmpstrMask(int imm8)(__m128i a, __m128i aValid, __m128i b, const __m128i bValid) @safe { enum bool chars16Bits = imm8 & 1; enum int Mode = (imm8 >> 2) & 3; static if (Mode == 0) // equal any { __m128i R = _mm_setzero_si128(); static if (chars16Bits) // 64 comparisons { for (int k = 0; k < 8; ++k) { __m128i eqMask = _mm_cmpeq_epi16(a, b); eqMask = _mm_and_si128(eqMask, aValid); R = _mm_or_si128(R, eqMask); // rotate a and aValid a = _mm_or_si128(_mm_srli_si128!2(a), _mm_slli_si128!14(a)); aValid = _mm_or_si128(_mm_srli_si128!2(aValid), _mm_slli_si128!14(aValid)); } } else { for (int k = 0; k < 16; ++k) { __m128i eqMask = _mm_cmpeq_epi8(a, b); eqMask = _mm_and_si128(eqMask, aValid); R = _mm_or_si128(R, eqMask); // rotate a and aValid a = _mm_or_si128(_mm_srli_si128!1(a), _mm_slli_si128!15(a)); aValid = _mm_or_si128(_mm_srli_si128!1(aValid), _mm_slli_si128!15(aValid)); } } R = _mm_and_si128(R, bValid); } else static if (Mode == 1) // ranges { enum bool signed = (imm8 & 2) != 0; // For each character in b, the returned mask says if it was found in a range-pair in `a`. __m128i R = _mm_setzero_si128(); static if (chars16Bits) { for (int pos = 0; pos < 8; pos += 2) { short min = (cast(short8)a).array[pos]; short max = (cast(short8)a).array[pos+1]; static if (signed) { __m128i ge = ~_mm_cmplt_epi16(b, _mm_set1_epi16(min)); __m128i le = ~_mm_cmpgt_epi16(b, _mm_set1_epi16(max)); } else { // No SSE way to do 16-bit unsigned comparisons, // but flipping the sign bit let us used signed comp __m128i firstBits = _mm_set1_epi16(-32768); __m128i reverseB = _mm_xor_si128(b, firstBits); __m128i reverseMin = _mm_xor_si128(_mm_set1_epi16(min), firstBits); __m128i reverseMax = _mm_xor_si128(_mm_set1_epi16(max), firstBits); __m128i ge = ~_mm_cmplt_epi16(reverseB, reverseMin); __m128i le = ~_mm_cmpgt_epi16(reverseB, reverseMax); } __m128i inRange = _mm_and_si128(le, ge); // Not considered in range a is invalid here. short aValidHere = (cast(short8)aValid).array[pos+1]; __m128i mmAValidHere = _mm_set1_epi16(aValidHere); inRange = _mm_and_si128(inRange, mmAValidHere); R = _mm_or_si128(R, inRange); } } else // 8-bits { for (int pos = 0; pos < 16; pos += 2) { byte min = (cast(byte16)a).array[pos]; byte max = (cast(byte16)a).array[pos+1]; static if (signed) { __m128i ge = _mm_xor_si128(_mm_cmplt_epi8(b, _mm_set1_epi8(min))); __m128i le = _mm_xor_si128(_mm_cmpgt_epi8(b, _mm_set1_epi8(max))); } else { // No SSE way to do 16-bit unsigned comparisons, // but flipping the sign bit let us used signed comp __m128i firstBits = _mm_set1_epi8(-128); __m128i reverseB = _mm_xor_si128(b, firstBits); __m128i reverseMin = _mm_xor_si128(_mm_set1_epi8(min), firstBits); __m128i reverseMax = _mm_xor_si128(_mm_set1_epi8(max), firstBits); __m128i ge = ~_mm_cmplt_epi8(reverseB, reverseMin); __m128i le = ~_mm_cmpgt_epi8(reverseB, reverseMax); } __m128i inRange = _mm_and_si128(le, ge); // Not considered in range a is invalid here. byte aValidHere = (cast(byte16)aValid).array[pos+1]; __m128i mmAValidHere = _mm_set1_epi8(aValidHere); inRange = _mm_and_si128(inRange, mmAValidHere); R = _mm_or_si128(R, inRange); } } // invalid b part is not in range R = _mm_and_si128(R, bValid); } else static if (Mode == 2) // equal each, just 16 comparisons not 256 { static if (chars16Bits) { __m128i R = _mm_cmpeq_epi16(a, b); } else { __m128i R = _mm_cmpeq_epi8(a, b); } // if only a or b is invalid, consider not equal R = _mm_andnot_si128(_mm_xor_si128(aValid, bValid), R); // if a and b are both invalid, consider equal R = _mm_or_si128(R, ~_mm_or_si128(aValid, bValid)); } else static if (Mode == 3) // equal ordered { // a is searched in b. __m128i bValidShift = bValid; __m128i R = _mm_set1_epi32(-1); // all b positions possible for containing a static if (chars16Bits) { for (int pos = 0; pos < 8; ++pos) { // compare character k of a, where can it go in b? short charK = (cast(short8)a).array[pos]; __m128i mmcharK = _mm_set1_epi16(charK); short aValidHere = (cast(short8)aValid).array[pos]; __m128i mmAValidHere = _mm_set1_epi16(aValidHere); __m128i mmAInvalidHere = _mm_xor_si128(mmAValidHere, _mm_set1_epi32(-1)); __m128i eqMask = _mm_cmpeq_epi16(mmcharK, b); // Where A is invalid, the comparison always holds "equal" eqMask = _mm_or_si128(eqMask, mmAInvalidHere); // Where B is invalid, and A is valid, the comparison is forced to false eqMask = _mm_and_si128(eqMask, _mm_or_si128(bValidShift, mmAInvalidHere)); R = _mm_and_si128(eqMask); // drop first char of b b = _mm_srli_si128!2(b); bValidShift = _mm_srli_si128!2(bValidShift); } } else { for (int pos = 0; pos < 16; ++pos) { // compare character k of a, where can it go in b? byte charK = (cast(byte16)a).array[pos]; __m128i mmcharK = _mm_set1_epi8(charK); byte aValidHere = (cast(byte16)aValid).array[pos]; __m128i mmAValidHere = _mm_set1_epi8(aValidHere); __m128i mmAInvalidHere = _mm_xor_si128(mmAValidHere, _mm_set1_epi32(-1)); __m128i eqMask = _mm_cmpeq_epi8(mmcharK, b); // Where A is invalid, the comparison always holds "equal" eqMask = _mm_or_si128(eqMask, mmAInvalidHere); // Where B is invalid, and A is valid, the comparison is forced to false eqMask = _mm_and_si128(eqMask, _mm_or_si128(bValidShift, mmAInvalidHere)); R = _mm_and_si128(R, eqMask); // drop first char of b b = _mm_srli_si128!1(b); bValidShift = _mm_srli_si128!1(bValidShift); } } } else static assert(0); // Optionally negate result static if (imm8 & _SIDD_NEGATIVE_POLARITY) { static if (imm8 & _SIDD_MASKED_POSITIVE_POLARITY) { R = _mm_xor_si128(R, bValid); // only negate valid b } else { R = _mm_xor_si128(R, _mm_set1_epi32(-1)); // negate all } } return R; }