diff options
| author | John Naylor | 2022-08-03 16:49:04 +0000 |
|---|---|---|
| committer | John Naylor | 2022-08-10 03:48:29 +0000 |
| commit | b6ef167564905e8dab8af1b1dba0997ec61204ca (patch) | |
| tree | 9a2d2cc792c48588214a9e9dc8f7adfd68905667 /src/include | |
| parent | 356dd2ce5b7b0f3ea11e016f73b1f173604b8452 (diff) | |
Introduce optimized routine for linear searches of arrays
Use SSE2 intrinsics to speed up the search, where available. Otherwise,
use a simple 'for' loop. The motivation to add this now is to speed up
XidInMVCCSnapshot(), which is the reason only unsigned 32-bit integer
arrays are optimized. Other types are left for future work, as is the
extension of this technique to non-x86 platforms.
Nathan Bossart
Reviewed by: Andres Freund, Bharath Rupireddy, Masahiko Sawada
Discussion: https://postgr.es/m/20220713170950.GA3116318%40nathanxps13
Diffstat (limited to 'src/include')
| -rw-r--r-- | src/include/port/pg_lfind.h | 103 |
1 files changed, 103 insertions, 0 deletions
diff --git a/src/include/port/pg_lfind.h b/src/include/port/pg_lfind.h new file mode 100644 index 00000000000..fb125977b2e --- /dev/null +++ b/src/include/port/pg_lfind.h @@ -0,0 +1,103 @@ +/*------------------------------------------------------------------------- + * + * pg_lfind.h + * Optimized linear search routines. + * + * Copyright (c) 2022, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/include/port/pg_lfind.h + * + *------------------------------------------------------------------------- + */ +#ifndef PG_LFIND_H +#define PG_LFIND_H + +#include "port/simd.h" + +/* + * pg_lfind32 + * + * Return true if there is an element in 'base' that equals 'key', otherwise + * return false. + */ +static inline bool +pg_lfind32(uint32 key, uint32 *base, uint32 nelem) +{ + uint32 i = 0; + + /* Use SIMD intrinsics where available. */ +#ifdef USE_SSE2 + + /* + * A 16-byte register only has four 4-byte lanes. For better + * instruction-level parallelism, each loop iteration operates on a block + * of four registers. Testing has showed this is ~40% faster than using a + * block of two registers. + */ + const __m128i keys = _mm_set1_epi32(key); /* load 4 copies of key */ + uint32 iterations = nelem & ~0xF; /* round down to multiple of 16 */ + +#if defined(USE_ASSERT_CHECKING) + bool assert_result = false; + + /* pre-compute the result for assert checking */ + for (i = 0; i < nelem; i++) + { + if (key == base[i]) + { + assert_result = true; + break; + } + } +#endif + + for (i = 0; i < iterations; i += 16) + { + /* load the next block into 4 registers holding 4 values each */ + const __m128i vals1 = _mm_loadu_si128((__m128i *) & base[i]); + const __m128i vals2 = _mm_loadu_si128((__m128i *) & base[i + 4]); + const __m128i vals3 = _mm_loadu_si128((__m128i *) & base[i + 8]); + const __m128i vals4 = _mm_loadu_si128((__m128i *) & base[i + 12]); + + /* compare each value to the key */ + const __m128i result1 = _mm_cmpeq_epi32(keys, vals1); + const __m128i result2 = _mm_cmpeq_epi32(keys, vals2); + const __m128i result3 = _mm_cmpeq_epi32(keys, vals3); + const __m128i result4 = _mm_cmpeq_epi32(keys, vals4); + + /* combine the results into a single variable */ + const __m128i tmp1 = _mm_or_si128(result1, result2); + const __m128i tmp2 = _mm_or_si128(result3, result4); + const __m128i result = _mm_or_si128(tmp1, tmp2); + + /* see if there was a match */ + if (_mm_movemask_epi8(result) != 0) + { +#if defined(USE_ASSERT_CHECKING) + Assert(assert_result == true); +#endif + return true; + } + } +#endif /* USE_SSE2 */ + + /* Process the remaining elements one at a time. */ + for (; i < nelem; i++) + { + if (key == base[i]) + { +#if defined(USE_SSE2) && defined(USE_ASSERT_CHECKING) + Assert(assert_result == true); +#endif + return true; + } + } + +#if defined(USE_SSE2) && defined(USE_ASSERT_CHECKING) + Assert(assert_result == false); +#endif + return false; +} + +#endif /* PG_LFIND_H */ |
