diff options
| author | Nathan Bossart | 2024-04-07 02:56:23 +0000 |
|---|---|---|
| committer | Nathan Bossart | 2024-04-07 02:56:23 +0000 |
| commit | 792752af4eb5cf7b5b8b0470dbf22901c5178fe5 (patch) | |
| tree | 2090baf57be8c2bf773386571587aa6511d8cb27 /meson.build | |
| parent | 158f5819236806b7c9cab323658c231e9371c458 (diff) | |
Optimize pg_popcount() with AVX-512 instructions.
Presently, pg_popcount() processes data in 32-bit or 64-bit chunks
when possible. Newer hardware that supports AVX-512 instructions
can use 512-bit chunks, which provides a nice speedup, especially
for larger buffers. This commit introduces the infrastructure
required to detect compiler and CPU support for the required
AVX-512 intrinsic functions, and it adds a new pg_popcount()
implementation that uses these functions. If CPU support for this
optimized implementation is detected at runtime, a function pointer
is updated so that it is used by subsequent calls to pg_popcount().
Most of the existing in-tree calls to pg_popcount() should benefit
from these instructions, and calls with smaller buffers should at
least not regress compared to v16. The new infrastructure
introduced by this commit can also be used to optimize
visibilitymap_count(), but that is left for a follow-up commit.
Co-authored-by: Paul Amonson, Ants Aasma
Reviewed-by: Matthias van de Meent, Tom Lane, Noah Misch, Akash Shankaran, Alvaro Herrera, Andres Freund, David Rowley
Discussion: https://postgr.es/m/BL1PR11MB5304097DF7EA81D04C33F3D1DCA6A%40BL1PR11MB5304.namprd11.prod.outlook.com
Diffstat (limited to 'meson.build')
| -rw-r--r-- | meson.build | 87 |
1 files changed, 87 insertions, 0 deletions
diff --git a/meson.build b/meson.build index 87437960bc3..5acf083ce3c 100644 --- a/meson.build +++ b/meson.build @@ -1783,6 +1783,30 @@ elif cc.links(''' endif +# Check for __get_cpuid_count() and __cpuidex() in a similar fashion. +if cc.links(''' + #include <cpuid.h> + int main(int arg, char **argv) + { + unsigned int exx[4] = {0, 0, 0, 0}; + __get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]); + } + ''', name: '__get_cpuid_count', + args: test_c_args) + cdata.set('HAVE__GET_CPUID_COUNT', 1) +elif cc.links(''' + #include <intrin.h> + int main(int arg, char **argv) + { + unsigned int exx[4] = {0, 0, 0, 0}; + __cpuidex(exx, 7, 0); + } + ''', name: '__cpuidex', + args: test_c_args) + cdata.set('HAVE__CPUIDEX', 1) +endif + + # Defend against clang being used on x86-32 without SSE2 enabled. As current # versions of clang do not understand -fexcess-precision=standard, the use of # x87 floating point operations leads to problems like isinf possibly returning @@ -1996,6 +2020,69 @@ int main(void) endif +############################################################### +# Check for the availability of XSAVE intrinsics. +############################################################### + +cflags_xsave = [] +if host_cpu == 'x86' or host_cpu == 'x86_64' + + prog = ''' +#include <immintrin.h> + +int main(void) +{ + return _xgetbv(0) & 0xe0; +} +''' + + if cc.links(prog, name: 'XSAVE intrinsics without -mxsave', + args: test_c_args) + cdata.set('HAVE_XSAVE_INTRINSICS', 1) + elif cc.links(prog, name: 'XSAVE intrinsics with -mxsave', + args: test_c_args + ['-mxsave']) + cdata.set('HAVE_XSAVE_INTRINSICS', 1) + cflags_xsave += '-mxsave' + endif + +endif + + +############################################################### +# Check for the availability of AVX-512 popcount intrinsics. +############################################################### + +cflags_popcnt = [] +if host_cpu == 'x86_64' + + prog = ''' +#include <immintrin.h> + +int main(void) +{ + const char buf[sizeof(__m512i)]; + INT64 popcnt = 0; + __m512i accum = _mm512_setzero_si512(); + const __m512i val = _mm512_maskz_loadu_epi8((__mmask64) 0xf0f0f0f0f0f0f0f0, (const __m512i *) buf); + const __m512i cnt = _mm512_popcnt_epi64(val); + accum = _mm512_add_epi64(accum, cnt); + popcnt = _mm512_reduce_add_epi64(accum); + /* return computed value, to prevent the above being optimized away */ + return popcnt == 0; +} +''' + + if cc.links(prog, name: 'AVX-512 popcount without -mavx512vpopcntdq -mavx512bw', + args: test_c_args + ['-DINT64=@0@'.format(cdata.get('PG_INT64_TYPE'))]) + cdata.set('USE_AVX512_POPCNT_WITH_RUNTIME_CHECK', 1) + elif cc.links(prog, name: 'AVX-512 popcount with -mavx512vpopcntdq -mavx512bw', + args: test_c_args + ['-DINT64=@0@'.format(cdata.get('PG_INT64_TYPE'))] + ['-mavx512vpopcntdq'] + ['-mavx512bw']) + cdata.set('USE_AVX512_POPCNT_WITH_RUNTIME_CHECK', 1) + cflags_popcnt += ['-mavx512vpopcntdq'] + ['-mavx512bw'] + endif + +endif + ############################################################### # Select CRC-32C implementation. |
