|
20 | 20 | #include "pycore_hashtable.h"
|
21 | 21 | #include "pycore_strhex.h" // _Py_strhex()
|
22 | 22 |
|
| 23 | +/* |
| 24 | + * Taken from blake2module.c. In the future, detection of SIMD support |
| 25 | + * should be delegated to https://github.com/python/cpython/pull/125011. |
| 26 | + */ |
| 27 | +#if defined(__x86_64__) && defined(__GNUC__) |
| 28 | +# include <cpuid.h> |
| 29 | +#elif defined(_M_X64) |
| 30 | +# include <intrin.h> |
| 31 | +#endif |
| 32 | + |
| 33 | +#if defined(__APPLE__) && defined(__arm64__) |
| 34 | +# undef HACL_CAN_COMPILE_SIMD128 |
| 35 | +# undef HACL_CAN_COMPILE_SIMD256 |
| 36 | +#endif |
| 37 | + |
23 | 38 | // Small mismatch between the variable names Python defines as part of configure
|
24 | 39 | // at the ones HACL* expects to be set in order to enable those headers.
|
25 | 40 | #define HACL_CAN_COMPILE_VEC128 HACL_CAN_COMPILE_SIMD128
|
@@ -1667,17 +1682,73 @@ hmacmodule_init_strings(hmacmodule_state *state)
|
1667 | 1682 | static void
|
1668 | 1683 | hmacmodule_init_cpu_features(hmacmodule_state *state)
|
1669 | 1684 | {
|
| 1685 | + int eax1 = 0, ebx1 = 0, ecx1 = 0, edx1 = 0; |
| 1686 | + int eax7 = 0, ebx7 = 0, ecx7 = 0, edx7 = 0; |
| 1687 | +#if defined(__x86_64__) && defined(__GNUC__) |
| 1688 | + __cpuid_count(1, 0, eax1, ebx1, ecx1, edx1); |
| 1689 | + __cpuid_count(7, 0, eax7, ebx7, ecx7, edx7); |
| 1690 | +#elif defined(_M_X64) |
| 1691 | + int info1[4] = { 0 }; |
| 1692 | + __cpuidex(info1, 1, 0); |
| 1693 | + eax1 = info1[0], ebx1 = info1[1], ecx1 = info1[2], edx1 = info1[3]; |
| 1694 | + |
| 1695 | + int info7[4] = { 0 }; |
| 1696 | + __cpuidex(info7, 7, 0); |
| 1697 | + eax7 = info7[0], ebx7 = info7[1], ecx7 = info7[2], edx7 = info7[3]; |
| 1698 | +#endif |
| 1699 | + // fmt: off |
| 1700 | + (void)eax1; (void)ebx1; (void)ecx1; (void)edx1; |
| 1701 | + (void)eax7; (void)ebx7; (void)ecx7; (void)edx7; |
| 1702 | + // fmt: on |
| 1703 | + |
| 1704 | +#define EBX_AVX2 (1 << 5) |
| 1705 | +#define ECX_SSE3 (1 << 0) |
| 1706 | +#define ECX_SSSE3 (1 << 9) |
| 1707 | +#define ECX_SSE4_1 (1 << 19) |
| 1708 | +#define ECX_SSE4_2 (1 << 20) |
| 1709 | +#define ECX_AVX (1 << 28) |
| 1710 | +#define EDX_SSE (1 << 25) |
| 1711 | +#define EDX_SSE2 (1 << 26) |
| 1712 | +#define EDX_CMOV (1 << 15) |
| 1713 | + |
| 1714 | + bool avx = (ecx1 & ECX_AVX) != 0; |
| 1715 | + bool avx2 = (ebx7 & EBX_AVX2) != 0; |
| 1716 | + |
| 1717 | + bool sse = (edx1 & EDX_SSE) != 0; |
| 1718 | + bool sse2 = (edx1 & EDX_SSE2) != 0; |
| 1719 | + bool cmov = (edx1 & EDX_CMOV) != 0; |
| 1720 | + |
| 1721 | + bool sse3 = (ecx1 & ECX_SSE3) != 0; |
| 1722 | + bool sse41 = (ecx1 & ECX_SSE4_1) != 0; |
| 1723 | + bool sse42 = (ecx1 & ECX_SSE4_2) != 0; |
| 1724 | + |
| 1725 | +#undef EDX_CMOV |
| 1726 | +#undef EDX_SSE2 |
| 1727 | +#undef EDX_SSE |
| 1728 | +#undef ECX_AVX |
| 1729 | +#undef ECX_SSE4_2 |
| 1730 | +#undef ECX_SSE4_1 |
| 1731 | +#undef ECX_SSSE3 |
| 1732 | +#undef ECX_SSE3 |
| 1733 | +#undef EBX_AVX2 |
| 1734 | + |
1670 | 1735 | #if HACL_CAN_COMPILE_SIMD128
|
1671 |
| - // TODO: use py_cpuid_features (gh-125022) to deduce what we want |
1672 |
| - state->can_run_simd128 = false; |
| 1736 | + // TODO(picnixz): use py_cpuid_features (gh-125022) to improve detection |
| 1737 | + state->can_run_simd128 = sse && sse2 && sse3 && sse41 && sse42 && cmov; |
1673 | 1738 | #else
|
| 1739 | + // fmt: off |
| 1740 | + (void)sse; (void)sse2; (void)sse3; (void)sse41; (void)sse42; (void)cmov; |
| 1741 | + // fmt: on |
1674 | 1742 | state->can_run_simd128 = false;
|
1675 | 1743 | #endif
|
1676 | 1744 |
|
1677 | 1745 | #if HACL_CAN_COMPILE_SIMD256
|
1678 |
| - // TODO: use py_cpuid_features (gh-125022) to deduce what we want |
1679 |
| - state->can_run_simd256 = false; |
| 1746 | + // TODO(picnixz): use py_cpuid_features (gh-125022) to improve detection |
| 1747 | + state->can_run_simd256 = state->can_run_simd128 && avx && avx2; |
1680 | 1748 | #else
|
| 1749 | + // fmt: off |
| 1750 | + (void)avx; (void)avx2; |
| 1751 | + // fmt: on |
1681 | 1752 | state->can_run_simd256 = false;
|
1682 | 1753 | #endif
|
1683 | 1754 | }
|
|
0 commit comments