From a71922b6402a198c37eb93eef9a472e3e3db9eb8 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Wed, 18 Feb 2026 11:07:02 +0100 Subject: [PATCH] Small complexity reduction --- include/xsimd/arch/xsimd_avx2.hpp | 90 ++++++++++--------------------- 1 file changed, 27 insertions(+), 63 deletions(-) diff --git a/include/xsimd/arch/xsimd_avx2.hpp b/include/xsimd/arch/xsimd_avx2.hpp index bf6d9e7de..dd548ab8c 100644 --- a/include/xsimd/arch/xsimd_avx2.hpp +++ b/include/xsimd/arch/xsimd_avx2.hpp @@ -1225,11 +1225,9 @@ namespace xsimd __m256i r0 = _mm256_shuffle_epi8(self, half_mask); __m256i r1 = _mm256_shuffle_epi8(swapped, half_mask); - // select lane by the mask index divided by 16 - constexpr auto lane = batch_constant< - uint8_t, A, - 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16> {}; + // select lane by the mask index divided by 16, first lane is 0, second is 16. + constexpr auto lane_size = make_batch_constant(); + constexpr auto lane = (make_iota_batch_constant() / lane_size) * lane_size; batch_bool blend_mask = (mask & 0b10000u) != lane; return _mm256_blendv_epi8(r0, r1, blend_mask); } @@ -1259,66 +1257,32 @@ namespace xsimd namespace detail { - template - constexpr T swizzle_val_none() - { - // Most significant bit of the byte must be 1 - return 0x80; - } - - template - constexpr bool swizzle_val_is_cross_lane(T val, T idx, T size) - { - return (idx < (size / 2)) != (val < (size / 2)); - } - - template - constexpr bool swizzle_val_is_defined(T val, T size) + template + struct swizzle_mask { - return (0 <= val) && (val < size); - } - - template - constexpr T swizzle_self_val(T val, T idx, T size) - { - return (swizzle_val_is_defined(val, size) && !swizzle_val_is_cross_lane(val, idx, size)) - ? val % (size / 2) - : swizzle_val_none(); - } + static constexpr auto values = std::array { Vals... }; - template - constexpr batch_constant(sizeof...(Vals)))...> - swizzle_make_self_batch_impl(std::index_sequence) - { - return {}; - } + static constexpr T get(std::size_t idx_, std::size_t size_) noexcept + { + const T size = static_cast(size_); + const T idx = static_cast(idx_); + const T val = values[idx_]; - template - constexpr auto swizzle_make_self_batch() - { - return swizzle_make_self_batch_impl(std::make_index_sequence()); - } + // Check if value in bounds + if ((T(0) <= val) && (val < size)) + { + // Whether we need to access the value from the other lane + const bool val_is_cross_lane = (idx < (size / 2)) != (val < (size / 2)); + if (val_is_cross_lane == cross_batch) + { + return val % (size / 2); + } + } - template - constexpr T swizzle_cross_val(T val, T idx, T size) - { - return (swizzle_val_is_defined(val, size) && swizzle_val_is_cross_lane(val, idx, size)) - ? val % (size / 2) - : swizzle_val_none(); - } - - template - constexpr batch_constant(sizeof...(Vals)))...> - swizzle_make_cross_batch_impl(std::index_sequence) - { - return {}; - } - - template - constexpr auto swizzle_make_cross_batch() - { - return swizzle_make_cross_batch_impl(std::make_index_sequence()); - } + // Out of bounds with most significant bit set to 1 will set the swizzle target to 0 + return ~T {}; + } + }; } // swizzle (constant mask) @@ -1354,8 +1318,8 @@ namespace xsimd // We can outsmart the dynamic version by creating a compile-time mask that leaves zeros // where it does not need to select data, resulting in a simple OR merge of the two batches. - constexpr auto self_mask = detail::swizzle_make_self_batch(); - constexpr auto cross_mask = detail::swizzle_make_cross_batch(); + constexpr auto self_mask = make_batch_constant, A>(); + constexpr auto cross_mask = make_batch_constant, A>(); // permute bytes within each lane (AVX2 only) __m256i r0 = _mm256_shuffle_epi8(self, self_mask.as_batch());