From a743ba65316f2fe7099a3155f8e979a60c8832c2 Mon Sep 17 00:00:00 2001 From: Philip Rebohle Date: Mon, 7 Oct 2019 17:19:32 +0200 Subject: [PATCH] [dxvk] Use memcmp replacement for pipeline state lookup Measured to be over twice as fast as memcmp on Ryzen for the 512-byte graphics pipeline state struct, achieving two cycles per iteration. --- src/dxvk/dxvk_graphics_state.h | 8 +++--- src/util/util_bit.h | 52 ++++++++++++++++++++++++++++++++++ 2 files changed, 56 insertions(+), 4 deletions(-) diff --git a/src/dxvk/dxvk_graphics_state.h b/src/dxvk/dxvk_graphics_state.h index bd050a5e..938a8384 100644 --- a/src/dxvk/dxvk_graphics_state.h +++ b/src/dxvk/dxvk_graphics_state.h @@ -641,11 +641,11 @@ namespace dxvk { } bool operator == (const DxvkGraphicsPipelineStateInfo& other) const { - return !std::memcmp(this, &other, sizeof(*this)); + return bit::bcmpeq(this, &other); } bool operator != (const DxvkGraphicsPipelineStateInfo& other) const { - return std::memcmp(this, &other, sizeof(*this)); + return !bit::bcmpeq(this, &other); } bool useDynamicStencilRef() const { @@ -709,11 +709,11 @@ namespace dxvk { } bool operator == (const DxvkComputePipelineStateInfo& other) const { - return !std::memcmp(this, &other, sizeof(*this)); + return bit::bcmpeq(this, &other); } bool operator != (const DxvkComputePipelineStateInfo& other) const { - return std::memcmp(this, &other, sizeof(*this)); + return !bit::bcmpeq(this, &other); } DxvkBindingMask bsBindingMask; diff --git a/src/util/util_bit.h b/src/util/util_bit.h index 1bad55c3..b8b2cee8 100644 --- a/src/util/util_bit.h +++ b/src/util/util_bit.h @@ -79,5 +79,57 @@ namespace dxvk::bit { shift += count; return shift > Bits ? shift - Bits : 0; } + + /** + * \brief Compares two aligned structs bit by bit + * + * \param [in] a First struct + * \param [in] b Second struct + * \returns \c true if the structs are equal + */ + template + bool bcmpeq(const T* a, const T* b) { + static_assert(alignof(T) >= 16); + #if defined(__GNUC__) || defined(__clang__) || defined(_MSC_VER) + auto ai = reinterpret_cast(a); + auto bi = reinterpret_cast(b); + + size_t i = 0; + + #if defined(__clang__) + #pragma nounroll + #elif defined(__GNUC__) + #pragma GCC unroll 0 + #endif + + for ( ; i < 2 * (sizeof(T) / 32); i += 2) { + __m128i eq0 = _mm_cmpeq_epi8( + _mm_load_si128(ai + i), + _mm_load_si128(bi + i)); + __m128i eq1 = _mm_cmpeq_epi8( + _mm_load_si128(ai + i + 1), + _mm_load_si128(bi + i + 1)); + __m128i eq = _mm_and_si128(eq0, eq1); + + int mask = _mm_movemask_epi8(eq); + if (mask != 0xFFFF) + return false; + } + + for ( ; i < sizeof(T) / 16; i++) { + __m128i eq = _mm_cmpeq_epi8( + _mm_load_si128(ai + i), + _mm_load_si128(bi + i)); + + int mask = _mm_movemask_epi8(eq); + if (mask != 0xFFFF) + return false; + } + + return true; + #else + return !std::memcmp(a, b, sizeof(T)); + #endif + } }