[dxvk] Implement shared allocation cache

Allows refilling local caches in constant time.
2025-03-13 19:29:14 +01:00 · 2024-09-24 19:57:16 +02:00 · 2024-09-24 19:57:16 +02:00 · 2722a41675
commit 2722a41675
parent 9a51849920
2 changed files with 292 additions and 48 deletions
--- a/src/dxvk/dxvk_memory.cpp
+++ b/src/dxvk/dxvk_memory.cpp
@ -169,6 +169,14 @@ namespace dxvk {
  }


+  void DxvkResourceAllocation::destroyBufferViews() {
+    if (m_bufferViews) {
+      delete m_bufferViews;
+      m_bufferViews = nullptr;
+    }
+  }
+
+


  DxvkResourceAllocationPool::DxvkResourceAllocationPool() {
@ -231,9 +239,9 @@ namespace dxvk {
  uint32_t DxvkLocalAllocationCache::computePreferredAllocationCount(
          VkDeviceSize                size) {
    uint32_t poolIndex = computePoolIndex(size);
-    uint32_t count = (DxvkPageAllocator::PageSize / MinSize) >> poolIndex;
+    uint32_t count = (PoolCapacityInBytes / MinSize) >> poolIndex;

-    return std::max(count, MinAllocationCountPerPool);
+    return std::max(count, 1u);
  }


@ -251,6 +259,95 @@ namespace dxvk {



+  DxvkSharedAllocationCache::DxvkSharedAllocationCache(
+          DxvkMemoryAllocator*        allocator)
+  : m_allocator(allocator) {
+    for (uint32_t i = 0; i < m_pools.size(); i++) {
+      VkDeviceSize size = DxvkLocalAllocationCache::computeAllocationSize(i);
+      m_freeLists[i].capacity = DxvkLocalAllocationCache::computePreferredAllocationCount(size);
+    }
+  }
+
+
+  DxvkSharedAllocationCache::~DxvkSharedAllocationCache() {
+    for (const auto& freeList : m_freeLists)
+      m_allocator->freeCachedAllocations(freeList.head);
+
+    for (const auto& pool : m_pools) {
+      for (auto list : pool.lists)
+        m_allocator->freeCachedAllocations(list);
+    }
+  }
+
+
+  DxvkResourceAllocation* DxvkSharedAllocationCache::getAllocationList(
+          VkDeviceSize                allocationSize) {
+    uint32_t poolIndex = DxvkLocalAllocationCache::computePoolIndex(allocationSize);
+
+    // If there's a list ready for us, take the whole thing
+    std::unique_lock poolLock(m_poolMutex);
+    auto& pool = m_pools[poolIndex];
+
+    if (!pool.listCount)
+      return nullptr;
+
+    if (!(--pool.listCount))
+      pool.drainTime = high_resolution_clock::now();
+
+    return std::exchange(pool.lists[pool.listCount], nullptr);
+  }
+
+
+  DxvkResourceAllocation* DxvkSharedAllocationCache::freeAllocation(
+          DxvkResourceAllocation*     allocation) {
+    uint32_t poolIndex = DxvkLocalAllocationCache::computePoolIndex(allocation->m_size);
+
+    { std::unique_lock freeLock(m_freeMutex);
+      auto& list = m_freeLists[poolIndex];
+
+      allocation->m_next = list.head;
+      list.head = allocation;
+
+      if (++list.size < list.capacity)
+        return nullptr;
+
+      // Free list is full, try to add it to the list array
+      // so that subsequent allocations can use it.
+      list.head = nullptr;
+      list.size = 0u;
+    }
+
+    // Add free list to the pool if possible.
+    { std::unique_lock poolLock(m_poolMutex);
+      auto& pool = m_pools[poolIndex];
+
+      if (likely(pool.listCount < PoolSize)) {
+        pool.lists[pool.listCount++] = allocation;
+        return nullptr;
+      }
+
+      // If the pool is full, destroy the entire free list
+      return allocation;
+    }
+  }
+
+
+  void DxvkSharedAllocationCache::cleanupUnusedFromLockedAllocator(
+          high_resolution_clock::time_point time) {
+    std::unique_lock poolLock(m_poolMutex);
+
+    for (auto& pool : m_pools) {
+      if (pool.listCount && time - pool.drainTime >= std::chrono::seconds(1u)) {
+        m_allocator->freeCachedAllocationsLocked(std::exchange(
+          pool.lists[--pool.listCount], nullptr));
+        pool.drainTime = time;
+      }
+    }
+  }
+
+
+
+
  DxvkMemoryAllocator::DxvkMemoryAllocator(DxvkDevice* device)
  : m_device(device) {
    VkPhysicalDeviceMemoryProperties memInfo = device->adapter()->memoryProperties();
@ -299,6 +396,14 @@ namespace dxvk {

    m_worker.join();

+    // Destroy shared caches so that any allocations
+    // that are still alive get returned to the device
+    for (uint32_t i = 0; i < m_memTypeCount; i++) {
+      if (m_memTypes[i].sharedCache)
+        delete m_memTypes[i].sharedCache;
+    }
+
+    // Now that no allocations are alive, we can free chunks
    for (uint32_t i = 0; i < m_memHeapCount; i++)
      freeEmptyChunksInHeap(m_memHeaps[i], VkDeviceSize(-1), high_resolution_clock::time_point());
  }
@ -485,8 +590,11 @@ namespace dxvk {
      if (likely(memoryRequirements.memoryTypeBits)) {
        // If the given allocation cache supports the memory types and usage
        // flags that we need, try to use it to service this allocation.
+        // Only use the allocation cache for mappable allocations since those
+        // are expected to happen frequently.
        if (allocationCache && createInfo.size <= DxvkLocalAllocationCache::MaxSize
-         && allocationCache->m_memoryTypes && !(allocationCache->m_memoryTypes & ~memoryRequirements.memoryTypeBits)) {
+         && allocationCache->m_memoryTypes && !(allocationCache->m_memoryTypes & ~memoryRequirements.memoryTypeBits)
+         && (properties & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT)) {
          allocation = allocationCache->allocateFromCache(createInfo.size);

          if (likely(allocation))
@ -924,25 +1032,37 @@ namespace dxvk {

  void DxvkMemoryAllocator::freeAllocation(
          DxvkResourceAllocation* allocation) {
-    std::unique_lock lock(m_mutex);
+    if (allocation->m_flags.test(DxvkAllocationFlag::Cacheable)) {
+      // Return cacheable allocations to the shared cache
+      allocation->destroyBufferViews();

-    if (likely(allocation->m_type)) {
-      allocation->m_type->stats.memoryUsed -= allocation->m_size;
+      if (allocation->m_type->sharedCache)
+        allocation = allocation->m_type->sharedCache->freeAllocation(allocation);

-      if (unlikely(allocation->m_flags.test(DxvkAllocationFlag::OwnsMemory))) {
-        // We free the actual allocation later, just update stats here.
-        allocation->m_type->stats.memoryAllocated -= allocation->m_size;
-      } else {
-        auto& pool = allocation->m_mapPtr
-          ? allocation->m_type->mappedPool
-          : allocation->m_type->devicePool;
+      // If we get a list of allocations back from the
+      // shared cache, free all of them in one go
+      freeCachedAllocations(allocation);
+    } else {
+      std::unique_lock lock(m_mutex);

-        if (unlikely(pool.free(allocation->m_address, allocation->m_size)))
-          freeEmptyChunksInPool(*allocation->m_type, pool, 0, high_resolution_clock::now());
+      if (likely(allocation->m_type)) {
+        allocation->m_type->stats.memoryUsed -= allocation->m_size;
+
+        if (unlikely(allocation->m_flags.test(DxvkAllocationFlag::OwnsMemory))) {
+          // We free the actual allocation later, just update stats here.
+          allocation->m_type->stats.memoryAllocated -= allocation->m_size;
+        } else {
+          auto& pool = allocation->m_mapPtr
+            ? allocation->m_type->mappedPool
+            : allocation->m_type->devicePool;
+
+          if (unlikely(pool.free(allocation->m_address, allocation->m_size)))
+            freeEmptyChunksInPool(*allocation->m_type, pool, 0, high_resolution_clock::now());
+        }
      }
-    }

-    m_allocationPool.free(allocation);
+      m_allocationPool.free(allocation);
+    }
  }


@ -955,6 +1075,15 @@ namespace dxvk {
  }


+  void DxvkMemoryAllocator::freeCachedAllocations(
+          DxvkResourceAllocation* allocation) {
+    if (allocation) {
+      std::unique_lock lock(m_mutex);
+      freeCachedAllocationsLocked(allocation);
+    }
+  }
+
+
  void DxvkMemoryAllocator::freeCachedAllocationsLocked(
          DxvkResourceAllocation* allocation) {
    while (allocation) {
@ -1105,42 +1234,58 @@ namespace dxvk {


  bool DxvkMemoryAllocator::refillAllocationCache(
-          DxvkLocalAllocationCache* cache,
-    const VkMemoryRequirements& requirements,
-          VkMemoryPropertyFlags properties) {
+          DxvkLocalAllocationCache*   cache,
+    const VkMemoryRequirements&       requirements,
+          VkMemoryPropertyFlags       properties) {
+    // Ensure that all cached allocations report a power-of-two size.
+    // The shared cache implementation currently relies on this.
    VkDeviceSize allocationSize = (VkDeviceSize(-1) >> bit::lzcnt(requirements.size - 1u)) + 1u;
    allocationSize = std::max(allocationSize, DxvkLocalAllocationCache::MinSize);

-    // TODO implement shared caches per memory pool
-
-    // No suitable allocations available from the shared cache, create some
-    // new ones so that subsequent allocations of this size category can be
-    // handled without locking the allocator.
+    // Maximum number of allocations when we miss in the shared cache
    uint32_t allocationCount = DxvkLocalAllocationCache::computePreferredAllocationCount(allocationSize);

-    DxvkResourceAllocation* head = nullptr;
-    DxvkResourceAllocation* tail = nullptr;
-
-    std::unique_lock lock(m_mutex);
-
    for (auto typeIndex : bit::BitMask(cache->m_memoryTypes)) {
-      auto& pool = (properties & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT)
-        ? m_memTypes[typeIndex].mappedPool
-        : m_memTypes[typeIndex].devicePool;
+      auto& memoryType = m_memTypes[typeIndex];
+
+      // Initialize shared cache on demand only
+      if (unlikely(!memoryType.sharedCache)) {
+        std::unique_lock lock(m_mutex);
+
+        if (!memoryType.sharedCache)
+          memoryType.sharedCache = new DxvkSharedAllocationCache(this);
+      }
+
+      // Try to grab a list of allocations from the shared cache first. If
+      // this succeeds, allocating several pages of memory is near instant.
+      DxvkResourceAllocation* allocation = memoryType.sharedCache->getAllocationList(allocationSize);
+
+      if (likely(allocation)) {
+        allocation = cache->assignCache(allocationSize, allocation);
+        freeCachedAllocations(allocation);
+        return true;
+      }
+
+      // Fill cache with the preferred allocation count of this size category so
+      // that subsequent allocations can be handled without locking the allocator.
+      DxvkResourceAllocation* head = nullptr;
+      DxvkResourceAllocation* tail = nullptr;
+
+      std::unique_lock lock(m_mutex);
+      auto& memoryPool = memoryType.mappedPool;

      while (allocationCount) {
        // Try to suballocate from existing chunks, but do not create
        // any new chunks. Let the regular code path handle that case
        // as necessary.
-        int64_t address = pool.alloc(allocationSize, requirements.alignment);
+        int64_t address = memoryPool.alloc(allocationSize, requirements.alignment);

        if (address < 0)
          break;

        // Add allocation to the list and mark it as cacheable,
        // so it will get recycled as-is after use.
-        DxvkResourceAllocation* allocation = createAllocation(
-          m_memTypes[typeIndex], pool, address, allocationSize);
+        allocation = createAllocation(memoryType, memoryPool, address, allocationSize);
        allocation->m_flags.set(DxvkAllocationFlag::Cacheable);

        if (tail) {
@ -1154,15 +1299,13 @@ namespace dxvk {
        allocationCount--;
      }

-      if (!allocationCount)
-        break;
+      if (tail) {
+        tail->m_next = cache->assignCache(allocationSize, head);
+        return true;
+      }
    }

-    if (!tail)
-      return false;
-
-    tail->m_next = cache->assignCache(allocationSize, head);
-    return true;
+    return false;
  }


@ -1588,6 +1731,12 @@ namespace dxvk {

        heapStats[i] = stats;
      }
+
+      // Periodically clean up unused cached allocations
+      for (uint32_t i = 0; i < m_memTypeCount; i++) {
+        if (m_memTypes[i].sharedCache)
+          m_memTypes[i].sharedCache->cleanupUnusedFromLockedAllocator(currentTime);
+      }
    }

    // Ensure adapter allocation statistics are consistent
--- a/src/dxvk/dxvk_memory.h
+++ b/src/dxvk/dxvk_memory.h
@ -13,6 +13,7 @@ namespace dxvk {
  class DxvkMemoryAllocator;
  class DxvkMemoryChunk;
  class DxvkSparsePageTable;
+  class DxvkSharedAllocationCache;

  /**
   * \brief Resource access flags
@ -160,6 +161,8 @@ namespace dxvk {

    DxvkMemoryPool    devicePool;
    DxvkMemoryPool    mappedPool;
+
+    DxvkSharedAllocationCache* sharedCache = nullptr;
  };


@ -584,6 +587,8 @@ namespace dxvk {

    DxvkResourceAllocation*     m_next = nullptr;

+    void destroyBufferViews();
+
    void free();

    static force_inline uint64_t getIncrement(DxvkAccess access) {
@ -765,10 +770,12 @@ namespace dxvk {
   * context classes in order to reduce lock contention.
   */
  class DxvkLocalAllocationCache {
-    friend class DxvkMemoryAllocator;
+    constexpr static VkDeviceSize PoolCapacityInBytes = 4u * DxvkPageAllocator::PageSize;
+
+    friend DxvkMemoryAllocator;
  public:
-    constexpr static uint32_t PoolCount = 8u;
-    constexpr static uint32_t MinAllocationCountPerPool = 8u;
+    // Cache allocations up to 128 kiB
+    constexpr static uint32_t PoolCount = 10u;

    constexpr static VkDeviceSize MinSize = DxvkPoolAllocator::MinSize;
    constexpr static VkDeviceSize MaxSize = MinSize << (PoolCount - 1u);
@ -816,6 +823,24 @@ namespace dxvk {
    static uint32_t computePreferredAllocationCount(
            VkDeviceSize                size);

+    /**
+     * \brief Computes pool index for a given allocation size
+     *
+     * \param [in] size Allocation size
+     * \returns Pool index
+     */
+    static uint32_t computePoolIndex(
+            VkDeviceSize                size);
+
+    /**
+     * \brief Computes allocation size for a given index
+     *
+     * \param [in] poolIndex Pool index
+     * \returns Allocation size for the pool
+     */
+    static VkDeviceSize computeAllocationSize(
+            uint32_t                    index);
+
  private:

    DxvkMemoryAllocator*  m_allocator   = nullptr;
@ -832,8 +857,74 @@ namespace dxvk {

    void freeCache();

-    static uint32_t computePoolIndex(
-            VkDeviceSize                size);
+  };
+
+
+  /**
+   * \brief Shared allocation cache
+   *
+   * Accumulates small allocations in free lists
+   * that can be allocated in their entirety.
+   */
+  class DxvkSharedAllocationCache {
+    constexpr static uint32_t PoolCount = DxvkLocalAllocationCache::PoolCount;
+    constexpr static uint32_t PoolSize = env::is32BitHostPlatform() ? 6u : 12u;
+
+    friend DxvkMemoryAllocator;
+  public:
+
+    DxvkSharedAllocationCache(
+            DxvkMemoryAllocator*        allocator);
+
+    ~DxvkSharedAllocationCache();
+
+    /**
+     * \brief Retrieves list of cached allocations
+     *
+     * \param [in] allocationSize Required allocation size
+     * \returns Pointer to head of allocation list,
+     *    or \c nullptr if the cache is empty.
+     */
+    DxvkResourceAllocation* getAllocationList(
+            VkDeviceSize                allocationSize);
+
+    /**
+     * \brief Frees cacheable allocation
+     *
+     * \param [in] allocation Allocation to free
+     * \returns List to destroy if the cache is full. Usually,
+     *    \c nullptr if the allocation was successfully added.
+     */
+    DxvkResourceAllocation* freeAllocation(
+            DxvkResourceAllocation*     allocation);
+
+  private:
+
+    struct FreeList {
+      uint16_t size = 0u;
+      uint16_t capacity = 0u;
+
+      DxvkResourceAllocation* head = nullptr;
+    };
+
+    struct Pool {
+      uint32_t listCount = 0u;
+      std::array<DxvkResourceAllocation*, PoolSize> lists = { };
+      high_resolution_clock::time_point drainTime = { };
+    };
+
+    alignas(CACHE_LINE_SIZE)
+    DxvkMemoryAllocator*        m_allocator = nullptr;
+
+    dxvk::mutex                 m_freeMutex;
+    std::array<FreeList, PoolCount> m_freeLists = { };
+
+    alignas(CACHE_LINE_SIZE)
+    dxvk::mutex                 m_poolMutex;
+    std::array<Pool, PoolCount> m_pools = { };
+
+    void cleanupUnusedFromLockedAllocator(
+            high_resolution_clock::time_point time);

  };

@ -848,6 +939,7 @@ namespace dxvk {
    friend DxvkMemory;
    friend DxvkResourceAllocation;
    friend DxvkLocalAllocationCache;
+    friend DxvkSharedAllocationCache;

    constexpr static uint64_t DedicatedChunkAddress = 1ull << 63u;

@ -1045,6 +1137,9 @@ namespace dxvk {
    void freeLocalCache(
            DxvkLocalAllocationCache* cache);

+    void freeCachedAllocations(
+            DxvkResourceAllocation* allocation);
+
    void freeCachedAllocationsLocked(
            DxvkResourceAllocation* allocation);